ch04

import%20marimo%0A%0A__generated_with%20%3D%20%220.18.4%22%0Aapp%20%3D%20marimo.App(width%3D%22medium%22)%0A%0Awith%20app.setup%3A%0A%20%20%20%20from%20importlib.metadata%20import%20version%0A%0A%20%20%20%20import%20marimo%20as%20mo%0A%20%20%20%20import%20matplotlib.pyplot%20as%20plt%0A%20%20%20%20import%20tiktoken%0A%20%20%20%20import%20torch%0A%20%20%20%20import%20torch.nn%20as%20nn%0A%20%20%20%20from%20ch03%20import%20MultiHeadAttention%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%20Chapter%204%3A%20Implementing%20a%20GPT%20model%20from%20Scratch%20To%20Generate%20Text%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%204.1%20Coding%20an%20LLM%20architecture%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20print(%22matplotlib%20version%3A%22%2C%20version(%22matplotlib%22))%0A%20%20%20%20print(%22torch%20version%3A%22%2C%20version(%22torch%22))%0A%20%20%20%20print(%22tiktoken%20version%3A%22%2C%20version(%22tiktoken%22))%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20This%20is%20hyperparameters%20of%20our%20GPT-2%20model.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20GPT_CONFIG_124M%20%3D%20%7B%0A%20%20%20%20%20%20%20%20%22vocab_size%22%3A%2050257%2C%20%20%20%20%23%20Vocabulary%20size%0A%20%20%20%20%20%20%20%20%22context_length%22%3A%201024%2C%20%23%20Context%20length%0A%20%20%20%20%20%20%20%20%22emb_dim%22%3A%20768%2C%20%20%20%20%20%20%20%20%20%23%20Embedding%20dimension%0A%20%20%20%20%20%20%20%20%22n_heads%22%3A%2012%2C%20%20%20%20%20%20%20%20%20%20%23%20Number%20of%20attention%20heads%0A%20%20%20%20%20%20%20%20%22n_layers%22%3A%2012%2C%20%20%20%20%20%20%20%20%20%23%20Number%20of%20layers%0A%20%20%20%20%20%20%20%20%22drop_rate%22%3A%200.1%2C%20%20%20%20%20%20%20%23%20Dropout%20rate%0A%20%20%20%20%20%20%20%20%22qkv_bias%22%3A%20False%20%20%20%20%20%20%20%23%20Query-Key-Value%20bias%0A%20%20%20%20%7D%0A%20%20%20%20return%20(GPT_CONFIG_124M%2C)%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20This%20is%20placeholder%20architecture%20for%20our%20GPT2%20model.%20The%20placeholders%20will%20be%20replaced.%0A%0A%20%20%20%20The%20%22Dummy%22%20means%20simple%20model%20by%20using%20simple%20placeholders%20just%20to%20check%20forward%20path%20algorithms.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.class_definition%0Aclass%20DummyGPTModel(nn.Module)%3A%0A%20%20%20%20def%20__init__(self%2C%20cfg)%3A%0A%20%20%20%20%20%20%20%20super().__init__()%0A%20%20%20%20%20%20%20%20%23%20mapping%20from%20ID0~vocab_size-1%20to%20emb_dim%20dimension%20vectors%0A%20%20%20%20%20%20%20%20self.tok_emb%20%3D%20nn.Embedding(cfg%5B%22vocab_size%22%5D%2C%20cfg%5B%22emb_dim%22%5D)%0A%20%20%20%20%20%20%20%20self.pos_emb%20%3D%20nn.Embedding(cfg%5B%22context_length%22%5D%2C%20cfg%5B%22emb_dim%22%5D)%0A%20%20%20%20%20%20%20%20self.drop_emb%20%3D%20nn.Dropout(cfg%5B%22drop_rate%22%5D)%0A%0A%20%20%20%20%20%20%20%20%23%20Use%20a%20placeholder%20for%20TransformerBlock%0A%20%20%20%20%20%20%20%20self.trf_blocks%20%3D%20nn.Sequential(%0A%20%20%20%20%20%20%20%20%20%20%20%20*%5BDummyTransformerBlock(cfg)%20for%20_%20in%20range(cfg%5B%22n_layers%22%5D)%5D)%0A%0A%20%20%20%20%20%20%20%20%23%20Use%20a%20placeholder%20for%20LayerNorm%0A%20%20%20%20%20%20%20%20self.final_norm%20%3D%20DummyLayerNorm(cfg%5B%22emb_dim%22%5D)%0A%20%20%20%20%20%20%20%20self.out_head%20%3D%20nn.Linear(%0A%20%20%20%20%20%20%20%20%20%20%20%20cfg%5B%22emb_dim%22%5D%2C%20cfg%5B%22vocab_size%22%5D%2C%20bias%3DFalse%0A%20%20%20%20%20%20%20%20)%0A%0A%20%20%20%20def%20forward(self%2C%20in_idx)%3A%0A%20%20%20%20%20%20%20%20batch_size%2C%20seq_len%20%3D%20in_idx.shape%0A%0A%20%20%20%20%20%20%20%20%23%20tokenize%0A%20%20%20%20%20%20%20%20tok_embeds%20%3D%20self.tok_emb(in_idx)%0A%0A%20%20%20%20%20%20%20%20%23%20positional%20embedding%0A%20%20%20%20%20%20%20%20pos_embeds%20%3D%20self.pos_emb(torch.arange(seq_len%2C%20device%3Din_idx.device))%0A%20%20%20%20%20%20%20%20x%20%3D%20tok_embeds%20%2B%20pos_embeds%0A%0A%20%20%20%20%20%20%20%20x%20%3D%20self.drop_emb(x)%20%20%20%20%23%20(batch_size%2C%20seq_len%2C%20emb_dim)%0A%20%20%20%20%20%20%20%20x%20%3D%20self.trf_blocks(x)%20%20%23%20(batch_size%2C%20seq_len%2C%20emb_dim)%0A%20%20%20%20%20%20%20%20x%20%3D%20self.final_norm(x)%20%20%23%20(batch_size%2C%20seq_len%2C%20emb_dim)%0A%20%20%20%20%20%20%20%20logits%20%3D%20self.out_head(x)%20%20%20%23%20(batch_size%2C%20seq_len%2C%20vocab_size)%0A%20%20%20%20%20%20%20%20return%20logits%0A%0A%0A%40app.class_definition%0Aclass%20DummyTransformerBlock(nn.Module)%3A%0A%20%20%20%20def%20__init__(self%2C%20cfg)%3A%0A%20%20%20%20%20%20%20%20super().__init__()%0A%20%20%20%20%20%20%20%20%23%20A%20simple%20placeholder%0A%0A%20%20%20%20def%20forward(self%2C%20x)%3A%0A%20%20%20%20%20%20%20%20%23%20This%20block%20does%20nothing%20and%20just%20returns%20its%20input.%0A%20%20%20%20%20%20%20%20return%20x%0A%0A%0A%40app.class_definition%0Aclass%20DummyLayerNorm(nn.Module)%3A%0A%20%20%20%20def%20__init__(self%2C%20normalized_shape%2C%20eps%3D1e-5)%3A%0A%20%20%20%20%20%20%20%20super().__init__()%0A%20%20%20%20%20%20%20%20%23%20The%20parameters%20here%20are%20just%20to%20mimic%20the%20LayerNorm%20interface.%0A%0A%20%20%20%20def%20forward(self%2C%20x)%3A%0A%20%20%20%20%20%20%20%20%23%20This%20layer%20does%20nothing%20and%20just%20returns%20its%20input.%0A%20%20%20%20%20%20%20%20return%20x%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20Use%20GPT2%20embedding%20and%20create%20a%20batch%20contains%20two%20phrases%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20tokenizer%20%3D%20tiktoken.get_encoding(%22gpt2%22)%0A%0A%20%20%20%20batch%20%3D%20%5B%5D%0A%0A%20%20%20%20txt1%20%3D%20%22Every%20effort%20moves%20you%22%0A%20%20%20%20txt2%20%3D%20%22Every%20day%20holds%20a%22%0A%0A%20%20%20%20batch.append(torch.tensor(tokenizer.encode(txt1)))%0A%20%20%20%20batch.append(torch.tensor(tokenizer.encode(txt2)))%0A%20%20%20%20batch%20%3D%20torch.stack(batch%2C%20dim%3D0)%0A%20%20%20%20print(batch)%0A%20%20%20%20return%20batch%2C%20tokenizer%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20Randomly%20initialize%20the%20GPT2%3A1.24b%20model%20and%20inference%20logits%20with%20untrained%20weights.%0A%0A%20%20%20%20The%20output%20is%20called%20as%20logits%2C%20and%20the%20shape%20is%20%60(batch_size%2Cseq_len%2Cvocab_size)%60.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(GPT_CONFIG_124M%2C%20batch)%3A%0A%20%20%20%20torch.manual_seed(123)%0A%20%20%20%20_model%20%3D%20DummyGPTModel(GPT_CONFIG_124M)%0A%0A%20%20%20%20_logits%20%3D%20_model(batch)%0A%20%20%20%20print(%22Output%20shape%3A%22%2C%20_logits.shape)%0A%20%20%20%20print(_logits)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%204.2%20Normalizing%20activations%20with%20layer%20normalization%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20To%20get%20efficient%20gradients%20for%20training%2C%20introduce%20layer%20normalization.%0A%0A%20%20%20%20Let's%20use%20the%20following%20usual%20linear%2BReLU%20layer%20to%20check%20the%20effect%20of%20layer%20normalization.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20torch.manual_seed(123)%0A%0A%20%20%20%20%23%20create%202%20training%20examples%20with%205%20dimensions%20(features)%20each%0A%20%20%20%20batch_example%20%3D%20torch.randn(2%2C%205)%20%0A%0A%20%20%20%20layer%20%3D%20nn.Sequential(nn.Linear(5%2C%206)%2C%20nn.ReLU())%0A%20%20%20%20out%20%3D%20layer(batch_example)%0A%20%20%20%20print(f%22%7Bbatch_example%3D%7D%22)%0A%20%20%20%20print(f%22%7Bout%3D%7D%22)%0A%20%20%20%20return%20batch_example%2C%20out%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20Check%20statistics%20before%20normalization.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(out)%3A%0A%20%20%20%20mean%20%3D%20out.mean(dim%3D-1%2C%20keepdim%3DTrue)%0A%20%20%20%20var%20%3D%20out.var(dim%3D-1%2C%20keepdim%3DTrue)%0A%0A%20%20%20%20print(%22Mean%3A%5Cn%22%2C%20mean)%0A%20%20%20%20print(%22Variance%3A%5Cn%22%2C%20var)%0A%20%20%20%20return%20mean%2C%20var%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20Apply%20normalization%20to%20that.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mean%2C%20out%2C%20var)%3A%0A%20%20%20%20out_norm%20%3D%20(out%20-%20mean)%20%2F%20torch.sqrt(var)%0A%20%20%20%20print(%22Normalized%20layer%20outputs%3A%5Cn%22%2C%20out_norm)%0A%0A%20%20%20%20mean_norm%20%3D%20out_norm.mean(dim%3D-1%2C%20keepdim%3DTrue)%0A%20%20%20%20var_norm%20%3D%20out_norm.var(dim%3D-1%2C%20keepdim%3DTrue)%0A%20%20%20%20print(%22Mean%3A%5Cn%22%2C%20mean_norm)%0A%20%20%20%20print(%22Variance%3A%5Cn%22%2C%20var_norm)%0A%20%20%20%20return%20mean_norm%2C%20var_norm%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20Disable%20PyTorch%20scientific%20notation%20for%20readability%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(mean_norm%2C%20var_norm)%3A%0A%20%20%20%20torch.set_printoptions(sci_mode%3DFalse)%0A%20%20%20%20print(%22Mean%3A%22%2C%20mean_norm)%0A%20%20%20%20print(%22Variance%3A%22%2C%20var_norm)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20To%20replace%20the%20layer%20norm%20placeholder%2C%20implement%20%60LayerNorm%60%20class%20based%20on%20the%20above%20observations.%0A%0A%20%20%20%20The%20normalization%20is%20applied%20to%20the%20last%20dimension%20%60emb_dim%60%2C%20and%20scaling%20and%20shift%20parameters%20are%20introduced%20to%20learn%20suitable%20distribution%20for%20good%20gradients.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.class_definition%0Aclass%20LayerNorm(nn.Module)%3A%0A%20%20%20%20def%20__init__(self%2C%20emb_dim)%3A%0A%20%20%20%20%20%20%20%20super().__init__()%0A%20%20%20%20%20%20%20%20self.eps%20%3D%201e-5%0A%20%20%20%20%20%20%20%20self.scale%20%3D%20nn.Parameter(torch.ones(emb_dim))%0A%20%20%20%20%20%20%20%20self.shift%20%3D%20nn.Parameter(torch.zeros(emb_dim))%0A%0A%20%20%20%20def%20forward(self%2C%20x)%3A%0A%20%20%20%20%20%20%20%20mean%20%3D%20x.mean(dim%3D-1%2C%20keepdim%3DTrue)%0A%20%20%20%20%20%20%20%20%23%20difference%20of%20n%20and%20(n-1)%20is%20negligible%20for%20large%20n%20(unbaiased%3DFalse)%0A%20%20%20%20%20%20%20%20%23%20this%20configuration%20is%20compatible%20with%20original%20GPT2%20model%0A%20%20%20%20%20%20%20%20var%20%3D%20x.var(dim%3D-1%2C%20keepdim%3DTrue%2C%20unbiased%3DFalse)%0A%20%20%20%20%20%20%20%20norm_x%20%3D%20(x%20-%20mean)%20%2F%20torch.sqrt(var%20%2B%20self.eps)%0A%20%20%20%20%20%20%20%20return%20self.scale%20*%20norm_x%20%2B%20self.shift%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20Try%20it.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(batch_example)%3A%0A%20%20%20%20ln%20%3D%20LayerNorm(emb_dim%3D5)%0A%20%20%20%20out_ln%20%3D%20ln(batch_example)%0A%0A%20%20%20%20mean_ln%20%3D%20out_ln.mean(dim%3D-1%2C%20keepdim%3DTrue)%0A%20%20%20%20var_ln%20%3D%20out_ln.var(dim%3D-1%2C%20unbiased%3DFalse%2C%20keepdim%3DTrue)%0A%0A%20%20%20%20print(%22Mean%3A%5Cn%22%2C%20mean_ln)%0A%20%20%20%20print(%22Variance%3A%5Cn%22%2C%20var_ln)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%204.3%20Implementing%20a%20feed%20forward%20network%20with%20GELU%20activations%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20We%20use%20GELU%20(Gaussian%20Error%20Linear%20Unit)%20as%20an%20activation%20function.%0A%20%20%20%20This%20is%20defined%20as%0A%20%20%20%20%24%24%0A%20%20%20%20%5Cmathrm%7BGELU%7D(x)%5Cequiv%20x%5Ccdot%5CPhi(x)%2C%5Cquad%0A%20%20%20%20%5CPhi(x)%5Cequiv%5Cint%20dx%5C%2C%5Cmathcal%7BN%7D(x%3B%5Cmu%3D0%2C%5Csigma%3D1)%0A%20%20%20%20%24%24%0A%0A%20%20%20%20Because%20of%20efficiency%2C%20we%20use%20the%20following%20approximation%20obtained%20by%20curve%20fitting.%0A%20%20%20%20%24%24%0A%20%20%20%20%5Ctext%7BGELU%7D(x)%20%5Capprox%200.5%20%5Ccdot%20x%20%5Ccdot%20%5Cleft(1%20%2B%20%5Ctanh%5Cleft%5B%5Csqrt%7B%5Cfrac%7B2%7D%7B%5Cpi%7D%7D%20%5Ccdot%20%5Cleft(x%20%2B%200.044715%20%5Ccdot%20x%5E3%5Cright)%5Cright%5D%5Cright)%0A%20%20%20%20%24%24%0A%0A%20%20%20%20This%20GELU%20is%20known%20as%20smoother%20gradient%20and%20better%20activation%20function%20than%20ReLU%20for%20learning.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.class_definition%0Aclass%20GELU(nn.Module)%3A%0A%20%20%20%20def%20__init__(self)%3A%0A%20%20%20%20%20%20%20%20super().__init__()%0A%0A%20%20%20%20def%20forward(self%2C%20x)%3A%0A%20%20%20%20%20%20%20%20return%200.5%20*%20x%20*%20(1%20%2B%20torch.tanh(%0A%20%20%20%20%20%20%20%20%20%20%20%20torch.sqrt(torch.tensor(2.0%20%2F%20torch.pi))%20*%20%0A%20%20%20%20%20%20%20%20%20%20%20%20(x%20%2B%200.044715%20*%20torch.pow(x%2C%203))%0A%20%20%20%20%20%20%20%20))%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20Compare%20GELU%20with%20ReLU.%0A%20%20%20%20GERU%20has%20non-zero%20gradient%20except%20for%20certain%20point%2C%20so%20it%20avoids%20singular%20optimizations.%0A%20%20%20%20GELU%20also%20accepts%20minus%20%24x%24%20and%20such%20values%20also%20contribute%20the%20training.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20gelu%2C%20relu%20%3D%20GELU()%2C%20nn.ReLU()%0A%0A%20%20%20%20%23%20Some%20sample%20data%0A%20%20%20%20x%20%3D%20torch.linspace(-3%2C%203%2C%20100)%0A%20%20%20%20y_gelu%2C%20y_relu%20%3D%20gelu(x)%2C%20relu(x)%0A%0A%20%20%20%20plt.figure(figsize%3D(8%2C%203))%0A%20%20%20%20for%20i%2C%20(y%2C%20label)%20in%20enumerate(zip(%5By_gelu%2C%20y_relu%5D%2C%20%5B%22GELU%22%2C%20%22ReLU%22%5D)%2C%201)%3A%0A%20%20%20%20%20%20%20%20plt.subplot(1%2C%202%2C%20i)%0A%20%20%20%20%20%20%20%20plt.plot(x%2C%20y)%0A%20%20%20%20%20%20%20%20plt.title(f%22%7Blabel%7D%20activation%20function%22)%0A%20%20%20%20%20%20%20%20plt.xlabel(%22x%22)%0A%20%20%20%20%20%20%20%20plt.ylabel(f%22%7Blabel%7D(x)%22)%0A%20%20%20%20%20%20%20%20plt.grid(True)%0A%0A%20%20%20%20plt.tight_layout()%0A%20%20%20%20plt.show()%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20This%20is%20the%20class%20for%20FeedForward%20network%20using%20GELU.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.class_definition%0Aclass%20FeedForward(nn.Module)%3A%0A%20%20%20%20def%20__init__(self%2C%20cfg)%3A%0A%20%20%20%20%20%20%20%20super().__init__()%0A%20%20%20%20%20%20%20%20self.layers%20%3D%20nn.Sequential(%0A%20%20%20%20%20%20%20%20%20%20%20%20nn.Linear(cfg%5B%22emb_dim%22%5D%2C%204%20*%20cfg%5B%22emb_dim%22%5D)%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20GELU()%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20nn.Linear(4%20*%20cfg%5B%22emb_dim%22%5D%2C%20cfg%5B%22emb_dim%22%5D)%2C%0A%20%20%20%20%20%20%20%20)%0A%0A%20%20%20%20def%20forward(self%2C%20x)%3A%0A%20%20%20%20%20%20%20%20return%20self.layers(x)%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20Remind%20embedding%20dimension%20of%20our%20GPT2.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(GPT_CONFIG_124M)%3A%0A%20%20%20%20print(GPT_CONFIG_124M%5B%22emb_dim%22%5D)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20This%20shows%20the%20output%20shape%20of%20the%20%60FeedForward%60%20network.%0A%20%20%20%20The%20first%20layer%20increses%20the%20dimension%20in%204%20times%2C%20then%2C%20the%20last%20layer%20decreases%20the%20dimension%20to%20the%20same%20with%20the%20input.%0A%0A%20%20%20%20This%20makes%20GPT2%20model%20deeper%20without%20considering%20input%2Foutput%20dimesion%20compatibilities.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(GPT_CONFIG_124M)%3A%0A%20%20%20%20ffn%20%3D%20FeedForward(GPT_CONFIG_124M)%0A%0A%20%20%20%20%23%20input%20shape%3A%20%5Bbatch_size%2C%20num_token%2C%20emb_size%5D%0A%20%20%20%20_x%20%3D%20torch.rand(2%2C%203%2C%20768)%20%0A%20%20%20%20print(ffn(_x).shape)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%204.4%20Adding%20shortcut%20connections%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20This%20is%20an%20example%20of%20skip%20connection%20like%20ResNet.%0A%20%20%20%20To%20avoid%20vanishing%20gradient%20problem%20of%20deep%20networks%2C%20this%20network%20predicts%20residuals%20in%20each%20steps.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.class_definition%0Aclass%20ExampleDeepNeuralNetwork(nn.Module)%3A%0A%20%20%20%20def%20__init__(self%2C%20layer_sizes%2C%20use_shortcut)%3A%0A%20%20%20%20%20%20%20%20super().__init__()%0A%20%20%20%20%20%20%20%20self.use_shortcut%20%3D%20use_shortcut%0A%20%20%20%20%20%20%20%20self.layers%20%3D%20nn.ModuleList(%5B%0A%20%20%20%20%20%20%20%20%20%20%20%20nn.Sequential(nn.Linear(layer_sizes%5B0%5D%2C%20layer_sizes%5B1%5D)%2C%20GELU())%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20nn.Sequential(nn.Linear(layer_sizes%5B1%5D%2C%20layer_sizes%5B2%5D)%2C%20GELU())%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20nn.Sequential(nn.Linear(layer_sizes%5B2%5D%2C%20layer_sizes%5B3%5D)%2C%20GELU())%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20nn.Sequential(nn.Linear(layer_sizes%5B3%5D%2C%20layer_sizes%5B4%5D)%2C%20GELU())%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20nn.Sequential(nn.Linear(layer_sizes%5B4%5D%2C%20layer_sizes%5B5%5D)%2C%20GELU())%0A%20%20%20%20%20%20%20%20%5D)%0A%0A%20%20%20%20def%20forward(self%2C%20x)%3A%0A%20%20%20%20%20%20%20%20for%20layer%20in%20self.layers%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%23%20Compute%20the%20output%20of%20the%20current%20layer%0A%20%20%20%20%20%20%20%20%20%20%20%20layer_output%20%3D%20layer(x)%0A%20%20%20%20%20%20%20%20%20%20%20%20%23%20Check%20if%20shortcut%20can%20be%20applied%0A%20%20%20%20%20%20%20%20%20%20%20%20if%20self.use_shortcut%20and%20x.shape%20%3D%3D%20layer_output.shape%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20x%20%3D%20x%20%2B%20layer_output%0A%20%20%20%20%20%20%20%20%20%20%20%20else%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20x%20%3D%20layer_output%0A%20%20%20%20%20%20%20%20return%20x%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20To%20see%20the%20vanishing%20gradient%20problem%2C%20define%20the%20following%20function%20to%20see%20gradients.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.function%0Adef%20print_gradients(model%2C%20x)%3A%0A%20%20%20%20%23%20Forward%20pass%0A%20%20%20%20output%20%3D%20model(x)%0A%20%20%20%20target%20%3D%20torch.tensor(%5B%5B0.%5D%5D)%0A%0A%20%20%20%20%23%20Calculate%20loss%20based%20on%20how%20close%20the%20target%20and%20output%20are%0A%20%20%20%20loss%20%3D%20nn.MSELoss()%0A%20%20%20%20%23%20just%20taking%20MSE%20of%20the%20output%20values%0A%20%20%20%20loss%20%3D%20loss(output%2C%20target)%0A%0A%20%20%20%20%23%20Backward%20pass%20to%20calculate%20the%20gradients%0A%20%20%20%20loss.backward()%0A%0A%20%20%20%20for%20name%2C%20param%20in%20model.named_parameters()%3A%0A%20%20%20%20%20%20%20%20if%20'weight'%20in%20name%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%23%20Print%20the%20mean%20absolute%20gradient%20of%20the%20weights%0A%20%20%20%20%20%20%20%20%20%20%20%20print(f%22%7Bname%7D%20has%20gradient%20mean%20of%20%7Bparam.grad.abs().mean().item()%7D%22)%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20You%20can%20see%20vanishing%20gradients%20from%20the%20last%20layer%204%20to%20the%20initial%20layer%200.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20layer_sizes%20%3D%20%5B3%2C%203%2C%203%2C%203%2C%203%2C%201%5D%20%20%0A%0A%20%20%20%20sample_input%20%3D%20torch.tensor(%5B%5B1.%2C%200.%2C%20-1.%5D%5D)%0A%0A%20%20%20%20torch.manual_seed(123)%0A%20%20%20%20model_without_shortcut%20%3D%20ExampleDeepNeuralNetwork(%0A%20%20%20%20%20%20%20%20layer_sizes%2C%20use_shortcut%3DFalse%0A%20%20%20%20)%0A%20%20%20%20print_gradients(model_without_shortcut%2C%20sample_input)%0A%20%20%20%20return%20layer_sizes%2C%20sample_input%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20Skip%20connection%20relaxes%20the%20problem%20like%20this.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(layer_sizes%2C%20sample_input)%3A%0A%20%20%20%20torch.manual_seed(123)%0A%20%20%20%20model_with_shortcut%20%3D%20ExampleDeepNeuralNetwork(%0A%20%20%20%20%20%20%20%20layer_sizes%2C%20use_shortcut%3DTrue%0A%20%20%20%20)%0A%20%20%20%20print_gradients(model_with_shortcut%2C%20sample_input)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%204.5%20Connecting%20attention%20and%20linear%20layers%20in%20a%20transformer%20block%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20Implement%20%60TransformerBlock%60%20by%20including%20the%20above%20ideas%20of%20%60MultiHeadAttention%60%2C%20%60FeedForward%60%20and%20%60LayerNorm%60.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.class_definition%0Aclass%20TransformerBlock(nn.Module)%3A%0A%20%20%20%20def%20__init__(self%2C%20cfg)%3A%0A%20%20%20%20%20%20%20%20super().__init__()%0A%20%20%20%20%20%20%20%20self.att%20%3D%20MultiHeadAttention(%0A%20%20%20%20%20%20%20%20%20%20%20%20d_in%3Dcfg%5B%22emb_dim%22%5D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20d_out%3Dcfg%5B%22emb_dim%22%5D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20context_length%3Dcfg%5B%22context_length%22%5D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20num_heads%3Dcfg%5B%22n_heads%22%5D%2C%20%0A%20%20%20%20%20%20%20%20%20%20%20%20dropout%3Dcfg%5B%22drop_rate%22%5D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20qkv_bias%3Dcfg%5B%22qkv_bias%22%5D)%0A%20%20%20%20%20%20%20%20self.ff%20%3D%20FeedForward(cfg)%0A%20%20%20%20%20%20%20%20self.norm1%20%3D%20LayerNorm(cfg%5B%22emb_dim%22%5D)%0A%20%20%20%20%20%20%20%20self.norm2%20%3D%20LayerNorm(cfg%5B%22emb_dim%22%5D)%0A%20%20%20%20%20%20%20%20self.drop_shortcut%20%3D%20nn.Dropout(cfg%5B%22drop_rate%22%5D)%0A%0A%20%20%20%20def%20forward(self%2C%20x)%3A%0A%20%20%20%20%20%20%20%20%23%20Shortcut%20connection%20for%20attention%20block%0A%20%20%20%20%20%20%20%20shortcut%20%3D%20x%0A%20%20%20%20%20%20%20%20x%20%3D%20self.norm1(x)%0A%20%20%20%20%20%20%20%20x%20%3D%20self.att(x)%20%20%23%20Shape%20%5Bbatch_size%2C%20num_tokens%2C%20emb_size%5D%0A%20%20%20%20%20%20%20%20x%20%3D%20self.drop_shortcut(x)%0A%20%20%20%20%20%20%20%20x%20%3D%20x%20%2B%20shortcut%20%20%23%20Add%20the%20original%20input%20back%0A%0A%20%20%20%20%20%20%20%20%23%20Shortcut%20connection%20for%20feed%20forward%20block%0A%20%20%20%20%20%20%20%20shortcut%20%3D%20x%0A%20%20%20%20%20%20%20%20x%20%3D%20self.norm2(x)%0A%20%20%20%20%20%20%20%20x%20%3D%20self.ff(x)%0A%20%20%20%20%20%20%20%20x%20%3D%20self.drop_shortcut(x)%0A%20%20%20%20%20%20%20%20x%20%3D%20x%20%2B%20shortcut%20%20%23%20Add%20the%20original%20input%20back%0A%0A%20%20%20%20%20%20%20%20return%20x%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20It%20keeps%20the%20shape%20of%20vectors%20between%20inputs%20and%20outputs.%20It%20is%20suitable%20to%20combine%20deep%20layers.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(GPT_CONFIG_124M)%3A%0A%20%20%20%20torch.manual_seed(123)%0A%0A%20%20%20%20_x%20%3D%20torch.rand(2%2C%204%2C%20768)%20%20%23%20Shape%3A%20%5Bbatch_size%2C%20num_tokens%2C%20emb_dim%5D%0A%20%20%20%20block%20%3D%20TransformerBlock(GPT_CONFIG_124M)%0A%20%20%20%20_output%20%3D%20block(_x)%0A%0A%20%20%20%20print(%22Input%20shape%3A%22%2C%20_x.shape)%0A%20%20%20%20print(%22Output%20shape%3A%22%2C%20_output.shape)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%204.6%20Coding%20the%20GPT%20model%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20Replace%20the%20placeholders%20in%20Dummy%20class%20to%20create%20GPT2%20model.%20It%20repeats%20%60TransformerBlock%60%20in%20%60n_layers%60%20times.%20The%20output%20logits%20represents%20unnormalized%20probabilities%20for%20next%20tokens.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.class_definition%0Aclass%20GPTModel(nn.Module)%3A%0A%20%20%20%20def%20__init__(self%2C%20cfg)%3A%0A%20%20%20%20%20%20%20%20super().__init__()%0A%20%20%20%20%20%20%20%20self.tok_emb%20%3D%20nn.Embedding(cfg%5B%22vocab_size%22%5D%2C%20cfg%5B%22emb_dim%22%5D)%0A%20%20%20%20%20%20%20%20self.pos_emb%20%3D%20nn.Embedding(cfg%5B%22context_length%22%5D%2C%20cfg%5B%22emb_dim%22%5D)%0A%0A%20%20%20%20%20%20%20%20self.drop_emb%20%3D%20nn.Dropout(cfg%5B%22drop_rate%22%5D)%0A%0A%20%20%20%20%20%20%20%20self.trf_blocks%20%3D%20nn.Sequential(%0A%20%20%20%20%20%20%20%20%20%20%20%20*%5BTransformerBlock(cfg)%20for%20_%20in%20range(cfg%5B%22n_layers%22%5D)%5D)%0A%0A%20%20%20%20%20%20%20%20self.final_norm%20%3D%20LayerNorm(cfg%5B%22emb_dim%22%5D)%0A%20%20%20%20%20%20%20%20%23%20the%20same%20size%20with%20input%20token%20embedding%0A%20%20%20%20%20%20%20%20self.out_head%20%3D%20nn.Linear(%0A%20%20%20%20%20%20%20%20%20%20%20%20cfg%5B%22emb_dim%22%5D%2C%20cfg%5B%22vocab_size%22%5D%2C%20bias%3DFalse%0A%20%20%20%20%20%20%20%20)%0A%0A%20%20%20%20def%20forward(self%2C%20in_idx)%3A%0A%20%20%20%20%20%20%20%20batch_size%2C%20seq_len%20%3D%20in_idx.shape%0A%0A%20%20%20%20%20%20%20%20%23%20tokenize%20and%20positional%20embedding%0A%20%20%20%20%20%20%20%20tok_embeds%20%3D%20self.tok_emb(in_idx)%0A%20%20%20%20%20%20%20%20pos_embeds%20%3D%20self.pos_emb(torch.arange(seq_len%2C%20device%3Din_idx.device))%0A%20%20%20%20%20%20%20%20x%20%3D%20tok_embeds%20%2B%20pos_embeds%20%20%23%20Shape%20%5Bbatch_size%2C%20num_tokens%2C%20emb_size%5D%0A%0A%20%20%20%20%20%20%20%20x%20%3D%20self.drop_emb(x)%0A%20%20%20%20%20%20%20%20x%20%3D%20self.trf_blocks(x)%0A%20%20%20%20%20%20%20%20x%20%3D%20self.final_norm(x)%0A%20%20%20%20%20%20%20%20logits%20%3D%20self.out_head(x)%0A%20%20%20%20%20%20%20%20return%20logits%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20Try%20the%20forward%20path.%20The%20output%20shape%20is%20(B%2Cnum_tokens%2Cd_out).%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(GPT_CONFIG_124M%2C%20batch)%3A%0A%20%20%20%20torch.manual_seed(123)%0A%20%20%20%20model%20%3D%20GPTModel(GPT_CONFIG_124M)%0A%0A%20%20%20%20_out%20%3D%20model(batch)%0A%20%20%20%20print(%22Input%20batch%3A%5Cn%22%2C%20batch)%0A%20%20%20%20print(%22%5CnOutput%20shape%3A%22%2C%20_out.shape)%0A%20%20%20%20print(_out)%0A%20%20%20%20return%20(model%2C)%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20We%20can%20get%20total%20number%20of%20parameters%20by%20%60numel()%60%20method.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(model)%3A%0A%20%20%20%20total_params%20%3D%20sum(p.numel()%20for%20p%20in%20model.parameters())%0A%20%20%20%20print(f%22Total%20number%20of%20parameters%3A%20%7Btotal_params%3A%2C%7D%22)%0A%20%20%20%20return%20(total_params%2C)%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20We%20can%20share%20token%20embedding%20weights%20and%20output%20layer%20weights%20if%20you%20hope.%20These%20has%20the%20same%20shape.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(model)%3A%0A%20%20%20%20print(%22Token%20embedding%20layer%20shape%3A%22%2C%20model.tok_emb.weight.shape)%0A%20%20%20%20print(%22Output%20layer%20shape%3A%22%2C%20model.out_head.weight.shape)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20By%20considering%20such%20weight%20sharing%2C%20this%20model%20parameters%20reduces%20to%201.24b%20parameters.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(model%2C%20total_params)%3A%0A%20%20%20%20total_params_gpt2%20%3D%20%20total_params%20-%20sum(p.numel()%20for%20p%20in%20model.out_head.parameters())%0A%20%20%20%20print(f%22Number%20of%20trainable%20parameters%20considering%20weight%20tying%3A%20%7Btotal_params_gpt2%3A%2C%7D%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20We%20can%20estimate%20memory%20size%20to%20load%20this%20model%20by%20this.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(total_params)%3A%0A%20%20%20%20%23%20Calculate%20the%20total%20size%20in%20bytes%20(assuming%20float32%2C%204%20bytes%20per%20parameter)%0A%20%20%20%20total_size_bytes%20%3D%20total_params%20*%204%0A%0A%20%20%20%20%23%20Convert%20to%20megabytes%0A%20%20%20%20total_size_mb%20%3D%20total_size_bytes%20%2F%20(1024%20*%201024)%0A%0A%20%20%20%20print(f%22Total%20size%20of%20the%20model%3A%20%7Btotal_size_mb%3A.2f%7D%20MB%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%204.7%20Generating%20text%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20This%20is%20a%20function%20to%20generate%20texts%20(decoding)%20from%20GPT2%20outputs.%20The%20step%20applies%20softmax%20can%20be%20omitted%20because%20it%20is%20monotonic.%0A%0A%20%20%20%20It%20is%20called%20as%20the%20greedy%20decoding%20to%20select%20tokens%20whose%20probability%20is%20the%20maximum%20on%20the%20candidates.%20There%20are%20several%20strategies%20to%20select%20next%20tokens%20according%20to%20tasks.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.function%0Adef%20generate_text_simple(model%2C%20idx%2C%20max_new_tokens%2C%20context_size)%3A%0A%20%20%20%20%23%20idx%20is%20(batch%2C%20n_tokens)%20array%20of%20indices%20in%20the%20current%20context%0A%20%20%20%20for%20_%20in%20range(max_new_tokens)%3A%0A%0A%20%20%20%20%20%20%20%20%23%20Crop%20current%20context%20if%20it%20exceeds%20the%20supported%20context%20size%0A%20%20%20%20%20%20%20%20%23%20E.g.%2C%20if%20LLM%20supports%20only%205%20tokens%2C%20and%20the%20context%20size%20is%2010%0A%20%20%20%20%20%20%20%20%23%20then%20only%20the%20last%205%20tokens%20are%20used%20as%20context%0A%20%20%20%20%20%20%20%20idx_cond%20%3D%20idx%5B%3A%2C%20-context_size%3A%5D%0A%0A%20%20%20%20%20%20%20%20%23%20Get%20the%20predictions%0A%20%20%20%20%20%20%20%20with%20torch.no_grad()%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20logits%20%3D%20model(idx_cond)%0A%0A%20%20%20%20%20%20%20%20%23%20Focus%20only%20on%20the%20last%20time%20step%0A%20%20%20%20%20%20%20%20%23%20(batch%2C%20n_tokens%2C%20vocab_size)%20becomes%20(batch%2C%20vocab_size)%0A%20%20%20%20%20%20%20%20logits%20%3D%20logits%5B%3A%2C%20-1%2C%20%3A%5D%20%20%0A%0A%20%20%20%20%20%20%20%20%23%20Apply%20softmax%20to%20get%20probabilities%0A%20%20%20%20%20%20%20%20probas%20%3D%20torch.softmax(logits%2C%20dim%3D-1)%20%20%23%20(batch%2C%20vocab_size)%0A%0A%20%20%20%20%20%20%20%20%23%20Get%20the%20idx%20of%20the%20vocab%20entry%20with%20the%20highest%20probability%20value%0A%20%20%20%20%20%20%20%20idx_next%20%3D%20torch.argmax(probas%2C%20dim%3D-1%2C%20keepdim%3DTrue)%20%20%23%20(batch%2C%201)%0A%0A%20%20%20%20%20%20%20%20%23%20Append%20sampled%20index%20to%20the%20running%20sequence%0A%20%20%20%20%20%20%20%20idx%20%3D%20torch.cat((idx%2C%20idx_next)%2C%20dim%3D1)%20%20%23%20(batch%2C%20n_tokens%2B1)%0A%0A%20%20%20%20return%20idx%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20Encode%20texts%20by%20using%20tokenizer%20to%20prepare%20the%20input.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(tokenizer)%3A%0A%20%20%20%20start_context%20%3D%20%22Hello%2C%20I%20am%22%0A%0A%20%20%20%20encoded%20%3D%20tokenizer.encode(start_context)%0A%20%20%20%20print(%22encoded%3A%22%2C%20encoded)%0A%0A%20%20%20%20encoded_tensor%20%3D%20torch.tensor(encoded).unsqueeze(0)%0A%20%20%20%20print(%22encoded_tensor.shape%3A%22%2C%20encoded_tensor.shape)%0A%20%20%20%20return%20(encoded_tensor%2C)%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20Input%20it%20to%20the%20function%20to%20generate%20text.%20The%20output%20is%20token%20IDs.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(GPT_CONFIG_124M%2C%20encoded_tensor%2C%20model)%3A%0A%20%20%20%20model.eval()%20%23%20disable%20dropout%0A%0A%20%20%20%20out_gen%20%3D%20generate_text_simple(%0A%20%20%20%20%20%20%20%20model%3Dmodel%2C%0A%20%20%20%20%20%20%20%20idx%3Dencoded_tensor%2C%20%0A%20%20%20%20%20%20%20%20max_new_tokens%3D6%2C%20%0A%20%20%20%20%20%20%20%20context_size%3DGPT_CONFIG_124M%5B%22context_length%22%5D%0A%20%20%20%20)%0A%0A%20%20%20%20print(%22Output%3A%22%2C%20out_gen)%0A%20%20%20%20print(%22Output%20length%3A%22%2C%20len(out_gen%5B0%5D))%0A%20%20%20%20return%20(out_gen%2C)%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20Decode%20the%20output%20token%20IDs%20by%20using%20tokenizer.%20The%20output%20is%20nonsense%20because%20we%20did%20not%20train%20it%20yet.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(out_gen%2C%20tokenizer)%3A%0A%20%20%20%20decoded_text%20%3D%20tokenizer.decode(out_gen.squeeze(0).tolist())%0A%20%20%20%20print(decoded_text)%0A%20%20%20%20return%0A%0A%0Aif%20__name__%20%3D%3D%20%22__main__%22%3A%0A%20%20%20%20app.run()%0A