diff --git a/ch04/01_main-chapter-code/ch04.ipynb b/ch04/01_main-chapter-code/ch04.ipynb index 742c962..27eda6b 100644 --- a/ch04/01_main-chapter-code/ch04.ipynb +++ b/ch04/01_main-chapter-code/ch04.ipynb @@ -29,7 +29,7 @@ "id": "53fe99ab-0bcf-4778-a6b5-6db81fb826ef", "metadata": {}, "source": [ - "## 4.1 Coding the LLM backbone" + "## 4.1 Coding an LLM architecture" ] }, { @@ -479,7 +479,7 @@ "- Note that we also add a smaller value (`eps`) before computing the square root of the variance; this is to avoid division-by-zero errors if the variance is 0\n", "\n", "**Biased variance**\n", - "- In the variance calculation above, setting `unbiased=False` means using the formula $\\frac{\\sum (x_i - \\bar{x})^2}{n}$ to compute the variance where n is the sample size (here, the number of features or columns); this formula does not include Bessel's correction (which uses `n-1` in the denominator), thus providing a biased estimate of the variance \n", + "- In the variance calculation above, setting `unbiased=False` means using the formula $\\frac{\\sum_i (x_i - \\bar{x})^2}{n}$ to compute the variance where n is the sample size (here, the number of features or columns); this formula does not include Bessel's correction (which uses `n-1` in the denominator), thus providing a biased estimate of the variance \n", "- For LLMs, where the embedding dimension `n` is very large, the difference between using n and `n-1`\n", " is negligible\n", "- However, GPT-2 was trained with a biased variance in the normalization layers, which is why we also adopted this setting for compatibility reasons with the pretrained weights that we will load in later chapters\n", @@ -558,7 +558,7 @@ "id": "7d482ce7-e493-4bfc-a820-3ea99f564ebc", "metadata": {}, "source": [ - "- GELU ([Hendrycks and Gimpel 2016](https://arxiv.org/abs/1606.08415)) can be implemented in several ways; the exact version is defined as GELU(x)=x⋅Φ(x), where Φ(x) is the cumulative distribution function of the standard Gaussian distribution\n", + "- GELU ([Hendrycks and Gimpel 2016](https://arxiv.org/abs/1606.08415)) can be implemented in several ways; the exact version is defined as GELU(x)=x⋅Φ(x), where Φ(x) is the cumulative distribution function of the standard Gaussian distribution.\n", "- In practice, it's common to implement a computationally cheaper approximation: $\\text{GELU}(x) \\approx 0.5 \\cdot x \\cdot \\left(1 + \\tanh\\left[\\sqrt{\\frac{2}{\\pi}} \\cdot \\left(x + 0.044715 \\cdot x^3\\right)\\right]\\right)\n", "$ (the original GPT-2 model was also trained with this approximation)" ] @@ -680,7 +680,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 30, "id": "928e7f7c-d0b1-499f-8d07-4cadb428a6f9", "metadata": {}, "outputs": [ @@ -688,14 +688,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "torch.Size([2, 768])\n" + "torch.Size([2, 3, 768])\n" ] } ], "source": [ "ffn = FeedForward(GPT_CONFIG_124M)\n", "\n", - "x = torch.rand(2, 768) # input with batch dimension 2\n", + "# input shape: [batch_size, num_token, emb_size]\n", + "x = torch.rand(2, 3, 768) \n", "out = ffn(x)\n", "print(out.shape)" ] @@ -832,7 +833,7 @@ " # Shortcut connection for attention block\n", " shortcut = x\n", " x = self.norm1(x)\n", - " x = self.att(x)\n", + " x = self.att(x) # Shape [batch_size, num_tokens, emb_size]\n", " x = self.drop_resid(x)\n", " x = x + shortcut # Add the original input back\n", "\n", @@ -957,7 +958,7 @@ " batch_size, seq_len = in_idx.shape\n", " tok_embeds = self.tok_emb(in_idx)\n", " pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))\n", - " x = tok_embeds + pos_embeds\n", + " x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size]\n", " x = self.trf_blocks(x)\n", " x = self.final_norm(x)\n", " logits = self.out_head(x)\n", @@ -1002,7 +1003,7 @@ ], "source": [ "torch.manual_seed(123)\n", - "model = GPTModel(GPT_CONFIG_124M)\n", + "model = GPT(GPT_CONFIG_124M)\n", "\n", "out = model(batch)\n", "print(\"Output shape:\", out.shape)\n", diff --git a/ch04/01_main-chapter-code/figures/ffn.webp b/ch04/01_main-chapter-code/figures/ffn.webp index 38ea715..d686cb3 100644 Binary files a/ch04/01_main-chapter-code/figures/ffn.webp and b/ch04/01_main-chapter-code/figures/ffn.webp differ diff --git a/ch04/01_main-chapter-code/gpt.py b/ch04/01_main-chapter-code/gpt.py index c08b118..a508786 100644 --- a/ch04/01_main-chapter-code/gpt.py +++ b/ch04/01_main-chapter-code/gpt.py @@ -168,7 +168,7 @@ class TransformerBlock(nn.Module): # Shortcut connection for attention block shortcut = x x = self.norm1(x) - x = self.att(x) + x = self.att(x) # Shape [batch_size, num_tokens, emb_size] x = self.drop_resid(x) x = x + shortcut # Add the original input back @@ -200,7 +200,7 @@ class GPTModel(nn.Module): batch_size, seq_len = in_idx.shape tok_embeds = self.tok_emb(in_idx) pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device)) - x = tok_embeds + pos_embeds + x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size] x = self.trf_blocks(x) x = self.final_norm(x) logits = self.out_head(x)