diff --git a/appendix-D/01_main-chapter-code/previous_chapters.py b/appendix-D/01_main-chapter-code/previous_chapters.py index 4292f64..46030ef 100644 --- a/appendix-D/01_main-chapter-code/previous_chapters.py +++ b/appendix-D/01_main-chapter-code/previous_chapters.py @@ -170,21 +170,21 @@ class TransformerBlock(nn.Module): self.ff = FeedForward(cfg) self.norm1 = LayerNorm(cfg["emb_dim"]) self.norm2 = LayerNorm(cfg["emb_dim"]) - self.drop_resid = nn.Dropout(cfg["drop_rate"]) + self.drop_shortcut = nn.Dropout(cfg["drop_rate"]) def forward(self, x): # Shortcut connection for attention block shortcut = x x = self.norm1(x) x = self.att(x) # Shape [batch_size, num_tokens, emb_size] - x = self.drop_resid(x) + x = self.drop_shortcut(x) x = x + shortcut # Add the original input back # Shortcut connection for feed-forward block shortcut = x x = self.norm2(x) x = self.ff(x) - x = self.drop_resid(x) + x = self.drop_shortcut(x) x = x + shortcut # Add the original input back return x diff --git a/ch04/01_main-chapter-code/ch04.ipynb b/ch04/01_main-chapter-code/ch04.ipynb index 4fc4e3e..ac1d5e0 100644 --- a/ch04/01_main-chapter-code/ch04.ipynb +++ b/ch04/01_main-chapter-code/ch04.ipynb @@ -950,21 +950,21 @@ " self.ff = FeedForward(cfg)\n", " self.norm1 = LayerNorm(cfg[\"emb_dim\"])\n", " self.norm2 = LayerNorm(cfg[\"emb_dim\"])\n", - " self.drop_resid = nn.Dropout(cfg[\"drop_rate\"])\n", + " self.drop_shortcut = nn.Dropout(cfg[\"drop_rate\"])\n", "\n", " def forward(self, x):\n", " # Shortcut connection for attention block\n", " shortcut = x\n", " x = self.norm1(x)\n", " x = self.att(x) # Shape [batch_size, num_tokens, emb_size]\n", - " x = self.drop_resid(x)\n", + " x = self.drop_shortcut(x)\n", " x = x + shortcut # Add the original input back\n", "\n", " # Shortcut connection for feed forward block\n", " shortcut = x\n", " x = self.norm2(x)\n", " x = self.ff(x)\n", - " x = self.drop_resid(x)\n", + " x = self.drop_shortcut(x)\n", " x = x + shortcut # Add the original input back\n", "\n", " return x" @@ -1489,7 +1489,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.6" } }, "nbformat": 4, diff --git a/ch04/01_main-chapter-code/exercise-solutions.ipynb b/ch04/01_main-chapter-code/exercise-solutions.ipynb index 6d49b9a..d2dfa4a 100644 --- a/ch04/01_main-chapter-code/exercise-solutions.ipynb +++ b/ch04/01_main-chapter-code/exercise-solutions.ipynb @@ -34,7 +34,7 @@ "metadata": {}, "outputs": [], "source": [ - "from gpt import Transfocontext_lengthmerBlock\n", + "from gpt import TransformerBlock\n", "\n", "GPT_CONFIG_124M = {\n", " \"vocab_size\": 50257,\n", @@ -264,9 +264,9 @@ " \"emb_dim\": 768,\n", " \"n_heads\": 12,\n", " \"n_layers\": 12,\n", - " \"drop_rate_emb\": 0.1, # NEW: dropout for embedding layers\n", - " \"drop_rate_attn\": 0.1, # NEW: dropout for multi-head attention \n", - " \"drop_rate_resid\": 0.1, # NEW: dropout for residual connections \n", + " \"drop_rate_emb\": 0.1, # NEW: dropout for embedding layers\n", + " \"drop_rate_attn\": 0.1, # NEW: dropout for multi-head attention \n", + " \"drop_rate_shortcut\": 0.1, # NEW: dropout for shortcut connections \n", " \"qkv_bias\": False\n", "}" ] @@ -295,21 +295,21 @@ " self.ff = FeedForward(cfg)\n", " self.norm1 = LayerNorm(cfg[\"emb_dim\"])\n", " self.norm2 = LayerNorm(cfg[\"emb_dim\"])\n", - " self.drop_resid = nn.Dropout(cfg[\"drop_rate_resid\"])\n", + " self.drop_shortcut = nn.Dropout(cfg[\"drop_rate_shortcut\"])\n", "\n", " def forward(self, x):\n", " # Shortcut connection for attention block\n", " shortcut = x\n", " x = self.norm1(x)\n", " x = self.att(x) # Shape [batch_size, num_tokens, emb_size]\n", - " x = self.drop_resid(x)\n", + " x = self.drop_shortcut(x)\n", " x = x + shortcut # Add the original input back\n", "\n", " # Shortcut connection for feed-forward block\n", " shortcut = x\n", " x = self.norm2(x)\n", " x = self.ff(x)\n", - " x = self.drop_resid(x)\n", + " x = self.drop_shortcut(x)\n", " x = x + shortcut # Add the original input back\n", "\n", " return x\n", @@ -370,7 +370,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.11.4" } }, "nbformat": 4, diff --git a/ch04/01_main-chapter-code/gpt.py b/ch04/01_main-chapter-code/gpt.py index ff27673..b2d985b 100644 --- a/ch04/01_main-chapter-code/gpt.py +++ b/ch04/01_main-chapter-code/gpt.py @@ -162,21 +162,21 @@ class TransformerBlock(nn.Module): self.ff = FeedForward(cfg) self.norm1 = LayerNorm(cfg["emb_dim"]) self.norm2 = LayerNorm(cfg["emb_dim"]) - self.drop_resid = nn.Dropout(cfg["drop_rate"]) + self.drop_shortcut = nn.Dropout(cfg["drop_rate"]) def forward(self, x): # Shortcut connection for attention block shortcut = x x = self.norm1(x) x = self.att(x) # Shape [batch_size, num_tokens, emb_size] - x = self.drop_resid(x) + x = self.drop_shortcut(x) x = x + shortcut # Add the original input back # Shortcut connection for feed-forward block shortcut = x x = self.norm2(x) x = self.ff(x) - x = self.drop_resid(x) + x = self.drop_shortcut(x) x = x + shortcut # Add the original input back return x diff --git a/ch05/01_main-chapter-code/exercise-solutions.ipynb b/ch05/01_main-chapter-code/exercise-solutions.ipynb index 6f5eb2e..a0cb70e 100644 --- a/ch05/01_main-chapter-code/exercise-solutions.ipynb +++ b/ch05/01_main-chapter-code/exercise-solutions.ipynb @@ -519,7 +519,7 @@ "train_losses, val_losses, tokens_seen = train_model_simple(\n", " model, train_loader, val_loader, optimizer, device,\n", " num_epochs=num_epochs, eval_freq=5, eval_iter=5,\n", - " start_context=\"Every effort moves you\",\n", + " start_context=\"Every effort moves you\", tokenizer=tokenizer\n", ")" ] }, @@ -605,7 +605,7 @@ "text": [ "File already exists and is up-to-date: gpt2/124M/checkpoint\n", "File already exists and is up-to-date: gpt2/124M/encoder.json\n", - "File already exists and is up-to-date: gpt2/124M/settings.json\n", + "File already exists and is up-to-date: gpt2/124M/hparams.json\n", "File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001\n", "File already exists and is up-to-date: gpt2/124M/model.ckpt.index\n", "File already exists and is up-to-date: gpt2/124M/model.ckpt.meta\n", @@ -760,7 +760,7 @@ "text": [ "File already exists and is up-to-date: gpt2/1558M/checkpoint\n", "File already exists and is up-to-date: gpt2/1558M/encoder.json\n", - "File already exists and is up-to-date: gpt2/1558M/settings.json\n", + "File already exists and is up-to-date: gpt2/1558M/hparams.json\n", "File already exists and is up-to-date: gpt2/1558M/model.ckpt.data-00000-of-00001\n", "File already exists and is up-to-date: gpt2/1558M/model.ckpt.index\n", "File already exists and is up-to-date: gpt2/1558M/model.ckpt.meta\n", @@ -859,7 +859,7 @@ "text": [ "File already exists and is up-to-date: gpt2/1558M/checkpoint\n", "File already exists and is up-to-date: gpt2/1558M/encoder.json\n", - "File already exists and is up-to-date: gpt2/1558M/settings.json\n", + "File already exists and is up-to-date: gpt2/1558M/hparams.json\n", "File already exists and is up-to-date: gpt2/1558M/model.ckpt.data-00000-of-00001\n", "File already exists and is up-to-date: gpt2/1558M/model.ckpt.index\n", "File already exists and is up-to-date: gpt2/1558M/model.ckpt.meta\n", diff --git a/ch05/01_main-chapter-code/previous_chapters.py b/ch05/01_main-chapter-code/previous_chapters.py index 7aa600e..9b05743 100644 --- a/ch05/01_main-chapter-code/previous_chapters.py +++ b/ch05/01_main-chapter-code/previous_chapters.py @@ -167,21 +167,21 @@ class TransformerBlock(nn.Module): self.ff = FeedForward(cfg) self.norm1 = LayerNorm(cfg["emb_dim"]) self.norm2 = LayerNorm(cfg["emb_dim"]) - self.drop_resid = nn.Dropout(cfg["drop_rate"]) + self.drop_shortcut = nn.Dropout(cfg["drop_rate"]) def forward(self, x): # Shortcut connection for attention block shortcut = x x = self.norm1(x) x = self.att(x) # Shape [batch_size, num_tokens, emb_size] - x = self.drop_resid(x) + x = self.drop_shortcut(x) x = x + shortcut # Add the original input back # Shortcut connection for feed-forward block shortcut = x x = self.norm2(x) x = self.ff(x) - x = self.drop_resid(x) + x = self.drop_shortcut(x) x = x + shortcut # Add the original input back return x diff --git a/ch05/02_alternative_weight_loading/previous_chapters.py b/ch05/02_alternative_weight_loading/previous_chapters.py index af3ab2e..19cc2c2 100644 --- a/ch05/02_alternative_weight_loading/previous_chapters.py +++ b/ch05/02_alternative_weight_loading/previous_chapters.py @@ -167,21 +167,21 @@ class TransformerBlock(nn.Module): self.ff = FeedForward(cfg) self.norm1 = LayerNorm(cfg["emb_dim"]) self.norm2 = LayerNorm(cfg["emb_dim"]) - self.drop_resid = nn.Dropout(cfg["drop_rate"]) + self.drop_shortcut = nn.Dropout(cfg["drop_rate"]) def forward(self, x): # Shortcut connection for attention block shortcut = x x = self.norm1(x) x = self.att(x) # Shape [batch_size, num_tokens, emb_size] - x = self.drop_resid(x) + x = self.drop_shortcut(x) x = x + shortcut # Add the original input back # Shortcut connection for feed-forward block shortcut = x x = self.norm2(x) x = self.ff(x) - x = self.drop_resid(x) + x = self.drop_shortcut(x) x = x + shortcut # Add the original input back return x diff --git a/ch05/03_bonus_pretraining_on_gutenberg/previous_chapters.py b/ch05/03_bonus_pretraining_on_gutenberg/previous_chapters.py index a5ded8e..d5f3b6d 100644 --- a/ch05/03_bonus_pretraining_on_gutenberg/previous_chapters.py +++ b/ch05/03_bonus_pretraining_on_gutenberg/previous_chapters.py @@ -164,21 +164,21 @@ class TransformerBlock(nn.Module): self.ff = FeedForward(cfg) self.norm1 = LayerNorm(cfg["emb_dim"]) self.norm2 = LayerNorm(cfg["emb_dim"]) - self.drop_resid = nn.Dropout(cfg["drop_rate"]) + self.drop_shortcut = nn.Dropout(cfg["drop_rate"]) def forward(self, x): # Shortcut connection for attention block shortcut = x x = self.norm1(x) x = self.att(x) # Shape [batch_size, num_tokens, emb_size] - x = self.drop_resid(x) + x = self.drop_shortcut(x) x = x + shortcut # Add the original input back # Shortcut connection for feed-forward block shortcut = x x = self.norm2(x) x = self.ff(x) - x = self.drop_resid(x) + x = self.drop_shortcut(x) x = x + shortcut # Add the original input back return x diff --git a/ch05/05_bonus_hparam_tuning/previous_chapters.py b/ch05/05_bonus_hparam_tuning/previous_chapters.py index 2fd2d9c..1fa5502 100644 --- a/ch05/05_bonus_hparam_tuning/previous_chapters.py +++ b/ch05/05_bonus_hparam_tuning/previous_chapters.py @@ -167,21 +167,21 @@ class TransformerBlock(nn.Module): self.ff = FeedForward(cfg) self.norm1 = LayerNorm(cfg["emb_dim"]) self.norm2 = LayerNorm(cfg["emb_dim"]) - self.drop_resid = nn.Dropout(cfg["drop_rate"]) + self.drop_shortcut = nn.Dropout(cfg["drop_rate"]) def forward(self, x): # Shortcut connection for attention block shortcut = x x = self.norm1(x) x = self.att(x) # Shape [batch_size, num_tokens, emb_size] - x = self.drop_resid(x) + x = self.drop_shortcut(x) x = x + shortcut # Add the original input back # Shortcut connection for feed-forward block shortcut = x x = self.norm2(x) x = self.ff(x) - x = self.drop_resid(x) + x = self.drop_shortcut(x) x = x + shortcut # Add the original input back return x diff --git a/ch06/02_bonus_additional-experiments/previous_chapters.py b/ch06/02_bonus_additional-experiments/previous_chapters.py index e794f9b..8d6f827 100644 --- a/ch06/02_bonus_additional-experiments/previous_chapters.py +++ b/ch06/02_bonus_additional-experiments/previous_chapters.py @@ -169,21 +169,21 @@ class TransformerBlock(nn.Module): self.ff = FeedForward(cfg) self.norm1 = LayerNorm(cfg["emb_dim"]) self.norm2 = LayerNorm(cfg["emb_dim"]) - self.drop_resid = nn.Dropout(cfg["drop_rate"]) + self.drop_shortcut = nn.Dropout(cfg["drop_rate"]) def forward(self, x): # Shortcut connection for attention block shortcut = x x = self.norm1(x) x = self.att(x) # Shape [batch_size, num_tokens, emb_size] - x = self.drop_resid(x) + x = self.drop_shortcut(x) x = x + shortcut # Add the original input back # Shortcut connection for feed-forward block shortcut = x x = self.norm2(x) x = self.ff(x) - x = self.drop_resid(x) + x = self.drop_shortcut(x) x = x + shortcut # Add the original input back return x