Rename drop_resid to drop_shortcut (#136)

2026-04-10 12:33:42 +00:00 · 2024-04-28 14:31:27 -05:00
parent 70cd174091
commit 97ed38116a
10 changed files with 37 additions and 37 deletions
--- a/ch05/01_main-chapter-code/exercise-solutions.ipynb
+++ b/ch05/01_main-chapter-code/exercise-solutions.ipynb
@@ -519,7 +519,7 @@
    "train_losses, val_losses, tokens_seen = train_model_simple(\n",
    "    model, train_loader, val_loader, optimizer, device,\n",
    "    num_epochs=num_epochs, eval_freq=5, eval_iter=5,\n",
-    "    start_context=\"Every effort moves you\",\n",
+    "    start_context=\"Every effort moves you\", tokenizer=tokenizer\n",
    ")"
   ]
  },
@@ -605,7 +605,7 @@
     "text": [
      "File already exists and is up-to-date: gpt2/124M/checkpoint\n",
      "File already exists and is up-to-date: gpt2/124M/encoder.json\n",
-      "File already exists and is up-to-date: gpt2/124M/settings.json\n",
+      "File already exists and is up-to-date: gpt2/124M/hparams.json\n",
      "File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001\n",
      "File already exists and is up-to-date: gpt2/124M/model.ckpt.index\n",
      "File already exists and is up-to-date: gpt2/124M/model.ckpt.meta\n",
@@ -760,7 +760,7 @@
     "text": [
      "File already exists and is up-to-date: gpt2/1558M/checkpoint\n",
      "File already exists and is up-to-date: gpt2/1558M/encoder.json\n",
-      "File already exists and is up-to-date: gpt2/1558M/settings.json\n",
+      "File already exists and is up-to-date: gpt2/1558M/hparams.json\n",
      "File already exists and is up-to-date: gpt2/1558M/model.ckpt.data-00000-of-00001\n",
      "File already exists and is up-to-date: gpt2/1558M/model.ckpt.index\n",
      "File already exists and is up-to-date: gpt2/1558M/model.ckpt.meta\n",
@@ -859,7 +859,7 @@
     "text": [
      "File already exists and is up-to-date: gpt2/1558M/checkpoint\n",
      "File already exists and is up-to-date: gpt2/1558M/encoder.json\n",
-      "File already exists and is up-to-date: gpt2/1558M/settings.json\n",
+      "File already exists and is up-to-date: gpt2/1558M/hparams.json\n",
      "File already exists and is up-to-date: gpt2/1558M/model.ckpt.data-00000-of-00001\n",
      "File already exists and is up-to-date: gpt2/1558M/model.ckpt.index\n",
      "File already exists and is up-to-date: gpt2/1558M/model.ckpt.meta\n",
--- a/ch05/01_main-chapter-code/previous_chapters.py
+++ b/ch05/01_main-chapter-code/previous_chapters.py
@@ -167,21 +167,21 @@ class TransformerBlock(nn.Module):
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
-        self.drop_resid = nn.Dropout(cfg["drop_rate"])
+        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)   # Shape [batch_size, num_tokens, emb_size]
-        x = self.drop_resid(x)
+        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed-forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
-        x = self.drop_resid(x)
+        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x
--- a/ch05/02_alternative_weight_loading/previous_chapters.py
+++ b/ch05/02_alternative_weight_loading/previous_chapters.py
@@ -167,21 +167,21 @@ class TransformerBlock(nn.Module):
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
-        self.drop_resid = nn.Dropout(cfg["drop_rate"])
+        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)   # Shape [batch_size, num_tokens, emb_size]
-        x = self.drop_resid(x)
+        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed-forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
-        x = self.drop_resid(x)
+        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x
--- a/ch05/03_bonus_pretraining_on_gutenberg/previous_chapters.py
+++ b/ch05/03_bonus_pretraining_on_gutenberg/previous_chapters.py
@@ -164,21 +164,21 @@ class TransformerBlock(nn.Module):
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
-        self.drop_resid = nn.Dropout(cfg["drop_rate"])
+        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)   # Shape [batch_size, num_tokens, emb_size]
-        x = self.drop_resid(x)
+        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed-forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
-        x = self.drop_resid(x)
+        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x
--- a/ch05/05_bonus_hparam_tuning/previous_chapters.py
+++ b/ch05/05_bonus_hparam_tuning/previous_chapters.py
@@ -167,21 +167,21 @@ class TransformerBlock(nn.Module):
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
-        self.drop_resid = nn.Dropout(cfg["drop_rate"])
+        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)   # Shape [batch_size, num_tokens, emb_size]
-        x = self.drop_resid(x)
+        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed-forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
-        x = self.drop_resid(x)
+        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x