Rename drop_resid to drop_shortcut (#136)

This commit is contained in:
Sebastian Raschka
2024-04-28 14:31:27 -05:00
committed by GitHub
parent 70cd174091
commit 97ed38116a
10 changed files with 37 additions and 37 deletions

View File

@@ -950,21 +950,21 @@
" self.ff = FeedForward(cfg)\n",
" self.norm1 = LayerNorm(cfg[\"emb_dim\"])\n",
" self.norm2 = LayerNorm(cfg[\"emb_dim\"])\n",
" self.drop_resid = nn.Dropout(cfg[\"drop_rate\"])\n",
" self.drop_shortcut = nn.Dropout(cfg[\"drop_rate\"])\n",
"\n",
" def forward(self, x):\n",
" # Shortcut connection for attention block\n",
" shortcut = x\n",
" x = self.norm1(x)\n",
" x = self.att(x) # Shape [batch_size, num_tokens, emb_size]\n",
" x = self.drop_resid(x)\n",
" x = self.drop_shortcut(x)\n",
" x = x + shortcut # Add the original input back\n",
"\n",
" # Shortcut connection for feed forward block\n",
" shortcut = x\n",
" x = self.norm2(x)\n",
" x = self.ff(x)\n",
" x = self.drop_resid(x)\n",
" x = self.drop_shortcut(x)\n",
" x = x + shortcut # Add the original input back\n",
"\n",
" return x"
@@ -1489,7 +1489,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
"version": "3.10.6"
}
},
"nbformat": 4,

View File

@@ -34,7 +34,7 @@
"metadata": {},
"outputs": [],
"source": [
"from gpt import Transfocontext_lengthmerBlock\n",
"from gpt import TransformerBlock\n",
"\n",
"GPT_CONFIG_124M = {\n",
" \"vocab_size\": 50257,\n",
@@ -264,9 +264,9 @@
" \"emb_dim\": 768,\n",
" \"n_heads\": 12,\n",
" \"n_layers\": 12,\n",
" \"drop_rate_emb\": 0.1, # NEW: dropout for embedding layers\n",
" \"drop_rate_attn\": 0.1, # NEW: dropout for multi-head attention \n",
" \"drop_rate_resid\": 0.1, # NEW: dropout for residual connections \n",
" \"drop_rate_emb\": 0.1, # NEW: dropout for embedding layers\n",
" \"drop_rate_attn\": 0.1, # NEW: dropout for multi-head attention \n",
" \"drop_rate_shortcut\": 0.1, # NEW: dropout for shortcut connections \n",
" \"qkv_bias\": False\n",
"}"
]
@@ -295,21 +295,21 @@
" self.ff = FeedForward(cfg)\n",
" self.norm1 = LayerNorm(cfg[\"emb_dim\"])\n",
" self.norm2 = LayerNorm(cfg[\"emb_dim\"])\n",
" self.drop_resid = nn.Dropout(cfg[\"drop_rate_resid\"])\n",
" self.drop_shortcut = nn.Dropout(cfg[\"drop_rate_shortcut\"])\n",
"\n",
" def forward(self, x):\n",
" # Shortcut connection for attention block\n",
" shortcut = x\n",
" x = self.norm1(x)\n",
" x = self.att(x) # Shape [batch_size, num_tokens, emb_size]\n",
" x = self.drop_resid(x)\n",
" x = self.drop_shortcut(x)\n",
" x = x + shortcut # Add the original input back\n",
"\n",
" # Shortcut connection for feed-forward block\n",
" shortcut = x\n",
" x = self.norm2(x)\n",
" x = self.ff(x)\n",
" x = self.drop_resid(x)\n",
" x = self.drop_shortcut(x)\n",
" x = x + shortcut # Add the original input back\n",
"\n",
" return x\n",
@@ -370,7 +370,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
"version": "3.11.4"
}
},
"nbformat": 4,

View File

@@ -162,21 +162,21 @@ class TransformerBlock(nn.Module):
self.ff = FeedForward(cfg)
self.norm1 = LayerNorm(cfg["emb_dim"])
self.norm2 = LayerNorm(cfg["emb_dim"])
self.drop_resid = nn.Dropout(cfg["drop_rate"])
self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
def forward(self, x):
# Shortcut connection for attention block
shortcut = x
x = self.norm1(x)
x = self.att(x) # Shape [batch_size, num_tokens, emb_size]
x = self.drop_resid(x)
x = self.drop_shortcut(x)
x = x + shortcut # Add the original input back
# Shortcut connection for feed-forward block
shortcut = x
x = self.norm2(x)
x = self.ff(x)
x = self.drop_resid(x)
x = self.drop_shortcut(x)
x = x + shortcut # Add the original input back
return x