Remove reundant dropout in MLP module (#105)

This commit is contained in:
Sebastian Raschka
2024-04-03 20:19:08 -05:00
committed by GitHub
parent edcae09884
commit 5beff4e25a
11 changed files with 202 additions and 266 deletions

File diff suppressed because one or more lines are too long

View File

@@ -253,7 +253,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 1,
"id": "5fee2cf5-61c3-4167-81b5-44ea155bbaf2",
"metadata": {},
"outputs": [],
@@ -265,7 +265,6 @@
" \"n_heads\": 12,\n",
" \"n_layers\": 12,\n",
" \"drop_rate_emb\": 0.1, # NEW: dropout for embedding layers\n",
" \"drop_rate_ffn\": 0.1, # NEW: dropout for feed forward module\n",
" \"drop_rate_attn\": 0.1, # NEW: dropout for multi-head attention \n",
" \"drop_rate_resid\": 0.1, # NEW: dropout for residual connections \n",
" \"qkv_bias\": False\n",
@@ -274,26 +273,13 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 2,
"id": "5aa1b0c1-d78a-48fc-ad08-4802458b43f7",
"metadata": {},
"outputs": [],
"source": [
"import torch.nn as nn\n",
"from gpt import MultiHeadAttention, LayerNorm, GELU\n",
"\n",
"class FeedForward(nn.Module):\n",
" def __init__(self, cfg):\n",
" super().__init__()\n",
" self.layers = nn.Sequential(\n",
" nn.Linear(cfg[\"emb_dim\"], 4 * cfg[\"emb_dim\"]),\n",
" GELU(),\n",
" nn.Linear(4 * cfg[\"emb_dim\"], cfg[\"emb_dim\"]),\n",
" nn.Dropout(cfg[\"drop_rate_ffn\"]) # NEW: dropout for feed forward module\n",
" )\n",
"\n",
" def forward(self, x):\n",
" return self.layers(x)\n",
"from gpt import MultiHeadAttention, LayerNorm, GELU, FeedForward\n",
"\n",
"\n",
"class TransformerBlock(nn.Module):\n",
@@ -356,7 +342,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 3,
"id": "1d013d32-c275-4f42-be21-9010f1537227",
"metadata": {},
"outputs": [],
@@ -384,7 +370,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
"version": "3.11.4"
}
},
"nbformat": 4,

View File

@@ -144,7 +144,6 @@ class FeedForward(nn.Module):
nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
GELU(),
nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
nn.Dropout(cfg["drop_rate"])
)
def forward(self, x):