Rename variable to context_length to make it easier on readers (#106)

* rename to context length

* fix spacing
This commit is contained in:
Sebastian Raschka
2024-04-04 07:27:41 -05:00
committed by GitHub
parent a940373a14
commit 2de60d1bfb
25 changed files with 242 additions and 242 deletions

View File

@@ -61,13 +61,13 @@
"from previous_chapters import GPTModel\n",
"\n",
"GPT_CONFIG_124M = {\n",
" \"vocab_size\": 50257, # Vocabulary size\n",
" \"ctx_len\": 256, # Shortened context length (orig: 1024)\n",
" \"emb_dim\": 768, # Embedding dimension\n",
" \"n_heads\": 12, # Number of attention heads\n",
" \"n_layers\": 12, # Number of layers\n",
" \"drop_rate\": 0.1, # Dropout rate\n",
" \"qkv_bias\": False # Query-key-value bias\n",
" \"vocab_size\": 50257, # Vocabulary size\n",
" \"context_length\": 256, # Shortened context length (orig: 1024)\n",
" \"emb_dim\": 768, # Embedding dimension\n",
" \"n_heads\": 12, # Number of attention heads\n",
" \"n_layers\": 12, # Number of layers\n",
" \"drop_rate\": 0.1, # Dropout rate\n",
" \"qkv_bias\": False # Query-key-value bias\n",
"}\n",
"\n",
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
@@ -127,8 +127,8 @@
"train_loader = create_dataloader_v1(\n",
" text_data[:split_idx],\n",
" batch_size=2,\n",
" max_length=GPT_CONFIG_124M[\"ctx_len\"],\n",
" stride=GPT_CONFIG_124M[\"ctx_len\"],\n",
" max_length=GPT_CONFIG_124M[\"context_length\"],\n",
" stride=GPT_CONFIG_124M[\"context_length\"],\n",
" drop_last=True,\n",
" shuffle=True\n",
")\n",
@@ -136,8 +136,8 @@
"val_loader = create_dataloader_v1(\n",
" text_data[split_idx:],\n",
" batch_size=2,\n",
" max_length=GPT_CONFIG_124M[\"ctx_len\"],\n",
" stride=GPT_CONFIG_124M[\"ctx_len\"],\n",
" max_length=GPT_CONFIG_124M[\"context_length\"],\n",
" stride=GPT_CONFIG_124M[\"context_length\"],\n",
" drop_last=False,\n",
" shuffle=False\n",
")"
@@ -755,7 +755,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
"version": "3.10.6"
}
},
"nbformat": 4,

View File

@@ -61,7 +61,7 @@ def create_dataloader_v1(txt, batch_size=4, max_length=256,
#####################################
class MultiHeadAttention(nn.Module):
def __init__(self, d_in, d_out, block_size, dropout, num_heads, qkv_bias=False):
def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
super().__init__()
assert d_out % num_heads == 0, "d_out must be divisible by n_heads"
@@ -74,7 +74,7 @@ class MultiHeadAttention(nn.Module):
self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs
self.dropout = nn.Dropout(dropout)
self.register_buffer('mask', torch.triu(torch.ones(block_size, block_size), diagonal=1))
self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
def forward(self, x):
b, num_tokens, d_in = x.shape
@@ -164,7 +164,7 @@ class TransformerBlock(nn.Module):
self.att = MultiHeadAttention(
d_in=cfg["emb_dim"],
d_out=cfg["emb_dim"],
block_size=cfg["ctx_len"],
context_length=cfg["ctx_len"],
num_heads=cfg["n_heads"],
dropout=cfg["drop_rate"],
qkv_bias=cfg["qkv_bias"])