Rename variable to context_length to make it easier on readers (#106)

* rename to context length

* fix spacing
This commit is contained in:
Sebastian Raschka
2024-04-04 07:27:41 -05:00
committed by GitHub
parent 684562733a
commit ccd7cebbb3
25 changed files with 242 additions and 242 deletions

View File

@@ -140,13 +140,13 @@
"from previous_chapters import GPTModel\n",
"\n",
"GPT_CONFIG_124M = {\n",
" \"vocab_size\": 50257, # Vocabulary size\n",
" \"ctx_len\": 256, # Shortened context length (orig: 1024)\n",
" \"emb_dim\": 768, # Embedding dimension\n",
" \"n_heads\": 12, # Number of attention heads\n",
" \"n_layers\": 12, # Number of layers\n",
" \"drop_rate\": 0.1, # Dropout rate\n",
" \"qkv_bias\": False # Query-key-value bias\n",
" \"vocab_size\": 50257, # Vocabulary size\n",
" \"context_length\": 256, # Shortened context length (orig: 1024)\n",
" \"emb_dim\": 768, # Embedding dimension\n",
" \"n_heads\": 12, # Number of attention heads\n",
" \"n_layers\": 12, # Number of layers\n",
" \"drop_rate\": 0.1, # Dropout rate\n",
" \"qkv_bias\": False # Query-key-value bias\n",
"}\n",
"\n",
"torch.manual_seed(123)\n",
@@ -161,10 +161,10 @@
"source": [
"- We use dropout of 0.1 above, but it's relatively common to train LLMs without dropout nowadays\n",
"- Modern LLMs also don't use bias vectors in the `nn.Linear` layers for the query, key, and value matrices (unlike earlier GPT models), which is achieved by setting `\"qkv_bias\": False`\n",
"- We reduce the context length (`ctx_len`) of only 256 tokens to reduce the computational resource requirements for training the model, whereas the original 124 million parameter GPT-2 model used 1024 characters\n",
"- We reduce the context length (`context_length`) of only 256 tokens to reduce the computational resource requirements for training the model, whereas the original 124 million parameter GPT-2 model used 1024 characters\n",
" - This is so that more readers will be able to follow and execute the code examples on their laptop computer\n",
" - However, please feel free to increase the `ctx_len` to 1024 tokens (this would not require any code changes)\n",
" - We will also load a model with a 1024 `ctx_len` later from pretrained weights"
" - However, please feel free to increase the `context_length` to 1024 tokens (this would not require any code changes)\n",
" - We will also load a model with a 1024 `context_length` later from pretrained weights"
]
},
{
@@ -219,7 +219,7 @@
" model=model,\n",
" idx=text_to_token_ids(start_context, tokenizer),\n",
" max_new_tokens=10,\n",
" context_size=GPT_CONFIG_124M[\"ctx_len\"]\n",
" context_size=GPT_CONFIG_124M[\"context_length\"]\n",
")\n",
"\n",
"print(\"Output text:\\n\", token_ids_to_text(token_ids, tokenizer))"
@@ -928,8 +928,8 @@
"train_loader = create_dataloader_v1(\n",
" train_data,\n",
" batch_size=2,\n",
" max_length=GPT_CONFIG_124M[\"ctx_len\"],\n",
" stride=GPT_CONFIG_124M[\"ctx_len\"],\n",
" max_length=GPT_CONFIG_124M[\"context_length\"],\n",
" stride=GPT_CONFIG_124M[\"context_length\"],\n",
" drop_last=True,\n",
" shuffle=True\n",
")\n",
@@ -937,8 +937,8 @@
"val_loader = create_dataloader_v1(\n",
" val_data,\n",
" batch_size=2,\n",
" max_length=GPT_CONFIG_124M[\"ctx_len\"],\n",
" stride=GPT_CONFIG_124M[\"ctx_len\"],\n",
" max_length=GPT_CONFIG_124M[\"context_length\"],\n",
" stride=GPT_CONFIG_124M[\"context_length\"],\n",
" drop_last=False,\n",
" shuffle=False\n",
")"
@@ -953,14 +953,14 @@
"source": [
"# Sanity check\n",
"\n",
"if total_tokens * (train_ratio) < GPT_CONFIG_124M[\"ctx_len\"]:\n",
"if total_tokens * (train_ratio) < GPT_CONFIG_124M[\"context_length\"]:\n",
" print(\"Not enough tokens for the training loader. \"\n",
" \"Try to lower the `GPT_CONFIG_124M['ctx_len']` or \"\n",
" \"Try to lower the `GPT_CONFIG_124M['context_length']` or \"\n",
" \"increase the `training_ratio`\")\n",
"\n",
"if total_tokens * (1-train_ratio) < GPT_CONFIG_124M[\"ctx_len\"]:\n",
"if total_tokens * (1-train_ratio) < GPT_CONFIG_124M[\"context_length\"]:\n",
" print(\"Not enough tokens for the validation loader. \"\n",
" \"Try to lower the `GPT_CONFIG_124M['ctx_len']` or \"\n",
" \"Try to lower the `GPT_CONFIG_124M['context_length']` or \"\n",
" \"decrease the `training_ratio`\")"
]
},
@@ -1441,7 +1441,7 @@
" model=model,\n",
" idx=text_to_token_ids(\"Every effort moves you\", tokenizer),\n",
" max_new_tokens=25,\n",
" context_size=GPT_CONFIG_124M[\"ctx_len\"]\n",
" context_size=GPT_CONFIG_124M[\"context_length\"]\n",
")\n",
"\n",
"print(\"Output text:\\n\", token_ids_to_text(token_ids, tokenizer))"
@@ -1906,7 +1906,7 @@
" model=model,\n",
" idx=text_to_token_ids(\"Every effort moves you\", tokenizer),\n",
" max_new_tokens=15,\n",
" context_size=GPT_CONFIG_124M[\"ctx_len\"],\n",
" context_size=GPT_CONFIG_124M[\"context_length\"],\n",
" top_k=25,\n",
" temperature=1.4\n",
")\n",
@@ -2203,7 +2203,7 @@
"model_name = \"gpt2-small (124M)\" # Example model name\n",
"NEW_CONFIG = GPT_CONFIG_124M.copy()\n",
"NEW_CONFIG.update(model_configs[model_name])\n",
"NEW_CONFIG.update({\"ctx_len\": 1024, \"qkv_bias\": True})\n",
"NEW_CONFIG.update({\"context_length\": 1024, \"qkv_bias\": True})\n",
"\n",
"gpt = GPTModel(NEW_CONFIG)\n",
"gpt.eval();"
@@ -2338,7 +2338,7 @@
" model=gpt,\n",
" idx=text_to_token_ids(\"Every effort moves you\", tokenizer),\n",
" max_new_tokens=25,\n",
" context_size=NEW_CONFIG[\"ctx_len\"],\n",
" context_size=NEW_CONFIG[\"context_length\"],\n",
" top_k=50,\n",
" temperature=1.5\n",
")\n",
@@ -2403,7 +2403,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
"version": "3.10.6"
}
},
"nbformat": 4,

View File

@@ -234,7 +234,7 @@
"\n",
"GPT_CONFIG_124M = {\n",
" \"vocab_size\": 50257, # Vocabulary size\n",
" \"ctx_len\": 256, # Shortened context length (orig: 1024)\n",
" \"context_length\": 256, # Shortened context length (orig: 1024)\n",
" \"emb_dim\": 768, # Embedding dimension\n",
" \"n_heads\": 12, # Number of attention heads\n",
" \"n_layers\": 12, # Number of layers\n",
@@ -286,7 +286,7 @@
" model=model,\n",
" idx=text_to_token_ids(start_context, tokenizer),\n",
" max_new_tokens=25,\n",
" context_size=GPT_CONFIG_124M[\"ctx_len\"]\n",
" context_size=GPT_CONFIG_124M[\"context_length\"]\n",
")\n",
"\n",
"print(\"Output text:\\n\", token_ids_to_text(token_ids, tokenizer))"
@@ -314,7 +314,7 @@
" model=model,\n",
" idx=text_to_token_ids(\"Every effort moves you\", tokenizer),\n",
" max_new_tokens=25,\n",
" context_size=GPT_CONFIG_124M[\"ctx_len\"],\n",
" context_size=GPT_CONFIG_124M[\"context_length\"],\n",
" top_k=None,\n",
" temperature=0.0\n",
")\n",
@@ -344,7 +344,7 @@
" model=model,\n",
" idx=text_to_token_ids(\"Every effort moves you\", tokenizer),\n",
" max_new_tokens=25,\n",
" context_size=GPT_CONFIG_124M[\"ctx_len\"],\n",
" context_size=GPT_CONFIG_124M[\"context_length\"],\n",
" top_k=None,\n",
" temperature=0.0\n",
")\n",
@@ -383,13 +383,13 @@
"\n",
"\n",
"GPT_CONFIG_124M = {\n",
" \"vocab_size\": 50257, # Vocabulary size\n",
" \"ctx_len\": 256, # Shortened context length (orig: 1024)\n",
" \"emb_dim\": 768, # Embedding dimension\n",
" \"n_heads\": 12, # Number of attention heads\n",
" \"n_layers\": 12, # Number of layers\n",
" \"drop_rate\": 0.1, # Dropout rate\n",
" \"qkv_bias\": False # Query-key-value bias\n",
" \"vocab_size\": 50257, # Vocabulary size\n",
" \"context_length\": 256, # Shortened context length (orig: 1024)\n",
" \"emb_dim\": 768, # Embedding dimension\n",
" \"n_heads\": 12, # Number of attention heads\n",
" \"n_layers\": 12, # Number of layers\n",
" \"drop_rate\": 0.1, # Dropout rate\n",
" \"qkv_bias\": False # Query-key-value bias\n",
"}\n",
"\n",
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
@@ -451,8 +451,8 @@
"train_loader = create_dataloader_v1(\n",
" train_data,\n",
" batch_size=2,\n",
" max_length=GPT_CONFIG_124M[\"ctx_len\"],\n",
" stride=GPT_CONFIG_124M[\"ctx_len\"],\n",
" max_length=GPT_CONFIG_124M[\"context_length\"],\n",
" stride=GPT_CONFIG_124M[\"context_length\"],\n",
" drop_last=True,\n",
" shuffle=True\n",
")\n",
@@ -460,8 +460,8 @@
"val_loader = create_dataloader_v1(\n",
" val_data,\n",
" batch_size=2,\n",
" max_length=GPT_CONFIG_124M[\"ctx_len\"],\n",
" stride=GPT_CONFIG_124M[\"ctx_len\"],\n",
" max_length=GPT_CONFIG_124M[\"context_length\"],\n",
" stride=GPT_CONFIG_124M[\"context_length\"],\n",
" drop_last=False,\n",
" shuffle=False\n",
")"
@@ -557,13 +557,13 @@
"\n",
"\n",
"GPT_CONFIG_124M = {\n",
" \"vocab_size\": 50257, # Vocabulary size\n",
" \"ctx_len\": 256, # Shortened context length (orig: 1024)\n",
" \"emb_dim\": 768, # Embedding dimension\n",
" \"n_heads\": 12, # Number of attention heads\n",
" \"n_layers\": 12, # Number of layers\n",
" \"drop_rate\": 0.1, # Dropout rate\n",
" \"qkv_bias\": False # Query-key-value bias\n",
" \"vocab_size\": 50257, # Vocabulary size\n",
" \"context_length\": 256, # Shortened context length (orig: 1024)\n",
" \"emb_dim\": 768, # Embedding dimension\n",
" \"n_heads\": 12, # Number of attention heads\n",
" \"n_layers\": 12, # Number of layers\n",
" \"drop_rate\": 0.1, # Dropout rate\n",
" \"qkv_bias\": False # Query-key-value bias\n",
"}\n",
"\n",
"\n",
@@ -617,7 +617,7 @@
"model_name = \"gpt2-small (124M)\" # Example model name\n",
"NEW_CONFIG = GPT_CONFIG_124M.copy()\n",
"NEW_CONFIG.update(model_configs[model_name])\n",
"NEW_CONFIG.update({\"ctx_len\": 1024, \"qkv_bias\": True})\n",
"NEW_CONFIG.update({\"context_length\": 1024, \"qkv_bias\": True})\n",
"\n",
"gpt = GPTModel(NEW_CONFIG)\n",
"gpt.eval();"
@@ -675,8 +675,8 @@
"train_loader = create_dataloader_v1(\n",
" train_data,\n",
" batch_size=2,\n",
" max_length=GPT_CONFIG_124M[\"ctx_len\"],\n",
" stride=GPT_CONFIG_124M[\"ctx_len\"],\n",
" max_length=GPT_CONFIG_124M[\"context_length\"],\n",
" stride=GPT_CONFIG_124M[\"context_length\"],\n",
" drop_last=True,\n",
" shuffle=True\n",
")\n",
@@ -684,8 +684,8 @@
"val_loader = create_dataloader_v1(\n",
" val_data,\n",
" batch_size=2,\n",
" max_length=GPT_CONFIG_124M[\"ctx_len\"],\n",
" stride=GPT_CONFIG_124M[\"ctx_len\"],\n",
" max_length=GPT_CONFIG_124M[\"context_length\"],\n",
" stride=GPT_CONFIG_124M[\"context_length\"],\n",
" drop_last=False,\n",
" shuffle=False\n",
")"
@@ -753,7 +753,7 @@
"model_name = \"gpt2-xl (1558M)\"\n",
"NEW_CONFIG = GPT_CONFIG_124M.copy()\n",
"NEW_CONFIG.update(model_configs[model_name])\n",
"NEW_CONFIG.update({\"ctx_len\": 1024, \"qkv_bias\": True})\n",
"NEW_CONFIG.update({\"context_length\": 1024, \"qkv_bias\": True})\n",
"\n",
"gpt = GPTModel(NEW_CONFIG)\n",
"gpt.eval();\n",
@@ -811,13 +811,13 @@
"\n",
"\n",
"GPT_CONFIG_124M = {\n",
" \"vocab_size\": 50257, # Vocabulary size\n",
" \"ctx_len\": 256, # Shortened context length (orig: 1024)\n",
" \"emb_dim\": 768, # Embedding dimension\n",
" \"n_heads\": 12, # Number of attention heads\n",
" \"n_layers\": 12, # Number of layers\n",
" \"drop_rate\": 0.1, # Dropout rate\n",
" \"qkv_bias\": False # Query-key-value bias\n",
" \"vocab_size\": 50257, # Vocabulary size\n",
" \"context_length\": 256, # Shortened context length (orig: 1024)\n",
" \"emb_dim\": 768, # Embedding dimension\n",
" \"n_heads\": 12, # Number of attention heads\n",
" \"n_layers\": 12, # Number of layers\n",
" \"drop_rate\": 0.1, # Dropout rate\n",
" \"qkv_bias\": False # Query-key-value bias\n",
"}\n",
"\n",
"\n",
@@ -859,7 +859,7 @@
"model_name = \"gpt2-xl (1558M)\"\n",
"NEW_CONFIG = GPT_CONFIG_124M.copy()\n",
"NEW_CONFIG.update(model_configs[model_name])\n",
"NEW_CONFIG.update({\"ctx_len\": 1024, \"qkv_bias\": True})\n",
"NEW_CONFIG.update({\"context_length\": 1024, \"qkv_bias\": True})\n",
"\n",
"gpt = GPTModel(NEW_CONFIG)\n",
"gpt.eval()\n",
@@ -901,7 +901,7 @@
" model=gpt,\n",
" idx=text_to_token_ids(\"Every effort moves you\", tokenizer),\n",
" max_new_tokens=25,\n",
" context_size=NEW_CONFIG[\"ctx_len\"],\n",
" context_size=NEW_CONFIG[\"context_length\"],\n",
" top_k=50,\n",
" temperature=1.5\n",
")\n",
@@ -926,7 +926,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
"version": "3.10.6"
}
},
"nbformat": 4,

View File

@@ -234,7 +234,7 @@ def main(gpt_config, input_prompt, model_size):
model=gpt,
idx=text_to_token_ids(input_prompt, tokenizer),
max_new_tokens=30,
context_size=gpt_config["ctx_len"],
context_size=gpt_config["context_length"],
top_k=1,
temperature=1.0
)
@@ -250,10 +250,10 @@ if __name__ == "__main__":
INPUT_PROMPT = "Every effort moves"
BASE_CONFIG = {
"vocab_size": 50257, # Vocabulary size
"ctx_len": 1024, # Context length
"drop_rate": 0.0, # Dropout rate
"qkv_bias": True # Query-key-value bias
"vocab_size": 50257, # Vocabulary size
"context_length": 1024, # Context length
"drop_rate": 0.0, # Dropout rate
"qkv_bias": True # Query-key-value bias
}
model_configs = {

View File

@@ -166,8 +166,8 @@ def main(gpt_config, hparams):
train_loader = create_dataloader_v1(
text_data[:split_idx],
batch_size=hparams["batch_size"],
max_length=gpt_config["ctx_len"],
stride=gpt_config["ctx_len"],
max_length=gpt_config["context_length"],
stride=gpt_config["context_length"],
drop_last=True,
shuffle=True
)
@@ -175,8 +175,8 @@ def main(gpt_config, hparams):
val_loader = create_dataloader_v1(
text_data[split_idx:],
batch_size=hparams["batch_size"],
max_length=gpt_config["ctx_len"],
stride=gpt_config["ctx_len"],
max_length=gpt_config["context_length"],
stride=gpt_config["context_length"],
drop_last=False,
shuffle=False
)
@@ -197,13 +197,13 @@ def main(gpt_config, hparams):
if __name__ == "__main__":
GPT_CONFIG_124M = {
"vocab_size": 50257, # Vocabulary size
"ctx_len": 256, # Shortened context length (orig: 1024)
"emb_dim": 768, # Embedding dimension
"n_heads": 12, # Number of attention heads
"n_layers": 12, # Number of layers
"drop_rate": 0.1, # Dropout rate
"qkv_bias": False # Query-key-value bias
"vocab_size": 50257, # Vocabulary size
"context_length": 256, # Shortened context length (orig: 1024)
"emb_dim": 768, # Embedding dimension
"n_heads": 12, # Number of attention heads
"n_layers": 12, # Number of layers
"drop_rate": 0.1, # Dropout rate
"qkv_bias": False # Query-key-value bias
}
OTHER_HPARAMS = {

View File

@@ -54,7 +54,7 @@ def create_dataloader_v1(txt, batch_size=4, max_length=256,
# Chapter 3
#####################################
class MultiHeadAttention(nn.Module):
def __init__(self, d_in, d_out, block_size, dropout, num_heads, qkv_bias=False):
def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
super().__init__()
assert d_out % num_heads == 0, "d_out must be divisible by n_heads"
@@ -67,7 +67,7 @@ class MultiHeadAttention(nn.Module):
self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs
self.dropout = nn.Dropout(dropout)
self.register_buffer('mask', torch.triu(torch.ones(block_size, block_size), diagonal=1))
self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
def forward(self, x):
b, num_tokens, d_in = x.shape
@@ -156,7 +156,7 @@ class TransformerBlock(nn.Module):
self.att = MultiHeadAttention(
d_in=cfg["emb_dim"],
d_out=cfg["emb_dim"],
block_size=cfg["ctx_len"],
context_length=cfg["context_length"],
num_heads=cfg["n_heads"],
dropout=cfg["drop_rate"],
qkv_bias=cfg["qkv_bias"])
@@ -187,7 +187,7 @@ class GPTModel(nn.Module):
def __init__(self, cfg):
super().__init__()
self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
self.pos_emb = nn.Embedding(cfg["ctx_len"], cfg["emb_dim"])
self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
self.drop_emb = nn.Dropout(cfg["drop_rate"])
self.trf_blocks = nn.Sequential(
@@ -237,13 +237,13 @@ def generate_text_simple(model, idx, max_new_tokens, context_size):
if __name__ == "__main__":
GPT_CONFIG_124M = {
"vocab_size": 50257, # Vocabulary size
"ctx_len": 1024, # Context length
"emb_dim": 768, # Embedding dimension
"n_heads": 12, # Number of attention heads
"n_layers": 12, # Number of layers
"drop_rate": 0.1, # Dropout rate
"qkv_bias": False # Query-Key-Value bias
"vocab_size": 50257, # Vocabulary size
"context_length": 1024, # Context length
"emb_dim": 768, # Embedding dimension
"n_heads": 12, # Number of attention heads
"n_layers": 12, # Number of layers
"drop_rate": 0.1, # Dropout rate
"qkv_bias": False # Query-Key-Value bias
}
torch.manual_seed(123)
@@ -265,7 +265,7 @@ if __name__ == "__main__":
model=model,
idx=encoded_tensor,
max_new_tokens=10,
context_size=GPT_CONFIG_124M["ctx_len"]
context_size=GPT_CONFIG_124M["context_length"]
)
decoded_text = tokenizer.decode(out.squeeze(0).tolist())

View File

@@ -13,10 +13,10 @@ from gpt_train import main
def gpt_config():
return {
"vocab_size": 50257,
"ctx_len": 12, # small for testing efficiency
"emb_dim": 32, # small for testing efficiency
"n_heads": 4, # small for testing efficiency
"n_layers": 2, # small for testing efficiency
"context_length": 12, # small for testing efficiency
"emb_dim": 32, # small for testing efficiency
"n_heads": 4, # small for testing efficiency
"n_layers": 2, # small for testing efficiency
"drop_rate": 0.1,
"qkv_bias": False
}