mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2026-04-10 12:33:42 +00:00
Rename variable to context_length to make it easier on readers (#106)
* rename to context length * fix spacing
This commit is contained in:
committed by
GitHub
parent
684562733a
commit
ccd7cebbb3
@@ -117,13 +117,13 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"GPT_CONFIG_124M = {\n",
|
||||
" \"vocab_size\": 50257, # Vocabulary size\n",
|
||||
" \"ctx_len\": 1024, # Context length\n",
|
||||
" \"emb_dim\": 768, # Embedding dimension\n",
|
||||
" \"n_heads\": 12, # Number of attention heads\n",
|
||||
" \"n_layers\": 12, # Number of layers\n",
|
||||
" \"drop_rate\": 0.1, # Dropout rate\n",
|
||||
" \"qkv_bias\": False # Query-Key-Value bias\n",
|
||||
" \"vocab_size\": 50257, # Vocabulary size\n",
|
||||
" \"context_length\": 1024, # Context length\n",
|
||||
" \"emb_dim\": 768, # Embedding dimension\n",
|
||||
" \"n_heads\": 12, # Number of attention heads\n",
|
||||
" \"n_layers\": 12, # Number of layers\n",
|
||||
" \"drop_rate\": 0.1, # Dropout rate\n",
|
||||
" \"qkv_bias\": False # Query-Key-Value bias\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
@@ -134,7 +134,7 @@
|
||||
"source": [
|
||||
"- We use short variable names to avoid long lines of code later\n",
|
||||
"- `\"vocab_size\"` indicates a vocabulary size of 50,257 words, supported by the BPE tokenizer discussed in Chapter 2\n",
|
||||
"- `\"ctx_len\"` represents the model's maximum input token count, as enabled by positional embeddings covered in Chapter 2\n",
|
||||
"- `\"context_length\"` represents the model's maximum input token count, as enabled by positional embeddings covered in Chapter 2\n",
|
||||
"- `\"emb_dim\"` is the embedding size for token inputs, converting each input token into a 768-dimensional vector\n",
|
||||
"- `\"n_heads\"` is the number of attention heads in the multi-head attention mechanism implemented in Chapter 3\n",
|
||||
"- `\"n_layers\"` is the number of transformer blocks within the model, which we'll implement in upcoming sections\n",
|
||||
@@ -943,7 +943,7 @@
|
||||
" self.att = MultiHeadAttention(\n",
|
||||
" d_in=cfg[\"emb_dim\"],\n",
|
||||
" d_out=cfg[\"emb_dim\"],\n",
|
||||
" block_size=cfg[\"ctx_len\"],\n",
|
||||
" context_length=cfg[\"ctx_len\"],\n",
|
||||
" num_heads=cfg[\"n_heads\"], \n",
|
||||
" dropout=cfg[\"drop_rate\"],\n",
|
||||
" qkv_bias=cfg[\"qkv_bias\"])\n",
|
||||
@@ -1489,7 +1489,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.4"
|
||||
"version": "3.10.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -34,11 +34,11 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from gpt import TransformerBlock\n",
|
||||
"from gpt import Transfocontext_lengthmerBlock\n",
|
||||
"\n",
|
||||
"GPT_CONFIG_124M = {\n",
|
||||
" \"vocab_size\": 50257,\n",
|
||||
" \"ctx_len\": 1024,\n",
|
||||
" \"context_length\": 1024,\n",
|
||||
" \"emb_dim\": 768,\n",
|
||||
" \"n_heads\": 12,\n",
|
||||
" \"n_layers\": 12,\n",
|
||||
@@ -139,7 +139,7 @@
|
||||
"source": [
|
||||
"GPT_CONFIG_124M = {\n",
|
||||
" \"vocab_size\": 50257,\n",
|
||||
" \"ctx_len\": 1024,\n",
|
||||
" \"context_length\": 1024,\n",
|
||||
" \"emb_dim\": 768,\n",
|
||||
" \"n_heads\": 12,\n",
|
||||
" \"n_layers\": 12,\n",
|
||||
@@ -260,7 +260,7 @@
|
||||
"source": [
|
||||
"GPT_CONFIG_124M = {\n",
|
||||
" \"vocab_size\": 50257,\n",
|
||||
" \"ctx_len\": 1024,\n",
|
||||
" \"context_length\": 1024,\n",
|
||||
" \"emb_dim\": 768,\n",
|
||||
" \"n_heads\": 12,\n",
|
||||
" \"n_layers\": 12,\n",
|
||||
@@ -288,7 +288,7 @@
|
||||
" self.att = MultiHeadAttention(\n",
|
||||
" d_in=cfg[\"emb_dim\"],\n",
|
||||
" d_out=cfg[\"emb_dim\"],\n",
|
||||
" block_size=cfg[\"ctx_len\"],\n",
|
||||
" context_length=cfg[\"context_length\"],\n",
|
||||
" num_heads=cfg[\"n_heads\"], \n",
|
||||
" dropout=cfg[\"drop_rate_attn\"], # NEW: dropout for multi-head attention\n",
|
||||
" qkv_bias=cfg[\"qkv_bias\"])\n",
|
||||
@@ -319,7 +319,7 @@
|
||||
" def __init__(self, cfg):\n",
|
||||
" super().__init__()\n",
|
||||
" self.tok_emb = nn.Embedding(cfg[\"vocab_size\"], cfg[\"emb_dim\"])\n",
|
||||
" self.pos_emb = nn.Embedding(cfg[\"ctx_len\"], cfg[\"emb_dim\"])\n",
|
||||
" self.pos_emb = nn.Embedding(cfg[\"context_length\"], cfg[\"emb_dim\"])\n",
|
||||
" self.drop_emb = nn.Dropout(cfg[\"drop_rate_emb\"]) # NEW: dropout for embedding layers\n",
|
||||
"\n",
|
||||
" self.trf_blocks = nn.Sequential(\n",
|
||||
@@ -370,7 +370,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.4"
|
||||
"version": "3.10.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -54,7 +54,7 @@ def create_dataloader_v1(txt, batch_size=4, max_length=256,
|
||||
# Chapter 3
|
||||
#####################################
|
||||
class MultiHeadAttention(nn.Module):
|
||||
def __init__(self, d_in, d_out, block_size, dropout, num_heads, qkv_bias=False):
|
||||
def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
|
||||
super().__init__()
|
||||
assert d_out % num_heads == 0, "d_out must be divisible by num_heads"
|
||||
|
||||
@@ -67,7 +67,7 @@ class MultiHeadAttention(nn.Module):
|
||||
self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
|
||||
self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs
|
||||
self.dropout = nn.Dropout(dropout)
|
||||
self.register_buffer('mask', torch.triu(torch.ones(block_size, block_size), diagonal=1))
|
||||
self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))
|
||||
|
||||
def forward(self, x):
|
||||
b, num_tokens, d_in = x.shape
|
||||
@@ -156,7 +156,7 @@ class TransformerBlock(nn.Module):
|
||||
self.att = MultiHeadAttention(
|
||||
d_in=cfg["emb_dim"],
|
||||
d_out=cfg["emb_dim"],
|
||||
block_size=cfg["ctx_len"],
|
||||
context_length=cfg["context_length"],
|
||||
num_heads=cfg["n_heads"],
|
||||
dropout=cfg["drop_rate"],
|
||||
qkv_bias=cfg["qkv_bias"])
|
||||
@@ -187,7 +187,7 @@ class GPTModel(nn.Module):
|
||||
def __init__(self, cfg):
|
||||
super().__init__()
|
||||
self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
|
||||
self.pos_emb = nn.Embedding(cfg["ctx_len"], cfg["emb_dim"])
|
||||
self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
|
||||
self.drop_emb = nn.Dropout(cfg["drop_rate"])
|
||||
|
||||
self.trf_blocks = nn.Sequential(
|
||||
@@ -236,13 +236,13 @@ def generate_text_simple(model, idx, max_new_tokens, context_size):
|
||||
|
||||
def main():
|
||||
GPT_CONFIG_124M = {
|
||||
"vocab_size": 50257, # Vocabulary size
|
||||
"ctx_len": 1024, # Context length
|
||||
"emb_dim": 768, # Embedding dimension
|
||||
"n_heads": 12, # Number of attention heads
|
||||
"n_layers": 12, # Number of layers
|
||||
"drop_rate": 0.1, # Dropout rate
|
||||
"qkv_bias": False # Query-Key-Value bias
|
||||
"vocab_size": 50257, # Vocabulary size
|
||||
"context_length": 1024, # Context length
|
||||
"emb_dim": 768, # Embedding dimension
|
||||
"n_heads": 12, # Number of attention heads
|
||||
"n_layers": 12, # Number of layers
|
||||
"drop_rate": 0.1, # Dropout rate
|
||||
"qkv_bias": False # Query-Key-Value bias
|
||||
}
|
||||
|
||||
torch.manual_seed(123)
|
||||
@@ -264,7 +264,7 @@ def main():
|
||||
model=model,
|
||||
idx=encoded_tensor,
|
||||
max_new_tokens=10,
|
||||
context_size=GPT_CONFIG_124M["ctx_len"]
|
||||
context_size=GPT_CONFIG_124M["context_length"]
|
||||
)
|
||||
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
|
||||
|
||||
|
||||
@@ -48,7 +48,7 @@ def create_dataloader_v1(txt, batch_size=4, max_length=256,
|
||||
|
||||
|
||||
class MultiHeadAttention(nn.Module):
|
||||
def __init__(self, d_in, d_out, block_size, dropout, num_heads, qkv_bias=False):
|
||||
def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
|
||||
super().__init__()
|
||||
assert d_out % num_heads == 0, "d_out must be divisible by num_heads"
|
||||
|
||||
@@ -61,7 +61,7 @@ class MultiHeadAttention(nn.Module):
|
||||
self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
|
||||
self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs
|
||||
self.dropout = nn.Dropout(dropout)
|
||||
self.register_buffer('mask', torch.triu(torch.ones(block_size, block_size), diagonal=1))
|
||||
self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
|
||||
|
||||
def forward(self, x):
|
||||
b, num_tokens, d_in = x.shape
|
||||
|
||||
Reference in New Issue
Block a user