From fcb13fd636b5dc25387cafb63036d644bb53d404 Mon Sep 17 00:00:00 2001 From: rasbt Date: Mon, 29 Jan 2024 08:13:52 -0600 Subject: [PATCH] add code backbone ch04 --- ch04/01_main-chapter-code/ch04.ipynb | 487 ++++++++++++++++++ .../01_main-chapter-code/previous_chapters.py | 96 ++++ 2 files changed, 583 insertions(+) create mode 100644 ch04/01_main-chapter-code/ch04.ipynb create mode 100644 ch04/01_main-chapter-code/previous_chapters.py diff --git a/ch04/01_main-chapter-code/ch04.ipynb b/ch04/01_main-chapter-code/ch04.ipynb new file mode 100644 index 0000000..9d96c52 --- /dev/null +++ b/ch04/01_main-chapter-code/ch04.ipynb @@ -0,0 +1,487 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ce9295b2-182b-490b-8325-83a67c4a001d", + "metadata": {}, + "source": [ + "# Chapter 4: Implementing a GPT model from Scratch To Generate Text \n", + "\n", + "## (Notes are in progress ...)" + ] + }, + { + "cell_type": "markdown", + "id": "e7da97ed-e02f-4d7f-b68e-a0eba3716e02", + "metadata": {}, + "source": [ + "- In this chapter, we implement the architecture of a GPT-like LLM; in the next chapter, we will train this LLM" + ] + }, + { + "cell_type": "markdown", + "id": "53fe99ab-0bcf-4778-a6b5-6db81fb826ef", + "metadata": {}, + "source": [ + "## 4.1 Coding the decoder" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5ed66875-1f24-445d-add6-006aae3c5707", + "metadata": {}, + "outputs": [], + "source": [ + "GPT_CONFIG = {\n", + " \"vocab_size\": 50257, # Vocabulary size\n", + " \"ctx_len\": 1024, # Context length\n", + " \"emb_dim\": 768, # Embedding dimension\n", + " \"n_heads\": 12, # Number of attention heads\n", + " \"n_layers\": 12, # Number of layers\n", + " \"drop_rate\": 0.1, # Dropout rate\n", + " \"qkv_bias\": True # Query-Key-Value bias\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "619c2eed-f8ea-4ff5-92c3-feda0f29b227", + "metadata": {}, + "outputs": [], + "source": [ + "import torch.nn as nn\n", + "\n", + "\n", + "class DummyGPTModel(nn.Module):\n", + " def __init__(self, cfg):\n", + " super().__init__()\n", + " self.tok_emb = nn.Embedding(cfg[\"vocab_size\"], cfg[\"emb_dim\"])\n", + " self.pos_emb = nn.Embedding(cfg[\"ctx_len\"], cfg[\"emb_dim\"])\n", + " self.drop_emb = nn.Dropout(cfg[\"drop_rate\"])\n", + " \n", + " # Use a placeholder for TransformerBlock\n", + " self.trf_blocks = nn.Sequential(\n", + " *[DummyTransformerBlock(cfg) for _ in range(cfg[\"n_layers\"])])\n", + " \n", + " # Use a placeholder for LayerNorm\n", + " self.final_norm = DummyLayerNorm(cfg[\"emb_dim\"])\n", + " self.out_head = nn.Linear(cfg[\"emb_dim\"], cfg[\"vocab_size\"], bias=False)\n", + "\n", + " def forward(self, in_idx):\n", + " batch_size, seq_len = in_idx.shape\n", + " tok_embeds = self.tok_emb(in_idx)\n", + " pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))\n", + " x = tok_embeds + pos_embeds\n", + " x = self.drop_emb(x)\n", + " x = self.trf_blocks(x)\n", + " x = self.final_norm(x)\n", + " logits = self.out_head(x)\n", + " return logits\n", + "\n", + "\n", + "class DummyTransformerBlock(nn.Module):\n", + " def __init__(self, cfg):\n", + " super().__init__()\n", + " # A simple placeholder\n", + "\n", + " def forward(self, x):\n", + " # This block does nothing and just returns its input.\n", + " return x\n", + "\n", + "\n", + "class DummyLayerNorm(nn.Module):\n", + " def __init__(self, normalized_shape, eps=1e-5):\n", + " super().__init__()\n", + " # The parameters here are just to mimic the LayerNorm interface.\n", + "\n", + " def forward(self, x):\n", + " # This layer does nothing and just returns its input.\n", + " return x" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "794b6b6c-d36f-411e-a7db-8ac566a87fee", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[ 6109, 3626, 6100, 345, 2651, 13],\n", + " [ 6109, 1110, 6622, 257, 11483, 13]])" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import tiktoken\n", + "import torch\n", + "\n", + "tokenizer = tiktoken.get_encoding(\"gpt2\")\n", + "\n", + "batch = []\n", + "\n", + "txt1 = \"Every effort moves you forward.\"\n", + "txt2 = \"Every day holds a lesson.\"\n", + "\n", + "batch.append(torch.tensor(tokenizer.encode(txt1)))\n", + "batch.append(torch.tensor(tokenizer.encode(txt2)))\n", + "batch = torch.stack(batch, dim=0)\n", + "batch" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "009238cd-0160-4834-979c-309710986bb0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Output shape: torch.Size([2, 6, 50257])\n", + "tensor([[[-1.2034, 0.3201, -0.7130, ..., -1.5548, -0.2390, -0.4667],\n", + " [-0.1192, 0.4539, -0.4432, ..., 0.2392, 1.3469, 1.2430],\n", + " [ 0.5307, 1.6720, -0.4695, ..., 1.1966, 0.0111, 0.5835],\n", + " [ 0.0139, 1.6755, -0.3388, ..., 1.1586, -0.0435, -1.0400],\n", + " [ 0.0106, -1.6711, 0.7797, ..., 0.3561, -0.0867, -0.5452],\n", + " [ 0.1821, 1.1189, 0.1641, ..., 1.9012, 1.2240, 0.8853]],\n", + "\n", + " [[-1.0341, 0.2765, -1.1252, ..., -0.8381, 0.0773, 0.1147],\n", + " [-0.2632, 0.5427, -0.2828, ..., 0.1357, 0.3707, 1.3615],\n", + " [ 0.9695, 1.2466, -0.3515, ..., -0.0171, -0.3478, 0.2616],\n", + " [-0.0237, -0.7329, 0.3184, ..., 1.5946, -0.1334, -0.2981],\n", + " [-0.1876, -0.7909, 0.8811, ..., 1.1121, -0.3781, -1.4438],\n", + " [ 0.0405, 1.2000, 0.0702, ..., 1.4740, 1.1567, 1.2077]]],\n", + " grad_fn=)\n" + ] + } + ], + "source": [ + "torch.manual_seed(123)\n", + "model = DummyGPTModel(GPT_CONFIG)\n", + "\n", + "out = model(batch)\n", + "print(\"Output shape:\", out.shape)\n", + "print(out)" + ] + }, + { + "cell_type": "markdown", + "id": "62598daa-f819-40da-95ca-899988b6f8da", + "metadata": {}, + "source": [ + "## 4.2 Normalizing activations with LayerNorm" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3333a305-aa3d-460a-bcce-b80662d464d9", + "metadata": {}, + "outputs": [], + "source": [ + "class LayerNorm(nn.Module):\n", + " def __init__(self, emb_dim):\n", + " super().__init__()\n", + " self.eps = 1e-5\n", + " self.scale = nn.Parameter(torch.ones(emb_dim))\n", + " self.shift = nn.Parameter(torch.zeros(emb_dim))\n", + "\n", + " def forward(self, x):\n", + " mean = x.mean(-1, keepdim=True)\n", + " var = x.var(-1, keepdim=True, unbiased=False)\n", + " norm_x = (x - mean) / torch.sqrt(var + self.eps)\n", + " return self.scale * norm_x + self.shift" + ] + }, + { + "cell_type": "markdown", + "id": "fd9d772b-c833-4a5c-9d58-9b208d2a0b68", + "metadata": {}, + "source": [ + "## 4.3 Adding GeLU activation functions" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "9275c879-b148-4579-a107-86827ca14d4d", + "metadata": {}, + "outputs": [], + "source": [ + "class GELU(nn.Module):\n", + " def __init__(self):\n", + " super().__init__()\n", + "\n", + " def forward(self, x):\n", + " return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2 / torch.pi)) *\n", + " (x + 0.044715 * x ** 3)))\n", + "\n", + "\n", + "class FeedForward(nn.Module):\n", + " def __init__(self, cfg):\n", + " super().__init__()\n", + " self.net = nn.Sequential(\n", + " nn.Linear(cfg[\"emb_dim\"], 4 * cfg[\"emb_dim\"]),\n", + " GELU(),\n", + " nn.Linear(4 * cfg[\"emb_dim\"], cfg[\"emb_dim\"]),\n", + " nn.Dropout(cfg[\"drop_rate\"])\n", + " )\n", + "\n", + " def forward(self, x):\n", + " return self.net(x)" + ] + }, + { + "cell_type": "markdown", + "id": "4ffcb905-53c7-4886-87d2-4464c5fecf89", + "metadata": {}, + "source": [ + "## 4.4 Understanding shortcut connections" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "05473938-799c-49fd-86d4-8ed65f94fee6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[-1.1785],\n", + " [-0.0278],\n", + " [-0.5737],\n", + " [-1.5400],\n", + " [ 0.1513]], grad_fn=)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "class ExampleWithShortcut(nn.Module):\n", + " def __init__(self):\n", + " super().__init__()\n", + " self.fc1 = nn.Linear(10, 10)\n", + " self.fc2 = nn.Linear(10, 10)\n", + " self.fc3 = nn.Linear(10, 1)\n", + " self.relu = nn.ReLU()\n", + "\n", + " def forward(self, x):\n", + " identity = x\n", + " x = self.relu(self.fc1(x))\n", + " x = self.relu(self.fc2(x)) + identity # Shortcut connection\n", + " x = self.fc3(x)\n", + " return x\n", + "\n", + "torch.manual_seed(123)\n", + "ex_short = ExampleWithShortcut()\n", + "inputs = torch.randn(5, 10)\n", + "ex_short(inputs)" + ] + }, + { + "cell_type": "markdown", + "id": "cae578ca-e564-42cf-8635-a2267047cdff", + "metadata": {}, + "source": [ + "## 4.5 Connecting attention and linear layers" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "0e1e8176-e5e3-4152-b1aa-0bbd7891dfd9", + "metadata": {}, + "outputs": [], + "source": [ + "from previous_chapters import MultiHeadAttention\n", + "\n", + "\n", + "class TransformerBlock(nn.Module):\n", + " def __init__(self, cfg):\n", + " super().__init__()\n", + " self.att = MultiHeadAttention(\n", + " d_in=cfg[\"emb_dim\"],\n", + " d_out=cfg[\"emb_dim\"],\n", + " block_size=cfg[\"ctx_len\"],\n", + " num_heads=cfg[\"n_heads\"], \n", + " dropout=cfg[\"drop_rate\"],\n", + " qkv_bias=cfg[\"qkv_bias\"])\n", + " self.ff = FeedForward(cfg)\n", + " self.norm1 = LayerNorm(cfg[\"emb_dim\"])\n", + " self.norm2 = LayerNorm(cfg[\"emb_dim\"])\n", + " self.drop_resid = nn.Dropout(cfg[\"drop_rate\"])\n", + "\n", + " def forward(self, x):\n", + " x = x + self.drop_resid(self.att(self.norm1(x)))\n", + " x = x + self.drop_resid(self.ff(self.norm2(x)))\n", + " return x" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c61de39c-d03c-4a32-8b57-f49ac3834857", + "metadata": {}, + "outputs": [], + "source": [ + "class GPTModel(nn.Module):\n", + " def __init__(self, cfg):\n", + " super().__init__()\n", + " self.tok_emb = nn.Embedding(cfg[\"vocab_size\"], cfg[\"emb_dim\"])\n", + " self.pos_emb = nn.Embedding(cfg[\"ctx_len\"], cfg[\"emb_dim\"])\n", + " \n", + " # Use a placeholder for TransformerBlock\n", + " self.trf_blocks = nn.Sequential(\n", + " *[TransformerBlock(cfg) for _ in range(cfg[\"n_layers\"])])\n", + " \n", + " # Use a placeholder for LayerNorm\n", + " self.final_norm = LayerNorm(cfg[\"emb_dim\"])\n", + " self.out_head = nn.Linear(cfg[\"emb_dim\"], cfg[\"vocab_size\"], bias=False)\n", + "\n", + " def forward(self, in_idx):\n", + " batch_size, seq_len = in_idx.shape\n", + " tok_embeds = self.tok_emb(in_idx)\n", + " pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))\n", + " x = tok_embeds + pos_embeds\n", + " x = self.trf_blocks(x)\n", + " x = self.final_norm(x)\n", + " logits = self.out_head(x)\n", + " return logits" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "252b78c2-4404-483b-84fe-a412e55c16fc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Output shape: torch.Size([2, 6, 50257])\n", + "tensor([[[-0.7971, -0.6232, -0.1815, ..., 0.1020, -0.0916, 0.1885],\n", + " [ 0.5491, -0.5220, 0.7559, ..., -0.3137, -0.8780, 0.2182],\n", + " [ 0.3107, 0.0346, -0.4637, ..., -0.3700, -0.4346, -0.0747],\n", + " [ 0.5681, 0.3940, 0.5397, ..., -0.1027, 0.5461, 0.4834],\n", + " [-0.2948, -0.1605, -0.5878, ..., 0.0054, -0.0207, -0.1100],\n", + " [-0.3096, -0.7744, -0.0254, ..., 0.7480, 0.3515, 0.3208]],\n", + "\n", + " [[-0.6910, -0.3758, -0.1458, ..., -0.1824, -0.5231, 0.0873],\n", + " [-0.2562, -0.4204, 1.5507, ..., -0.7057, -0.3989, 0.0084],\n", + " [-0.4263, -0.2257, -0.2074, ..., -0.2160, -1.1648, 0.4744],\n", + " [-0.0245, 1.3792, 0.2234, ..., -0.7153, -0.7858, -0.3762],\n", + " [-0.4696, -0.4584, -0.4812, ..., 0.5044, -0.8911, 0.1549],\n", + " [-0.7727, -0.6125, -0.3203, ..., 1.0753, -0.0878, 0.2805]]],\n", + " grad_fn=)\n" + ] + } + ], + "source": [ + "torch.manual_seed(123)\n", + "model = GPTModel(GPT_CONFIG)\n", + "\n", + "out = model(batch)\n", + "print(\"Output shape:\", out.shape)\n", + "print(out)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "84fb8be4-9d3b-402b-b3da-86b663aac33a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of parameters: 163,037,184\n", + "Number of trainable parameters considering weight tying: 124,439,808\n" + ] + } + ], + "source": [ + "total_params = sum(p.numel() for p in model.parameters())\n", + "print(f\"Total number of parameters: {total_params:,}\")\n", + "\n", + "total_params_gpt2 = total_params - sum(p.numel() for p in model.tok_emb.parameters())\n", + "print(f\"Number of trainable parameters considering weight tying: {total_params_gpt2:,}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "5131a752-fab8-4d70-a600-e29870b33528", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total size of the model: 621.94 MB\n" + ] + } + ], + "source": [ + "# Calculate the total size in bytes (assuming float32, 4 bytes per parameter)\n", + "total_size_bytes = total_params * 4\n", + "\n", + "# Convert to megabytes\n", + "total_size_mb = total_size_bytes / (1024 * 1024)\n", + "\n", + "print(f\"Total size of the model: {total_size_mb:.2f} MB\")" + ] + }, + { + "cell_type": "markdown", + "id": "da5d9bc0-95ab-45d4-9378-417628d86e35", + "metadata": {}, + "source": [ + "## 4.6 Implementing the forward pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07700ec8-32e8-4775-9c13-5c43671d6728", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ch04/01_main-chapter-code/previous_chapters.py b/ch04/01_main-chapter-code/previous_chapters.py new file mode 100644 index 0000000..7b4b4ea --- /dev/null +++ b/ch04/01_main-chapter-code/previous_chapters.py @@ -0,0 +1,96 @@ +import tiktoken +import torch +import torch.nn as nn +from torch.utils.data import Dataset, DataLoader + + +class GPTDatasetV1(Dataset): + def __init__(self, txt, tokenizer, max_length, stride): + self.tokenizer = tokenizer + self.input_ids = [] + self.target_ids = [] + + # Tokenize the entire text + token_ids = tokenizer.encode(txt) + + # Use a sliding window to chunk the book into overlapping sequences of max_length + for i in range(0, len(token_ids) - max_length, stride): + input_chunk = token_ids[i:i + max_length] + target_chunk = token_ids[i + 1: i + max_length + 1] + self.input_ids.append(torch.tensor(input_chunk)) + self.target_ids.append(torch.tensor(target_chunk)) + + def __len__(self): + return len(self.input_ids) + + def __getitem__(self, idx): + return self.input_ids[idx], self.target_ids[idx] + + +def create_dataloader(txt, batch_size=4, max_length=256, stride=128, shuffle=True): + # Initialize the tokenizer + tokenizer = tiktoken.get_encoding("gpt2") + + # Create dataset + dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) + + # Create dataloader + dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle) + + return dataloader + + +class MultiHeadAttention(nn.Module): + def __init__(self, d_in, d_out, block_size, dropout, num_heads, qkv_bias=False): + super().__init__() + assert d_out % num_heads == 0, "d_out must be divisible by n_heads" + + self.d_out = d_out + self.num_heads = num_heads + self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim + + self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias) + self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias) + self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias) + self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs + self.dropout = nn.Dropout(dropout) + self.register_buffer('mask', torch.triu(torch.ones(block_size, block_size), diagonal=1)) + + def forward(self, x): + b, num_tokens, d_in = x.shape + + keys = self.W_key(x) # Shape: (b, num_tokens, d_out) + queries = self.W_query(x) + values = self.W_value(x) + + # We implicitly split the matrix by adding a `num_heads` dimension + # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim) + keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) + values = values.view(b, num_tokens, self.num_heads, self.head_dim) + queries = queries.view(b, num_tokens, self.num_heads, self.head_dim) + + # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim) + keys = keys.transpose(1, 2) + queries = queries.transpose(1, 2) + values = values.transpose(1, 2) + + # Compute scaled dot-product attention (aka self-attention) with a causal mask + attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head + # Original mask truncated to the number of tokens and converted to boolean + mask_bool = self.mask.bool()[:num_tokens, :num_tokens] + # Unsqueeze the mask twice to match dimensions + mask_unsqueezed = mask_bool.unsqueeze(0).unsqueeze(0) + # Use the unsqueezed mask to fill attention scores + attn_scores.masked_fill_(mask_unsqueezed, -torch.inf) + + attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) + attn_weights = self.dropout(attn_weights) + + # Shape: (b, num_tokens, num_heads, head_dim) + context_vec = (attn_weights @ values).transpose(1, 2) + + # Combine heads, where self.d_out = self.num_heads * self.head_dim + context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out) + context_vec = self.out_proj(context_vec) # optional projection + + return context_vec \ No newline at end of file