diff --git a/.github/workflows/basic-tests-linux-uv.yml b/.github/workflows/basic-tests-linux-uv.yml index 982e8f9..35864a8 100644 --- a/.github/workflows/basic-tests-linux-uv.yml +++ b/.github/workflows/basic-tests-linux-uv.yml @@ -66,3 +66,9 @@ jobs: run: | source .venv/bin/activate pytest ch02/05_bpe-from-scratch/tests/tests.py + + - name: Test Selected Bonus Materials + shell: bash + run: | + source .venv/bin/activate + pytest pkg/llms_from_scratch/tests/ diff --git a/appendix-D/01_main-chapter-code/appendix-D.ipynb b/appendix-D/01_main-chapter-code/appendix-D.ipynb index d10b811..64c6d67 100644 --- a/appendix-D/01_main-chapter-code/appendix-D.ipynb +++ b/appendix-D/01_main-chapter-code/appendix-D.ipynb @@ -68,6 +68,11 @@ "\n", "\n", "from previous_chapters import GPTModel\n", + "# If the `previous_chapters.py` file is not available locally,\n", + "# you can import it from the `llms-from-scratch` PyPI package.\n", + "# For details, see: https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg\n", + "# E.g.,\n", + "# from llms_from_scratch.ch04 import GPTModel\n", "\n", "GPT_CONFIG_124M = {\n", " \"vocab_size\": 50257, # Vocabulary size\n", @@ -139,6 +144,9 @@ "outputs": [], "source": [ "from previous_chapters import create_dataloader_v1\n", + "# Alternatively:\n", + "# from llms_from_scratch.ch02 import create_dataloader_v1\n", + "\n", "\n", "# Train/validation ratio\n", "train_ratio = 0.90\n", @@ -454,6 +462,9 @@ "outputs": [], "source": [ "from previous_chapters import calc_loss_batch\n", + "# Alternatively:\n", + "# from llms_from_scratch.ch05 import calc_loss_batch\n", + "\n", "\n", "torch.manual_seed(123)\n", "model = GPTModel(GPT_CONFIG_124M)\n", @@ -551,6 +562,9 @@ "outputs": [], "source": [ "from previous_chapters import evaluate_model, generate_and_print_sample\n", + "# Alternatively:\n", + "# from llms_from_scratch.ch05 import evaluate_model, generate_and_print_samplee\n", + "\n", "\n", "ORIG_BOOK_VERSION = False\n", "\n", @@ -790,6 +804,9 @@ ], "source": [ "from previous_chapters import plot_losses\n", + "# Alternatively:\n", + "# from llms_from_scratch.ch05 import plot_losses\n", + "\n", "\n", "epochs_tensor = torch.linspace(1, n_epochs, len(train_losses))\n", "plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)\n", @@ -823,7 +840,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.16" } }, "nbformat": 4, diff --git a/appendix-E/01_main-chapter-code/appendix-E.ipynb b/appendix-E/01_main-chapter-code/appendix-E.ipynb index 24080c9..dec9921 100644 --- a/appendix-E/01_main-chapter-code/appendix-E.ipynb +++ b/appendix-E/01_main-chapter-code/appendix-E.ipynb @@ -198,6 +198,16 @@ " create_balanced_dataset,\n", " random_split\n", ")\n", + "# If the `previous_chapters.py` file is not available locally,\n", + "# you can import it from the `llms-from-scratch` PyPI package.\n", + "# For details, see: https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg\n", + "# E.g.,\n", + "# from llms_from_scratch.ch06 import (\n", + "# download_and_unzip_spam_data,\n", + "# create_balanced_dataset,\n", + "# random_split\n", + "# )\n", + "\n", "\n", "\n", "url = \"https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip\"\n", @@ -409,6 +419,10 @@ "source": [ "from gpt_download import download_and_load_gpt2\n", "from previous_chapters import GPTModel, load_weights_into_gpt\n", + "# Alternatively:\n", + "# from llms_from_scratch.ch04 import GPTModel\n", + "# from llms_from_scratch.ch05 import load_weights_into_gpt\n", + "\n", "\n", "\n", "CHOOSE_MODEL = \"gpt2-small (124M)\"\n", @@ -577,6 +591,9 @@ ], "source": [ "from previous_chapters import calc_accuracy_loader\n", + "# Alternatively:\n", + "# from llms_from_scratch.ch06 import calc_accuracy_loader\n", + "\n", "\n", "\n", "torch.manual_seed(123)\n", @@ -1387,6 +1404,8 @@ "source": [ "import time\n", "from previous_chapters import train_classifier_simple\n", + "# Alternatively:\n", + "# from llms_from_scratch.ch06 import train_classifier_simple\n", "\n", "\n", "start_time = time.time()\n", @@ -1442,6 +1461,8 @@ ], "source": [ "from previous_chapters import plot_values\n", + "# Alternatively:\n", + "# from llms_from_scratch.ch06 import plot_values\n", "\n", "epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))\n", "examples_seen_tensor = torch.linspace(0, examples_seen, len(train_losses))\n", diff --git a/ch04/01_main-chapter-code/ch04.ipynb b/ch04/01_main-chapter-code/ch04.ipynb index 2f25aca..4f6c9f3 100644 --- a/ch04/01_main-chapter-code/ch04.ipynb +++ b/ch04/01_main-chapter-code/ch04.ipynb @@ -970,6 +970,12 @@ "metadata": {}, "outputs": [], "source": [ + "# If the `previous_chapters.py` file is not available locally,\n", + "# you can import it from the `llms-from-scratch` PyPI package.\n", + "# For details, see: https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg\n", + "# E.g.,\n", + "# from llms_from_scratch.ch03 import MultiHeadAttention\n", + "\n", "from previous_chapters import MultiHeadAttention\n", "\n", "\n", diff --git a/ch04/02_performance-analysis/README.md b/ch04/02_performance-analysis/README.md index d3eee11..a741ac8 100644 --- a/ch04/02_performance-analysis/README.md +++ b/ch04/02_performance-analysis/README.md @@ -1,5 +1,4 @@ # Chapter 4: Implementing a GPT Model from Scratch To Generate Text - [flops-analysis.ipynb](flops-analysis.ipynb) analyses the floating point operations per second (FLOPS) of the GPT model(s) implemented in the main chapter. -- [previous_chapters.py](previous_chapters.py) is a Python module containing the `GPTModel` code we implemented in chapter 4 and other code implemented in previous chapters, which we import in the analysis notebook. - `requirements-extra.txt` includes additional Python libraries that need to be installed (via `pip install -r requirements-extra.txt`. \ No newline at end of file diff --git a/ch04/02_performance-analysis/flops-analysis.ipynb b/ch04/02_performance-analysis/flops-analysis.ipynb index 2401526..71e3a54 100644 --- a/ch04/02_performance-analysis/flops-analysis.ipynb +++ b/ch04/02_performance-analysis/flops-analysis.ipynb @@ -127,7 +127,9 @@ "import torch\n", "from thop import profile\n", "\n", - "from previous_chapters import GPTModel\n", + "# For installation instructions, see:\n", + "# https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg\n", + "from llms_from_scratch.ch04 import GPTModel\n", "\n", "\n", "BASE_CONFIG = {\n", @@ -550,7 +552,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.16" } }, "nbformat": 4, diff --git a/ch05/01_main-chapter-code/ch05.ipynb b/ch05/01_main-chapter-code/ch05.ipynb index d846214..ed87065 100644 --- a/ch05/01_main-chapter-code/ch05.ipynb +++ b/ch05/01_main-chapter-code/ch05.ipynb @@ -147,6 +147,11 @@ "source": [ "import torch\n", "from previous_chapters import GPTModel\n", + "# If the `previous_chapters.py` file is not available locally,\n", + "# you can import it from the `llms-from-scratch` PyPI package.\n", + "# For details, see: https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg\n", + "# E.g.,\n", + "# from llms_from_scratch.ch04 import GPTModel\n", "\n", "GPT_CONFIG_124M = {\n", " \"vocab_size\": 50257, # Vocabulary size\n", @@ -212,6 +217,9 @@ "import tiktoken\n", "from previous_chapters import generate_text_simple\n", "\n", + "# Alternatively:\n", + "# from llms_from_scratch.ch04 import generate_text_simple\n", + "\n", "def text_to_token_ids(text, tokenizer):\n", " encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})\n", " encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension\n", @@ -924,6 +932,8 @@ "outputs": [], "source": [ "from previous_chapters import create_dataloader_v1\n", + "# Alternatively:\n", + "# from llms_from_scratch.ch02 import create_dataloader_v1\n", "\n", "# Train/validation ratio\n", "train_ratio = 0.90\n", @@ -2548,7 +2558,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.8" + "version": "3.10.16" } }, "nbformat": 4, diff --git a/ch05/02_alternative_weight_loading/previous_chapters.py b/ch05/02_alternative_weight_loading/previous_chapters.py deleted file mode 100644 index 6c5b101..0000000 --- a/ch05/02_alternative_weight_loading/previous_chapters.py +++ /dev/null @@ -1,293 +0,0 @@ -# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). -# Source for "Build a Large Language Model From Scratch" -# - https://www.manning.com/books/build-a-large-language-model-from-scratch -# Code: https://github.com/rasbt/LLMs-from-scratch -# -# This file collects all the relevant code that we covered thus far -# throughout Chapters 2-4. -# This file can be run as a standalone script. - -import tiktoken -import torch -import torch.nn as nn -from torch.utils.data import Dataset, DataLoader - -##################################### -# Chapter 2 -##################################### - - -class GPTDatasetV1(Dataset): - def __init__(self, txt, tokenizer, max_length, stride): - self.input_ids = [] - self.target_ids = [] - - # Tokenize the entire text - token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"}) - - # Use a sliding window to chunk the book into overlapping sequences of max_length - for i in range(0, len(token_ids) - max_length, stride): - input_chunk = token_ids[i:i + max_length] - target_chunk = token_ids[i + 1: i + max_length + 1] - self.input_ids.append(torch.tensor(input_chunk)) - self.target_ids.append(torch.tensor(target_chunk)) - - def __len__(self): - return len(self.input_ids) - - def __getitem__(self, idx): - return self.input_ids[idx], self.target_ids[idx] - - -def create_dataloader_v1(txt, batch_size=4, max_length=256, - stride=128, shuffle=True, drop_last=True, num_workers=0): - # Initialize the tokenizer - tokenizer = tiktoken.get_encoding("gpt2") - - # Create dataset - dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) - - # Create dataloader - dataloader = DataLoader( - dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers) - - return dataloader - - -##################################### -# Chapter 3 -##################################### -class MultiHeadAttention(nn.Module): - def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False): - super().__init__() - assert d_out % num_heads == 0, "d_out must be divisible by n_heads" - - self.d_out = d_out - self.num_heads = num_heads - self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim - - self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias) - self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias) - self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias) - self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs - self.dropout = nn.Dropout(dropout) - self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) - - def forward(self, x): - b, num_tokens, d_in = x.shape - - keys = self.W_key(x) # Shape: (b, num_tokens, d_out) - queries = self.W_query(x) - values = self.W_value(x) - - # We implicitly split the matrix by adding a `num_heads` dimension - # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim) - keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) - values = values.view(b, num_tokens, self.num_heads, self.head_dim) - queries = queries.view(b, num_tokens, self.num_heads, self.head_dim) - - # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim) - keys = keys.transpose(1, 2) - queries = queries.transpose(1, 2) - values = values.transpose(1, 2) - - # Compute scaled dot-product attention (aka self-attention) with a causal mask - attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head - - # Original mask truncated to the number of tokens and converted to boolean - mask_bool = self.mask.bool()[:num_tokens, :num_tokens] - - # Use the mask to fill attention scores - attn_scores.masked_fill_(mask_bool, -torch.inf) - - attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) - attn_weights = self.dropout(attn_weights) - - # Shape: (b, num_tokens, num_heads, head_dim) - context_vec = (attn_weights @ values).transpose(1, 2) - - # Combine heads, where self.d_out = self.num_heads * self.head_dim - context_vec = context_vec.reshape(b, num_tokens, self.d_out) - context_vec = self.out_proj(context_vec) # optional projection - - return context_vec - - -##################################### -# Chapter 4 -##################################### -class LayerNorm(nn.Module): - def __init__(self, emb_dim): - super().__init__() - self.eps = 1e-5 - self.scale = nn.Parameter(torch.ones(emb_dim)) - self.shift = nn.Parameter(torch.zeros(emb_dim)) - - def forward(self, x): - mean = x.mean(dim=-1, keepdim=True) - var = x.var(dim=-1, keepdim=True, unbiased=False) - norm_x = (x - mean) / torch.sqrt(var + self.eps) - return self.scale * norm_x + self.shift - - -class GELU(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return 0.5 * x * (1 + torch.tanh( - torch.sqrt(torch.tensor(2.0 / torch.pi)) * - (x + 0.044715 * torch.pow(x, 3)) - )) - - -class FeedForward(nn.Module): - def __init__(self, cfg): - super().__init__() - self.layers = nn.Sequential( - nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), - GELU(), - nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]), - ) - - def forward(self, x): - return self.layers(x) - - -class TransformerBlock(nn.Module): - def __init__(self, cfg): - super().__init__() - self.att = MultiHeadAttention( - d_in=cfg["emb_dim"], - d_out=cfg["emb_dim"], - context_length=cfg["context_length"], - num_heads=cfg["n_heads"], - dropout=cfg["drop_rate"], - qkv_bias=cfg["qkv_bias"]) - self.ff = FeedForward(cfg) - self.norm1 = LayerNorm(cfg["emb_dim"]) - self.norm2 = LayerNorm(cfg["emb_dim"]) - self.drop_shortcut = nn.Dropout(cfg["drop_rate"]) - - def forward(self, x): - # Shortcut connection for attention block - shortcut = x - x = self.norm1(x) - x = self.att(x) # Shape [batch_size, num_tokens, emb_size] - x = self.drop_shortcut(x) - x = x + shortcut # Add the original input back - - # Shortcut connection for feed-forward block - shortcut = x - x = self.norm2(x) - x = self.ff(x) - x = self.drop_shortcut(x) - x = x + shortcut # Add the original input back - - return x - - -class GPTModel(nn.Module): - def __init__(self, cfg): - super().__init__() - self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"]) - self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"]) - self.drop_emb = nn.Dropout(cfg["drop_rate"]) - - self.trf_blocks = nn.Sequential( - *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]) - - self.final_norm = LayerNorm(cfg["emb_dim"]) - self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False) - - def forward(self, in_idx): - batch_size, seq_len = in_idx.shape - tok_embeds = self.tok_emb(in_idx) - pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device)) - x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size] - x = self.drop_emb(x) - x = self.trf_blocks(x) - x = self.final_norm(x) - logits = self.out_head(x) - return logits - - -def generate_text_simple(model, idx, max_new_tokens, context_size): - # idx is (B, T) array of indices in the current context - for _ in range(max_new_tokens): - - # Crop current context if it exceeds the supported context size - # E.g., if LLM supports only 5 tokens, and the context size is 10 - # then only the last 5 tokens are used as context - idx_cond = idx[:, -context_size:] - - # Get the predictions - with torch.no_grad(): - logits = model(idx_cond) - - # Focus only on the last time step - # (batch, n_token, vocab_size) becomes (batch, vocab_size) - logits = logits[:, -1, :] - - # Get the idx of the vocab entry with the highest logits value - idx_next = torch.argmax(logits, dim=-1, keepdim=True) # (batch, 1) - - # Append sampled index to the running sequence - idx = torch.cat((idx, idx_next), dim=1) # (batch, n_tokens+1) - - return idx - - -##################################### -# Chapter 5 -##################################### - - -def text_to_token_ids(text, tokenizer): - encoded = tokenizer.encode(text) - encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension - return encoded_tensor - - -def token_ids_to_text(token_ids, tokenizer): - flat = token_ids.squeeze(0) # remove batch dimension - return tokenizer.decode(flat.tolist()) - - -def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None): - - # For-loop is the same as before: Get logits, and only focus on last time step - for _ in range(max_new_tokens): - idx_cond = idx[:, -context_size:] - with torch.no_grad(): - logits = model(idx_cond) - logits = logits[:, -1, :] - - # New: Filter logits with top_k sampling - if top_k is not None: - # Keep only top_k values - top_logits, _ = torch.topk(logits, top_k) - min_val = top_logits[:, -1] - logits = torch.where(logits < min_val, torch.tensor(float('-inf')).to(logits.device), logits) - - # New: Apply temperature scaling - if temperature > 0.0: - logits = logits / temperature - - # Apply softmax to get probabilities - probs = torch.softmax(logits, dim=-1) # (batch_size, context_len) - - # Sample from the distribution - idx_next = torch.multinomial(probs, num_samples=1) # (batch_size, 1) - - # Otherwise same as before: get idx of the vocab entry with the highest logits value - else: - idx_next = torch.argmax(logits, dim=-1, keepdim=True) # (batch_size, 1) - - if idx_next == eos_id: # Stop generating early if end-of-sequence token is encountered and eos_id is specified - break - - # Same as before: append sampled index to the running sequence - idx = torch.cat((idx, idx_next), dim=1) # (batch_size, num_tokens+1) - - return idx diff --git a/ch05/02_alternative_weight_loading/weight-loading-hf-safetensors.ipynb b/ch05/02_alternative_weight_loading/weight-loading-hf-safetensors.ipynb index 30ddb08..6a634e3 100644 --- a/ch05/02_alternative_weight_loading/weight-loading-hf-safetensors.ipynb +++ b/ch05/02_alternative_weight_loading/weight-loading-hf-safetensors.ipynb @@ -95,7 +95,9 @@ "metadata": {}, "outputs": [], "source": [ - "from previous_chapters import GPTModel" + "from llms_from_scratch.ch04 import GPTModel\n", + "# For llms_from_scratch installation instructions, see:\n", + "# https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg" ] }, { @@ -270,7 +272,8 @@ ], "source": [ "import tiktoken\n", - "from previous_chapters import generate, text_to_token_ids, token_ids_to_text\n", + "from llms_from_scratch.ch05 import generate, text_to_token_ids, token_ids_to_text\n", + "\n", "\n", "torch.manual_seed(123)\n", "\n", diff --git a/ch05/02_alternative_weight_loading/weight-loading-hf-transformers.ipynb b/ch05/02_alternative_weight_loading/weight-loading-hf-transformers.ipynb index c632b25..c03f62e 100644 --- a/ch05/02_alternative_weight_loading/weight-loading-hf-transformers.ipynb +++ b/ch05/02_alternative_weight_loading/weight-loading-hf-transformers.ipynb @@ -230,7 +230,9 @@ "outputs": [], "source": [ "import torch\n", - "from previous_chapters import GPTModel\n", + "from llms_from_scratch.ch04 import GPTModel\n", + "# For llms_from_scratch installation instructions, see:\n", + "# https://github.com/rasbt/LLMs-from-scratch/tree/main/\n", "\n", "\n", "gpt = GPTModel(BASE_CONFIG)\n", @@ -258,7 +260,8 @@ ], "source": [ "import tiktoken\n", - "from previous_chapters import generate, text_to_token_ids, token_ids_to_text\n", + "from llms_from_scratch.ch05 import generate, text_to_token_ids, token_ids_to_text\n", + "\n", "\n", "torch.manual_seed(123)\n", "\n", diff --git a/ch05/03_bonus_pretraining_on_gutenberg/pretraining_simple.py b/ch05/03_bonus_pretraining_on_gutenberg/pretraining_simple.py index 0bbf2b1..15dcf8f 100644 --- a/ch05/03_bonus_pretraining_on_gutenberg/pretraining_simple.py +++ b/ch05/03_bonus_pretraining_on_gutenberg/pretraining_simple.py @@ -17,14 +17,12 @@ from pathlib import Path import time import tiktoken import torch -from previous_chapters import ( - create_dataloader_v1, - GPTModel, - generate_and_print_sample, - calc_loss_batch, - evaluate_model, - plot_losses -) + +# For llms_from_scratch installation instructions, see: +# https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg +from llms_from_scratch.ch02 import create_dataloader_v1 +from llms_from_scratch.ch04 import GPTModel, generate_and_print_sample +from llms_from_scratch.ch05 import calc_loss_batch, evaluate_model, plot_losses def read_text_file(file_path): diff --git a/ch05/03_bonus_pretraining_on_gutenberg/previous_chapters.py b/ch05/03_bonus_pretraining_on_gutenberg/previous_chapters.py deleted file mode 100644 index 0e0d8c0..0000000 --- a/ch05/03_bonus_pretraining_on_gutenberg/previous_chapters.py +++ /dev/null @@ -1,317 +0,0 @@ -# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). -# Source for "Build a Large Language Model From Scratch" -# - https://www.manning.com/books/build-a-large-language-model-from-scratch -# Code: https://github.com/rasbt/LLMs-from-scratch - -# This file collects all the relevant code that we covered thus far -# throughout Chapters 2-4. -# This file can be run as a standalone script. - -import tiktoken -import torch -import torch.nn as nn -from torch.utils.data import Dataset, DataLoader -import matplotlib.pyplot as plt -from matplotlib.ticker import MaxNLocator - -##################################### -# Chapter 2 -##################################### - - -class GPTDatasetV1(Dataset): - def __init__(self, txt, tokenizer, max_length, stride): - self.input_ids = [] - self.target_ids = [] - - token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'}) - - for i in range(0, len(token_ids) - max_length, stride): - input_chunk = token_ids[i:i + max_length] - target_chunk = token_ids[i + 1: i + max_length + 1] - self.input_ids.append(torch.tensor(input_chunk)) - self.target_ids.append(torch.tensor(target_chunk)) - - def __len__(self): - return len(self.input_ids) - - def __getitem__(self, idx): - return self.input_ids[idx], self.target_ids[idx] - - -def create_dataloader_v1(txt, batch_size=4, max_length=256, - stride=128, shuffle=True, drop_last=True, num_workers=0): - tokenizer = tiktoken.get_encoding("gpt2") - dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) - dataloader = DataLoader( - dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers) - - return dataloader - - -##################################### -# Chapter 3 -##################################### - -class MultiHeadAttention(nn.Module): - def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False): - super().__init__() - assert d_out % num_heads == 0, "d_out must be divisible by n_heads" - - self.d_out = d_out - self.num_heads = num_heads - self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim - - self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias) - self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias) - self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias) - self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs - self.dropout = nn.Dropout(dropout) - self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) - - def forward(self, x): - b, num_tokens, d_in = x.shape - - keys = self.W_key(x) # Shape: (b, num_tokens, d_out) - queries = self.W_query(x) - values = self.W_value(x) - - # We implicitly split the matrix by adding a `num_heads` dimension - # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim) - keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) - values = values.view(b, num_tokens, self.num_heads, self.head_dim) - queries = queries.view(b, num_tokens, self.num_heads, self.head_dim) - - # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim) - keys = keys.transpose(1, 2) - queries = queries.transpose(1, 2) - values = values.transpose(1, 2) - - # Compute scaled dot-product attention (aka self-attention) with a causal mask - attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head - - # Original mask truncated to the number of tokens and converted to boolean - mask_bool = self.mask.bool()[:num_tokens, :num_tokens] - - # Use the mask to fill attention scores - attn_scores.masked_fill_(mask_bool, -torch.inf) - - attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) - attn_weights = self.dropout(attn_weights) - - # Shape: (b, num_tokens, num_heads, head_dim) - context_vec = (attn_weights @ values).transpose(1, 2) - - # Combine heads, where self.d_out = self.num_heads * self.head_dim - context_vec = context_vec.reshape(b, num_tokens, self.d_out) - context_vec = self.out_proj(context_vec) # optional projection - - return context_vec - - -##################################### -# Chapter 4 -##################################### - -class LayerNorm(nn.Module): - def __init__(self, emb_dim): - super().__init__() - self.eps = 1e-5 - self.scale = nn.Parameter(torch.ones(emb_dim)) - self.shift = nn.Parameter(torch.zeros(emb_dim)) - - def forward(self, x): - mean = x.mean(dim=-1, keepdim=True) - var = x.var(dim=-1, keepdim=True, unbiased=False) - norm_x = (x - mean) / torch.sqrt(var + self.eps) - return self.scale * norm_x + self.shift - - -class GELU(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return 0.5 * x * (1 + torch.tanh( - torch.sqrt(torch.tensor(2.0 / torch.pi)) * - (x + 0.044715 * torch.pow(x, 3)) - )) - - -class FeedForward(nn.Module): - def __init__(self, cfg): - super().__init__() - self.layers = nn.Sequential( - nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), - GELU(), - nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]), - ) - - def forward(self, x): - return self.layers(x) - - -class TransformerBlock(nn.Module): - def __init__(self, cfg): - super().__init__() - self.att = MultiHeadAttention( - d_in=cfg["emb_dim"], - d_out=cfg["emb_dim"], - context_length=cfg["context_length"], - num_heads=cfg["n_heads"], - dropout=cfg["drop_rate"], - qkv_bias=cfg["qkv_bias"]) - self.ff = FeedForward(cfg) - self.norm1 = LayerNorm(cfg["emb_dim"]) - self.norm2 = LayerNorm(cfg["emb_dim"]) - self.drop_shortcut = nn.Dropout(cfg["drop_rate"]) - - def forward(self, x): - # Shortcut connection for attention block - shortcut = x - x = self.norm1(x) - x = self.att(x) # Shape [batch_size, num_tokens, emb_size] - x = self.drop_shortcut(x) - x = x + shortcut # Add the original input back - - # Shortcut connection for feed-forward block - shortcut = x - x = self.norm2(x) - x = self.ff(x) - x = self.drop_shortcut(x) - x = x + shortcut # Add the original input back - - return x - - -class GPTModel(nn.Module): - def __init__(self, cfg): - super().__init__() - self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"]) - self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"]) - self.drop_emb = nn.Dropout(cfg["drop_rate"]) - - self.trf_blocks = nn.Sequential( - *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]) - - self.final_norm = LayerNorm(cfg["emb_dim"]) - self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False) - - def forward(self, in_idx): - batch_size, seq_len = in_idx.shape - tok_embeds = self.tok_emb(in_idx) - pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device)) - x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size] - x = self.drop_emb(x) - x = self.trf_blocks(x) - x = self.final_norm(x) - logits = self.out_head(x) - return logits - - -def generate_text_simple(model, idx, max_new_tokens, context_size): - # idx is (B, T) array of indices in the current context - for _ in range(max_new_tokens): - - # Crop current context if it exceeds the supported context size - # E.g., if LLM supports only 5 tokens, and the context size is 10 - # then only the last 5 tokens are used as context - idx_cond = idx[:, -context_size:] - - # Get the predictions - with torch.no_grad(): - logits = model(idx_cond) - - # Focus only on the last time step - # (batch, n_token, vocab_size) becomes (batch, vocab_size) - logits = logits[:, -1, :] - - # Get the idx of the vocab entry with the highest logits value - idx_next = torch.argmax(logits, dim=-1, keepdim=True) # (batch, 1) - - # Append sampled index to the running sequence - idx = torch.cat((idx, idx_next), dim=1) # (batch, n_tokens+1) - - return idx - - -##################################### -# Chapter 5 -#################################### - - -def calc_loss_batch(input_batch, target_batch, model, device): - input_batch, target_batch = input_batch.to(device), target_batch.to(device) - logits = model(input_batch) - loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten()) - return loss - - -def calc_loss_loader(data_loader, model, device, num_batches=None): - total_loss = 0. - if len(data_loader) == 0: - return float("nan") - elif num_batches is None: - num_batches = len(data_loader) - else: - num_batches = min(num_batches, len(data_loader)) - for i, (input_batch, target_batch) in enumerate(data_loader): - if i < num_batches: - loss = calc_loss_batch(input_batch, target_batch, model, device) - total_loss += loss.item() - else: - break - return total_loss / num_batches - - -def evaluate_model(model, train_loader, val_loader, device, eval_iter): - model.eval() - with torch.no_grad(): - train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter) - val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter) - model.train() - return train_loss, val_loss - - -def generate_and_print_sample(model, tokenizer, device, start_context): - model.eval() - context_size = model.pos_emb.weight.shape[0] - encoded = text_to_token_ids(start_context, tokenizer).to(device) - with torch.no_grad(): - token_ids = generate_text_simple( - model=model, idx=encoded, - max_new_tokens=50, context_size=context_size) - decoded_text = token_ids_to_text(token_ids, tokenizer) - print(decoded_text.replace("\n", " ")) # Compact print format - model.train() - - -def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses, output_dir): - fig, ax1 = plt.subplots() - - # Plot training and validation loss against epochs - ax1.plot(epochs_seen, train_losses, label="Training loss") - ax1.plot(epochs_seen, val_losses, linestyle="-.", label="Validation loss") - ax1.set_xlabel("Epochs") - ax1.set_ylabel("Loss") - ax1.legend(loc="upper right") - ax1.xaxis.set_major_locator(MaxNLocator(integer=True)) - - # Create a second x-axis for tokens seen - ax2 = ax1.twiny() # Create a second x-axis that shares the same y-axis - ax2.plot(tokens_seen, train_losses, alpha=0) # Invisible plot for aligning ticks - ax2.set_xlabel("Tokens seen") - - fig.tight_layout() # Adjust layout to make room - plt.savefig(output_dir / "losses.pdf") - - -def text_to_token_ids(text, tokenizer): - encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'}) - encoded_tensor = torch.tensor(encoded).unsqueeze(0) # Add batch dimension - return encoded_tensor - - -def token_ids_to_text(token_ids, tokenizer): - flat = token_ids.squeeze(0) # Remove batch dimension - return tokenizer.decode(flat.tolist()) diff --git a/ch05/05_bonus_hparam_tuning/hparam_search.py b/ch05/05_bonus_hparam_tuning/hparam_search.py index 3e0604e..4f2a2ce 100644 --- a/ch05/05_bonus_hparam_tuning/hparam_search.py +++ b/ch05/05_bonus_hparam_tuning/hparam_search.py @@ -8,7 +8,11 @@ import math import os import tiktoken import torch -from previous_chapters import GPTModel, create_dataloader_v1 + +# For llms_from_scratch installation instructions, see: +# https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg +from llms_from_scratch.ch02 import create_dataloader_v1 +from llms_from_scratch.ch04 import GPTModel # Define a grid of hyperparameters to search over diff --git a/ch05/05_bonus_hparam_tuning/previous_chapters.py b/ch05/05_bonus_hparam_tuning/previous_chapters.py deleted file mode 100644 index b5c92fa..0000000 --- a/ch05/05_bonus_hparam_tuning/previous_chapters.py +++ /dev/null @@ -1,279 +0,0 @@ -# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). -# Source for "Build a Large Language Model From Scratch" -# - https://www.manning.com/books/build-a-large-language-model-from-scratch -# Code: https://github.com/rasbt/LLMs-from-scratch - -# This file collects all the relevant code that we covered thus far -# throughout Chapters 2-4. -# This file can be run as a standalone script. - -import tiktoken -import torch -import torch.nn as nn -from torch.utils.data import Dataset, DataLoader - -##################################### -# Chapter 2 -##################################### - - -class GPTDatasetV1(Dataset): - def __init__(self, txt, tokenizer, max_length, stride): - self.input_ids = [] - self.target_ids = [] - - # Tokenize the entire text - token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"}) - - # Use a sliding window to chunk the book into overlapping sequences of max_length - for i in range(0, len(token_ids) - max_length, stride): - input_chunk = token_ids[i:i + max_length] - target_chunk = token_ids[i + 1: i + max_length + 1] - self.input_ids.append(torch.tensor(input_chunk)) - self.target_ids.append(torch.tensor(target_chunk)) - - def __len__(self): - return len(self.input_ids) - - def __getitem__(self, idx): - return self.input_ids[idx], self.target_ids[idx] - - -def create_dataloader_v1(txt, batch_size=4, max_length=256, - stride=128, shuffle=True, drop_last=True, num_workers=0): - # Initialize the tokenizer - tokenizer = tiktoken.get_encoding("gpt2") - - # Create dataset - dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) - - # Create dataloader - dataloader = DataLoader( - dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers) - - return dataloader - - -##################################### -# Chapter 3 -##################################### -class MultiHeadAttention(nn.Module): - def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False): - super().__init__() - assert d_out % num_heads == 0, "d_out must be divisible by num_heads" - - self.d_out = d_out - self.num_heads = num_heads - self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim - - self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias) - self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias) - self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias) - self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs - self.dropout = nn.Dropout(dropout) - self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) - - def forward(self, x): - b, num_tokens, d_in = x.shape - - keys = self.W_key(x) # Shape: (b, num_tokens, d_out) - queries = self.W_query(x) - values = self.W_value(x) - - # We implicitly split the matrix by adding a `num_heads` dimension - # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim) - keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) - values = values.view(b, num_tokens, self.num_heads, self.head_dim) - queries = queries.view(b, num_tokens, self.num_heads, self.head_dim) - - # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim) - keys = keys.transpose(1, 2) - queries = queries.transpose(1, 2) - values = values.transpose(1, 2) - - # Compute scaled dot-product attention (aka self-attention) with a causal mask - attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head - - # Original mask truncated to the number of tokens and converted to boolean - mask_bool = self.mask.bool()[:num_tokens, :num_tokens] - - # Use the mask to fill attention scores - attn_scores.masked_fill_(mask_bool, -torch.inf) - - attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) - attn_weights = self.dropout(attn_weights) - - # Shape: (b, num_tokens, num_heads, head_dim) - context_vec = (attn_weights @ values).transpose(1, 2) - - # Combine heads, where self.d_out = self.num_heads * self.head_dim - context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out) - context_vec = self.out_proj(context_vec) # optional projection - - return context_vec - - -##################################### -# Chapter 4 -##################################### -class LayerNorm(nn.Module): - def __init__(self, emb_dim): - super().__init__() - self.eps = 1e-5 - self.scale = nn.Parameter(torch.ones(emb_dim)) - self.shift = nn.Parameter(torch.zeros(emb_dim)) - - def forward(self, x): - mean = x.mean(dim=-1, keepdim=True) - var = x.var(dim=-1, keepdim=True, unbiased=False) - norm_x = (x - mean) / torch.sqrt(var + self.eps) - return self.scale * norm_x + self.shift - - -class GELU(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return 0.5 * x * (1 + torch.tanh( - torch.sqrt(torch.tensor(2.0 / torch.pi)) * - (x + 0.044715 * torch.pow(x, 3)) - )) - - -class FeedForward(nn.Module): - def __init__(self, cfg): - super().__init__() - self.layers = nn.Sequential( - nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), - GELU(), - nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]), - ) - - def forward(self, x): - return self.layers(x) - - -class TransformerBlock(nn.Module): - def __init__(self, cfg): - super().__init__() - self.att = MultiHeadAttention( - d_in=cfg["emb_dim"], - d_out=cfg["emb_dim"], - context_length=cfg["context_length"], - num_heads=cfg["n_heads"], - dropout=cfg["drop_rate"], - qkv_bias=cfg["qkv_bias"]) - self.ff = FeedForward(cfg) - self.norm1 = LayerNorm(cfg["emb_dim"]) - self.norm2 = LayerNorm(cfg["emb_dim"]) - self.drop_shortcut = nn.Dropout(cfg["drop_rate"]) - - def forward(self, x): - # Shortcut connection for attention block - shortcut = x - x = self.norm1(x) - x = self.att(x) # Shape [batch_size, num_tokens, emb_size] - x = self.drop_shortcut(x) - x = x + shortcut # Add the original input back - - # Shortcut connection for feed-forward block - shortcut = x - x = self.norm2(x) - x = self.ff(x) - x = self.drop_shortcut(x) - x = x + shortcut # Add the original input back - - return x - - -class GPTModel(nn.Module): - def __init__(self, cfg): - super().__init__() - self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"]) - self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"]) - self.drop_emb = nn.Dropout(cfg["drop_rate"]) - - self.trf_blocks = nn.Sequential( - *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]) - - self.final_norm = LayerNorm(cfg["emb_dim"]) - self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False) - - def forward(self, in_idx): - batch_size, seq_len = in_idx.shape - tok_embeds = self.tok_emb(in_idx) - pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device)) - x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size] - x = self.drop_emb(x) - x = self.trf_blocks(x) - x = self.final_norm(x) - logits = self.out_head(x) - return logits - - -def generate_text_simple(model, idx, max_new_tokens, context_size): - # idx is (B, T) array of indices in the current context - for _ in range(max_new_tokens): - - # Crop current context if it exceeds the supported context size - # E.g., if LLM supports only 5 tokens, and the context size is 10 - # then only the last 5 tokens are used as context - idx_cond = idx[:, -context_size:] - - # Get the predictions - with torch.no_grad(): - logits = model(idx_cond) - - # Focus only on the last time step - # (batch, n_token, vocab_size) becomes (batch, vocab_size) - logits = logits[:, -1, :] - - # Get the idx of the vocab entry with the highest logits value - idx_next = torch.argmax(logits, dim=-1, keepdim=True) # (batch, 1) - - # Append sampled index to the running sequence - idx = torch.cat((idx, idx_next), dim=1) # (batch, n_tokens+1) - - return idx - - -if __name__ == "__main__": - - GPT_CONFIG_124M = { - "vocab_size": 50257, # Vocabulary size - "context_length": 1024, # Context length - "emb_dim": 768, # Embedding dimension - "n_heads": 12, # Number of attention heads - "n_layers": 12, # Number of layers - "drop_rate": 0.1, # Dropout rate - "qkv_bias": False # Query-Key-Value bias - } - - torch.manual_seed(123) - model = GPTModel(GPT_CONFIG_124M) - model.eval() # disable dropout - - start_context = "Hello, I am" - - tokenizer = tiktoken.get_encoding("gpt2") - encoded = tokenizer.encode(start_context) - encoded_tensor = torch.tensor(encoded).unsqueeze(0) - - print(f"\n{50*'='}\n{22*' '}IN\n{50*'='}") - print("\nInput text:", start_context) - print("Encoded input text:", encoded) - print("encoded_tensor.shape:", encoded_tensor.shape) - - out = generate_text_simple( - model=model, - idx=encoded_tensor, - max_new_tokens=10, - context_size=GPT_CONFIG_124M["context_length"] - ) - decoded_text = tokenizer.decode(out.squeeze(0).tolist()) - - print(f"\n\n{50*'='}\n{22*' '}OUT\n{50*'='}") - print("\nOutput:", out) - print("Output length:", len(out[0])) - print("Output text:", decoded_text) diff --git a/ch05/06_user_interface/app_orig.py b/ch05/06_user_interface/app_orig.py index ef3e027..c36ab2b 100644 --- a/ch05/06_user_interface/app_orig.py +++ b/ch05/06_user_interface/app_orig.py @@ -7,10 +7,12 @@ import tiktoken import torch import chainlit -from previous_chapters import ( +# For llms_from_scratch installation instructions, see: +# https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg +from llms_from_scratch.ch04 import GPTModel +from llms_from_scratch.ch05 import ( download_and_load_gpt2, generate, - GPTModel, load_weights_into_gpt, text_to_token_ids, token_ids_to_text, diff --git a/ch05/06_user_interface/app_own.py b/ch05/06_user_interface/app_own.py index ef3a819..a341b53 100644 --- a/ch05/06_user_interface/app_own.py +++ b/ch05/06_user_interface/app_own.py @@ -10,13 +10,16 @@ import tiktoken import torch import chainlit -from previous_chapters import ( +# For llms_from_scratch installation instructions, see: +# https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg +from llms_from_scratch.ch04 import GPTModel +from llms_from_scratch.ch05 import ( generate, - GPTModel, text_to_token_ids, token_ids_to_text, ) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") diff --git a/ch05/06_user_interface/previous_chapters.py b/ch05/06_user_interface/previous_chapters.py deleted file mode 100644 index b1e96c1..0000000 --- a/ch05/06_user_interface/previous_chapters.py +++ /dev/null @@ -1,384 +0,0 @@ -# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). -# Source for "Build a Large Language Model From Scratch" -# - https://www.manning.com/books/build-a-large-language-model-from-scratch -# Code: https://github.com/rasbt/LLMs-from-scratch -# -# This file collects all the relevant code that we covered thus far -# throughout Chapters 2-5. - -import json -import os -import urllib - -import numpy as np -import tensorflow as tf -import torch -import torch.nn as nn -from tqdm import tqdm - - -##################################### -# Chapter 3 -##################################### -class MultiHeadAttention(nn.Module): - def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False): - super().__init__() - assert d_out % num_heads == 0, "d_out must be divisible by n_heads" - - self.d_out = d_out - self.num_heads = num_heads - self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim - - self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias) - self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias) - self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias) - self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs - self.dropout = nn.Dropout(dropout) - self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) - - def forward(self, x): - b, num_tokens, d_in = x.shape - - keys = self.W_key(x) # Shape: (b, num_tokens, d_out) - queries = self.W_query(x) - values = self.W_value(x) - - # We implicitly split the matrix by adding a `num_heads` dimension - # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim) - keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) - values = values.view(b, num_tokens, self.num_heads, self.head_dim) - queries = queries.view(b, num_tokens, self.num_heads, self.head_dim) - - # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim) - keys = keys.transpose(1, 2) - queries = queries.transpose(1, 2) - values = values.transpose(1, 2) - - # Compute scaled dot-product attention (aka self-attention) with a causal mask - attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head - - # Original mask truncated to the number of tokens and converted to boolean - mask_bool = self.mask.bool()[:num_tokens, :num_tokens] - - # Use the mask to fill attention scores - attn_scores.masked_fill_(mask_bool, -torch.inf) - - attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) - attn_weights = self.dropout(attn_weights) - - # Shape: (b, num_tokens, num_heads, head_dim) - context_vec = (attn_weights @ values).transpose(1, 2) - - # Combine heads, where self.d_out = self.num_heads * self.head_dim - context_vec = context_vec.reshape(b, num_tokens, self.d_out) - context_vec = self.out_proj(context_vec) # optional projection - - return context_vec - - -##################################### -# Chapter 4 -##################################### -class LayerNorm(nn.Module): - def __init__(self, emb_dim): - super().__init__() - self.eps = 1e-5 - self.scale = nn.Parameter(torch.ones(emb_dim)) - self.shift = nn.Parameter(torch.zeros(emb_dim)) - - def forward(self, x): - mean = x.mean(dim=-1, keepdim=True) - var = x.var(dim=-1, keepdim=True, unbiased=False) - norm_x = (x - mean) / torch.sqrt(var + self.eps) - return self.scale * norm_x + self.shift - - -class GELU(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return 0.5 * x * (1 + torch.tanh( - torch.sqrt(torch.tensor(2.0 / torch.pi)) * - (x + 0.044715 * torch.pow(x, 3)) - )) - - -class FeedForward(nn.Module): - def __init__(self, cfg): - super().__init__() - self.layers = nn.Sequential( - nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), - GELU(), - nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]), - ) - - def forward(self, x): - return self.layers(x) - - -class TransformerBlock(nn.Module): - def __init__(self, cfg): - super().__init__() - self.att = MultiHeadAttention( - d_in=cfg["emb_dim"], - d_out=cfg["emb_dim"], - context_length=cfg["context_length"], - num_heads=cfg["n_heads"], - dropout=cfg["drop_rate"], - qkv_bias=cfg["qkv_bias"]) - self.ff = FeedForward(cfg) - self.norm1 = LayerNorm(cfg["emb_dim"]) - self.norm2 = LayerNorm(cfg["emb_dim"]) - self.drop_shortcut = nn.Dropout(cfg["drop_rate"]) - - def forward(self, x): - # Shortcut connection for attention block - shortcut = x - x = self.norm1(x) - x = self.att(x) # Shape [batch_size, num_tokens, emb_size] - x = self.drop_shortcut(x) - x = x + shortcut # Add the original input back - - # Shortcut connection for feed-forward block - shortcut = x - x = self.norm2(x) - x = self.ff(x) - x = self.drop_shortcut(x) - x = x + shortcut # Add the original input back - - return x - - -class GPTModel(nn.Module): - def __init__(self, cfg): - super().__init__() - self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"]) - self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"]) - self.drop_emb = nn.Dropout(cfg["drop_rate"]) - - self.trf_blocks = nn.Sequential( - *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]) - - self.final_norm = LayerNorm(cfg["emb_dim"]) - self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False) - - def forward(self, in_idx): - batch_size, seq_len = in_idx.shape - tok_embeds = self.tok_emb(in_idx) - pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device)) - x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size] - x = self.drop_emb(x) - x = self.trf_blocks(x) - x = self.final_norm(x) - logits = self.out_head(x) - return logits - - -##################################### -# Chapter 5 -##################################### -def text_to_token_ids(text, tokenizer): - encoded = tokenizer.encode(text) - encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension - return encoded_tensor - - -def token_ids_to_text(token_ids, tokenizer): - flat = token_ids.squeeze(0) # remove batch dimension - return tokenizer.decode(flat.tolist()) - - -def download_and_load_gpt2(model_size, models_dir): - # Validate model size - allowed_sizes = ("124M", "355M", "774M", "1558M") - if model_size not in allowed_sizes: - raise ValueError(f"Model size not in {allowed_sizes}") - - # Define paths - model_dir = os.path.join(models_dir, model_size) - base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models" - filenames = [ - "checkpoint", "encoder.json", "hparams.json", - "model.ckpt.data-00000-of-00001", "model.ckpt.index", - "model.ckpt.meta", "vocab.bpe" - ] - - # Download files - os.makedirs(model_dir, exist_ok=True) - for filename in filenames: - file_url = os.path.join(base_url, model_size, filename) - file_path = os.path.join(model_dir, filename) - download_file(file_url, file_path) - - # Load settings and params - tf_ckpt_path = tf.train.latest_checkpoint(model_dir) - settings = json.load(open(os.path.join(model_dir, "hparams.json"))) - params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings) - - return settings, params - - -def download_file(url, destination): - # Send a GET request to download the file - with urllib.request.urlopen(url) as response: - # Get the total file size from headers, defaulting to 0 if not present - file_size = int(response.headers.get("Content-Length", 0)) - - # Check if file exists and has the same size - if os.path.exists(destination): - file_size_local = os.path.getsize(destination) - if file_size == file_size_local: - print(f"File already exists and is up-to-date: {destination}") - return - - # Define the block size for reading the file - block_size = 1024 # 1 Kilobyte - - # Initialize the progress bar with total file size - progress_bar_description = os.path.basename(url) # Extract filename from URL - with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar: - # Open the destination file in binary write mode - with open(destination, "wb") as file: - # Read the file in chunks and write to destination - while True: - chunk = response.read(block_size) - if not chunk: - break - file.write(chunk) - progress_bar.update(len(chunk)) # Update progress bar - - -def load_gpt2_params_from_tf_ckpt(ckpt_path, settings): - # Initialize parameters dictionary with empty blocks for each layer - params = {"blocks": [{} for _ in range(settings["n_layer"])]} - - # Iterate over each variable in the checkpoint - for name, _ in tf.train.list_variables(ckpt_path): - # Load the variable and remove singleton dimensions - variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name)) - - # Process the variable name to extract relevant parts - variable_name_parts = name.split("/")[1:] # Skip the 'model/' prefix - - # Identify the target dictionary for the variable - target_dict = params - if variable_name_parts[0].startswith("h"): - layer_number = int(variable_name_parts[0][1:]) - target_dict = params["blocks"][layer_number] - - # Recursively access or create nested dictionaries - for key in variable_name_parts[1:-1]: - target_dict = target_dict.setdefault(key, {}) - - # Assign the variable array to the last key - last_key = variable_name_parts[-1] - target_dict[last_key] = variable_array - - return params - - -def assign(left, right): - if left.shape != right.shape: - raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}") - return torch.nn.Parameter(torch.tensor(right)) - - -def load_weights_into_gpt(gpt, params): - gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe']) - gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte']) - - for b in range(len(params["blocks"])): - q_w, k_w, v_w = np.split( - (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1) - gpt.trf_blocks[b].att.W_query.weight = assign( - gpt.trf_blocks[b].att.W_query.weight, q_w.T) - gpt.trf_blocks[b].att.W_key.weight = assign( - gpt.trf_blocks[b].att.W_key.weight, k_w.T) - gpt.trf_blocks[b].att.W_value.weight = assign( - gpt.trf_blocks[b].att.W_value.weight, v_w.T) - - q_b, k_b, v_b = np.split( - (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1) - gpt.trf_blocks[b].att.W_query.bias = assign( - gpt.trf_blocks[b].att.W_query.bias, q_b) - gpt.trf_blocks[b].att.W_key.bias = assign( - gpt.trf_blocks[b].att.W_key.bias, k_b) - gpt.trf_blocks[b].att.W_value.bias = assign( - gpt.trf_blocks[b].att.W_value.bias, v_b) - - gpt.trf_blocks[b].att.out_proj.weight = assign( - gpt.trf_blocks[b].att.out_proj.weight, - params["blocks"][b]["attn"]["c_proj"]["w"].T) - gpt.trf_blocks[b].att.out_proj.bias = assign( - gpt.trf_blocks[b].att.out_proj.bias, - params["blocks"][b]["attn"]["c_proj"]["b"]) - - gpt.trf_blocks[b].ff.layers[0].weight = assign( - gpt.trf_blocks[b].ff.layers[0].weight, - params["blocks"][b]["mlp"]["c_fc"]["w"].T) - gpt.trf_blocks[b].ff.layers[0].bias = assign( - gpt.trf_blocks[b].ff.layers[0].bias, - params["blocks"][b]["mlp"]["c_fc"]["b"]) - gpt.trf_blocks[b].ff.layers[2].weight = assign( - gpt.trf_blocks[b].ff.layers[2].weight, - params["blocks"][b]["mlp"]["c_proj"]["w"].T) - gpt.trf_blocks[b].ff.layers[2].bias = assign( - gpt.trf_blocks[b].ff.layers[2].bias, - params["blocks"][b]["mlp"]["c_proj"]["b"]) - - gpt.trf_blocks[b].norm1.scale = assign( - gpt.trf_blocks[b].norm1.scale, - params["blocks"][b]["ln_1"]["g"]) - gpt.trf_blocks[b].norm1.shift = assign( - gpt.trf_blocks[b].norm1.shift, - params["blocks"][b]["ln_1"]["b"]) - gpt.trf_blocks[b].norm2.scale = assign( - gpt.trf_blocks[b].norm2.scale, - params["blocks"][b]["ln_2"]["g"]) - gpt.trf_blocks[b].norm2.shift = assign( - gpt.trf_blocks[b].norm2.shift, - params["blocks"][b]["ln_2"]["b"]) - - gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"]) - gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"]) - gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"]) - - -def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None): - - # For-loop is the same as before: Get logits, and only focus on last time step - for _ in range(max_new_tokens): - idx_cond = idx[:, -context_size:] - with torch.no_grad(): - logits = model(idx_cond) - logits = logits[:, -1, :] - - # New: Filter logits with top_k sampling - if top_k is not None: - # Keep only top_k values - top_logits, _ = torch.topk(logits, top_k) - min_val = top_logits[:, -1] - logits = torch.where(logits < min_val, torch.tensor(float('-inf')).to(logits.device), logits) - - # New: Apply temperature scaling - if temperature > 0.0: - logits = logits / temperature - - # Apply softmax to get probabilities - probs = torch.softmax(logits, dim=-1) # (batch_size, context_len) - - # Sample from the distribution - idx_next = torch.multinomial(probs, num_samples=1) # (batch_size, 1) - - # Otherwise same as before: get idx of the vocab entry with the highest logits value - else: - idx_next = torch.argmax(logits, dim=-1, keepdim=True) # (batch_size, 1) - - if idx_next == eos_id: # Stop generating early if end-of-sequence token is encountered and eos_id is specified - break - - # Same as before: append sampled index to the running sequence - idx = torch.cat((idx, idx_next), dim=1) # (batch_size, num_tokens+1) - - return idx diff --git a/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb b/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb index 00bcabb..520302e 100644 --- a/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb +++ b/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb @@ -1264,6 +1264,12 @@ ], "source": [ "from previous_chapters import generate, text_to_token_ids, token_ids_to_text\n", + "# If the `previous_chapters.py` file is not available locally,\n", + "# you can import it from the `llms-from-scratch` PyPI package.\n", + "# For details, see: https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg\n", + "# E.g.,\n", + "# from llms_from_scratch.ch05 import generate, text_to_token_ids, token_ids_to_text\n", + "\n", "\n", "\n", "torch.manual_seed(123)\n", @@ -1691,7 +1697,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.16" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb b/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb index 2f22722..991a5de 100644 --- a/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb +++ b/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb @@ -1324,6 +1324,11 @@ ], "source": [ "from previous_chapters import generate, text_to_token_ids, token_ids_to_text\n", + "# If the `previous_chapters.py` file is not available locally,\n", + "# you can import it from the `llms-from-scratch` PyPI package.\n", + "# For details, see: https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg\n", + "# E.g.,\n", + "# from llms_from_scratch.ch05 import generate, text_to_token_ids, token_ids_to_text\n", "\n", "\n", "torch.manual_seed(123)\n", diff --git a/ch05/08_memory_efficient_weight_loading/memory-efficient-state-dict.ipynb b/ch05/08_memory_efficient_weight_loading/memory-efficient-state-dict.ipynb index 8ab9d4f..9903393 100644 --- a/ch05/08_memory_efficient_weight_loading/memory-efficient-state-dict.ipynb +++ b/ch05/08_memory_efficient_weight_loading/memory-efficient-state-dict.ipynb @@ -161,6 +161,12 @@ "outputs": [], "source": [ "from previous_chapters import GPTModel\n", + "# If the `previous_chapters.py` file is not available locally,\n", + "# you can import it from the `llms-from-scratch` PyPI package.\n", + "# For details, see: https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg\n", + "# E.g.,\n", + "# from llms_from_scratch.ch04 import GPTModel\n", + "\n", "\n", "\n", "BASE_CONFIG = {\n", @@ -921,7 +927,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.16" } }, "nbformat": 4, diff --git a/ch05/09_extending-tokenizers/extend-tiktoken.ipynb b/ch05/09_extending-tokenizers/extend-tiktoken.ipynb index 83d40a4..d53a67e 100644 --- a/ch05/09_extending-tokenizers/extend-tiktoken.ipynb +++ b/ch05/09_extending-tokenizers/extend-tiktoken.ipynb @@ -301,8 +301,9 @@ } ], "source": [ - "# Relative import from the gpt_download.py contained in this folder\n", - "from gpt_download import download_and_load_gpt2\n", + "from llms_from_scratch.ch05 import download_and_load_gpt2\n", + "# For llms_from_scratch installation instructions, see:\n", + "# https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg\n", "\n", "settings, params = download_and_load_gpt2(model_size=\"124M\", models_dir=\"gpt2\")" ] @@ -314,8 +315,9 @@ "metadata": {}, "outputs": [], "source": [ - "# Relative import from the gpt_download.py contained in this folder\n", - "from previous_chapters import GPTModel\n", + "from llms_from_scratch.ch04 import GPTModel\n", + "# For llms_from_scratch installation instructions, see:\n", + "# https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg\n", "\n", "GPT_CONFIG_124M = {\n", " \"vocab_size\": 50257, # Vocabulary size\n", @@ -763,7 +765,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.16" } }, "nbformat": 4, diff --git a/ch05/09_extending-tokenizers/gpt_download.py b/ch05/09_extending-tokenizers/gpt_download.py deleted file mode 100644 index 6e27a4f..0000000 --- a/ch05/09_extending-tokenizers/gpt_download.py +++ /dev/null @@ -1,157 +0,0 @@ -# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). -# Source for "Build a Large Language Model From Scratch" -# - https://www.manning.com/books/build-a-large-language-model-from-scratch -# Code: https://github.com/rasbt/LLMs-from-scratch - - -import os -import urllib.request - -# import requests -import json -import numpy as np -import tensorflow as tf -from tqdm import tqdm - - -def download_and_load_gpt2(model_size, models_dir): - # Validate model size - allowed_sizes = ("124M", "355M", "774M", "1558M") - if model_size not in allowed_sizes: - raise ValueError(f"Model size not in {allowed_sizes}") - - # Define paths - model_dir = os.path.join(models_dir, model_size) - base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models" - backup_base_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/gpt2" - filenames = [ - "checkpoint", "encoder.json", "hparams.json", - "model.ckpt.data-00000-of-00001", "model.ckpt.index", - "model.ckpt.meta", "vocab.bpe" - ] - - # Download files - os.makedirs(model_dir, exist_ok=True) - for filename in filenames: - file_url = os.path.join(base_url, model_size, filename) - backup_url = os.path.join(backup_base_url, model_size, filename) - file_path = os.path.join(model_dir, filename) - download_file(file_url, file_path, backup_url) - - # Load settings and params - tf_ckpt_path = tf.train.latest_checkpoint(model_dir) - settings = json.load(open(os.path.join(model_dir, "hparams.json"), "r", encoding="utf-8")) - params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings) - - return settings, params - - -def download_file(url, destination, backup_url=None): - def _attempt_download(download_url): - with urllib.request.urlopen(download_url) as response: - # Get the total file size from headers, defaulting to 0 if not present - file_size = int(response.headers.get("Content-Length", 0)) - - # Check if file exists and has the same size - if os.path.exists(destination): - file_size_local = os.path.getsize(destination) - if file_size == file_size_local: - print(f"File already exists and is up-to-date: {destination}") - return True # Indicate success without re-downloading - - block_size = 1024 # 1 Kilobyte - - # Initialize the progress bar with total file size - progress_bar_description = os.path.basename(download_url) - with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar: - with open(destination, "wb") as file: - while True: - chunk = response.read(block_size) - if not chunk: - break - file.write(chunk) - progress_bar.update(len(chunk)) - return True - - try: - if _attempt_download(url): - return - except (urllib.error.HTTPError, urllib.error.URLError): - if backup_url is not None: - print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}") - try: - if _attempt_download(backup_url): - return - except urllib.error.HTTPError: - pass - - # If we reach here, both attempts have failed - error_message = ( - f"Failed to download from both primary URL ({url})" - f"{' and backup URL (' + backup_url + ')' if backup_url else ''}." - "\nCheck your internet connection or the file availability.\n" - "For help, visit: https://github.com/rasbt/LLMs-from-scratch/discussions/273" - ) - print(error_message) - except Exception as e: - print(f"An unexpected error occurred: {e}") - - -# Alternative way using `requests` -""" -def download_file(url, destination): - # Send a GET request to download the file in streaming mode - response = requests.get(url, stream=True) - - # Get the total file size from headers, defaulting to 0 if not present - file_size = int(response.headers.get("content-length", 0)) - - # Check if file exists and has the same size - if os.path.exists(destination): - file_size_local = os.path.getsize(destination) - if file_size == file_size_local: - print(f"File already exists and is up-to-date: {destination}") - return - - # Define the block size for reading the file - block_size = 1024 # 1 Kilobyte - - # Initialize the progress bar with total file size - progress_bar_description = url.split("/")[-1] # Extract filename from URL - with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar: - # Open the destination file in binary write mode - with open(destination, "wb") as file: - # Iterate over the file data in chunks - for chunk in response.iter_content(block_size): - progress_bar.update(len(chunk)) # Update progress bar - file.write(chunk) # Write the chunk to the file -""" - - -def load_gpt2_params_from_tf_ckpt(ckpt_path, settings): - # Initialize parameters dictionary with empty blocks for each layer - params = {"blocks": [{} for _ in range(settings["n_layer"])]} - - # Iterate over each variable in the checkpoint - for name, _ in tf.train.list_variables(ckpt_path): - # Load the variable and remove singleton dimensions - variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name)) - - # Process the variable name to extract relevant parts - variable_name_parts = name.split("/")[1:] # Skip the 'model/' prefix - - # Identify the target dictionary for the variable - target_dict = params - if variable_name_parts[0].startswith("h"): - layer_number = int(variable_name_parts[0][1:]) - target_dict = params["blocks"][layer_number] - - # Recursively access or create nested dictionaries - for key in variable_name_parts[1:-1]: - target_dict = target_dict.setdefault(key, {}) - - # Assign the variable array to the last key - last_key = variable_name_parts[-1] - target_dict[last_key] = variable_array - - return params diff --git a/ch05/09_extending-tokenizers/previous_chapters.py b/ch05/09_extending-tokenizers/previous_chapters.py deleted file mode 100644 index 369e370..0000000 --- a/ch05/09_extending-tokenizers/previous_chapters.py +++ /dev/null @@ -1,279 +0,0 @@ -# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). -# Source for "Build a Large Language Model From Scratch" -# - https://www.manning.com/books/build-a-large-language-model-from-scratch -# Code: https://github.com/rasbt/LLMs-from-scratch -# -# This file collects all the relevant code that we covered thus far -# throughout Chapters 2-4. -# This file can be run as a standalone script. - -import tiktoken -import torch -import torch.nn as nn -from torch.utils.data import Dataset, DataLoader - -##################################### -# Chapter 2 -##################################### - - -class GPTDatasetV1(Dataset): - def __init__(self, txt, tokenizer, max_length, stride): - self.input_ids = [] - self.target_ids = [] - - # Tokenize the entire text - token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"}) - - # Use a sliding window to chunk the book into overlapping sequences of max_length - for i in range(0, len(token_ids) - max_length, stride): - input_chunk = token_ids[i:i + max_length] - target_chunk = token_ids[i + 1: i + max_length + 1] - self.input_ids.append(torch.tensor(input_chunk)) - self.target_ids.append(torch.tensor(target_chunk)) - - def __len__(self): - return len(self.input_ids) - - def __getitem__(self, idx): - return self.input_ids[idx], self.target_ids[idx] - - -def create_dataloader_v1(txt, batch_size=4, max_length=256, - stride=128, shuffle=True, drop_last=True, num_workers=0): - # Initialize the tokenizer - tokenizer = tiktoken.get_encoding("gpt2") - - # Create dataset - dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) - - # Create dataloader - dataloader = DataLoader( - dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers) - - return dataloader - - -##################################### -# Chapter 3 -##################################### -class MultiHeadAttention(nn.Module): - def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False): - super().__init__() - assert d_out % num_heads == 0, "d_out must be divisible by n_heads" - - self.d_out = d_out - self.num_heads = num_heads - self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim - - self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias) - self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias) - self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias) - self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs - self.dropout = nn.Dropout(dropout) - self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) - - def forward(self, x): - b, num_tokens, d_in = x.shape - - keys = self.W_key(x) # Shape: (b, num_tokens, d_out) - queries = self.W_query(x) - values = self.W_value(x) - - # We implicitly split the matrix by adding a `num_heads` dimension - # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim) - keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) - values = values.view(b, num_tokens, self.num_heads, self.head_dim) - queries = queries.view(b, num_tokens, self.num_heads, self.head_dim) - - # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim) - keys = keys.transpose(1, 2) - queries = queries.transpose(1, 2) - values = values.transpose(1, 2) - - # Compute scaled dot-product attention (aka self-attention) with a causal mask - attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head - - # Original mask truncated to the number of tokens and converted to boolean - mask_bool = self.mask.bool()[:num_tokens, :num_tokens] - - # Use the mask to fill attention scores - attn_scores.masked_fill_(mask_bool, -torch.inf) - - attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) - attn_weights = self.dropout(attn_weights) - - # Shape: (b, num_tokens, num_heads, head_dim) - context_vec = (attn_weights @ values).transpose(1, 2) - - # Combine heads, where self.d_out = self.num_heads * self.head_dim - context_vec = context_vec.reshape(b, num_tokens, self.d_out) - context_vec = self.out_proj(context_vec) # optional projection - - return context_vec - - -##################################### -# Chapter 4 -##################################### -class LayerNorm(nn.Module): - def __init__(self, emb_dim): - super().__init__() - self.eps = 1e-5 - self.scale = nn.Parameter(torch.ones(emb_dim)) - self.shift = nn.Parameter(torch.zeros(emb_dim)) - - def forward(self, x): - mean = x.mean(dim=-1, keepdim=True) - var = x.var(dim=-1, keepdim=True, unbiased=False) - norm_x = (x - mean) / torch.sqrt(var + self.eps) - return self.scale * norm_x + self.shift - - -class GELU(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return 0.5 * x * (1 + torch.tanh( - torch.sqrt(torch.tensor(2.0 / torch.pi)) * - (x + 0.044715 * torch.pow(x, 3)) - )) - - -class FeedForward(nn.Module): - def __init__(self, cfg): - super().__init__() - self.layers = nn.Sequential( - nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), - GELU(), - nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]), - ) - - def forward(self, x): - return self.layers(x) - - -class TransformerBlock(nn.Module): - def __init__(self, cfg): - super().__init__() - self.att = MultiHeadAttention( - d_in=cfg["emb_dim"], - d_out=cfg["emb_dim"], - context_length=cfg["context_length"], - num_heads=cfg["n_heads"], - dropout=cfg["drop_rate"], - qkv_bias=cfg["qkv_bias"]) - self.ff = FeedForward(cfg) - self.norm1 = LayerNorm(cfg["emb_dim"]) - self.norm2 = LayerNorm(cfg["emb_dim"]) - self.drop_shortcut = nn.Dropout(cfg["drop_rate"]) - - def forward(self, x): - # Shortcut connection for attention block - shortcut = x - x = self.norm1(x) - x = self.att(x) # Shape [batch_size, num_tokens, emb_size] - x = self.drop_shortcut(x) - x = x + shortcut # Add the original input back - - # Shortcut connection for feed-forward block - shortcut = x - x = self.norm2(x) - x = self.ff(x) - x = self.drop_shortcut(x) - x = x + shortcut # Add the original input back - - return x - - -class GPTModel(nn.Module): - def __init__(self, cfg): - super().__init__() - self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"]) - self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"]) - self.drop_emb = nn.Dropout(cfg["drop_rate"]) - - self.trf_blocks = nn.Sequential( - *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]) - - self.final_norm = LayerNorm(cfg["emb_dim"]) - self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False) - - def forward(self, in_idx): - batch_size, seq_len = in_idx.shape - tok_embeds = self.tok_emb(in_idx) - pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device)) - x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size] - x = self.drop_emb(x) - x = self.trf_blocks(x) - x = self.final_norm(x) - logits = self.out_head(x) - return logits - - -def generate_text_simple(model, idx, max_new_tokens, context_size): - # idx is (B, T) array of indices in the current context - for _ in range(max_new_tokens): - - # Crop current context if it exceeds the supported context size - # E.g., if LLM supports only 5 tokens, and the context size is 10 - # then only the last 5 tokens are used as context - idx_cond = idx[:, -context_size:] - - # Get the predictions - with torch.no_grad(): - logits = model(idx_cond) - - # Focus only on the last time step - # (batch, n_token, vocab_size) becomes (batch, vocab_size) - logits = logits[:, -1, :] - - # Get the idx of the vocab entry with the highest logits value - idx_next = torch.argmax(logits, dim=-1, keepdim=True) # (batch, 1) - - # Append sampled index to the running sequence - idx = torch.cat((idx, idx_next), dim=1) # (batch, n_tokens+1) - - return idx - - -if __name__ == "__main__": - - GPT_CONFIG_124M = { - "vocab_size": 50257, # Vocabulary size - "context_length": 1024, # Context length - "emb_dim": 768, # Embedding dimension - "n_heads": 12, # Number of attention heads - "n_layers": 12, # Number of layers - "drop_rate": 0.1, # Dropout rate - "qkv_bias": False # Query-Key-Value bias - } - - torch.manual_seed(123) - model = GPTModel(GPT_CONFIG_124M) - model.eval() # disable dropout - - start_context = "Hello, I am" - - tokenizer = tiktoken.get_encoding("gpt2") - encoded = tokenizer.encode(start_context) - encoded_tensor = torch.tensor(encoded).unsqueeze(0) - - print(f"\n{50*'='}\n{22*' '}IN\n{50*'='}") - print("\nInput text:", start_context) - print("Encoded input text:", encoded) - print("encoded_tensor.shape:", encoded_tensor.shape) - - out = generate_text_simple( - model=model, - idx=encoded_tensor, - max_new_tokens=10, - context_size=GPT_CONFIG_124M["context_length"] - ) - decoded_text = tokenizer.decode(out.squeeze(0).tolist()) - - print(f"\n\n{50*'='}\n{22*' '}OUT\n{50*'='}") - print("\nOutput:", out) - print("Output length:", len(out[0])) - print("Output text:", decoded_text) diff --git a/ch06/01_main-chapter-code/ch06.ipynb b/ch06/01_main-chapter-code/ch06.ipynb index 669e75d..d0dabbd 100644 --- a/ch06/01_main-chapter-code/ch06.ipynb +++ b/ch06/01_main-chapter-code/ch06.ipynb @@ -1035,6 +1035,12 @@ "source": [ "from gpt_download import download_and_load_gpt2\n", "from previous_chapters import GPTModel, load_weights_into_gpt\n", + "# If the `previous_chapters.py` file is not available locally,\n", + "# you can import it from the `llms-from-scratch` PyPI package.\n", + "# For details, see: https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg\n", + "# E.g.,\n", + "# from llms_from_scratch.ch04 import GPTModel\n", + "# from llms_from_scratch.ch05 import download_and_load_gpt2, load_weights_into_gpt\n", "\n", "model_size = CHOOSE_MODEL.split(\" \")[-1].lstrip(\"(\").rstrip(\")\")\n", "settings, params = download_and_load_gpt2(model_size=model_size, models_dir=\"gpt2\")\n", @@ -1075,6 +1081,13 @@ " token_ids_to_text\n", ")\n", "\n", + "# Alternatively:\n", + "# from llms_from_scratch.ch05 import (\n", + "# generate_text_simple,\n", + "# text_to_token_ids,\n", + "# token_ids_to_text\n", + "# )\n", + "\n", "\n", "text_1 = \"Every effort moves you\"\n", "\n", diff --git a/ch06/02_bonus_additional-experiments/README.md b/ch06/02_bonus_additional-experiments/README.md index 1ac17f2..20455a4 100644 --- a/ch06/02_bonus_additional-experiments/README.md +++ b/ch06/02_bonus_additional-experiments/README.md @@ -55,7 +55,7 @@ You can use the following code to reproduce the experiments: - Row 16: `python additional_experiments.py --trainable_token_pos "flexible"` - Row 17: `python additional_experiments.py --disable_causal_mask` - Row 18: `python additional_experiments.py --ignore_index 50256` -- Row 19: `python additional_experiments.py --average embeddings` +- Row 19: `python additional_experiments.py --average_embeddings` I've kept the LLM and dataset small on purpose, so you can run the training on a regular laptop like a MacBook Air M3 in about 15 minutes (for the default setting) in case you don't have access to a GPU. diff --git a/ch06/02_bonus_additional-experiments/additional_experiments.py b/ch06/02_bonus_additional-experiments/additional_experiments.py index a85b932..974f21e 100644 --- a/ch06/02_bonus_additional-experiments/additional_experiments.py +++ b/ch06/02_bonus_additional-experiments/additional_experiments.py @@ -21,6 +21,14 @@ from gpt_download import download_and_load_gpt2 from previous_chapters import GPTModel, load_weights_into_gpt +# If the `previous_chapters.py` file is not available locally, +# you can import it from the `llms-from-scratch` PyPI package. +# For details, see: https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg +# E.g., +# from llms_from_scratch.ch04 import GPTModel +# from llms_from_scratch.ch05 import download_and_load_gpt2, load_weights_into_gpt + + class LoRALayer(torch.nn.Module): def __init__(self, in_dim, out_dim, rank, alpha): super().__init__() diff --git a/ch06/04_user_interface/app.py b/ch06/04_user_interface/app.py index 84ad330..86b5f52 100644 --- a/ch06/04_user_interface/app.py +++ b/ch06/04_user_interface/app.py @@ -10,10 +10,11 @@ import tiktoken import torch import chainlit -from previous_chapters import ( - classify_review, - GPTModel -) +# For llms_from_scratch installation instructions, see: +# https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg +from llms_from_scratch.ch04 import GPTModel +from llms_from_scratch.ch06 import classify_review + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") diff --git a/ch06/04_user_interface/previous_chapters.py b/ch06/04_user_interface/previous_chapters.py deleted file mode 100644 index 4dc4d5b..0000000 --- a/ch06/04_user_interface/previous_chapters.py +++ /dev/null @@ -1,371 +0,0 @@ -# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). -# Source for "Build a Large Language Model From Scratch" -# - https://www.manning.com/books/build-a-large-language-model-from-scratch -# Code: https://github.com/rasbt/LLMs-from-scratch -# -# This file collects all the relevant code that we covered thus far -# throughout Chapters 2-5. - -import json -import os -import urllib - -import numpy as np -import tensorflow as tf -import torch -import torch.nn as nn -from tqdm import tqdm - - -##################################### -# Chapter 3 -##################################### -class MultiHeadAttention(nn.Module): - def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False): - super().__init__() - assert d_out % num_heads == 0, "d_out must be divisible by n_heads" - - self.d_out = d_out - self.num_heads = num_heads - self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim - - self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias) - self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias) - self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias) - self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs - self.dropout = nn.Dropout(dropout) - self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) - - def forward(self, x): - b, num_tokens, d_in = x.shape - - keys = self.W_key(x) # Shape: (b, num_tokens, d_out) - queries = self.W_query(x) - values = self.W_value(x) - - # We implicitly split the matrix by adding a `num_heads` dimension - # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim) - keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) - values = values.view(b, num_tokens, self.num_heads, self.head_dim) - queries = queries.view(b, num_tokens, self.num_heads, self.head_dim) - - # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim) - keys = keys.transpose(1, 2) - queries = queries.transpose(1, 2) - values = values.transpose(1, 2) - - # Compute scaled dot-product attention (aka self-attention) with a causal mask - attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head - - # Original mask truncated to the number of tokens and converted to boolean - mask_bool = self.mask.bool()[:num_tokens, :num_tokens] - - # Use the mask to fill attention scores - attn_scores.masked_fill_(mask_bool, -torch.inf) - - attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) - attn_weights = self.dropout(attn_weights) - - # Shape: (b, num_tokens, num_heads, head_dim) - context_vec = (attn_weights @ values).transpose(1, 2) - - # Combine heads, where self.d_out = self.num_heads * self.head_dim - context_vec = context_vec.reshape(b, num_tokens, self.d_out) - context_vec = self.out_proj(context_vec) # optional projection - - return context_vec - - -##################################### -# Chapter 4 -##################################### -class LayerNorm(nn.Module): - def __init__(self, emb_dim): - super().__init__() - self.eps = 1e-5 - self.scale = nn.Parameter(torch.ones(emb_dim)) - self.shift = nn.Parameter(torch.zeros(emb_dim)) - - def forward(self, x): - mean = x.mean(dim=-1, keepdim=True) - var = x.var(dim=-1, keepdim=True, unbiased=False) - norm_x = (x - mean) / torch.sqrt(var + self.eps) - return self.scale * norm_x + self.shift - - -class GELU(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return 0.5 * x * (1 + torch.tanh( - torch.sqrt(torch.tensor(2.0 / torch.pi)) * - (x + 0.044715 * torch.pow(x, 3)) - )) - - -class FeedForward(nn.Module): - def __init__(self, cfg): - super().__init__() - self.layers = nn.Sequential( - nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), - GELU(), - nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]), - ) - - def forward(self, x): - return self.layers(x) - - -class TransformerBlock(nn.Module): - def __init__(self, cfg): - super().__init__() - self.att = MultiHeadAttention( - d_in=cfg["emb_dim"], - d_out=cfg["emb_dim"], - context_length=cfg["context_length"], - num_heads=cfg["n_heads"], - dropout=cfg["drop_rate"], - qkv_bias=cfg["qkv_bias"]) - self.ff = FeedForward(cfg) - self.norm1 = LayerNorm(cfg["emb_dim"]) - self.norm2 = LayerNorm(cfg["emb_dim"]) - self.drop_shortcut = nn.Dropout(cfg["drop_rate"]) - - def forward(self, x): - # Shortcut connection for attention block - shortcut = x - x = self.norm1(x) - x = self.att(x) # Shape [batch_size, num_tokens, emb_size] - x = self.drop_shortcut(x) - x = x + shortcut # Add the original input back - - # Shortcut connection for feed-forward block - shortcut = x - x = self.norm2(x) - x = self.ff(x) - x = self.drop_shortcut(x) - x = x + shortcut # Add the original input back - - return x - - -class GPTModel(nn.Module): - def __init__(self, cfg): - super().__init__() - self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"]) - self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"]) - self.drop_emb = nn.Dropout(cfg["drop_rate"]) - - self.trf_blocks = nn.Sequential( - *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]) - - self.final_norm = LayerNorm(cfg["emb_dim"]) - self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False) - - def forward(self, in_idx): - batch_size, seq_len = in_idx.shape - tok_embeds = self.tok_emb(in_idx) - pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device)) - x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size] - x = self.drop_emb(x) - x = self.trf_blocks(x) - x = self.final_norm(x) - logits = self.out_head(x) - return logits - - -##################################### -# Chapter 5 -##################################### -def text_to_token_ids(text, tokenizer): - encoded = tokenizer.encode(text) - encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension - return encoded_tensor - - -def token_ids_to_text(token_ids, tokenizer): - flat = token_ids.squeeze(0) # remove batch dimension - return tokenizer.decode(flat.tolist()) - - -def download_and_load_gpt2(model_size, models_dir): - # Validate model size - allowed_sizes = ("124M", "355M", "774M", "1558M") - if model_size not in allowed_sizes: - raise ValueError(f"Model size not in {allowed_sizes}") - - # Define paths - model_dir = os.path.join(models_dir, model_size) - base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models" - filenames = [ - "checkpoint", "encoder.json", "hparams.json", - "model.ckpt.data-00000-of-00001", "model.ckpt.index", - "model.ckpt.meta", "vocab.bpe" - ] - - # Download files - os.makedirs(model_dir, exist_ok=True) - for filename in filenames: - file_url = os.path.join(base_url, model_size, filename) - file_path = os.path.join(model_dir, filename) - download_file(file_url, file_path) - - # Load settings and params - tf_ckpt_path = tf.train.latest_checkpoint(model_dir) - settings = json.load(open(os.path.join(model_dir, "hparams.json"))) - params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings) - - return settings, params - - -def download_file(url, destination): - # Send a GET request to download the file - with urllib.request.urlopen(url) as response: - # Get the total file size from headers, defaulting to 0 if not present - file_size = int(response.headers.get("Content-Length", 0)) - - # Check if file exists and has the same size - if os.path.exists(destination): - file_size_local = os.path.getsize(destination) - if file_size == file_size_local: - print(f"File already exists and is up-to-date: {destination}") - return - - # Define the block size for reading the file - block_size = 1024 # 1 Kilobyte - - # Initialize the progress bar with total file size - progress_bar_description = os.path.basename(url) # Extract filename from URL - with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar: - # Open the destination file in binary write mode - with open(destination, "wb") as file: - # Read the file in chunks and write to destination - while True: - chunk = response.read(block_size) - if not chunk: - break - file.write(chunk) - progress_bar.update(len(chunk)) # Update progress bar - - -def load_gpt2_params_from_tf_ckpt(ckpt_path, settings): - # Initialize parameters dictionary with empty blocks for each layer - params = {"blocks": [{} for _ in range(settings["n_layer"])]} - - # Iterate over each variable in the checkpoint - for name, _ in tf.train.list_variables(ckpt_path): - # Load the variable and remove singleton dimensions - variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name)) - - # Process the variable name to extract relevant parts - variable_name_parts = name.split("/")[1:] # Skip the 'model/' prefix - - # Identify the target dictionary for the variable - target_dict = params - if variable_name_parts[0].startswith("h"): - layer_number = int(variable_name_parts[0][1:]) - target_dict = params["blocks"][layer_number] - - # Recursively access or create nested dictionaries - for key in variable_name_parts[1:-1]: - target_dict = target_dict.setdefault(key, {}) - - # Assign the variable array to the last key - last_key = variable_name_parts[-1] - target_dict[last_key] = variable_array - - return params - - -def assign(left, right): - if left.shape != right.shape: - raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}") - return torch.nn.Parameter(torch.tensor(right)) - - -def load_weights_into_gpt(gpt, params): - gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe']) - gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte']) - - for b in range(len(params["blocks"])): - q_w, k_w, v_w = np.split( - (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1) - gpt.trf_blocks[b].att.W_query.weight = assign( - gpt.trf_blocks[b].att.W_query.weight, q_w.T) - gpt.trf_blocks[b].att.W_key.weight = assign( - gpt.trf_blocks[b].att.W_key.weight, k_w.T) - gpt.trf_blocks[b].att.W_value.weight = assign( - gpt.trf_blocks[b].att.W_value.weight, v_w.T) - - q_b, k_b, v_b = np.split( - (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1) - gpt.trf_blocks[b].att.W_query.bias = assign( - gpt.trf_blocks[b].att.W_query.bias, q_b) - gpt.trf_blocks[b].att.W_key.bias = assign( - gpt.trf_blocks[b].att.W_key.bias, k_b) - gpt.trf_blocks[b].att.W_value.bias = assign( - gpt.trf_blocks[b].att.W_value.bias, v_b) - - gpt.trf_blocks[b].att.out_proj.weight = assign( - gpt.trf_blocks[b].att.out_proj.weight, - params["blocks"][b]["attn"]["c_proj"]["w"].T) - gpt.trf_blocks[b].att.out_proj.bias = assign( - gpt.trf_blocks[b].att.out_proj.bias, - params["blocks"][b]["attn"]["c_proj"]["b"]) - - gpt.trf_blocks[b].ff.layers[0].weight = assign( - gpt.trf_blocks[b].ff.layers[0].weight, - params["blocks"][b]["mlp"]["c_fc"]["w"].T) - gpt.trf_blocks[b].ff.layers[0].bias = assign( - gpt.trf_blocks[b].ff.layers[0].bias, - params["blocks"][b]["mlp"]["c_fc"]["b"]) - gpt.trf_blocks[b].ff.layers[2].weight = assign( - gpt.trf_blocks[b].ff.layers[2].weight, - params["blocks"][b]["mlp"]["c_proj"]["w"].T) - gpt.trf_blocks[b].ff.layers[2].bias = assign( - gpt.trf_blocks[b].ff.layers[2].bias, - params["blocks"][b]["mlp"]["c_proj"]["b"]) - - gpt.trf_blocks[b].norm1.scale = assign( - gpt.trf_blocks[b].norm1.scale, - params["blocks"][b]["ln_1"]["g"]) - gpt.trf_blocks[b].norm1.shift = assign( - gpt.trf_blocks[b].norm1.shift, - params["blocks"][b]["ln_1"]["b"]) - gpt.trf_blocks[b].norm2.scale = assign( - gpt.trf_blocks[b].norm2.scale, - params["blocks"][b]["ln_2"]["g"]) - gpt.trf_blocks[b].norm2.shift = assign( - gpt.trf_blocks[b].norm2.shift, - params["blocks"][b]["ln_2"]["b"]) - - gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"]) - gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"]) - gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"]) - - -##################################### -# Chapter 6 -##################################### -def classify_review(text, model, tokenizer, device, max_length=None, pad_token_id=50256): - model.eval() - - # Prepare inputs to the model - input_ids = tokenizer.encode(text) - supported_context_length = model.pos_emb.weight.shape[0] - - # Truncate sequences if they too long - input_ids = input_ids[:min(max_length, supported_context_length)] - - # Pad sequences to the longest sequence - input_ids += [pad_token_id] * (max_length - len(input_ids)) - input_tensor = torch.tensor(input_ids, device=device).unsqueeze(0) # add batch dimension - - # Model inference - with torch.no_grad(): - logits = model(input_tensor.to(device))[:, -1, :] # Logits of the last output token - predicted_label = torch.argmax(logits, dim=-1).item() - - # Return the classified result - return "spam" if predicted_label == 1 else "not spam" diff --git a/ch07/01_main-chapter-code/ch07.ipynb b/ch07/01_main-chapter-code/ch07.ipynb index 105b79e..e7e083a 100644 --- a/ch07/01_main-chapter-code/ch07.ipynb +++ b/ch07/01_main-chapter-code/ch07.ipynb @@ -1541,6 +1541,12 @@ "source": [ "from gpt_download import download_and_load_gpt2\n", "from previous_chapters import GPTModel, load_weights_into_gpt\n", + "# If the `previous_chapters.py` file is not available locally,\n", + "# you can import it from the `llms-from-scratch` PyPI package.\n", + "# For details, see: https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg\n", + "# E.g.,\n", + "# from llms_from_scratch.ch04 import GPTModel\n", + "# from llms_from_scratch.ch05 import download_and_load_gpt2, load_weights_into_gpt\n", "\n", "\n", "BASE_CONFIG = {\n", @@ -1626,6 +1632,13 @@ " text_to_token_ids,\n", " token_ids_to_text\n", ")\n", + "# Alternatively:\n", + "# from llms_from_scratch.ch05 import (\n", + "# generate_text_simple,\n", + "# text_to_token_ids,\n", + "# token_ids_to_text\n", + "# )\n", + "\n", "\n", "token_ids = generate(\n", " model=model,\n", @@ -1727,7 +1740,12 @@ "from previous_chapters import (\n", " calc_loss_loader,\n", " train_model_simple\n", - ")" + ")\n", + "# Alternatively:\n", + "# from llms_from_scratch.ch05 import (\n", + "# calc_loss_loader,\n", + "# train_model_simple,\n", + "# )\n" ] }, { @@ -1939,6 +1957,8 @@ ], "source": [ "from previous_chapters import plot_losses\n", + "# Alternatively:\n", + "# from llms_from_scratch.ch05 import plot_losses\n", "\n", "epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))\n", "plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)" diff --git a/ch07/04_preference-tuning-with-dpo/dpo-from-scratch.ipynb b/ch07/04_preference-tuning-with-dpo/dpo-from-scratch.ipynb index b244f03..af3ccff 100644 --- a/ch07/04_preference-tuning-with-dpo/dpo-from-scratch.ipynb +++ b/ch07/04_preference-tuning-with-dpo/dpo-from-scratch.ipynb @@ -1613,6 +1613,11 @@ "outputs": [], "source": [ "from previous_chapters import GPTModel\n", + "# If the `previous_chapters.py` file is not available locally,\n", + "# you can import it from the `llms-from-scratch` PyPI package.\n", + "# For details, see: https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg\n", + "# E.g.,\n", + "# from llms_from_scratch.ch04 import GPTModel\n", "\n", "\n", "BASE_CONFIG = {\n", @@ -1715,6 +1720,12 @@ " text_to_token_ids,\n", " token_ids_to_text\n", ")\n", + "# Alternatively:\n", + "# from llms_from_scratch.ch05 (\n", + "# generate,\n", + "# text_to_token_ids,\n", + "# token_ids_to_text\n", + "# )\n", "\n", "torch.manual_seed(123)\n", "\n", @@ -2382,6 +2393,16 @@ " - we also track the rewards and reward margins, which are commonly used in RLHF and DPO contexts to track the training progress\n" ] }, + { + "cell_type": "markdown", + "id": "820d4904-f819-4d62-bfb4-85cf28863683", + "metadata": { + "id": "820d4904-f819-4d62-bfb4-85cf28863683" + }, + "source": [ + "- Before we start the training, let's print the initial losses and rewards:" + ] + }, { "cell_type": "code", "execution_count": 47, @@ -2392,6 +2413,8 @@ "outputs": [], "source": [ "from previous_chapters import generate_and_print_sample\n", + "# Alternatively:\n", + "# from llms_from_scratch.ch04 import generate_text_simple\n", "\n", "\n", "def train_model_dpo_simple(\n", @@ -2471,16 +2494,6 @@ " return tracking" ] }, - { - "cell_type": "markdown", - "id": "820d4904-f819-4d62-bfb4-85cf28863683", - "metadata": { - "id": "820d4904-f819-4d62-bfb4-85cf28863683" - }, - "source": [ - "- Before we start the training, let's print the initial losses and rewards:" - ] - }, { "cell_type": "code", "execution_count": 48, @@ -2776,6 +2789,8 @@ ], "source": [ "from previous_chapters import plot_losses\n", + "# Alternatively:\n", + "# from llms_from_scratch.ch05 import plot_losses\n", "\n", "\n", "epochs_tensor = torch.linspace(0, num_epochs, len(tracking[\"train_losses\"]))\n", @@ -3112,7 +3127,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.16" } }, "nbformat": 4, diff --git a/ch07/06_user_interface/app.py b/ch07/06_user_interface/app.py index 52b39e5..4c035fe 100644 --- a/ch07/06_user_interface/app.py +++ b/ch07/06_user_interface/app.py @@ -10,9 +10,12 @@ import tiktoken import torch import chainlit -from previous_chapters import ( + +# For llms_from_scratch installation instructions, see: +# https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg +from llms_from_scratch.ch04 import GPTModel +from llms_from_scratch.ch05 import ( generate, - GPTModel, text_to_token_ids, token_ids_to_text, ) diff --git a/ch07/06_user_interface/previous_chapters.py b/ch07/06_user_interface/previous_chapters.py deleted file mode 100644 index b1e96c1..0000000 --- a/ch07/06_user_interface/previous_chapters.py +++ /dev/null @@ -1,384 +0,0 @@ -# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). -# Source for "Build a Large Language Model From Scratch" -# - https://www.manning.com/books/build-a-large-language-model-from-scratch -# Code: https://github.com/rasbt/LLMs-from-scratch -# -# This file collects all the relevant code that we covered thus far -# throughout Chapters 2-5. - -import json -import os -import urllib - -import numpy as np -import tensorflow as tf -import torch -import torch.nn as nn -from tqdm import tqdm - - -##################################### -# Chapter 3 -##################################### -class MultiHeadAttention(nn.Module): - def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False): - super().__init__() - assert d_out % num_heads == 0, "d_out must be divisible by n_heads" - - self.d_out = d_out - self.num_heads = num_heads - self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim - - self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias) - self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias) - self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias) - self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs - self.dropout = nn.Dropout(dropout) - self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) - - def forward(self, x): - b, num_tokens, d_in = x.shape - - keys = self.W_key(x) # Shape: (b, num_tokens, d_out) - queries = self.W_query(x) - values = self.W_value(x) - - # We implicitly split the matrix by adding a `num_heads` dimension - # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim) - keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) - values = values.view(b, num_tokens, self.num_heads, self.head_dim) - queries = queries.view(b, num_tokens, self.num_heads, self.head_dim) - - # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim) - keys = keys.transpose(1, 2) - queries = queries.transpose(1, 2) - values = values.transpose(1, 2) - - # Compute scaled dot-product attention (aka self-attention) with a causal mask - attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head - - # Original mask truncated to the number of tokens and converted to boolean - mask_bool = self.mask.bool()[:num_tokens, :num_tokens] - - # Use the mask to fill attention scores - attn_scores.masked_fill_(mask_bool, -torch.inf) - - attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) - attn_weights = self.dropout(attn_weights) - - # Shape: (b, num_tokens, num_heads, head_dim) - context_vec = (attn_weights @ values).transpose(1, 2) - - # Combine heads, where self.d_out = self.num_heads * self.head_dim - context_vec = context_vec.reshape(b, num_tokens, self.d_out) - context_vec = self.out_proj(context_vec) # optional projection - - return context_vec - - -##################################### -# Chapter 4 -##################################### -class LayerNorm(nn.Module): - def __init__(self, emb_dim): - super().__init__() - self.eps = 1e-5 - self.scale = nn.Parameter(torch.ones(emb_dim)) - self.shift = nn.Parameter(torch.zeros(emb_dim)) - - def forward(self, x): - mean = x.mean(dim=-1, keepdim=True) - var = x.var(dim=-1, keepdim=True, unbiased=False) - norm_x = (x - mean) / torch.sqrt(var + self.eps) - return self.scale * norm_x + self.shift - - -class GELU(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return 0.5 * x * (1 + torch.tanh( - torch.sqrt(torch.tensor(2.0 / torch.pi)) * - (x + 0.044715 * torch.pow(x, 3)) - )) - - -class FeedForward(nn.Module): - def __init__(self, cfg): - super().__init__() - self.layers = nn.Sequential( - nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), - GELU(), - nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]), - ) - - def forward(self, x): - return self.layers(x) - - -class TransformerBlock(nn.Module): - def __init__(self, cfg): - super().__init__() - self.att = MultiHeadAttention( - d_in=cfg["emb_dim"], - d_out=cfg["emb_dim"], - context_length=cfg["context_length"], - num_heads=cfg["n_heads"], - dropout=cfg["drop_rate"], - qkv_bias=cfg["qkv_bias"]) - self.ff = FeedForward(cfg) - self.norm1 = LayerNorm(cfg["emb_dim"]) - self.norm2 = LayerNorm(cfg["emb_dim"]) - self.drop_shortcut = nn.Dropout(cfg["drop_rate"]) - - def forward(self, x): - # Shortcut connection for attention block - shortcut = x - x = self.norm1(x) - x = self.att(x) # Shape [batch_size, num_tokens, emb_size] - x = self.drop_shortcut(x) - x = x + shortcut # Add the original input back - - # Shortcut connection for feed-forward block - shortcut = x - x = self.norm2(x) - x = self.ff(x) - x = self.drop_shortcut(x) - x = x + shortcut # Add the original input back - - return x - - -class GPTModel(nn.Module): - def __init__(self, cfg): - super().__init__() - self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"]) - self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"]) - self.drop_emb = nn.Dropout(cfg["drop_rate"]) - - self.trf_blocks = nn.Sequential( - *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]) - - self.final_norm = LayerNorm(cfg["emb_dim"]) - self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False) - - def forward(self, in_idx): - batch_size, seq_len = in_idx.shape - tok_embeds = self.tok_emb(in_idx) - pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device)) - x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size] - x = self.drop_emb(x) - x = self.trf_blocks(x) - x = self.final_norm(x) - logits = self.out_head(x) - return logits - - -##################################### -# Chapter 5 -##################################### -def text_to_token_ids(text, tokenizer): - encoded = tokenizer.encode(text) - encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension - return encoded_tensor - - -def token_ids_to_text(token_ids, tokenizer): - flat = token_ids.squeeze(0) # remove batch dimension - return tokenizer.decode(flat.tolist()) - - -def download_and_load_gpt2(model_size, models_dir): - # Validate model size - allowed_sizes = ("124M", "355M", "774M", "1558M") - if model_size not in allowed_sizes: - raise ValueError(f"Model size not in {allowed_sizes}") - - # Define paths - model_dir = os.path.join(models_dir, model_size) - base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models" - filenames = [ - "checkpoint", "encoder.json", "hparams.json", - "model.ckpt.data-00000-of-00001", "model.ckpt.index", - "model.ckpt.meta", "vocab.bpe" - ] - - # Download files - os.makedirs(model_dir, exist_ok=True) - for filename in filenames: - file_url = os.path.join(base_url, model_size, filename) - file_path = os.path.join(model_dir, filename) - download_file(file_url, file_path) - - # Load settings and params - tf_ckpt_path = tf.train.latest_checkpoint(model_dir) - settings = json.load(open(os.path.join(model_dir, "hparams.json"))) - params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings) - - return settings, params - - -def download_file(url, destination): - # Send a GET request to download the file - with urllib.request.urlopen(url) as response: - # Get the total file size from headers, defaulting to 0 if not present - file_size = int(response.headers.get("Content-Length", 0)) - - # Check if file exists and has the same size - if os.path.exists(destination): - file_size_local = os.path.getsize(destination) - if file_size == file_size_local: - print(f"File already exists and is up-to-date: {destination}") - return - - # Define the block size for reading the file - block_size = 1024 # 1 Kilobyte - - # Initialize the progress bar with total file size - progress_bar_description = os.path.basename(url) # Extract filename from URL - with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar: - # Open the destination file in binary write mode - with open(destination, "wb") as file: - # Read the file in chunks and write to destination - while True: - chunk = response.read(block_size) - if not chunk: - break - file.write(chunk) - progress_bar.update(len(chunk)) # Update progress bar - - -def load_gpt2_params_from_tf_ckpt(ckpt_path, settings): - # Initialize parameters dictionary with empty blocks for each layer - params = {"blocks": [{} for _ in range(settings["n_layer"])]} - - # Iterate over each variable in the checkpoint - for name, _ in tf.train.list_variables(ckpt_path): - # Load the variable and remove singleton dimensions - variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name)) - - # Process the variable name to extract relevant parts - variable_name_parts = name.split("/")[1:] # Skip the 'model/' prefix - - # Identify the target dictionary for the variable - target_dict = params - if variable_name_parts[0].startswith("h"): - layer_number = int(variable_name_parts[0][1:]) - target_dict = params["blocks"][layer_number] - - # Recursively access or create nested dictionaries - for key in variable_name_parts[1:-1]: - target_dict = target_dict.setdefault(key, {}) - - # Assign the variable array to the last key - last_key = variable_name_parts[-1] - target_dict[last_key] = variable_array - - return params - - -def assign(left, right): - if left.shape != right.shape: - raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}") - return torch.nn.Parameter(torch.tensor(right)) - - -def load_weights_into_gpt(gpt, params): - gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe']) - gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte']) - - for b in range(len(params["blocks"])): - q_w, k_w, v_w = np.split( - (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1) - gpt.trf_blocks[b].att.W_query.weight = assign( - gpt.trf_blocks[b].att.W_query.weight, q_w.T) - gpt.trf_blocks[b].att.W_key.weight = assign( - gpt.trf_blocks[b].att.W_key.weight, k_w.T) - gpt.trf_blocks[b].att.W_value.weight = assign( - gpt.trf_blocks[b].att.W_value.weight, v_w.T) - - q_b, k_b, v_b = np.split( - (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1) - gpt.trf_blocks[b].att.W_query.bias = assign( - gpt.trf_blocks[b].att.W_query.bias, q_b) - gpt.trf_blocks[b].att.W_key.bias = assign( - gpt.trf_blocks[b].att.W_key.bias, k_b) - gpt.trf_blocks[b].att.W_value.bias = assign( - gpt.trf_blocks[b].att.W_value.bias, v_b) - - gpt.trf_blocks[b].att.out_proj.weight = assign( - gpt.trf_blocks[b].att.out_proj.weight, - params["blocks"][b]["attn"]["c_proj"]["w"].T) - gpt.trf_blocks[b].att.out_proj.bias = assign( - gpt.trf_blocks[b].att.out_proj.bias, - params["blocks"][b]["attn"]["c_proj"]["b"]) - - gpt.trf_blocks[b].ff.layers[0].weight = assign( - gpt.trf_blocks[b].ff.layers[0].weight, - params["blocks"][b]["mlp"]["c_fc"]["w"].T) - gpt.trf_blocks[b].ff.layers[0].bias = assign( - gpt.trf_blocks[b].ff.layers[0].bias, - params["blocks"][b]["mlp"]["c_fc"]["b"]) - gpt.trf_blocks[b].ff.layers[2].weight = assign( - gpt.trf_blocks[b].ff.layers[2].weight, - params["blocks"][b]["mlp"]["c_proj"]["w"].T) - gpt.trf_blocks[b].ff.layers[2].bias = assign( - gpt.trf_blocks[b].ff.layers[2].bias, - params["blocks"][b]["mlp"]["c_proj"]["b"]) - - gpt.trf_blocks[b].norm1.scale = assign( - gpt.trf_blocks[b].norm1.scale, - params["blocks"][b]["ln_1"]["g"]) - gpt.trf_blocks[b].norm1.shift = assign( - gpt.trf_blocks[b].norm1.shift, - params["blocks"][b]["ln_1"]["b"]) - gpt.trf_blocks[b].norm2.scale = assign( - gpt.trf_blocks[b].norm2.scale, - params["blocks"][b]["ln_2"]["g"]) - gpt.trf_blocks[b].norm2.shift = assign( - gpt.trf_blocks[b].norm2.shift, - params["blocks"][b]["ln_2"]["b"]) - - gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"]) - gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"]) - gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"]) - - -def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None): - - # For-loop is the same as before: Get logits, and only focus on last time step - for _ in range(max_new_tokens): - idx_cond = idx[:, -context_size:] - with torch.no_grad(): - logits = model(idx_cond) - logits = logits[:, -1, :] - - # New: Filter logits with top_k sampling - if top_k is not None: - # Keep only top_k values - top_logits, _ = torch.topk(logits, top_k) - min_val = top_logits[:, -1] - logits = torch.where(logits < min_val, torch.tensor(float('-inf')).to(logits.device), logits) - - # New: Apply temperature scaling - if temperature > 0.0: - logits = logits / temperature - - # Apply softmax to get probabilities - probs = torch.softmax(logits, dim=-1) # (batch_size, context_len) - - # Sample from the distribution - idx_next = torch.multinomial(probs, num_samples=1) # (batch_size, 1) - - # Otherwise same as before: get idx of the vocab entry with the highest logits value - else: - idx_next = torch.argmax(logits, dim=-1, keepdim=True) # (batch_size, 1) - - if idx_next == eos_id: # Stop generating early if end-of-sequence token is encountered and eos_id is specified - break - - # Same as before: append sampled index to the running sequence - idx = torch.cat((idx, idx_next), dim=1) # (batch_size, num_tokens+1) - - return idx diff --git a/pkg/llms_from_scratch/__init__.py b/pkg/llms_from_scratch/__init__.py new file mode 100644 index 0000000..03c0237 --- /dev/null +++ b/pkg/llms_from_scratch/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch \ No newline at end of file diff --git a/pkg/llms_from_scratch/appendix_a.py b/pkg/llms_from_scratch/appendix_a.py new file mode 100644 index 0000000..76b3b65 --- /dev/null +++ b/pkg/llms_from_scratch/appendix_a.py @@ -0,0 +1,44 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch + +import torch +from torch.utils.data import Dataset + + +class NeuralNetwork(torch.nn.Module): + def __init__(self, num_inputs, num_outputs): + super().__init__() + + self.layers = torch.nn.Sequential( + + # 1st hidden layer + torch.nn.Linear(num_inputs, 30), + torch.nn.ReLU(), + + # 2nd hidden layer + torch.nn.Linear(30, 20), + torch.nn.ReLU(), + + # output layer + torch.nn.Linear(20, num_outputs), + ) + + def forward(self, x): + logits = self.layers(x) + return logits + + +class ToyDataset(Dataset): + def __init__(self, X, y): + self.features = X + self.labels = y + + def __getitem__(self, index): + one_x = self.features[index] + one_y = self.labels[index] + return one_x, one_y + + def __len__(self): + return self.labels.shape[0] diff --git a/pkg/llms_from_scratch/appendix_d.py b/pkg/llms_from_scratch/appendix_d.py new file mode 100644 index 0000000..aa3cda4 --- /dev/null +++ b/pkg/llms_from_scratch/appendix_d.py @@ -0,0 +1,94 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch + +from .ch05 import calc_loss_batch, evaluate_model, generate_and_print_sample + +import math +import torch + + +def find_highest_gradient(model): + max_grad = None + for param in model.parameters(): + if param.grad is not None: + grad_values = param.grad.data.flatten() + max_grad_param = grad_values.max() + if max_grad is None or max_grad_param > max_grad: + max_grad = max_grad_param + return max_grad + + +def train_model(model, train_loader, val_loader, optimizer, device, + n_epochs, eval_freq, eval_iter, start_context, tokenizer, + warmup_steps, initial_lr=3e-05, min_lr=1e-6, orig_book_version=False): + + train_losses, val_losses, track_tokens_seen, track_lrs = [], [], [], [] + tokens_seen, global_step = 0, -1 + + # Retrieve the maximum learning rate from the optimizer + peak_lr = optimizer.param_groups[0]["lr"] + + # Calculate the total number of iterations in the training process + total_training_steps = len(train_loader) * n_epochs + + # Calculate the learning rate increment during the warmup phase + lr_increment = (peak_lr - initial_lr) / warmup_steps + + for epoch in range(n_epochs): + model.train() + for input_batch, target_batch in train_loader: + optimizer.zero_grad() + global_step += 1 + + # Adjust the learning rate based on the current phase (warmup or cosine annealing) + if global_step < warmup_steps: + # Linear warmup + lr = initial_lr + global_step * lr_increment + else: + # Cosine annealing after warmup + progress = ((global_step - warmup_steps) / + (total_training_steps - warmup_steps)) + lr = min_lr + (peak_lr - min_lr) * 0.5 * (1 + math.cos(math.pi * progress)) + + # Apply the calculated learning rate to the optimizer + for param_group in optimizer.param_groups: + param_group["lr"] = lr + track_lrs.append(lr) # Store the current learning rate + + # Calculate and backpropagate the loss + loss = calc_loss_batch(input_batch, target_batch, model, device) + loss.backward() + + # Apply gradient clipping after the warmup phase to avoid exploding gradients + if orig_book_version: + if global_step > warmup_steps: + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) + else: + if global_step >= warmup_steps: # the book originally used global_step > warmup_steps, which lead to a skipped clipping step after warmup + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) + + optimizer.step() + tokens_seen += input_batch.numel() + + # Periodically evaluate the model on the training and validation sets + if global_step % eval_freq == 0: + train_loss, val_loss = evaluate_model( + model, train_loader, val_loader, + device, eval_iter + ) + train_losses.append(train_loss) + val_losses.append(val_loss) + track_tokens_seen.append(tokens_seen) + # Print the current losses + print(f"Ep {epoch+1} (Iter {global_step:06d}): " + f"Train loss {train_loss:.3f}, " + f"Val loss {val_loss:.3f}") + + # Generate and print a sample from the model to monitor progress + generate_and_print_sample( + model, tokenizer, device, start_context + ) + + return train_losses, val_losses, track_tokens_seen, track_lrs diff --git a/pkg/llms_from_scratch/appendix_e.py b/pkg/llms_from_scratch/appendix_e.py new file mode 100644 index 0000000..9d004e3 --- /dev/null +++ b/pkg/llms_from_scratch/appendix_e.py @@ -0,0 +1,42 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch + +import torch +import math + + +class LoRALayer(torch.nn.Module): + def __init__(self, in_dim, out_dim, rank, alpha): + super().__init__() + self.A = torch.nn.Parameter(torch.empty(in_dim, rank)) + torch.nn.init.kaiming_uniform_(self.A, a=math.sqrt(5)) # similar to standard weight initialization + self.B = torch.nn.Parameter(torch.zeros(rank, out_dim)) + self.alpha = alpha + + def forward(self, x): + x = self.alpha * (x @ self.A @ self.B) + return x + + +class LinearWithLoRA(torch.nn.Module): + def __init__(self, linear, rank, alpha): + super().__init__() + self.linear = linear + self.lora = LoRALayer( + linear.in_features, linear.out_features, rank, alpha + ) + + def forward(self, x): + return self.linear(x) + self.lora(x) + + +def replace_linear_with_lora(model, rank, alpha): + for name, module in model.named_children(): + if isinstance(module, torch.nn.Linear): + # Replace the Linear layer with LinearWithLoRA + setattr(model, name, LinearWithLoRA(module, rank, alpha)) + else: + # Recursively apply the same function to child modules + replace_linear_with_lora(module, rank, alpha) diff --git a/pkg/llms_from_scratch/ch02.py b/pkg/llms_from_scratch/ch02.py new file mode 100644 index 0000000..47e74df --- /dev/null +++ b/pkg/llms_from_scratch/ch02.py @@ -0,0 +1,46 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch + +import torch +from torch.utils.data import Dataset, DataLoader +import tiktoken + + +class GPTDatasetV1(Dataset): + def __init__(self, txt, tokenizer, max_length, stride): + self.tokenizer = tokenizer + self.input_ids = [] + self.target_ids = [] + + # Tokenize the entire text + token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"}) + + # Use a sliding window to chunk the book into overlapping sequences of max_length + for i in range(0, len(token_ids) - max_length, stride): + input_chunk = token_ids[i:i + max_length] + target_chunk = token_ids[i + 1: i + max_length + 1] + self.input_ids.append(torch.tensor(input_chunk)) + self.target_ids.append(torch.tensor(target_chunk)) + + def __len__(self): + return len(self.input_ids) + + def __getitem__(self, idx): + return self.input_ids[idx], self.target_ids[idx] + + +def create_dataloader_v1(txt, batch_size=4, max_length=256, + stride=128, shuffle=True, drop_last=True, num_workers=0): + # Initialize the tokenizer + tokenizer = tiktoken.get_encoding("gpt2") + + # Create dataset + dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) + + # Create dataloader + dataloader = DataLoader( + dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers) + + return dataloader diff --git a/pkg/llms_from_scratch/ch03.py b/pkg/llms_from_scratch/ch03.py new file mode 100644 index 0000000..7f64439 --- /dev/null +++ b/pkg/llms_from_scratch/ch03.py @@ -0,0 +1,151 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch + +import torch +import torch.nn as nn + + +class SelfAttention_v1(nn.Module): + + def __init__(self, d_in, d_out): + super().__init__() + self.W_query = nn.Parameter(torch.rand(d_in, d_out)) + self.W_key = nn.Parameter(torch.rand(d_in, d_out)) + self.W_value = nn.Parameter(torch.rand(d_in, d_out)) + + def forward(self, x): + keys = x @ self.W_key + queries = x @ self.W_query + values = x @ self.W_value + + attn_scores = queries @ keys.T # omega + attn_weights = torch.softmax( + attn_scores / keys.shape[-1]**0.5, dim=-1 + ) + + context_vec = attn_weights @ values + return context_vec + + +class SelfAttention_v2(nn.Module): + + def __init__(self, d_in, d_out, qkv_bias=False): + super().__init__() + self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias) + self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias) + self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias) + + def forward(self, x): + keys = self.W_key(x) + queries = self.W_query(x) + values = self.W_value(x) + + attn_scores = queries @ keys.T + attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) + + context_vec = attn_weights @ values + return context_vec + + +class CausalAttention(nn.Module): + + def __init__(self, d_in, d_out, context_length, + dropout, qkv_bias=False): + super().__init__() + self.d_out = d_out + self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias) + self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias) + self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias) + self.dropout = nn.Dropout(dropout) # New + self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) # New + + def forward(self, x): + b, num_tokens, d_in = x.shape # New batch dimension b + # For inputs where `num_tokens` exceeds `context_length`, this will result in errors + # in the mask creation further below. + # In practice, this is not a problem since the LLM (chapters 4-7) ensures that inputs + # do not exceed `context_length` before reaching this forward method. + keys = self.W_key(x) + queries = self.W_query(x) + values = self.W_value(x) + + attn_scores = queries @ keys.transpose(1, 2) # Changed transpose + attn_scores.masked_fill_( # New, _ ops are in-place + self.mask.bool()[:num_tokens, :num_tokens], -torch.inf) # `:num_tokens` to account for cases where the number of tokens in the batch is smaller than the supported context_size + attn_weights = torch.softmax( + attn_scores / keys.shape[-1]**0.5, dim=-1 + ) + attn_weights = self.dropout(attn_weights) # New + + context_vec = attn_weights @ values + return context_vec + + +class MultiHeadAttentionWrapper(nn.Module): + def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False): + super().__init__() + self.heads = nn.ModuleList( + [CausalAttention(d_in, d_out, context_length, dropout, qkv_bias) + for _ in range(num_heads)] + ) + + def forward(self, x): + return torch.cat([head(x) for head in self.heads], dim=-1) + + +class MultiHeadAttention(nn.Module): + def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False): + super().__init__() + assert d_out % num_heads == 0, "d_out must be divisible by n_heads" + + self.d_out = d_out + self.num_heads = num_heads + self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim + + self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias) + self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias) + self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias) + self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs + self.dropout = nn.Dropout(dropout) + self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) + + def forward(self, x): + b, num_tokens, d_in = x.shape + + keys = self.W_key(x) # Shape: (b, num_tokens, d_out) + queries = self.W_query(x) + values = self.W_value(x) + + # We implicitly split the matrix by adding a `num_heads` dimension + # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim) + keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) + values = values.view(b, num_tokens, self.num_heads, self.head_dim) + queries = queries.view(b, num_tokens, self.num_heads, self.head_dim) + + # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim) + keys = keys.transpose(1, 2) + queries = queries.transpose(1, 2) + values = values.transpose(1, 2) + + # Compute scaled dot-product attention (aka self-attention) with a causal mask + attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head + + # Original mask truncated to the number of tokens and converted to boolean + mask_bool = self.mask.bool()[:num_tokens, :num_tokens] + + # Use the mask to fill attention scores + attn_scores.masked_fill_(mask_bool, -torch.inf) + + attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) + attn_weights = self.dropout(attn_weights) + + # Shape: (b, num_tokens, num_heads, head_dim) + context_vec = (attn_weights @ values).transpose(1, 2) + + # Combine heads, where self.d_out = self.num_heads * self.head_dim + context_vec = context_vec.reshape(b, num_tokens, self.d_out) + context_vec = self.out_proj(context_vec) # optional projection + + return context_vec diff --git a/ch04/02_performance-analysis/previous_chapters.py b/pkg/llms_from_scratch/ch04.py similarity index 58% rename from ch04/02_performance-analysis/previous_chapters.py rename to pkg/llms_from_scratch/ch04.py index bf05069..1a353a1 100644 --- a/ch04/02_performance-analysis/previous_chapters.py +++ b/pkg/llms_from_scratch/ch04.py @@ -2,76 +2,12 @@ # Source for "Build a Large Language Model From Scratch" # - https://www.manning.com/books/build-a-large-language-model-from-scratch # Code: https://github.com/rasbt/LLMs-from-scratch -# -# This file collects all the relevant code that we covered thus far -# throughout Chapters 2-4. -# This file can be run as a standalone script. + +from .ch03 import MultiHeadAttention import torch import torch.nn as nn -##################################### -# Chapter 3 -##################################### -class MultiHeadAttention(nn.Module): - def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False): - super().__init__() - assert d_out % num_heads == 0, "d_out must be divisible by n_heads" - - self.d_out = d_out - self.num_heads = num_heads - self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim - - self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias) - self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias) - self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias) - self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs - self.dropout = nn.Dropout(dropout) - self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) - - def forward(self, x): - b, num_tokens, d_in = x.shape - - keys = self.W_key(x) # Shape: (b, num_tokens, d_out) - queries = self.W_query(x) - values = self.W_value(x) - - # We implicitly split the matrix by adding a `num_heads` dimension - # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim) - keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) - values = values.view(b, num_tokens, self.num_heads, self.head_dim) - queries = queries.view(b, num_tokens, self.num_heads, self.head_dim) - - # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim) - keys = keys.transpose(1, 2) - queries = queries.transpose(1, 2) - values = values.transpose(1, 2) - - # Compute scaled dot-product attention (aka self-attention) with a causal mask - attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head - - # Original mask truncated to the number of tokens and converted to boolean - mask_bool = self.mask.bool()[:num_tokens, :num_tokens] - - # Use the mask to fill attention scores - attn_scores.masked_fill_(mask_bool, -torch.inf) - - attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) - attn_weights = self.dropout(attn_weights) - - # Shape: (b, num_tokens, num_heads, head_dim) - context_vec = (attn_weights @ values).transpose(1, 2) - - # Combine heads, where self.d_out = self.num_heads * self.head_dim - context_vec = context_vec.reshape(b, num_tokens, self.d_out) - context_vec = self.out_proj(context_vec) # optional projection - - return context_vec - - -##################################### -# Chapter 4 -##################################### class LayerNorm(nn.Module): def __init__(self, emb_dim): super().__init__() @@ -123,21 +59,21 @@ class TransformerBlock(nn.Module): self.ff = FeedForward(cfg) self.norm1 = LayerNorm(cfg["emb_dim"]) self.norm2 = LayerNorm(cfg["emb_dim"]) - self.drop_shortcut = nn.Dropout(cfg["drop_rate"]) + self.drop_resid = nn.Dropout(cfg["drop_rate"]) def forward(self, x): # Shortcut connection for attention block shortcut = x x = self.norm1(x) x = self.att(x) # Shape [batch_size, num_tokens, emb_size] - x = self.drop_shortcut(x) + x = self.drop_resid(x) x = x + shortcut # Add the original input back # Shortcut connection for feed-forward block shortcut = x x = self.norm2(x) x = self.ff(x) - x = self.drop_shortcut(x) + x = self.drop_resid(x) x = x + shortcut # Add the original input back return x diff --git a/pkg/llms_from_scratch/ch05.py b/pkg/llms_from_scratch/ch05.py new file mode 100644 index 0000000..39d1a6b --- /dev/null +++ b/pkg/llms_from_scratch/ch05.py @@ -0,0 +1,233 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch + +from .ch04 import generate_text_simple +import numpy as np +import matplotlib.pyplot as plt +from matplotlib.ticker import MaxNLocator +import torch + + +def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None): + + # For-loop is the same as before: Get logits, and only focus on last time step + for _ in range(max_new_tokens): + idx_cond = idx[:, -context_size:] + with torch.no_grad(): + logits = model(idx_cond) + logits = logits[:, -1, :] + + # New: Filter logits with top_k sampling + if top_k is not None: + # Keep only top_k values + top_logits, _ = torch.topk(logits, top_k) + min_val = top_logits[:, -1] + logits = torch.where(logits < min_val, torch.tensor(float('-inf')).to(logits.device), logits) + + # New: Apply temperature scaling + if temperature > 0.0: + logits = logits / temperature + + # Apply softmax to get probabilities + probs = torch.softmax(logits, dim=-1) # (batch_size, context_len) + + # Sample from the distribution + idx_next = torch.multinomial(probs, num_samples=1) # (batch_size, 1) + + # Otherwise same as before: get idx of the vocab entry with the highest logits value + else: + idx_next = torch.argmax(logits, dim=-1, keepdim=True) # (batch_size, 1) + + if idx_next == eos_id: # Stop generating early if end-of-sequence token is encountered and eos_id is specified + break + + # Same as before: append sampled index to the running sequence + idx = torch.cat((idx, idx_next), dim=1) # (batch_size, num_tokens+1) + + return idx + + +def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs, + eval_freq, eval_iter, start_context, tokenizer): + # Initialize lists to track losses and tokens seen + train_losses, val_losses, track_tokens_seen = [], [], [] + tokens_seen, global_step = 0, -1 + + # Main training loop + for epoch in range(num_epochs): + model.train() # Set model to training mode + + for input_batch, target_batch in train_loader: + optimizer.zero_grad() # Reset loss gradients from previous batch iteration + loss = calc_loss_batch(input_batch, target_batch, model, device) + loss.backward() # Calculate loss gradients + optimizer.step() # Update model weights using loss gradients + tokens_seen += input_batch.numel() + global_step += 1 + + # Optional evaluation step + if global_step % eval_freq == 0: + train_loss, val_loss = evaluate_model( + model, train_loader, val_loader, device, eval_iter) + train_losses.append(train_loss) + val_losses.append(val_loss) + track_tokens_seen.append(tokens_seen) + print(f"Ep {epoch+1} (Step {global_step:06d}): " + f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}") + + # Print a sample text after each epoch + generate_and_print_sample( + model, tokenizer, device, start_context + ) + + return train_losses, val_losses, track_tokens_seen + + +def evaluate_model(model, train_loader, val_loader, device, eval_iter): + model.eval() + with torch.no_grad(): + train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter) + val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter) + model.train() + return train_loss, val_loss + + +def generate_and_print_sample(model, tokenizer, device, start_context): + model.eval() + context_size = model.pos_emb.weight.shape[0] + encoded = text_to_token_ids(start_context, tokenizer).to(device) + with torch.no_grad(): + token_ids = generate_text_simple( + model=model, idx=encoded, + max_new_tokens=50, context_size=context_size + ) + decoded_text = token_ids_to_text(token_ids, tokenizer) + print(decoded_text.replace("\n", " ")) # Compact print format + model.train() + + +def assign(left, right): + if left.shape != right.shape: + raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}") + return torch.nn.Parameter(torch.tensor(right)) + + +def load_weights_into_gpt(gpt, params): + gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe']) + gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte']) + + for b in range(len(params["blocks"])): + q_w, k_w, v_w = np.split( + (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1) + gpt.trf_blocks[b].att.W_query.weight = assign( + gpt.trf_blocks[b].att.W_query.weight, q_w.T) + gpt.trf_blocks[b].att.W_key.weight = assign( + gpt.trf_blocks[b].att.W_key.weight, k_w.T) + gpt.trf_blocks[b].att.W_value.weight = assign( + gpt.trf_blocks[b].att.W_value.weight, v_w.T) + + q_b, k_b, v_b = np.split( + (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1) + gpt.trf_blocks[b].att.W_query.bias = assign( + gpt.trf_blocks[b].att.W_query.bias, q_b) + gpt.trf_blocks[b].att.W_key.bias = assign( + gpt.trf_blocks[b].att.W_key.bias, k_b) + gpt.trf_blocks[b].att.W_value.bias = assign( + gpt.trf_blocks[b].att.W_value.bias, v_b) + + gpt.trf_blocks[b].att.out_proj.weight = assign( + gpt.trf_blocks[b].att.out_proj.weight, + params["blocks"][b]["attn"]["c_proj"]["w"].T) + gpt.trf_blocks[b].att.out_proj.bias = assign( + gpt.trf_blocks[b].att.out_proj.bias, + params["blocks"][b]["attn"]["c_proj"]["b"]) + + gpt.trf_blocks[b].ff.layers[0].weight = assign( + gpt.trf_blocks[b].ff.layers[0].weight, + params["blocks"][b]["mlp"]["c_fc"]["w"].T) + gpt.trf_blocks[b].ff.layers[0].bias = assign( + gpt.trf_blocks[b].ff.layers[0].bias, + params["blocks"][b]["mlp"]["c_fc"]["b"]) + gpt.trf_blocks[b].ff.layers[2].weight = assign( + gpt.trf_blocks[b].ff.layers[2].weight, + params["blocks"][b]["mlp"]["c_proj"]["w"].T) + gpt.trf_blocks[b].ff.layers[2].bias = assign( + gpt.trf_blocks[b].ff.layers[2].bias, + params["blocks"][b]["mlp"]["c_proj"]["b"]) + + gpt.trf_blocks[b].norm1.scale = assign( + gpt.trf_blocks[b].norm1.scale, + params["blocks"][b]["ln_1"]["g"]) + gpt.trf_blocks[b].norm1.shift = assign( + gpt.trf_blocks[b].norm1.shift, + params["blocks"][b]["ln_1"]["b"]) + gpt.trf_blocks[b].norm2.scale = assign( + gpt.trf_blocks[b].norm2.scale, + params["blocks"][b]["ln_2"]["g"]) + gpt.trf_blocks[b].norm2.shift = assign( + gpt.trf_blocks[b].norm2.shift, + params["blocks"][b]["ln_2"]["b"]) + + gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"]) + gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"]) + gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"]) + + +def text_to_token_ids(text, tokenizer): + encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"}) + encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension + return encoded_tensor + + +def token_ids_to_text(token_ids, tokenizer): + flat = token_ids.squeeze(0) # remove batch dimension + return tokenizer.decode(flat.tolist()) + + +def calc_loss_batch(input_batch, target_batch, model, device): + input_batch, target_batch = input_batch.to(device), target_batch.to(device) + logits = model(input_batch) + loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten()) + return loss + + +def calc_loss_loader(data_loader, model, device, num_batches=None): + total_loss = 0. + if len(data_loader) == 0: + return float("nan") + elif num_batches is None: + num_batches = len(data_loader) + else: + # Reduce the number of batches to match the total number of batches in the data loader + # if num_batches exceeds the number of batches in the data loader + num_batches = min(num_batches, len(data_loader)) + for i, (input_batch, target_batch) in enumerate(data_loader): + if i < num_batches: + loss = calc_loss_batch(input_batch, target_batch, model, device) + total_loss += loss.item() + else: + break + return total_loss / num_batches + + +def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses): + fig, ax1 = plt.subplots(figsize=(5, 3)) + + # Plot training and validation loss against epochs + ax1.plot(epochs_seen, train_losses, label="Training loss") + ax1.plot(epochs_seen, val_losses, linestyle="-.", label="Validation loss") + ax1.set_xlabel("Epochs") + ax1.set_ylabel("Loss") + ax1.legend(loc="upper right") + ax1.xaxis.set_major_locator(MaxNLocator(integer=True)) # only show integer labels on x-axis + + # Create a second x-axis for tokens seen + ax2 = ax1.twiny() # Create a second x-axis that shares the same y-axis + ax2.plot(tokens_seen, train_losses, alpha=0) # Invisible plot for aligning ticks + ax2.set_xlabel("Tokens seen") + + fig.tight_layout() # Adjust layout to make room + plt.savefig("loss-plot.pdf") + plt.show() diff --git a/pkg/llms_from_scratch/ch06.py b/pkg/llms_from_scratch/ch06.py new file mode 100644 index 0000000..281017d --- /dev/null +++ b/pkg/llms_from_scratch/ch06.py @@ -0,0 +1,254 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch + + +import urllib.request +import zipfile +import os +from pathlib import Path + +import matplotlib.pyplot as plt +from torch.utils.data import Dataset +import torch +import pandas as pd + + +def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path): + if data_file_path.exists(): + print(f"{data_file_path} already exists. Skipping download and extraction.") + return + + # Downloading the file + with urllib.request.urlopen(url) as response: + with open(zip_path, "wb") as out_file: + out_file.write(response.read()) + + # Unzipping the file + with zipfile.ZipFile(zip_path, "r") as zip_ref: + zip_ref.extractall(extracted_path) + + # Add .tsv file extension + original_file_path = Path(extracted_path) / "SMSSpamCollection" + os.rename(original_file_path, data_file_path) + print(f"File downloaded and saved as {data_file_path}") + + +def create_balanced_dataset(df): + + # Count the instances of "spam" + num_spam = df[df["Label"] == "spam"].shape[0] + + # Randomly sample "ham" instances to match the number of "spam" instances + ham_subset = df[df["Label"] == "ham"].sample(num_spam, random_state=123) + + # Combine ham "subset" with "spam" + balanced_df = pd.concat([ham_subset, df[df["Label"] == "spam"]]) + + return balanced_df + + +def random_split(df, train_frac, validation_frac): + # Shuffle the entire DataFrame + df = df.sample(frac=1, random_state=123).reset_index(drop=True) + + # Calculate split indices + train_end = int(len(df) * train_frac) + validation_end = train_end + int(len(df) * validation_frac) + + # Split the DataFrame + train_df = df[:train_end] + validation_df = df[train_end:validation_end] + test_df = df[validation_end:] + + return train_df, validation_df, test_df + + +class SpamDataset(Dataset): + def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256): + self.data = pd.read_csv(csv_file) + + # Pre-tokenize texts + self.encoded_texts = [ + tokenizer.encode(text) for text in self.data["Text"] + ] + + if max_length is None: + self.max_length = self._longest_encoded_length() + else: + self.max_length = max_length + # Truncate sequences if they are longer than max_length + self.encoded_texts = [ + encoded_text[:self.max_length] + for encoded_text in self.encoded_texts + ] + + # Pad sequences to the longest sequence + self.encoded_texts = [ + encoded_text + [pad_token_id] * (self.max_length - len(encoded_text)) + for encoded_text in self.encoded_texts + ] + + def __getitem__(self, index): + encoded = self.encoded_texts[index] + label = self.data.iloc[index]["Label"] + return ( + torch.tensor(encoded, dtype=torch.long), + torch.tensor(label, dtype=torch.long) + ) + + def __len__(self): + return len(self.data) + + def _longest_encoded_length(self): + max_length = 0 + for encoded_text in self.encoded_texts: + encoded_length = len(encoded_text) + if encoded_length > max_length: + max_length = encoded_length + return max_length + # Note: A more pythonic version to implement this method + # is the following, which is also used in the next chapter: + # return max(len(encoded_text) for encoded_text in self.encoded_texts) + + +def calc_accuracy_loader(data_loader, model, device, num_batches=None): + model.eval() + correct_predictions, num_examples = 0, 0 + + if num_batches is None: + num_batches = len(data_loader) + else: + num_batches = min(num_batches, len(data_loader)) + for i, (input_batch, target_batch) in enumerate(data_loader): + if i < num_batches: + input_batch, target_batch = input_batch.to(device), target_batch.to(device) + + with torch.no_grad(): + logits = model(input_batch)[:, -1, :] # Logits of last output token + predicted_labels = torch.argmax(logits, dim=-1) + + num_examples += predicted_labels.shape[0] + correct_predictions += (predicted_labels == target_batch).sum().item() + else: + break + return correct_predictions / num_examples + + +def calc_loss_batch(input_batch, target_batch, model, device): + input_batch, target_batch = input_batch.to(device), target_batch.to(device) + logits = model(input_batch)[:, -1, :] # Logits of last output token + loss = torch.nn.functional.cross_entropy(logits, target_batch) + return loss + + +def calc_loss_loader(data_loader, model, device, num_batches=None): + total_loss = 0. + if len(data_loader) == 0: + return float("nan") + elif num_batches is None: + num_batches = len(data_loader) + else: + # Reduce the number of batches to match the total number of batches in the data loader + # if num_batches exceeds the number of batches in the data loader + num_batches = min(num_batches, len(data_loader)) + for i, (input_batch, target_batch) in enumerate(data_loader): + if i < num_batches: + loss = calc_loss_batch(input_batch, target_batch, model, device) + total_loss += loss.item() + else: + break + return total_loss / num_batches + + +def evaluate_model(model, train_loader, val_loader, device, eval_iter): + model.eval() + with torch.no_grad(): + train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter) + val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter) + model.train() + return train_loss, val_loss + + +def train_classifier_simple(model, train_loader, val_loader, optimizer, device, num_epochs, + eval_freq, eval_iter): + # Initialize lists to track losses and examples seen + train_losses, val_losses, train_accs, val_accs = [], [], [], [] + examples_seen, global_step = 0, -1 + + # Main training loop + for epoch in range(num_epochs): + model.train() # Set model to training mode + + for input_batch, target_batch in train_loader: + optimizer.zero_grad() # Reset loss gradients from previous batch iteration + loss = calc_loss_batch(input_batch, target_batch, model, device) + loss.backward() # Calculate loss gradients + optimizer.step() # Update model weights using loss gradients + examples_seen += input_batch.shape[0] # New: track examples instead of tokens + global_step += 1 + + # Optional evaluation step + if global_step % eval_freq == 0: + train_loss, val_loss = evaluate_model( + model, train_loader, val_loader, device, eval_iter) + train_losses.append(train_loss) + val_losses.append(val_loss) + print(f"Ep {epoch+1} (Step {global_step:06d}): " + f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}") + + # Calculate accuracy after each epoch + train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=eval_iter) + val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=eval_iter) + print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="") + print(f"Validation accuracy: {val_accuracy*100:.2f}%") + train_accs.append(train_accuracy) + val_accs.append(val_accuracy) + + return train_losses, val_losses, train_accs, val_accs, examples_seen + + +def plot_values(epochs_seen, examples_seen, train_values, val_values, label="loss"): + fig, ax1 = plt.subplots(figsize=(5, 3)) + + # Plot training and validation loss against epochs + ax1.plot(epochs_seen, train_values, label=f"Training {label}") + ax1.plot(epochs_seen, val_values, linestyle="-.", label=f"Validation {label}") + ax1.set_xlabel("Epochs") + ax1.set_ylabel(label.capitalize()) + ax1.legend() + + # Create a second x-axis for examples seen + ax2 = ax1.twiny() # Create a second x-axis that shares the same y-axis + ax2.plot(examples_seen, train_values, alpha=0) # Invisible plot for aligning ticks + ax2.set_xlabel("Examples seen") + + fig.tight_layout() # Adjust layout to make room + plt.savefig(f"{label}-plot.pdf") + plt.show() + + +def classify_review(text, model, tokenizer, device, max_length=None, pad_token_id=50256): + model.eval() + + # Prepare inputs to the model + input_ids = tokenizer.encode(text) + supported_context_length = model.pos_emb.weight.shape[0] + # Note: In the book, this was originally written as pos_emb.weight.shape[1] by mistake + # It didn't break the code but would have caused unnecessary truncation (to 768 instead of 1024) + + # Truncate sequences if they too long + input_ids = input_ids[:min(max_length, supported_context_length)] + + # Pad sequences to the longest sequence + input_ids += [pad_token_id] * (max_length - len(input_ids)) + input_tensor = torch.tensor(input_ids, device=device).unsqueeze(0) # add batch dimension + + # Model inference + with torch.no_grad(): + logits = model(input_tensor)[:, -1, :] # Logits of the last output token + predicted_label = torch.argmax(logits, dim=-1).item() + + # Return the classified result + return "spam" if predicted_label == 1 else "not spam" diff --git a/pkg/llms_from_scratch/ch07.py b/pkg/llms_from_scratch/ch07.py new file mode 100644 index 0000000..5a0946a --- /dev/null +++ b/pkg/llms_from_scratch/ch07.py @@ -0,0 +1,247 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch + +import json +import os +import psutil +import urllib + +import torch +import tqdm +from torch.utils.data import Dataset + + +def download_and_load_file(file_path, url): + + if not os.path.exists(file_path): + with urllib.request.urlopen(url) as response: + text_data = response.read().decode("utf-8") + with open(file_path, "w", encoding="utf-8") as file: + file.write(text_data) + + # The book originally contained this unnecessary "else" clause: + # else: + # with open(file_path, "r", encoding="utf-8") as file: + # text_data = file.read() + + with open(file_path, "r", encoding="utf-8") as file: + data = json.load(file) + + return data + + +def format_input(entry): + instruction_text = ( + f"Below is an instruction that describes a task. " + f"Write a response that appropriately completes the request." + f"\n\n### Instruction:\n{entry['instruction']}" + ) + + input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else "" + + return instruction_text + input_text + + +class InstructionDataset(Dataset): + def __init__(self, data, tokenizer): + self.data = data + + # Pre-tokenize texts + self.encoded_texts = [] + for entry in data: + instruction_plus_input = format_input(entry) + response_text = f"\n\n### Response:\n{entry['output']}" + full_text = instruction_plus_input + response_text + self.encoded_texts.append( + tokenizer.encode(full_text) + ) + + def __getitem__(self, index): + return self.encoded_texts[index] + + def __len__(self): + return len(self.data) + + +def custom_collate_draft_1( + batch, + pad_token_id=50256, + device="cpu" +): + # Find the longest sequence in the batch + # and increase the max length by +1, which will add one extra + # padding token below + batch_max_length = max(len(item)+1 for item in batch) + + # Pad and prepare inputs + inputs_lst = [] + + for item in batch: + new_item = item.copy() + # Add an <|endoftext|> token + new_item += [pad_token_id] + # Pad sequences to batch_max_length + padded = ( + new_item + [pad_token_id] * + (batch_max_length - len(new_item)) + ) + # Via padded[:-1], we remove the extra padded token + # that has been added via the +1 setting in batch_max_length + # (the extra padding token will be relevant in later codes) + inputs = torch.tensor(padded[:-1]) + inputs_lst.append(inputs) + + # Convert list of inputs to tensor and transfer to target device + inputs_tensor = torch.stack(inputs_lst).to(device) + return inputs_tensor + + +def custom_collate_draft_2( + batch, + pad_token_id=50256, + device="cpu" +): + # Find the longest sequence in the batch + batch_max_length = max(len(item)+1 for item in batch) + + # Pad and prepare inputs + inputs_lst, targets_lst = [], [] + + for item in batch: + new_item = item.copy() + # Add an <|endoftext|> token + new_item += [pad_token_id] + # Pad sequences to max_length + padded = ( + new_item + [pad_token_id] * + (batch_max_length - len(new_item)) + ) + inputs = torch.tensor(padded[:-1]) # Truncate the last token for inputs + targets = torch.tensor(padded[1:]) # Shift +1 to the right for targets + inputs_lst.append(inputs) + targets_lst.append(targets) + + # Convert list of inputs to tensor and transfer to target device + inputs_tensor = torch.stack(inputs_lst).to(device) + targets_tensor = torch.stack(targets_lst).to(device) + return inputs_tensor, targets_tensor + + +def custom_collate_fn( + batch, + pad_token_id=50256, + ignore_index=-100, + allowed_max_length=None, + device="cpu" +): + # Find the longest sequence in the batch + batch_max_length = max(len(item)+1 for item in batch) + + # Pad and prepare inputs and targets + inputs_lst, targets_lst = [], [] + + for item in batch: + new_item = item.copy() + # Add an <|endoftext|> token + new_item += [pad_token_id] + # Pad sequences to max_length + padded = ( + new_item + [pad_token_id] * + (batch_max_length - len(new_item)) + ) + inputs = torch.tensor(padded[:-1]) # Truncate the last token for inputs + targets = torch.tensor(padded[1:]) # Shift +1 to the right for targets + + # New: Replace all but the first padding tokens in targets by ignore_index + mask = targets == pad_token_id + indices = torch.nonzero(mask).squeeze() + if indices.numel() > 1: + targets[indices[1:]] = ignore_index + + # New: Optionally truncate to maximum sequence length + if allowed_max_length is not None: + inputs = inputs[:allowed_max_length] + targets = targets[:allowed_max_length] + + inputs_lst.append(inputs) + targets_lst.append(targets) + + # Convert list of inputs and targets to tensors and transfer to target device + inputs_tensor = torch.stack(inputs_lst).to(device) + targets_tensor = torch.stack(targets_lst).to(device) + + return inputs_tensor, targets_tensor + + +def check_if_running(process_name): + running = False + for proc in psutil.process_iter(["name"]): + if process_name in proc.info["name"]: + running = True + break + return running + + +def query_model( + prompt, + model="llama3", + url="http://localhost:11434/api/chat" +): + # Create the data payload as a dictionary + data = { + "model": model, + "messages": [ + {"role": "user", "content": prompt} + ], + "options": { # Settings below are required for deterministic responses + "seed": 123, + "temperature": 0, + "num_ctx": 2048 + } + } + + # Convert the dictionary to a JSON formatted string and encode it to bytes + payload = json.dumps(data).encode("utf-8") + + # Create a request object, setting the method to POST and adding necessary headers + request = urllib.request.Request( + url, + data=payload, + method="POST" + ) + request.add_header("Content-Type", "application/json") + + # Send the request and capture the response + response_data = "" + with urllib.request.urlopen(request) as response: + # Read and decode the response + while True: + line = response.readline().decode("utf-8") + if not line: + break + response_json = json.loads(line) + response_data += response_json["message"]["content"] + + return response_data + + +def generate_model_scores(json_data, json_key, model="llama3"): + scores = [] + for entry in tqdm(json_data, desc="Scoring entries"): + prompt = ( + f"Given the input `{format_input(entry)}` " + f"and correct output `{entry['output']}`, " + f"score the model response `{entry[json_key]}`" + f" on a scale from 0 to 100, where 100 is the best score. " + f"Respond with the integer number only." + ) + score = query_model(prompt, model) + try: + scores.append(int(score)) + except ValueError: + print(f"Could not convert score: {score}") + continue + + return scores diff --git a/pkg/llms_from_scratch/tests/test_appendix_a.py b/pkg/llms_from_scratch/tests/test_appendix_a.py new file mode 100644 index 0000000..9840a69 --- /dev/null +++ b/pkg/llms_from_scratch/tests/test_appendix_a.py @@ -0,0 +1,70 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch + +from llms_from_scratch.appendix_a import NeuralNetwork, ToyDataset + +import torch +import torch.nn.functional as F +from torch.utils.data import DataLoader + + +def test_dataset(): + + X_train = torch.tensor([ + [-1.2, 3.1], + [-0.9, 2.9], + [-0.5, 2.6], + [2.3, -1.1], + [2.7, -1.5] + ]) + + y_train = torch.tensor([0, 0, 0, 1, 1]) + train_ds = ToyDataset(X_train, y_train) + + len(train_ds) == 5 + torch.manual_seed(123) + + train_loader = DataLoader( + dataset=train_ds, + batch_size=2, + shuffle=True, + num_workers=0 + ) + + torch.manual_seed(123) + model = NeuralNetwork(num_inputs=2, num_outputs=2) + optimizer = torch.optim.SGD(model.parameters(), lr=0.5) + + num_epochs = 3 + + for epoch in range(num_epochs): + + model.train() + for batch_idx, (features, labels) in enumerate(train_loader): + + logits = model(features) + + loss = F.cross_entropy(logits, labels) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + print(f"Epoch: {epoch+1:03d}/{num_epochs:03d}" + f" | Batch {batch_idx:03d}/{len(train_loader):03d}" + f" | Train/Val Loss: {loss:.2f}") + + model.eval() + with torch.no_grad(): + outputs = model(X_train) + + expected = torch.tensor([ + [2.8569, -4.1618], + [2.5382, -3.7548], + [2.0944, -3.1820], + [-1.4814, 1.4816], + [-1.7176, 1.7342] + ]) + torch.equal(outputs, expected) diff --git a/pkg/llms_from_scratch/tests/test_appendix_d.py b/pkg/llms_from_scratch/tests/test_appendix_d.py new file mode 100644 index 0000000..949b69a --- /dev/null +++ b/pkg/llms_from_scratch/tests/test_appendix_d.py @@ -0,0 +1,118 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch + +from llms_from_scratch.ch02 import create_dataloader_v1 +from llms_from_scratch.ch04 import GPTModel +from llms_from_scratch.appendix_d import train_model + +import os +import urllib + +import tiktoken +import torch +from torch.utils.data import Subset, DataLoader + + +def test_train(tmp_path): + + GPT_CONFIG_124M = { + "vocab_size": 50257, # Vocabulary size + "context_length": 256, # Shortened context length (orig: 1024) + "emb_dim": 768, # Embedding dimension + "n_heads": 12, # Number of attention heads + "n_layers": 12, # Number of layers + "drop_rate": 0.1, # Dropout rate + "qkv_bias": False # Query-key-value bias + } + + OTHER_SETTINGS = { + "learning_rate": 5e-4, + "num_epochs": 2, + "batch_size": 1, + "weight_decay": 0.1 + } + + torch.manual_seed(123) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + ############################## + # Download data if necessary + ############################## + + file_path = tmp_path / "the-verdict.txt" + url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt" + + if not os.path.exists(file_path): + with urllib.request.urlopen(url) as response: + text_data = response.read().decode("utf-8") + with open(file_path, "w", encoding="utf-8") as file: + file.write(text_data) + else: + with open(file_path, "r", encoding="utf-8") as file: + text_data = file.read() + + ############################## + # Initialize model + ############################## + + model = GPTModel(GPT_CONFIG_124M) + model.to(device) # no assignment model = model.to(device) necessary for nn.Module classes + + ############################## + # Set up dataloaders + ############################## + + # Train/validation ratio + train_ratio = 0.90 + split_idx = int(train_ratio * len(text_data)) + + train_loader = create_dataloader_v1( + text_data[:split_idx], + batch_size=OTHER_SETTINGS["batch_size"], + max_length=GPT_CONFIG_124M["context_length"], + stride=GPT_CONFIG_124M["context_length"], + drop_last=True, + shuffle=True, + num_workers=0 + ) + + val_loader = create_dataloader_v1( + text_data[split_idx:], + batch_size=OTHER_SETTINGS["batch_size"], + max_length=GPT_CONFIG_124M["context_length"], + stride=GPT_CONFIG_124M["context_length"], + drop_last=False, + shuffle=False, + num_workers=0 + ) + + ############################## + # Train model + ############################## + + tokenizer = tiktoken.get_encoding("gpt2") + + train_subset = Subset(train_loader.dataset, range(1)) + one_batch_train_loader = DataLoader(train_subset, batch_size=1) + val_subset = Subset(val_loader.dataset, range(1)) + one_batch_val_loader = DataLoader(val_subset, batch_size=1) + + peak_lr = 0.001 # this was originally set to 5e-4 in the book by mistake + optimizer = torch.optim.AdamW(model.parameters(), lr=peak_lr, weight_decay=0.1) # the book accidentally omitted the lr assignment + tokenizer = tiktoken.get_encoding("gpt2") + + n_epochs = 6 + warmup_steps = 1 + + train_losses, val_losses, tokens_seen, lrs = train_model( + model, one_batch_train_loader, one_batch_val_loader, optimizer, device, n_epochs=n_epochs, + eval_freq=5, eval_iter=1, start_context="Every effort moves you", + tokenizer=tokenizer, warmup_steps=warmup_steps, + initial_lr=1e-5, min_lr=1e-5 + ) + + assert round(train_losses[0], 1) == 10.9 + assert round(val_losses[0], 1) == 11.0 + assert train_losses[-1] < train_losses[0] diff --git a/pkg/llms_from_scratch/tests/test_appendix_e.py b/pkg/llms_from_scratch/tests/test_appendix_e.py new file mode 100644 index 0000000..72b1cf5 --- /dev/null +++ b/pkg/llms_from_scratch/tests/test_appendix_e.py @@ -0,0 +1,150 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch + + +from llms_from_scratch.ch04 import GPTModel +from llms_from_scratch.ch06 import ( + download_and_unzip_spam_data, create_balanced_dataset, + random_split, SpamDataset, train_classifier_simple +) +from llms_from_scratch.appendix_e import replace_linear_with_lora + +from pathlib import Path +import urllib + +import pandas as pd +import tiktoken +import torch +from torch.utils.data import DataLoader, Subset + + +def test_train_classifier_lora(tmp_path): + + ######################################## + # Download and prepare dataset + ######################################## + + url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip" + zip_path = tmp_path / "sms_spam_collection.zip" + extracted_path = tmp_path / "sms_spam_collection" + data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv" + + try: + download_and_unzip_spam_data( + url, zip_path, extracted_path, data_file_path + ) + except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e: + print(f"Primary URL failed: {e}. Trying backup URL...") + backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip" + download_and_unzip_spam_data( + backup_url, zip_path, extracted_path, data_file_path + ) + + df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"]) + balanced_df = create_balanced_dataset(df) + balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1}) + + train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1) + train_df.to_csv(tmp_path / "train.csv", index=None) + validation_df.to_csv(tmp_path / "validation.csv", index=None) + test_df.to_csv(tmp_path / "test.csv", index=None) + + ######################################## + # Create data loaders + ######################################## + tokenizer = tiktoken.get_encoding("gpt2") + + train_dataset = SpamDataset( + csv_file=tmp_path / "train.csv", + max_length=None, + tokenizer=tokenizer + ) + + val_dataset = SpamDataset( + csv_file=tmp_path / "validation.csv", + max_length=train_dataset.max_length, + tokenizer=tokenizer + ) + + num_workers = 0 + batch_size = 8 + + torch.manual_seed(123) + + train_loader = DataLoader( + dataset=train_dataset, + batch_size=batch_size, + shuffle=True, + num_workers=num_workers, + drop_last=True, + ) + + val_loader = DataLoader( + dataset=val_dataset, + batch_size=batch_size, + num_workers=num_workers, + drop_last=False, + ) + + ######################################## + # Load pretrained model + ######################################## + + # Small GPT model for testing purposes + BASE_CONFIG = { + "vocab_size": 50257, + "context_length": 120, + "drop_rate": 0.0, + "qkv_bias": False, + "emb_dim": 12, + "n_layers": 1, + "n_heads": 2 + } + model = GPTModel(BASE_CONFIG) + model.eval() + device = "cpu" + + ######################################## + # Modify and pretrained model + ######################################## + + for param in model.parameters(): + param.requires_grad = False + + torch.manual_seed(123) + + num_classes = 2 + model.out_head = torch.nn.Linear(in_features=BASE_CONFIG["emb_dim"], out_features=num_classes) + replace_linear_with_lora(model, rank=16, alpha=16) + model.to(device) + + for param in model.trf_blocks[-1].parameters(): + param.requires_grad = True + + for param in model.final_norm.parameters(): + param.requires_grad = True + + ######################################## + # Finetune modified model + ######################################## + + torch.manual_seed(123) + + optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1) + + train_subset = Subset(train_loader.dataset, range(5)) + batch_train_loader = DataLoader(train_subset, batch_size=5) + val_subset = Subset(val_loader.dataset, range(5)) + batch_val_loader = DataLoader(val_subset, batch_size=5) + + num_epochs = 6 + train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple( + model, batch_train_loader, batch_val_loader, optimizer, device, + num_epochs=num_epochs, eval_freq=1, eval_iter=1, + ) + + assert round(train_losses[0], 1) == 0.8 + assert round(val_losses[0], 1) == 0.8 + assert train_losses[-1] < train_losses[0] diff --git a/pkg/llms_from_scratch/tests/test_ch02.py b/pkg/llms_from_scratch/tests/test_ch02.py new file mode 100644 index 0000000..11d8a52 --- /dev/null +++ b/pkg/llms_from_scratch/tests/test_ch02.py @@ -0,0 +1,54 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch + +from llms_from_scratch.ch02 import create_dataloader_v1 + +import os +import urllib.request + +import pytest +import torch + + +@pytest.mark.parametrize("file_name", ["the-verdict.txt"]) +def test_dataloader(tmp_path, file_name): + + if not os.path.exists("the-verdict.txt"): + url = ("https://raw.githubusercontent.com/rasbt/" + "LLMs-from-scratch/main/ch02/01_main-chapter-code/" + "the-verdict.txt") + file_path = "the-verdict.txt" + urllib.request.urlretrieve(url, file_path) + + with open("the-verdict.txt", "r", encoding="utf-8") as f: + raw_text = f.read() + + vocab_size = 50257 + output_dim = 256 + context_length = 1024 + + token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim) + pos_embedding_layer = torch.nn.Embedding(context_length, output_dim) + + batch_size = 8 + max_length = 4 + dataloader = create_dataloader_v1( + raw_text, + batch_size=batch_size, + max_length=max_length, + stride=max_length + ) + + for batch in dataloader: + x, y = batch + + token_embeddings = token_embedding_layer(x) + pos_embeddings = pos_embedding_layer(torch.arange(max_length)) + + input_embeddings = token_embeddings + pos_embeddings + + break + + input_embeddings.shape == torch.Size([8, 4, 256]) diff --git a/pkg/llms_from_scratch/tests/test_ch03.py b/pkg/llms_from_scratch/tests/test_ch03.py new file mode 100644 index 0000000..91d2606 --- /dev/null +++ b/pkg/llms_from_scratch/tests/test_ch03.py @@ -0,0 +1,22 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch + + +from llms_from_scratch.ch03 import MultiHeadAttention +import torch + + +def test_mha(): + + context_length = 100 + d_in = 256 + d_out = 16 + + mha = MultiHeadAttention(d_in, d_out, context_length, 0.0, num_heads=2) + + batch = torch.rand(8, 6, d_in) + context_vecs = mha(batch) + + context_vecs.shape == torch.Size([8, 6, d_out]) diff --git a/pkg/llms_from_scratch/tests/test_ch04.py b/pkg/llms_from_scratch/tests/test_ch04.py new file mode 100644 index 0000000..c84ad15 --- /dev/null +++ b/pkg/llms_from_scratch/tests/test_ch04.py @@ -0,0 +1,50 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch + +from llms_from_scratch.ch04 import GPTModel +from llms_from_scratch.ch04 import generate_text_simple + +import torch +import tiktoken + + +def test_GPTModel(): + GPT_CONFIG_124M = { + "vocab_size": 50257, # Vocabulary size + "context_length": 1024, # Context length + "emb_dim": 768, # Embedding dimension + "n_heads": 12, # Number of attention heads + "n_layers": 12, # Number of layers + "drop_rate": 0.1, # Dropout rate + "qkv_bias": False # Query-Key-Value bias + } + + torch.manual_seed(123) + model = GPTModel(GPT_CONFIG_124M) + model.eval() # disable dropout + + start_context = "Hello, I am" + + tokenizer = tiktoken.get_encoding("gpt2") + encoded = tokenizer.encode(start_context) + encoded_tensor = torch.tensor(encoded).unsqueeze(0) + + print(f"\n{50*'='}\n{22*' '}IN\n{50*'='}") + print("\nInput text:", start_context) + print("Encoded input text:", encoded) + print("encoded_tensor.shape:", encoded_tensor.shape) + + out = generate_text_simple( + model=model, + idx=encoded_tensor, + max_new_tokens=10, + context_size=GPT_CONFIG_124M["context_length"] + ) + + expect = torch.tensor([ + [15496, 11, 314, 716, 27018, 24086, 47843, 30961, 42348, 7267, + 49706, 43231, 47062, 34657] + ]) + torch.equal(expect, out) diff --git a/pkg/llms_from_scratch/tests/test_ch05.py b/pkg/llms_from_scratch/tests/test_ch05.py new file mode 100644 index 0000000..617440f --- /dev/null +++ b/pkg/llms_from_scratch/tests/test_ch05.py @@ -0,0 +1,115 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch + +from llms_from_scratch.ch02 import create_dataloader_v1 +from llms_from_scratch.ch04 import GPTModel +from llms_from_scratch.ch05 import train_model_simple + +import os +import urllib + +import pytest +import tiktoken +import torch +from torch.utils.data import Subset, DataLoader + + +@pytest.mark.parametrize("file_name", ["the-verdict.txt"]) +def test_train_simple(tmp_path, file_name): + + GPT_CONFIG_124M = { + "vocab_size": 50257, # Vocabulary size + "context_length": 256, # Shortened context length (orig: 1024) + "emb_dim": 768, # Embedding dimension + "n_heads": 12, # Number of attention heads + "n_layers": 12, # Number of layers + "drop_rate": 0.1, # Dropout rate + "qkv_bias": False # Query-key-value bias + } + + OTHER_SETTINGS = { + "learning_rate": 5e-4, + "num_epochs": 2, + "batch_size": 1, + "weight_decay": 0.1 + } + + torch.manual_seed(123) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + ############################## + # Download data if necessary + ############################## + + file_path = tmp_path / "the-verdict.txt" + url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt" + + if not os.path.exists(file_path): + with urllib.request.urlopen(url) as response: + text_data = response.read().decode('utf-8') + with open(file_path, "w", encoding="utf-8") as file: + file.write(text_data) + else: + with open(file_path, "r", encoding="utf-8") as file: + text_data = file.read() + + ############################## + # Initialize model + ############################## + + model = GPTModel(GPT_CONFIG_124M) + model.to(device) # no assignment model = model.to(device) necessary for nn.Module classes + optimizer = torch.optim.AdamW( + model.parameters(), lr=OTHER_SETTINGS["learning_rate"], weight_decay=OTHER_SETTINGS["weight_decay"] + ) + + ############################## + # Set up dataloaders + ############################## + + # Train/validation ratio + train_ratio = 0.90 + split_idx = int(train_ratio * len(text_data)) + + train_loader = create_dataloader_v1( + text_data[:split_idx], + batch_size=OTHER_SETTINGS["batch_size"], + max_length=GPT_CONFIG_124M["context_length"], + stride=GPT_CONFIG_124M["context_length"], + drop_last=True, + shuffle=True, + num_workers=0 + ) + + val_loader = create_dataloader_v1( + text_data[split_idx:], + batch_size=OTHER_SETTINGS["batch_size"], + max_length=GPT_CONFIG_124M["context_length"], + stride=GPT_CONFIG_124M["context_length"], + drop_last=False, + shuffle=False, + num_workers=0 + ) + + ############################## + # Train model + ############################## + + tokenizer = tiktoken.get_encoding("gpt2") + + train_subset = Subset(train_loader.dataset, range(1)) + one_batch_train_loader = DataLoader(train_subset, batch_size=1) + val_subset = Subset(val_loader.dataset, range(1)) + one_batch_val_loader = DataLoader(val_subset, batch_size=1) + + train_losses, val_losses, tokens_seen = train_model_simple( + model, one_batch_train_loader, one_batch_val_loader, optimizer, device, + num_epochs=OTHER_SETTINGS["num_epochs"], eval_freq=1, eval_iter=1, + start_context="Every effort moves you", tokenizer=tokenizer + ) + + assert round(train_losses[0], 1) == 7.6 + assert round(val_losses[0], 1) == 10.1 + assert train_losses[-1] < train_losses[0] diff --git a/pkg/llms_from_scratch/tests/test_ch06.py b/pkg/llms_from_scratch/tests/test_ch06.py new file mode 100644 index 0000000..f2e3249 --- /dev/null +++ b/pkg/llms_from_scratch/tests/test_ch06.py @@ -0,0 +1,148 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch + + +from llms_from_scratch.ch04 import GPTModel +from llms_from_scratch.ch06 import ( + download_and_unzip_spam_data, create_balanced_dataset, + random_split, SpamDataset, train_classifier_simple +) + +from pathlib import Path +import urllib + +import pandas as pd +import tiktoken +import torch +from torch.utils.data import DataLoader, Subset + + +def test_train_classifier(tmp_path): + + ######################################## + # Download and prepare dataset + ######################################## + + url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip" + zip_path = tmp_path / "sms_spam_collection.zip" + extracted_path = tmp_path / "sms_spam_collection" + data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv" + + try: + download_and_unzip_spam_data( + url, zip_path, extracted_path, data_file_path + ) + except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e: + print(f"Primary URL failed: {e}. Trying backup URL...") + backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip" + download_and_unzip_spam_data( + backup_url, zip_path, extracted_path, data_file_path + ) + + df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"]) + balanced_df = create_balanced_dataset(df) + balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1}) + + train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1) + train_df.to_csv(tmp_path / "train.csv", index=None) + validation_df.to_csv(tmp_path / "validation.csv", index=None) + test_df.to_csv(tmp_path / "test.csv", index=None) + + ######################################## + # Create data loaders + ######################################## + tokenizer = tiktoken.get_encoding("gpt2") + + train_dataset = SpamDataset( + csv_file=tmp_path / "train.csv", + max_length=None, + tokenizer=tokenizer + ) + + val_dataset = SpamDataset( + csv_file=tmp_path / "validation.csv", + max_length=train_dataset.max_length, + tokenizer=tokenizer + ) + + num_workers = 0 + batch_size = 8 + + torch.manual_seed(123) + + train_loader = DataLoader( + dataset=train_dataset, + batch_size=batch_size, + shuffle=True, + num_workers=num_workers, + drop_last=True, + ) + + val_loader = DataLoader( + dataset=val_dataset, + batch_size=batch_size, + num_workers=num_workers, + drop_last=False, + ) + + ######################################## + # Load pretrained model + ######################################## + + # Small GPT model for testing purposes + BASE_CONFIG = { + "vocab_size": 50257, + "context_length": 120, + "drop_rate": 0.0, + "qkv_bias": False, + "emb_dim": 12, + "n_layers": 1, + "n_heads": 2 + } + model = GPTModel(BASE_CONFIG) + model.eval() + device = "cpu" + + ######################################## + # Modify and pretrained model + ######################################## + + for param in model.parameters(): + param.requires_grad = False + + torch.manual_seed(123) + + num_classes = 2 + model.out_head = torch.nn.Linear(in_features=BASE_CONFIG["emb_dim"], out_features=num_classes) + model.to(device) + + for param in model.trf_blocks[-1].parameters(): + param.requires_grad = True + + for param in model.final_norm.parameters(): + param.requires_grad = True + + ######################################## + # Finetune modified model + ######################################## + + torch.manual_seed(123) + + optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.0) + + train_subset = Subset(train_loader.dataset, range(5)) + batch_train_loader = DataLoader(train_subset, batch_size=5) + val_subset = Subset(val_loader.dataset, range(5)) + batch_val_loader = DataLoader(val_subset, batch_size=5) + + num_epochs = 5 + train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple( + model, batch_train_loader, batch_val_loader, optimizer, device, + num_epochs=num_epochs, eval_freq=1, eval_iter=1, + ) + + assert round(train_losses[0], 1) == 0.8 + assert round(val_losses[0], 1) == 0.8 + assert train_losses[-1] < train_losses[0] diff --git a/pkg/llms_from_scratch/tests/test_ch07.py b/pkg/llms_from_scratch/tests/test_ch07.py new file mode 100644 index 0000000..5ec9618 --- /dev/null +++ b/pkg/llms_from_scratch/tests/test_ch07.py @@ -0,0 +1,108 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch + +from llms_from_scratch.ch04 import GPTModel +from llms_from_scratch.ch05 import train_model_simple +from llms_from_scratch.ch07 import ( + download_and_load_file, InstructionDataset, format_input, custom_collate_fn +) + +from functools import partial + +import torch +from torch.utils.data import DataLoader +import tiktoken + + +def test_instruction_finetune(tmp_path): + + ####################################### + # Download and prepare dataset + ####################################### + file_path = tmp_path / "instruction-data.json" + url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch07/01_main-chapter-code/instruction-data.json" + data = download_and_load_file(file_path, url) + + train_portion = int(len(data) * 0.85) # 85% for training + test_portion = int(len(data) * 0.1) # 10% for testing + + train_data = data[:train_portion] + test_data = data[train_portion:train_portion + test_portion] + val_data = data[train_portion + test_portion:] + + # Use very small subset for testing purposes + train_data = train_data[:15] + val_data = val_data[:15] + test_data = test_data[:15] + + tokenizer = tiktoken.get_encoding("gpt2") + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + customized_collate_fn = partial(custom_collate_fn, device=device, allowed_max_length=100) + + num_workers = 0 + batch_size = 8 + + torch.manual_seed(123) + + train_dataset = InstructionDataset(train_data, tokenizer) + train_loader = DataLoader( + train_dataset, + batch_size=batch_size, + collate_fn=customized_collate_fn, + shuffle=True, + drop_last=True, + num_workers=num_workers + ) + + val_dataset = InstructionDataset(val_data, tokenizer) + val_loader = DataLoader( + val_dataset, + batch_size=batch_size, + collate_fn=customized_collate_fn, + shuffle=False, + drop_last=False, + num_workers=num_workers + ) + + ####################################### + # Load pretrained model + ####################################### + + # Small GPT model for testing purposes + BASE_CONFIG = { + "vocab_size": 50257, + "context_length": 120, + "drop_rate": 0.0, + "qkv_bias": False, + "emb_dim": 12, + "n_layers": 1, + "n_heads": 2 + } + model = GPTModel(BASE_CONFIG) + model.eval() + device = "cpu" + CHOOSE_MODEL = "Small test model" + + print("Loaded model:", CHOOSE_MODEL) + print(50*"-") + + ####################################### + # Finetuning the model + ####################################### + + num_epochs = 10 + optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1) + + torch.manual_seed(123) + train_losses, val_losses, tokens_seen = train_model_simple( + model, train_loader, val_loader, optimizer, device, + num_epochs=num_epochs, eval_freq=5, eval_iter=5, + start_context=format_input(val_data[0]), tokenizer=tokenizer + ) + + assert round(train_losses[0], 1) == 10.9 + assert round(val_losses[0], 1) == 10.9 + assert train_losses[-1] < train_losses[0] diff --git a/pyproject.toml b/pyproject.toml index 2c0f3dd..e543439 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,10 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + [project] name = "llms-from-scratch" -version = "0.1.0" +version = "1.0.0" description = "Implement a ChatGPT-like LLM in PyTorch from scratch, step by step" readme = "README.md" requires-python = ">=3.10" @@ -14,17 +18,17 @@ dependencies = [ "numpy>=1.26,<2.1", "pandas>=2.2.1", "pip>=25.0.1", + "pytest>=8.3.5", ] -[tool.setuptools.packages] -find = {} - [tool.uv.sources] llms-from-scratch = { workspace = true } [dependency-groups] dev = [ + "build>=1.2.2.post1", "llms-from-scratch", + "twine>=6.1.0", ] [tool.ruff] @@ -37,3 +41,12 @@ ignore = [ "C406", "E226", "E402", "E702", "E703", "E722", "E731", "E741" ] + + +# `llms_from_scratch` PyPI package + +[tool.setuptools] +package-dir = {"" = "pkg"} + +[tool.setuptools.packages.find] +where = ["pkg"]