Add GPTModelFast (#584)

* Add GPTModelFast * update
2026-04-10 12:33:42 +00:00 · 2025-03-27 14:00:25 -05:00
parent c9271ac427
commit e07a7abdd5
7 changed files with 204 additions and 61 deletions
--- a/pkg/llms_from_scratch/tests/test_ch03.py
+++ b/pkg/llms_from_scratch/tests/test_ch03.py
@@ -4,7 +4,7 @@
 # Code: https://github.com/rasbt/LLMs-from-scratch


-from llms_from_scratch.ch03 import MultiHeadAttention
+from llms_from_scratch.ch03 import MultiHeadAttention, PyTorchMultiHeadAttention
 import torch


@@ -14,7 +14,15 @@ def test_mha():
    d_in = 256
    d_out = 16

-    mha = MultiHeadAttention(d_in, d_out, context_length, 0.0, num_heads=2)
+    mha = MultiHeadAttention(d_in, d_out, context_length, dropout=0.0, num_heads=2)
+
+    batch = torch.rand(8, 6, d_in)
+    context_vecs = mha(batch)
+
+    context_vecs.shape == torch.Size([8, 6, d_out])
+
+    # Test bonus class
+    mha = PyTorchMultiHeadAttention(d_in, d_out, num_heads=2)

    batch = torch.rand(8, 6, d_in)
    context_vecs = mha(batch)
--- a/pkg/llms_from_scratch/tests/test_ch04.py
+++ b/pkg/llms_from_scratch/tests/test_ch04.py
@@ -3,26 +3,29 @@
 #   - https://www.manning.com/books/build-a-large-language-model-from-scratch
 # Code: https://github.com/rasbt/LLMs-from-scratch

-from llms_from_scratch.ch04 import GPTModel
+from llms_from_scratch.ch04 import GPTModel, GPTModelFast
 from llms_from_scratch.ch04 import generate_text_simple

+import pytest
 import torch
 import tiktoken


-def test_GPTModel():
-    GPT_CONFIG_124M = {
-        "vocab_size": 50257,     # Vocabulary size
-        "context_length": 1024,  # Context length
-        "emb_dim": 768,          # Embedding dimension
-        "n_heads": 12,           # Number of attention heads
-        "n_layers": 12,          # Number of layers
-        "drop_rate": 0.1,        # Dropout rate
-        "qkv_bias": False        # Query-Key-Value bias
-    }
+GPT_CONFIG_124M = {
+    "vocab_size": 50257,     # Vocabulary size
+    "context_length": 1024,  # Context length
+    "emb_dim": 768,          # Embedding dimension
+    "n_heads": 12,           # Number of attention heads
+    "n_layers": 12,          # Number of layers
+    "drop_rate": 0.1,        # Dropout rate
+    "qkv_bias": False        # Query-Key-Value bias
+}

+
+@pytest.mark.parametrize("ModelClass", [GPTModel, GPTModelFast])
+def test_gpt_model_variants(ModelClass):
    torch.manual_seed(123)
-    model = GPTModel(GPT_CONFIG_124M)
+    model = ModelClass(GPT_CONFIG_124M)
    model.eval()  # disable dropout

    start_context = "Hello, I am"
@@ -47,4 +50,4 @@ def test_GPTModel():
        [15496,   11,   314,   716, 27018, 24086, 47843, 30961, 42348,  7267,
         49706, 43231, 47062, 34657]
    ])
-    torch.equal(expect, out)
+    assert torch.equal(expect, out), "Generated output does not match expected output"
--- a/pkg/llms_from_scratch/tests/test_ch05.py
+++ b/pkg/llms_from_scratch/tests/test_ch05.py
@@ -4,7 +4,7 @@
 # Code: https://github.com/rasbt/LLMs-from-scratch

 from llms_from_scratch.ch02 import create_dataloader_v1
-from llms_from_scratch.ch04 import GPTModel
+from llms_from_scratch.ch04 import GPTModel, GPTModelFast
 from llms_from_scratch.ch05 import train_model_simple

 import os
@@ -16,60 +16,47 @@ import torch
 from torch.utils.data import Subset, DataLoader


-@pytest.mark.parametrize("file_name", ["the-verdict.txt"])
-def test_train_simple(tmp_path, file_name):
+GPT_CONFIG_124M = {
+    "vocab_size": 50257,
+    "context_length": 256,  # Shortened for test speed
+    "emb_dim": 768,
+    "n_heads": 12,
+    "n_layers": 12,
+    "drop_rate": 0.1,
+    "qkv_bias": False
+}

-    GPT_CONFIG_124M = {
-        "vocab_size": 50257,    # Vocabulary size
-        "context_length": 256,  # Shortened context length (orig: 1024)
-        "emb_dim": 768,         # Embedding dimension
-        "n_heads": 12,          # Number of attention heads
-        "n_layers": 12,         # Number of layers
-        "drop_rate": 0.1,       # Dropout rate
-        "qkv_bias": False       # Query-key-value bias
-    }
+OTHER_SETTINGS = {
+    "learning_rate": 5e-4,
+    "num_epochs": 2,
+    "batch_size": 1,
+    "weight_decay": 0.1
+}

-    OTHER_SETTINGS = {
-        "learning_rate": 5e-4,
-        "num_epochs": 2,
-        "batch_size": 1,
-        "weight_decay": 0.1
-    }

+@pytest.mark.parametrize("ModelClass", [GPTModel, GPTModelFast])
+def test_train_simple(tmp_path, ModelClass):
    torch.manual_seed(123)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    ##############################
    # Download data if necessary
    ##############################
-
    file_path = tmp_path / "the-verdict.txt"
    url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"

    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
-            text_data = response.read().decode('utf-8')
-        with open(file_path, "w", encoding="utf-8") as file:
-            file.write(text_data)
+            text_data = response.read().decode("utf-8")
+        with open(file_path, "w", encoding="utf-8") as f:
+            f.write(text_data)
    else:
-        with open(file_path, "r", encoding="utf-8") as file:
-            text_data = file.read()
-
-    ##############################
-    # Initialize model
-    ##############################
-
-    model = GPTModel(GPT_CONFIG_124M)
-    model.to(device)  # no assignment model = model.to(device) necessary for nn.Module classes
-    optimizer = torch.optim.AdamW(
-        model.parameters(), lr=OTHER_SETTINGS["learning_rate"], weight_decay=OTHER_SETTINGS["weight_decay"]
-    )
+        with open(file_path, "r", encoding="utf-8") as f:
+            text_data = f.read()

    ##############################
    # Set up dataloaders
    ##############################
-
-    # Train/validation ratio
    train_ratio = 0.90
    split_idx = int(train_ratio * len(text_data))

@@ -93,17 +80,26 @@ def test_train_simple(tmp_path, file_name):
        num_workers=0
    )

-    ##############################
-    # Train model
-    ##############################
-
-    tokenizer = tiktoken.get_encoding("gpt2")
-
+    # Limit to 1 batch for speed
    train_subset = Subset(train_loader.dataset, range(1))
    one_batch_train_loader = DataLoader(train_subset, batch_size=1)
    val_subset = Subset(val_loader.dataset, range(1))
    one_batch_val_loader = DataLoader(val_subset, batch_size=1)

+    ##############################
+    # Train model
+    ##############################
+    model = ModelClass(GPT_CONFIG_124M)
+    model.to(device)
+
+    optimizer = torch.optim.AdamW(
+        model.parameters(),
+        lr=OTHER_SETTINGS["learning_rate"],
+        weight_decay=OTHER_SETTINGS["weight_decay"]
+    )
+
+    tokenizer = tiktoken.get_encoding("gpt2")
+
    train_losses, val_losses, tokens_seen = train_model_simple(
        model, one_batch_train_loader, one_batch_val_loader, optimizer, device,
        num_epochs=OTHER_SETTINGS["num_epochs"], eval_freq=1, eval_iter=1,