mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2026-04-10 12:33:42 +00:00
committed by
GitHub
parent
85f2bc0a58
commit
7114ccd10d
70
pkg/llms_from_scratch/tests/test_appendix_a.py
Normal file
70
pkg/llms_from_scratch/tests/test_appendix_a.py
Normal file
@@ -0,0 +1,70 @@
|
||||
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
|
||||
# Source for "Build a Large Language Model From Scratch"
|
||||
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
|
||||
# Code: https://github.com/rasbt/LLMs-from-scratch
|
||||
|
||||
from llms_from_scratch.appendix_a import NeuralNetwork, ToyDataset
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
|
||||
def test_dataset():
|
||||
|
||||
X_train = torch.tensor([
|
||||
[-1.2, 3.1],
|
||||
[-0.9, 2.9],
|
||||
[-0.5, 2.6],
|
||||
[2.3, -1.1],
|
||||
[2.7, -1.5]
|
||||
])
|
||||
|
||||
y_train = torch.tensor([0, 0, 0, 1, 1])
|
||||
train_ds = ToyDataset(X_train, y_train)
|
||||
|
||||
len(train_ds) == 5
|
||||
torch.manual_seed(123)
|
||||
|
||||
train_loader = DataLoader(
|
||||
dataset=train_ds,
|
||||
batch_size=2,
|
||||
shuffle=True,
|
||||
num_workers=0
|
||||
)
|
||||
|
||||
torch.manual_seed(123)
|
||||
model = NeuralNetwork(num_inputs=2, num_outputs=2)
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr=0.5)
|
||||
|
||||
num_epochs = 3
|
||||
|
||||
for epoch in range(num_epochs):
|
||||
|
||||
model.train()
|
||||
for batch_idx, (features, labels) in enumerate(train_loader):
|
||||
|
||||
logits = model(features)
|
||||
|
||||
loss = F.cross_entropy(logits, labels)
|
||||
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
print(f"Epoch: {epoch+1:03d}/{num_epochs:03d}"
|
||||
f" | Batch {batch_idx:03d}/{len(train_loader):03d}"
|
||||
f" | Train/Val Loss: {loss:.2f}")
|
||||
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
outputs = model(X_train)
|
||||
|
||||
expected = torch.tensor([
|
||||
[2.8569, -4.1618],
|
||||
[2.5382, -3.7548],
|
||||
[2.0944, -3.1820],
|
||||
[-1.4814, 1.4816],
|
||||
[-1.7176, 1.7342]
|
||||
])
|
||||
torch.equal(outputs, expected)
|
||||
118
pkg/llms_from_scratch/tests/test_appendix_d.py
Normal file
118
pkg/llms_from_scratch/tests/test_appendix_d.py
Normal file
@@ -0,0 +1,118 @@
|
||||
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
|
||||
# Source for "Build a Large Language Model From Scratch"
|
||||
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
|
||||
# Code: https://github.com/rasbt/LLMs-from-scratch
|
||||
|
||||
from llms_from_scratch.ch02 import create_dataloader_v1
|
||||
from llms_from_scratch.ch04 import GPTModel
|
||||
from llms_from_scratch.appendix_d import train_model
|
||||
|
||||
import os
|
||||
import urllib
|
||||
|
||||
import tiktoken
|
||||
import torch
|
||||
from torch.utils.data import Subset, DataLoader
|
||||
|
||||
|
||||
def test_train(tmp_path):
|
||||
|
||||
GPT_CONFIG_124M = {
|
||||
"vocab_size": 50257, # Vocabulary size
|
||||
"context_length": 256, # Shortened context length (orig: 1024)
|
||||
"emb_dim": 768, # Embedding dimension
|
||||
"n_heads": 12, # Number of attention heads
|
||||
"n_layers": 12, # Number of layers
|
||||
"drop_rate": 0.1, # Dropout rate
|
||||
"qkv_bias": False # Query-key-value bias
|
||||
}
|
||||
|
||||
OTHER_SETTINGS = {
|
||||
"learning_rate": 5e-4,
|
||||
"num_epochs": 2,
|
||||
"batch_size": 1,
|
||||
"weight_decay": 0.1
|
||||
}
|
||||
|
||||
torch.manual_seed(123)
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
##############################
|
||||
# Download data if necessary
|
||||
##############################
|
||||
|
||||
file_path = tmp_path / "the-verdict.txt"
|
||||
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
with urllib.request.urlopen(url) as response:
|
||||
text_data = response.read().decode("utf-8")
|
||||
with open(file_path, "w", encoding="utf-8") as file:
|
||||
file.write(text_data)
|
||||
else:
|
||||
with open(file_path, "r", encoding="utf-8") as file:
|
||||
text_data = file.read()
|
||||
|
||||
##############################
|
||||
# Initialize model
|
||||
##############################
|
||||
|
||||
model = GPTModel(GPT_CONFIG_124M)
|
||||
model.to(device) # no assignment model = model.to(device) necessary for nn.Module classes
|
||||
|
||||
##############################
|
||||
# Set up dataloaders
|
||||
##############################
|
||||
|
||||
# Train/validation ratio
|
||||
train_ratio = 0.90
|
||||
split_idx = int(train_ratio * len(text_data))
|
||||
|
||||
train_loader = create_dataloader_v1(
|
||||
text_data[:split_idx],
|
||||
batch_size=OTHER_SETTINGS["batch_size"],
|
||||
max_length=GPT_CONFIG_124M["context_length"],
|
||||
stride=GPT_CONFIG_124M["context_length"],
|
||||
drop_last=True,
|
||||
shuffle=True,
|
||||
num_workers=0
|
||||
)
|
||||
|
||||
val_loader = create_dataloader_v1(
|
||||
text_data[split_idx:],
|
||||
batch_size=OTHER_SETTINGS["batch_size"],
|
||||
max_length=GPT_CONFIG_124M["context_length"],
|
||||
stride=GPT_CONFIG_124M["context_length"],
|
||||
drop_last=False,
|
||||
shuffle=False,
|
||||
num_workers=0
|
||||
)
|
||||
|
||||
##############################
|
||||
# Train model
|
||||
##############################
|
||||
|
||||
tokenizer = tiktoken.get_encoding("gpt2")
|
||||
|
||||
train_subset = Subset(train_loader.dataset, range(1))
|
||||
one_batch_train_loader = DataLoader(train_subset, batch_size=1)
|
||||
val_subset = Subset(val_loader.dataset, range(1))
|
||||
one_batch_val_loader = DataLoader(val_subset, batch_size=1)
|
||||
|
||||
peak_lr = 0.001 # this was originally set to 5e-4 in the book by mistake
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=peak_lr, weight_decay=0.1) # the book accidentally omitted the lr assignment
|
||||
tokenizer = tiktoken.get_encoding("gpt2")
|
||||
|
||||
n_epochs = 6
|
||||
warmup_steps = 1
|
||||
|
||||
train_losses, val_losses, tokens_seen, lrs = train_model(
|
||||
model, one_batch_train_loader, one_batch_val_loader, optimizer, device, n_epochs=n_epochs,
|
||||
eval_freq=5, eval_iter=1, start_context="Every effort moves you",
|
||||
tokenizer=tokenizer, warmup_steps=warmup_steps,
|
||||
initial_lr=1e-5, min_lr=1e-5
|
||||
)
|
||||
|
||||
assert round(train_losses[0], 1) == 10.9
|
||||
assert round(val_losses[0], 1) == 11.0
|
||||
assert train_losses[-1] < train_losses[0]
|
||||
150
pkg/llms_from_scratch/tests/test_appendix_e.py
Normal file
150
pkg/llms_from_scratch/tests/test_appendix_e.py
Normal file
@@ -0,0 +1,150 @@
|
||||
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
|
||||
# Source for "Build a Large Language Model From Scratch"
|
||||
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
|
||||
# Code: https://github.com/rasbt/LLMs-from-scratch
|
||||
|
||||
|
||||
from llms_from_scratch.ch04 import GPTModel
|
||||
from llms_from_scratch.ch06 import (
|
||||
download_and_unzip_spam_data, create_balanced_dataset,
|
||||
random_split, SpamDataset, train_classifier_simple
|
||||
)
|
||||
from llms_from_scratch.appendix_e import replace_linear_with_lora
|
||||
|
||||
from pathlib import Path
|
||||
import urllib
|
||||
|
||||
import pandas as pd
|
||||
import tiktoken
|
||||
import torch
|
||||
from torch.utils.data import DataLoader, Subset
|
||||
|
||||
|
||||
def test_train_classifier_lora(tmp_path):
|
||||
|
||||
########################################
|
||||
# Download and prepare dataset
|
||||
########################################
|
||||
|
||||
url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
|
||||
zip_path = tmp_path / "sms_spam_collection.zip"
|
||||
extracted_path = tmp_path / "sms_spam_collection"
|
||||
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"
|
||||
|
||||
try:
|
||||
download_and_unzip_spam_data(
|
||||
url, zip_path, extracted_path, data_file_path
|
||||
)
|
||||
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
|
||||
print(f"Primary URL failed: {e}. Trying backup URL...")
|
||||
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
|
||||
download_and_unzip_spam_data(
|
||||
backup_url, zip_path, extracted_path, data_file_path
|
||||
)
|
||||
|
||||
df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
|
||||
balanced_df = create_balanced_dataset(df)
|
||||
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})
|
||||
|
||||
train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
|
||||
train_df.to_csv(tmp_path / "train.csv", index=None)
|
||||
validation_df.to_csv(tmp_path / "validation.csv", index=None)
|
||||
test_df.to_csv(tmp_path / "test.csv", index=None)
|
||||
|
||||
########################################
|
||||
# Create data loaders
|
||||
########################################
|
||||
tokenizer = tiktoken.get_encoding("gpt2")
|
||||
|
||||
train_dataset = SpamDataset(
|
||||
csv_file=tmp_path / "train.csv",
|
||||
max_length=None,
|
||||
tokenizer=tokenizer
|
||||
)
|
||||
|
||||
val_dataset = SpamDataset(
|
||||
csv_file=tmp_path / "validation.csv",
|
||||
max_length=train_dataset.max_length,
|
||||
tokenizer=tokenizer
|
||||
)
|
||||
|
||||
num_workers = 0
|
||||
batch_size = 8
|
||||
|
||||
torch.manual_seed(123)
|
||||
|
||||
train_loader = DataLoader(
|
||||
dataset=train_dataset,
|
||||
batch_size=batch_size,
|
||||
shuffle=True,
|
||||
num_workers=num_workers,
|
||||
drop_last=True,
|
||||
)
|
||||
|
||||
val_loader = DataLoader(
|
||||
dataset=val_dataset,
|
||||
batch_size=batch_size,
|
||||
num_workers=num_workers,
|
||||
drop_last=False,
|
||||
)
|
||||
|
||||
########################################
|
||||
# Load pretrained model
|
||||
########################################
|
||||
|
||||
# Small GPT model for testing purposes
|
||||
BASE_CONFIG = {
|
||||
"vocab_size": 50257,
|
||||
"context_length": 120,
|
||||
"drop_rate": 0.0,
|
||||
"qkv_bias": False,
|
||||
"emb_dim": 12,
|
||||
"n_layers": 1,
|
||||
"n_heads": 2
|
||||
}
|
||||
model = GPTModel(BASE_CONFIG)
|
||||
model.eval()
|
||||
device = "cpu"
|
||||
|
||||
########################################
|
||||
# Modify and pretrained model
|
||||
########################################
|
||||
|
||||
for param in model.parameters():
|
||||
param.requires_grad = False
|
||||
|
||||
torch.manual_seed(123)
|
||||
|
||||
num_classes = 2
|
||||
model.out_head = torch.nn.Linear(in_features=BASE_CONFIG["emb_dim"], out_features=num_classes)
|
||||
replace_linear_with_lora(model, rank=16, alpha=16)
|
||||
model.to(device)
|
||||
|
||||
for param in model.trf_blocks[-1].parameters():
|
||||
param.requires_grad = True
|
||||
|
||||
for param in model.final_norm.parameters():
|
||||
param.requires_grad = True
|
||||
|
||||
########################################
|
||||
# Finetune modified model
|
||||
########################################
|
||||
|
||||
torch.manual_seed(123)
|
||||
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)
|
||||
|
||||
train_subset = Subset(train_loader.dataset, range(5))
|
||||
batch_train_loader = DataLoader(train_subset, batch_size=5)
|
||||
val_subset = Subset(val_loader.dataset, range(5))
|
||||
batch_val_loader = DataLoader(val_subset, batch_size=5)
|
||||
|
||||
num_epochs = 6
|
||||
train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple(
|
||||
model, batch_train_loader, batch_val_loader, optimizer, device,
|
||||
num_epochs=num_epochs, eval_freq=1, eval_iter=1,
|
||||
)
|
||||
|
||||
assert round(train_losses[0], 1) == 0.8
|
||||
assert round(val_losses[0], 1) == 0.8
|
||||
assert train_losses[-1] < train_losses[0]
|
||||
54
pkg/llms_from_scratch/tests/test_ch02.py
Normal file
54
pkg/llms_from_scratch/tests/test_ch02.py
Normal file
@@ -0,0 +1,54 @@
|
||||
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
|
||||
# Source for "Build a Large Language Model From Scratch"
|
||||
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
|
||||
# Code: https://github.com/rasbt/LLMs-from-scratch
|
||||
|
||||
from llms_from_scratch.ch02 import create_dataloader_v1
|
||||
|
||||
import os
|
||||
import urllib.request
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
|
||||
@pytest.mark.parametrize("file_name", ["the-verdict.txt"])
|
||||
def test_dataloader(tmp_path, file_name):
|
||||
|
||||
if not os.path.exists("the-verdict.txt"):
|
||||
url = ("https://raw.githubusercontent.com/rasbt/"
|
||||
"LLMs-from-scratch/main/ch02/01_main-chapter-code/"
|
||||
"the-verdict.txt")
|
||||
file_path = "the-verdict.txt"
|
||||
urllib.request.urlretrieve(url, file_path)
|
||||
|
||||
with open("the-verdict.txt", "r", encoding="utf-8") as f:
|
||||
raw_text = f.read()
|
||||
|
||||
vocab_size = 50257
|
||||
output_dim = 256
|
||||
context_length = 1024
|
||||
|
||||
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
|
||||
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
|
||||
|
||||
batch_size = 8
|
||||
max_length = 4
|
||||
dataloader = create_dataloader_v1(
|
||||
raw_text,
|
||||
batch_size=batch_size,
|
||||
max_length=max_length,
|
||||
stride=max_length
|
||||
)
|
||||
|
||||
for batch in dataloader:
|
||||
x, y = batch
|
||||
|
||||
token_embeddings = token_embedding_layer(x)
|
||||
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
|
||||
|
||||
input_embeddings = token_embeddings + pos_embeddings
|
||||
|
||||
break
|
||||
|
||||
input_embeddings.shape == torch.Size([8, 4, 256])
|
||||
22
pkg/llms_from_scratch/tests/test_ch03.py
Normal file
22
pkg/llms_from_scratch/tests/test_ch03.py
Normal file
@@ -0,0 +1,22 @@
|
||||
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
|
||||
# Source for "Build a Large Language Model From Scratch"
|
||||
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
|
||||
# Code: https://github.com/rasbt/LLMs-from-scratch
|
||||
|
||||
|
||||
from llms_from_scratch.ch03 import MultiHeadAttention
|
||||
import torch
|
||||
|
||||
|
||||
def test_mha():
|
||||
|
||||
context_length = 100
|
||||
d_in = 256
|
||||
d_out = 16
|
||||
|
||||
mha = MultiHeadAttention(d_in, d_out, context_length, 0.0, num_heads=2)
|
||||
|
||||
batch = torch.rand(8, 6, d_in)
|
||||
context_vecs = mha(batch)
|
||||
|
||||
context_vecs.shape == torch.Size([8, 6, d_out])
|
||||
50
pkg/llms_from_scratch/tests/test_ch04.py
Normal file
50
pkg/llms_from_scratch/tests/test_ch04.py
Normal file
@@ -0,0 +1,50 @@
|
||||
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
|
||||
# Source for "Build a Large Language Model From Scratch"
|
||||
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
|
||||
# Code: https://github.com/rasbt/LLMs-from-scratch
|
||||
|
||||
from llms_from_scratch.ch04 import GPTModel
|
||||
from llms_from_scratch.ch04 import generate_text_simple
|
||||
|
||||
import torch
|
||||
import tiktoken
|
||||
|
||||
|
||||
def test_GPTModel():
|
||||
GPT_CONFIG_124M = {
|
||||
"vocab_size": 50257, # Vocabulary size
|
||||
"context_length": 1024, # Context length
|
||||
"emb_dim": 768, # Embedding dimension
|
||||
"n_heads": 12, # Number of attention heads
|
||||
"n_layers": 12, # Number of layers
|
||||
"drop_rate": 0.1, # Dropout rate
|
||||
"qkv_bias": False # Query-Key-Value bias
|
||||
}
|
||||
|
||||
torch.manual_seed(123)
|
||||
model = GPTModel(GPT_CONFIG_124M)
|
||||
model.eval() # disable dropout
|
||||
|
||||
start_context = "Hello, I am"
|
||||
|
||||
tokenizer = tiktoken.get_encoding("gpt2")
|
||||
encoded = tokenizer.encode(start_context)
|
||||
encoded_tensor = torch.tensor(encoded).unsqueeze(0)
|
||||
|
||||
print(f"\n{50*'='}\n{22*' '}IN\n{50*'='}")
|
||||
print("\nInput text:", start_context)
|
||||
print("Encoded input text:", encoded)
|
||||
print("encoded_tensor.shape:", encoded_tensor.shape)
|
||||
|
||||
out = generate_text_simple(
|
||||
model=model,
|
||||
idx=encoded_tensor,
|
||||
max_new_tokens=10,
|
||||
context_size=GPT_CONFIG_124M["context_length"]
|
||||
)
|
||||
|
||||
expect = torch.tensor([
|
||||
[15496, 11, 314, 716, 27018, 24086, 47843, 30961, 42348, 7267,
|
||||
49706, 43231, 47062, 34657]
|
||||
])
|
||||
torch.equal(expect, out)
|
||||
115
pkg/llms_from_scratch/tests/test_ch05.py
Normal file
115
pkg/llms_from_scratch/tests/test_ch05.py
Normal file
@@ -0,0 +1,115 @@
|
||||
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
|
||||
# Source for "Build a Large Language Model From Scratch"
|
||||
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
|
||||
# Code: https://github.com/rasbt/LLMs-from-scratch
|
||||
|
||||
from llms_from_scratch.ch02 import create_dataloader_v1
|
||||
from llms_from_scratch.ch04 import GPTModel
|
||||
from llms_from_scratch.ch05 import train_model_simple
|
||||
|
||||
import os
|
||||
import urllib
|
||||
|
||||
import pytest
|
||||
import tiktoken
|
||||
import torch
|
||||
from torch.utils.data import Subset, DataLoader
|
||||
|
||||
|
||||
@pytest.mark.parametrize("file_name", ["the-verdict.txt"])
|
||||
def test_train_simple(tmp_path, file_name):
|
||||
|
||||
GPT_CONFIG_124M = {
|
||||
"vocab_size": 50257, # Vocabulary size
|
||||
"context_length": 256, # Shortened context length (orig: 1024)
|
||||
"emb_dim": 768, # Embedding dimension
|
||||
"n_heads": 12, # Number of attention heads
|
||||
"n_layers": 12, # Number of layers
|
||||
"drop_rate": 0.1, # Dropout rate
|
||||
"qkv_bias": False # Query-key-value bias
|
||||
}
|
||||
|
||||
OTHER_SETTINGS = {
|
||||
"learning_rate": 5e-4,
|
||||
"num_epochs": 2,
|
||||
"batch_size": 1,
|
||||
"weight_decay": 0.1
|
||||
}
|
||||
|
||||
torch.manual_seed(123)
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
##############################
|
||||
# Download data if necessary
|
||||
##############################
|
||||
|
||||
file_path = tmp_path / "the-verdict.txt"
|
||||
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
with urllib.request.urlopen(url) as response:
|
||||
text_data = response.read().decode('utf-8')
|
||||
with open(file_path, "w", encoding="utf-8") as file:
|
||||
file.write(text_data)
|
||||
else:
|
||||
with open(file_path, "r", encoding="utf-8") as file:
|
||||
text_data = file.read()
|
||||
|
||||
##############################
|
||||
# Initialize model
|
||||
##############################
|
||||
|
||||
model = GPTModel(GPT_CONFIG_124M)
|
||||
model.to(device) # no assignment model = model.to(device) necessary for nn.Module classes
|
||||
optimizer = torch.optim.AdamW(
|
||||
model.parameters(), lr=OTHER_SETTINGS["learning_rate"], weight_decay=OTHER_SETTINGS["weight_decay"]
|
||||
)
|
||||
|
||||
##############################
|
||||
# Set up dataloaders
|
||||
##############################
|
||||
|
||||
# Train/validation ratio
|
||||
train_ratio = 0.90
|
||||
split_idx = int(train_ratio * len(text_data))
|
||||
|
||||
train_loader = create_dataloader_v1(
|
||||
text_data[:split_idx],
|
||||
batch_size=OTHER_SETTINGS["batch_size"],
|
||||
max_length=GPT_CONFIG_124M["context_length"],
|
||||
stride=GPT_CONFIG_124M["context_length"],
|
||||
drop_last=True,
|
||||
shuffle=True,
|
||||
num_workers=0
|
||||
)
|
||||
|
||||
val_loader = create_dataloader_v1(
|
||||
text_data[split_idx:],
|
||||
batch_size=OTHER_SETTINGS["batch_size"],
|
||||
max_length=GPT_CONFIG_124M["context_length"],
|
||||
stride=GPT_CONFIG_124M["context_length"],
|
||||
drop_last=False,
|
||||
shuffle=False,
|
||||
num_workers=0
|
||||
)
|
||||
|
||||
##############################
|
||||
# Train model
|
||||
##############################
|
||||
|
||||
tokenizer = tiktoken.get_encoding("gpt2")
|
||||
|
||||
train_subset = Subset(train_loader.dataset, range(1))
|
||||
one_batch_train_loader = DataLoader(train_subset, batch_size=1)
|
||||
val_subset = Subset(val_loader.dataset, range(1))
|
||||
one_batch_val_loader = DataLoader(val_subset, batch_size=1)
|
||||
|
||||
train_losses, val_losses, tokens_seen = train_model_simple(
|
||||
model, one_batch_train_loader, one_batch_val_loader, optimizer, device,
|
||||
num_epochs=OTHER_SETTINGS["num_epochs"], eval_freq=1, eval_iter=1,
|
||||
start_context="Every effort moves you", tokenizer=tokenizer
|
||||
)
|
||||
|
||||
assert round(train_losses[0], 1) == 7.6
|
||||
assert round(val_losses[0], 1) == 10.1
|
||||
assert train_losses[-1] < train_losses[0]
|
||||
148
pkg/llms_from_scratch/tests/test_ch06.py
Normal file
148
pkg/llms_from_scratch/tests/test_ch06.py
Normal file
@@ -0,0 +1,148 @@
|
||||
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
|
||||
# Source for "Build a Large Language Model From Scratch"
|
||||
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
|
||||
# Code: https://github.com/rasbt/LLMs-from-scratch
|
||||
|
||||
|
||||
from llms_from_scratch.ch04 import GPTModel
|
||||
from llms_from_scratch.ch06 import (
|
||||
download_and_unzip_spam_data, create_balanced_dataset,
|
||||
random_split, SpamDataset, train_classifier_simple
|
||||
)
|
||||
|
||||
from pathlib import Path
|
||||
import urllib
|
||||
|
||||
import pandas as pd
|
||||
import tiktoken
|
||||
import torch
|
||||
from torch.utils.data import DataLoader, Subset
|
||||
|
||||
|
||||
def test_train_classifier(tmp_path):
|
||||
|
||||
########################################
|
||||
# Download and prepare dataset
|
||||
########################################
|
||||
|
||||
url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
|
||||
zip_path = tmp_path / "sms_spam_collection.zip"
|
||||
extracted_path = tmp_path / "sms_spam_collection"
|
||||
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"
|
||||
|
||||
try:
|
||||
download_and_unzip_spam_data(
|
||||
url, zip_path, extracted_path, data_file_path
|
||||
)
|
||||
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
|
||||
print(f"Primary URL failed: {e}. Trying backup URL...")
|
||||
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
|
||||
download_and_unzip_spam_data(
|
||||
backup_url, zip_path, extracted_path, data_file_path
|
||||
)
|
||||
|
||||
df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
|
||||
balanced_df = create_balanced_dataset(df)
|
||||
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})
|
||||
|
||||
train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
|
||||
train_df.to_csv(tmp_path / "train.csv", index=None)
|
||||
validation_df.to_csv(tmp_path / "validation.csv", index=None)
|
||||
test_df.to_csv(tmp_path / "test.csv", index=None)
|
||||
|
||||
########################################
|
||||
# Create data loaders
|
||||
########################################
|
||||
tokenizer = tiktoken.get_encoding("gpt2")
|
||||
|
||||
train_dataset = SpamDataset(
|
||||
csv_file=tmp_path / "train.csv",
|
||||
max_length=None,
|
||||
tokenizer=tokenizer
|
||||
)
|
||||
|
||||
val_dataset = SpamDataset(
|
||||
csv_file=tmp_path / "validation.csv",
|
||||
max_length=train_dataset.max_length,
|
||||
tokenizer=tokenizer
|
||||
)
|
||||
|
||||
num_workers = 0
|
||||
batch_size = 8
|
||||
|
||||
torch.manual_seed(123)
|
||||
|
||||
train_loader = DataLoader(
|
||||
dataset=train_dataset,
|
||||
batch_size=batch_size,
|
||||
shuffle=True,
|
||||
num_workers=num_workers,
|
||||
drop_last=True,
|
||||
)
|
||||
|
||||
val_loader = DataLoader(
|
||||
dataset=val_dataset,
|
||||
batch_size=batch_size,
|
||||
num_workers=num_workers,
|
||||
drop_last=False,
|
||||
)
|
||||
|
||||
########################################
|
||||
# Load pretrained model
|
||||
########################################
|
||||
|
||||
# Small GPT model for testing purposes
|
||||
BASE_CONFIG = {
|
||||
"vocab_size": 50257,
|
||||
"context_length": 120,
|
||||
"drop_rate": 0.0,
|
||||
"qkv_bias": False,
|
||||
"emb_dim": 12,
|
||||
"n_layers": 1,
|
||||
"n_heads": 2
|
||||
}
|
||||
model = GPTModel(BASE_CONFIG)
|
||||
model.eval()
|
||||
device = "cpu"
|
||||
|
||||
########################################
|
||||
# Modify and pretrained model
|
||||
########################################
|
||||
|
||||
for param in model.parameters():
|
||||
param.requires_grad = False
|
||||
|
||||
torch.manual_seed(123)
|
||||
|
||||
num_classes = 2
|
||||
model.out_head = torch.nn.Linear(in_features=BASE_CONFIG["emb_dim"], out_features=num_classes)
|
||||
model.to(device)
|
||||
|
||||
for param in model.trf_blocks[-1].parameters():
|
||||
param.requires_grad = True
|
||||
|
||||
for param in model.final_norm.parameters():
|
||||
param.requires_grad = True
|
||||
|
||||
########################################
|
||||
# Finetune modified model
|
||||
########################################
|
||||
|
||||
torch.manual_seed(123)
|
||||
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.0)
|
||||
|
||||
train_subset = Subset(train_loader.dataset, range(5))
|
||||
batch_train_loader = DataLoader(train_subset, batch_size=5)
|
||||
val_subset = Subset(val_loader.dataset, range(5))
|
||||
batch_val_loader = DataLoader(val_subset, batch_size=5)
|
||||
|
||||
num_epochs = 5
|
||||
train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple(
|
||||
model, batch_train_loader, batch_val_loader, optimizer, device,
|
||||
num_epochs=num_epochs, eval_freq=1, eval_iter=1,
|
||||
)
|
||||
|
||||
assert round(train_losses[0], 1) == 0.8
|
||||
assert round(val_losses[0], 1) == 0.8
|
||||
assert train_losses[-1] < train_losses[0]
|
||||
108
pkg/llms_from_scratch/tests/test_ch07.py
Normal file
108
pkg/llms_from_scratch/tests/test_ch07.py
Normal file
@@ -0,0 +1,108 @@
|
||||
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
|
||||
# Source for "Build a Large Language Model From Scratch"
|
||||
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
|
||||
# Code: https://github.com/rasbt/LLMs-from-scratch
|
||||
|
||||
from llms_from_scratch.ch04 import GPTModel
|
||||
from llms_from_scratch.ch05 import train_model_simple
|
||||
from llms_from_scratch.ch07 import (
|
||||
download_and_load_file, InstructionDataset, format_input, custom_collate_fn
|
||||
)
|
||||
|
||||
from functools import partial
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
import tiktoken
|
||||
|
||||
|
||||
def test_instruction_finetune(tmp_path):
|
||||
|
||||
#######################################
|
||||
# Download and prepare dataset
|
||||
#######################################
|
||||
file_path = tmp_path / "instruction-data.json"
|
||||
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch07/01_main-chapter-code/instruction-data.json"
|
||||
data = download_and_load_file(file_path, url)
|
||||
|
||||
train_portion = int(len(data) * 0.85) # 85% for training
|
||||
test_portion = int(len(data) * 0.1) # 10% for testing
|
||||
|
||||
train_data = data[:train_portion]
|
||||
test_data = data[train_portion:train_portion + test_portion]
|
||||
val_data = data[train_portion + test_portion:]
|
||||
|
||||
# Use very small subset for testing purposes
|
||||
train_data = train_data[:15]
|
||||
val_data = val_data[:15]
|
||||
test_data = test_data[:15]
|
||||
|
||||
tokenizer = tiktoken.get_encoding("gpt2")
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
customized_collate_fn = partial(custom_collate_fn, device=device, allowed_max_length=100)
|
||||
|
||||
num_workers = 0
|
||||
batch_size = 8
|
||||
|
||||
torch.manual_seed(123)
|
||||
|
||||
train_dataset = InstructionDataset(train_data, tokenizer)
|
||||
train_loader = DataLoader(
|
||||
train_dataset,
|
||||
batch_size=batch_size,
|
||||
collate_fn=customized_collate_fn,
|
||||
shuffle=True,
|
||||
drop_last=True,
|
||||
num_workers=num_workers
|
||||
)
|
||||
|
||||
val_dataset = InstructionDataset(val_data, tokenizer)
|
||||
val_loader = DataLoader(
|
||||
val_dataset,
|
||||
batch_size=batch_size,
|
||||
collate_fn=customized_collate_fn,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
num_workers=num_workers
|
||||
)
|
||||
|
||||
#######################################
|
||||
# Load pretrained model
|
||||
#######################################
|
||||
|
||||
# Small GPT model for testing purposes
|
||||
BASE_CONFIG = {
|
||||
"vocab_size": 50257,
|
||||
"context_length": 120,
|
||||
"drop_rate": 0.0,
|
||||
"qkv_bias": False,
|
||||
"emb_dim": 12,
|
||||
"n_layers": 1,
|
||||
"n_heads": 2
|
||||
}
|
||||
model = GPTModel(BASE_CONFIG)
|
||||
model.eval()
|
||||
device = "cpu"
|
||||
CHOOSE_MODEL = "Small test model"
|
||||
|
||||
print("Loaded model:", CHOOSE_MODEL)
|
||||
print(50*"-")
|
||||
|
||||
#######################################
|
||||
# Finetuning the model
|
||||
#######################################
|
||||
|
||||
num_epochs = 10
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)
|
||||
|
||||
torch.manual_seed(123)
|
||||
train_losses, val_losses, tokens_seen = train_model_simple(
|
||||
model, train_loader, val_loader, optimizer, device,
|
||||
num_epochs=num_epochs, eval_freq=5, eval_iter=5,
|
||||
start_context=format_input(val_data[0]), tokenizer=tokenizer
|
||||
)
|
||||
|
||||
assert round(train_losses[0], 1) == 10.9
|
||||
assert round(val_losses[0], 1) == 10.9
|
||||
assert train_losses[-1] < train_losses[0]
|
||||
Reference in New Issue
Block a user