mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2026-04-10 12:33:42 +00:00
Organized setup instructions (#115)
* Organized setup instructions * update tets * link checker action * raise error upon broken link * fix links * fix links * delete duplicated paragraph
This commit is contained in:
committed by
GitHub
parent
0b866c133f
commit
790d0808b2
182
appendix-A/01_main-chapter-code/DDP-script.py
Normal file
182
appendix-A/01_main-chapter-code/DDP-script.py
Normal file
@@ -0,0 +1,182 @@
|
||||
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
|
||||
# Source for "Build a Large Language Model From Scratch"
|
||||
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
|
||||
# Code: https://github.com/rasbt/LLMs-from-scratch
|
||||
|
||||
# Appendix A: Introduction to PyTorch (Part 3)
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
|
||||
# NEW imports:
|
||||
import os
|
||||
import torch.multiprocessing as mp
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from torch.nn.parallel import DistributedDataParallel as DDP
|
||||
from torch.distributed import init_process_group, destroy_process_group
|
||||
|
||||
|
||||
# NEW: function to initialize a distributed process group (1 process / GPU)
|
||||
# this allows communication among processes
|
||||
def ddp_setup(rank, world_size):
|
||||
"""
|
||||
Arguments:
|
||||
rank: a unique process ID
|
||||
world_size: total number of processes in the group
|
||||
"""
|
||||
# rank of machine running rank:0 process
|
||||
# here, we assume all GPUs are on the same machine
|
||||
os.environ["MASTER_ADDR"] = "localhost"
|
||||
# any free port on the machine
|
||||
os.environ["MASTER_PORT"] = "12345"
|
||||
|
||||
# initialize process group
|
||||
# Windows users may have to use "gloo" instead of "nccl" as backend
|
||||
# nccl: NVIDIA Collective Communication Library
|
||||
init_process_group(backend="nccl", rank=rank, world_size=world_size)
|
||||
torch.cuda.set_device(rank)
|
||||
|
||||
|
||||
class ToyDataset(Dataset):
|
||||
def __init__(self, X, y):
|
||||
self.features = X
|
||||
self.labels = y
|
||||
|
||||
def __getitem__(self, index):
|
||||
one_x = self.features[index]
|
||||
one_y = self.labels[index]
|
||||
return one_x, one_y
|
||||
|
||||
def __len__(self):
|
||||
return self.labels.shape[0]
|
||||
|
||||
|
||||
class NeuralNetwork(torch.nn.Module):
|
||||
def __init__(self, num_inputs, num_outputs):
|
||||
super().__init__()
|
||||
|
||||
self.layers = torch.nn.Sequential(
|
||||
# 1st hidden layer
|
||||
torch.nn.Linear(num_inputs, 30),
|
||||
torch.nn.ReLU(),
|
||||
|
||||
# 2nd hidden layer
|
||||
torch.nn.Linear(30, 20),
|
||||
torch.nn.ReLU(),
|
||||
|
||||
# output layer
|
||||
torch.nn.Linear(20, num_outputs),
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
logits = self.layers(x)
|
||||
return logits
|
||||
|
||||
|
||||
def prepare_dataset():
|
||||
X_train = torch.tensor([
|
||||
[-1.2, 3.1],
|
||||
[-0.9, 2.9],
|
||||
[-0.5, 2.6],
|
||||
[2.3, -1.1],
|
||||
[2.7, -1.5]
|
||||
])
|
||||
y_train = torch.tensor([0, 0, 0, 1, 1])
|
||||
|
||||
X_test = torch.tensor([
|
||||
[-0.8, 2.8],
|
||||
[2.6, -1.6],
|
||||
])
|
||||
y_test = torch.tensor([0, 1])
|
||||
|
||||
train_ds = ToyDataset(X_train, y_train)
|
||||
test_ds = ToyDataset(X_test, y_test)
|
||||
|
||||
train_loader = DataLoader(
|
||||
dataset=train_ds,
|
||||
batch_size=2,
|
||||
shuffle=False, # NEW: False because of DistributedSampler below
|
||||
pin_memory=True,
|
||||
drop_last=True,
|
||||
# NEW: chunk batches across GPUs without overlapping samples:
|
||||
sampler=DistributedSampler(train_ds) # NEW
|
||||
)
|
||||
test_loader = DataLoader(
|
||||
dataset=test_ds,
|
||||
batch_size=2,
|
||||
shuffle=False,
|
||||
)
|
||||
return train_loader, test_loader
|
||||
|
||||
|
||||
# NEW: wrapper
|
||||
def main(rank, world_size, num_epochs):
|
||||
|
||||
ddp_setup(rank, world_size) # NEW: initialize process groups
|
||||
|
||||
train_loader, test_loader = prepare_dataset()
|
||||
model = NeuralNetwork(num_inputs=2, num_outputs=2)
|
||||
model.to(rank)
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr=0.5)
|
||||
|
||||
model = DDP(model, device_ids=[rank]) # NEW: wrap model with DDP
|
||||
# the core model is now accessible as model.module
|
||||
|
||||
for epoch in range(num_epochs):
|
||||
|
||||
model.train()
|
||||
for features, labels in train_loader:
|
||||
|
||||
features, labels = features.to(rank), labels.to(rank) # New: use rank
|
||||
logits = model(features)
|
||||
loss = F.cross_entropy(logits, labels) # Loss function
|
||||
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
# LOGGING
|
||||
print(f"[GPU{rank}] Epoch: {epoch+1:03d}/{num_epochs:03d}"
|
||||
f" | Batchsize {labels.shape[0]:03d}"
|
||||
f" | Train/Val Loss: {loss:.2f}")
|
||||
|
||||
model.eval()
|
||||
train_acc = compute_accuracy(model, train_loader, device=rank)
|
||||
print(f"[GPU{rank}] Training accuracy", train_acc)
|
||||
test_acc = compute_accuracy(model, test_loader, device=rank)
|
||||
print(f"[GPU{rank}] Test accuracy", test_acc)
|
||||
|
||||
destroy_process_group() # NEW: cleanly exit distributed mode
|
||||
|
||||
|
||||
def compute_accuracy(model, dataloader, device):
|
||||
model = model.eval()
|
||||
correct = 0.0
|
||||
total_examples = 0
|
||||
|
||||
for idx, (features, labels) in enumerate(dataloader):
|
||||
features, labels = features.to(device), labels.to(device)
|
||||
|
||||
with torch.no_grad():
|
||||
logits = model(features)
|
||||
predictions = torch.argmax(logits, dim=1)
|
||||
compare = labels == predictions
|
||||
correct += torch.sum(compare)
|
||||
total_examples += len(compare)
|
||||
return (correct / total_examples).item()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("PyTorch version:", torch.__version__)
|
||||
print("CUDA available:", torch.cuda.is_available())
|
||||
print("Number of GPUs available:", torch.cuda.device_count())
|
||||
|
||||
torch.manual_seed(123)
|
||||
|
||||
# NEW: spawn new processes
|
||||
# note that spawn will automatically pass the rank
|
||||
num_epochs = 3
|
||||
world_size = torch.cuda.device_count()
|
||||
mp.spawn(main, args=(world_size, num_epochs), nprocs=world_size)
|
||||
# nprocs=world_size spawns one process per GPU
|
||||
1335
appendix-A/01_main-chapter-code/code-part1.ipynb
Normal file
1335
appendix-A/01_main-chapter-code/code-part1.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
484
appendix-A/01_main-chapter-code/code-part2.ipynb
Normal file
484
appendix-A/01_main-chapter-code/code-part2.ipynb
Normal file
@@ -0,0 +1,484 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<font size=\"1\">\n",
|
||||
"Supplementary code for \"Build a Large Language Model From Scratch\": <a href=\"https://www.manning.com/books/build-a-large-language-model-from-scratch\">https://www.manning.com/books/build-a-large-language-model-from-scratch</a> by <a href=\"https://sebastianraschka.com\">Sebastian Raschka</a><br>\n",
|
||||
"Code repository: <a href=\"https://github.com/rasbt/LLMs-from-scratch\">https://github.com/rasbt/LLMs-from-scratch</a>\n",
|
||||
"</font>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "O9i6kzBsZVaZ"
|
||||
},
|
||||
"source": [
|
||||
"# Appendix A: Introduction to PyTorch (Part 2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "ppbG5d-NZezH"
|
||||
},
|
||||
"source": [
|
||||
"## A.9 Optimizing training performance with GPUs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "6jH0J_DPZhbn"
|
||||
},
|
||||
"source": [
|
||||
"### A.9.1 PyTorch computations on GPU devices"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "RM7kGhwMF_nO",
|
||||
"outputId": "ac60b048-b81f-4bb0-90fa-1ca474f04e9a"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2.0.1+cu118\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"print(torch.__version__)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "OXLCKXhiUkZt",
|
||||
"outputId": "39fe5366-287e-47eb-cc34-3508d616c4f9"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"True\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(torch.cuda.is_available())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "MTTlfh53Va-T",
|
||||
"outputId": "f31d8bbe-577f-4db4-9939-02e66b9f96d1"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"tensor([5., 7., 9.])"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tensor_1 = torch.tensor([1., 2., 3.])\n",
|
||||
"tensor_2 = torch.tensor([4., 5., 6.])\n",
|
||||
"\n",
|
||||
"print(tensor_1 + tensor_2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "Z4LwTNw7Vmmb",
|
||||
"outputId": "1c025c6a-e3ed-4c7c-f5fd-86c14607036e"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"tensor([5., 7., 9.], device='cuda:0')\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tensor_1 = tensor_1.to(\"cuda\")\n",
|
||||
"tensor_2 = tensor_2.to(\"cuda\")\n",
|
||||
"\n",
|
||||
"print(tensor_1 + tensor_2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 184
|
||||
},
|
||||
"id": "tKT6URN1Vuft",
|
||||
"outputId": "e6f01e7f-d9cf-44cb-cc6d-46fc7907d5c0"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "RuntimeError",
|
||||
"evalue": "ignored",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m<ipython-input-7-4ff3c4d20fc3>\u001b[0m in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mtensor_1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtensor_1\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"cpu\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtensor_1\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtensor_2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
||||
"\u001b[0;31mRuntimeError\u001b[0m: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tensor_1 = tensor_1.to(\"cpu\")\n",
|
||||
"print(tensor_1 + tensor_2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "c8j1cWDcWAMf"
|
||||
},
|
||||
"source": [
|
||||
"### A.9.2 Single-GPU training"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {
|
||||
"id": "GyY59cjieitv"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_train = torch.tensor([\n",
|
||||
" [-1.2, 3.1],\n",
|
||||
" [-0.9, 2.9],\n",
|
||||
" [-0.5, 2.6],\n",
|
||||
" [2.3, -1.1],\n",
|
||||
" [2.7, -1.5]\n",
|
||||
"])\n",
|
||||
"\n",
|
||||
"y_train = torch.tensor([0, 0, 0, 1, 1])\n",
|
||||
"\n",
|
||||
"X_test = torch.tensor([\n",
|
||||
" [-0.8, 2.8],\n",
|
||||
" [2.6, -1.6],\n",
|
||||
"])\n",
|
||||
"\n",
|
||||
"y_test = torch.tensor([0, 1])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {
|
||||
"id": "v41gKqEJempa"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from torch.utils.data import Dataset\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class ToyDataset(Dataset):\n",
|
||||
" def __init__(self, X, y):\n",
|
||||
" self.features = X\n",
|
||||
" self.labels = y\n",
|
||||
"\n",
|
||||
" def __getitem__(self, index):\n",
|
||||
" one_x = self.features[index]\n",
|
||||
" one_y = self.labels[index]\n",
|
||||
" return one_x, one_y\n",
|
||||
"\n",
|
||||
" def __len__(self):\n",
|
||||
" return self.labels.shape[0]\n",
|
||||
"\n",
|
||||
"train_ds = ToyDataset(X_train, y_train)\n",
|
||||
"test_ds = ToyDataset(X_test, y_test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"metadata": {
|
||||
"id": "UPGVRuylep8Y"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from torch.utils.data import DataLoader\n",
|
||||
"\n",
|
||||
"torch.manual_seed(123)\n",
|
||||
"\n",
|
||||
"train_loader = DataLoader(\n",
|
||||
" dataset=train_ds,\n",
|
||||
" batch_size=2,\n",
|
||||
" shuffle=True,\n",
|
||||
" num_workers=1,\n",
|
||||
" drop_last=True\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"test_loader = DataLoader(\n",
|
||||
" dataset=test_ds,\n",
|
||||
" batch_size=2,\n",
|
||||
" shuffle=False,\n",
|
||||
" num_workers=1\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {
|
||||
"id": "drhg6IXofAXh"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class NeuralNetwork(torch.nn.Module):\n",
|
||||
" def __init__(self, num_inputs, num_outputs):\n",
|
||||
" super().__init__()\n",
|
||||
"\n",
|
||||
" self.layers = torch.nn.Sequential(\n",
|
||||
"\n",
|
||||
" # 1st hidden layer\n",
|
||||
" torch.nn.Linear(num_inputs, 30),\n",
|
||||
" torch.nn.ReLU(),\n",
|
||||
"\n",
|
||||
" # 2nd hidden layer\n",
|
||||
" torch.nn.Linear(30, 20),\n",
|
||||
" torch.nn.ReLU(),\n",
|
||||
"\n",
|
||||
" # output layer\n",
|
||||
" torch.nn.Linear(20, num_outputs),\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" def forward(self, x):\n",
|
||||
" logits = self.layers(x)\n",
|
||||
" return logits"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "7jaS5sqPWCY0",
|
||||
"outputId": "84c74615-38f2-48b8-eeda-b5912fed1d3a"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Epoch: 001/003 | Batch 000/002 | Train/Val Loss: 0.75\n",
|
||||
"Epoch: 001/003 | Batch 001/002 | Train/Val Loss: 0.65\n",
|
||||
"Epoch: 002/003 | Batch 000/002 | Train/Val Loss: 0.44\n",
|
||||
"Epoch: 002/003 | Batch 001/002 | Train/Val Loss: 0.13\n",
|
||||
"Epoch: 003/003 | Batch 000/002 | Train/Val Loss: 0.03\n",
|
||||
"Epoch: 003/003 | Batch 001/002 | Train/Val Loss: 0.00\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch.nn.functional as F\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"torch.manual_seed(123)\n",
|
||||
"model = NeuralNetwork(num_inputs=2, num_outputs=2)\n",
|
||||
"\n",
|
||||
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\") # NEW\n",
|
||||
"model = model.to(device) # NEW\n",
|
||||
"\n",
|
||||
"optimizer = torch.optim.SGD(model.parameters(), lr=0.5)\n",
|
||||
"\n",
|
||||
"num_epochs = 3\n",
|
||||
"\n",
|
||||
"for epoch in range(num_epochs):\n",
|
||||
"\n",
|
||||
" model.train()\n",
|
||||
" for batch_idx, (features, labels) in enumerate(train_loader):\n",
|
||||
"\n",
|
||||
" features, labels = features.to(device), labels.to(device) # NEW\n",
|
||||
" logits = model(features)\n",
|
||||
" loss = F.cross_entropy(logits, labels) # Loss function\n",
|
||||
"\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
"\n",
|
||||
" ### LOGGING\n",
|
||||
" print(f\"Epoch: {epoch+1:03d}/{num_epochs:03d}\"\n",
|
||||
" f\" | Batch {batch_idx:03d}/{len(train_loader):03d}\"\n",
|
||||
" f\" | Train/Val Loss: {loss:.2f}\")\n",
|
||||
"\n",
|
||||
" model.eval()\n",
|
||||
" # Optional model evaluation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {
|
||||
"id": "4qrlmnPPe7FO"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def compute_accuracy(model, dataloader, device):\n",
|
||||
"\n",
|
||||
" model = model.eval()\n",
|
||||
" correct = 0.0\n",
|
||||
" total_examples = 0\n",
|
||||
"\n",
|
||||
" for idx, (features, labels) in enumerate(dataloader):\n",
|
||||
"\n",
|
||||
" features, labels = features.to(device), labels.to(device) # New\n",
|
||||
"\n",
|
||||
" with torch.no_grad():\n",
|
||||
" logits = model(features)\n",
|
||||
"\n",
|
||||
" predictions = torch.argmax(logits, dim=1)\n",
|
||||
" compare = labels == predictions\n",
|
||||
" correct += torch.sum(compare)\n",
|
||||
" total_examples += len(compare)\n",
|
||||
"\n",
|
||||
" return (correct / total_examples).item()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "1_-BfkfEf4HX",
|
||||
"outputId": "473bf21d-5880-4de3-fc8a-051d75315b94"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"1.0"
|
||||
]
|
||||
},
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"compute_accuracy(model, train_loader, device=device)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "iYtXKBGEgKss",
|
||||
"outputId": "508edd84-3fb7-4d04-cb23-9df0c3d24170"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"1.0"
|
||||
]
|
||||
},
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"compute_accuracy(model, test_loader, device=device)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### A.9.3 Training with multiple GPUs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"See [DDP-script.py](DDP-script.py)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/appendix-a_compressed/12.webp\" width=\"600px\">\n",
|
||||
"<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/appendix-a_compressed/13.webp\" width=\"600px\">"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"accelerator": "GPU",
|
||||
"colab": {
|
||||
"gpuType": "T4",
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
186
appendix-A/01_main-chapter-code/exercise-solutions.ipynb
Normal file
186
appendix-A/01_main-chapter-code/exercise-solutions.ipynb
Normal file
@@ -0,0 +1,186 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<font size=\"1\">\n",
|
||||
"Supplementary code for \"Build a Large Language Model From Scratch\": <a href=\"https://www.manning.com/books/build-a-large-language-model-from-scratch\">https://www.manning.com/books/build-a-large-language-model-from-scratch</a> by <a href=\"https://sebastianraschka.com\">Sebastian Raschka</a><br>\n",
|
||||
"Code repository: <a href=\"https://github.com/rasbt/LLMs-from-scratch\">https://github.com/rasbt/LLMs-from-scratch</a>\n",
|
||||
"</font>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Exercise A.3"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"class NeuralNetwork(torch.nn.Module):\n",
|
||||
" def __init__(self, num_inputs, num_outputs):\n",
|
||||
" super().__init__()\n",
|
||||
"\n",
|
||||
" self.layers = torch.nn.Sequential(\n",
|
||||
" \n",
|
||||
" # 1st hidden layer\n",
|
||||
" torch.nn.Linear(num_inputs, 30),\n",
|
||||
" torch.nn.ReLU(),\n",
|
||||
"\n",
|
||||
" # 2nd hidden layer\n",
|
||||
" torch.nn.Linear(30, 20),\n",
|
||||
" torch.nn.ReLU(),\n",
|
||||
"\n",
|
||||
" # output layer\n",
|
||||
" torch.nn.Linear(20, num_outputs),\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" def forward(self, x):\n",
|
||||
" logits = self.layers(x)\n",
|
||||
" return logits"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Total number of trainable model parameters: 752\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"model = NeuralNetwork(2, 2)\n",
|
||||
"\n",
|
||||
"num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
|
||||
"print(\"Total number of trainable model parameters:\", num_params)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Exercise A.4"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"id": "qGgnamiyLJxp"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"a = torch.rand(100, 200)\n",
|
||||
"b = torch.rand(200, 300)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "CvGvIeVkLzXE",
|
||||
"outputId": "44d027be-0787-4348-9c06-4e559d94d0e1"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"63.8 µs ± 8.7 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%timeit a @ b"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"id": "OmRtZLa9L2ZG"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"a, b = a.to(\"cuda\"), b.to(\"cuda\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "duLEhXDPL6k0",
|
||||
"outputId": "3486471d-fd62-446f-9855-2d01f41fd101"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"13.8 µs ± 425 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%timeit a @ b"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "Zqqa-To2L749"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"accelerator": "GPU",
|
||||
"colab": {
|
||||
"gpuType": "V100",
|
||||
"machine_shape": "hm",
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
Reference in New Issue
Block a user