Organized setup instructions (#115)

* Organized setup instructions * update tets * link checker action * raise error upon broken link * fix links * fix links * delete duplicated paragraph
2026-04-10 12:33:42 +00:00 · 2024-04-10 22:09:46 -04:00
parent 0b866c133f
commit 790d0808b2
39 changed files with 75 additions and 49 deletions
--- a/appendix-A/01_main-chapter-code/DDP-script.py
+++ b/appendix-A/01_main-chapter-code/DDP-script.py
@@ -0,0 +1,182 @@
+# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
+# Source for "Build a Large Language Model From Scratch"
+#   - https://www.manning.com/books/build-a-large-language-model-from-scratch
+# Code: https://github.com/rasbt/LLMs-from-scratch
+
+# Appendix A: Introduction to PyTorch (Part 3)
+
+import torch
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+
+# NEW imports:
+import os
+import torch.multiprocessing as mp
+from torch.utils.data.distributed import DistributedSampler
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.distributed import init_process_group, destroy_process_group
+
+
+# NEW: function to initialize a distributed process group (1 process / GPU)
+# this allows communication among processes
+def ddp_setup(rank, world_size):
+    """
+    Arguments:
+        rank: a unique process ID
+        world_size: total number of processes in the group
+    """
+    # rank of machine running rank:0 process
+    # here, we assume all GPUs are on the same machine
+    os.environ["MASTER_ADDR"] = "localhost"
+    # any free port on the machine
+    os.environ["MASTER_PORT"] = "12345"
+
+    # initialize process group
+    # Windows users may have to use "gloo" instead of "nccl" as backend
+    # nccl: NVIDIA Collective Communication Library
+    init_process_group(backend="nccl", rank=rank, world_size=world_size)
+    torch.cuda.set_device(rank)
+
+
+class ToyDataset(Dataset):
+    def __init__(self, X, y):
+        self.features = X
+        self.labels = y
+
+    def __getitem__(self, index):
+        one_x = self.features[index]
+        one_y = self.labels[index]
+        return one_x, one_y
+
+    def __len__(self):
+        return self.labels.shape[0]
+
+
+class NeuralNetwork(torch.nn.Module):
+    def __init__(self, num_inputs, num_outputs):
+        super().__init__()
+
+        self.layers = torch.nn.Sequential(
+            # 1st hidden layer
+            torch.nn.Linear(num_inputs, 30),
+            torch.nn.ReLU(),
+
+            # 2nd hidden layer
+            torch.nn.Linear(30, 20),
+            torch.nn.ReLU(),
+
+            # output layer
+            torch.nn.Linear(20, num_outputs),
+        )
+
+    def forward(self, x):
+        logits = self.layers(x)
+        return logits
+
+
+def prepare_dataset():
+    X_train = torch.tensor([
+        [-1.2, 3.1],
+        [-0.9, 2.9],
+        [-0.5, 2.6],
+        [2.3, -1.1],
+        [2.7, -1.5]
+    ])
+    y_train = torch.tensor([0, 0, 0, 1, 1])
+
+    X_test = torch.tensor([
+        [-0.8, 2.8],
+        [2.6, -1.6],
+    ])
+    y_test = torch.tensor([0, 1])
+
+    train_ds = ToyDataset(X_train, y_train)
+    test_ds = ToyDataset(X_test, y_test)
+
+    train_loader = DataLoader(
+        dataset=train_ds,
+        batch_size=2,
+        shuffle=False,  # NEW: False because of DistributedSampler below
+        pin_memory=True,
+        drop_last=True,
+        # NEW: chunk batches across GPUs without overlapping samples:
+        sampler=DistributedSampler(train_ds)  # NEW
+    )
+    test_loader = DataLoader(
+        dataset=test_ds,
+        batch_size=2,
+        shuffle=False,
+    )
+    return train_loader, test_loader
+
+
+# NEW: wrapper
+def main(rank, world_size, num_epochs):
+
+    ddp_setup(rank, world_size)  # NEW: initialize process groups
+
+    train_loader, test_loader = prepare_dataset()
+    model = NeuralNetwork(num_inputs=2, num_outputs=2)
+    model.to(rank)
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.5)
+
+    model = DDP(model, device_ids=[rank])  # NEW: wrap model with DDP
+    # the core model is now accessible as model.module
+
+    for epoch in range(num_epochs):
+
+        model.train()
+        for features, labels in train_loader:
+
+            features, labels = features.to(rank), labels.to(rank)  # New: use rank
+            logits = model(features)
+            loss = F.cross_entropy(logits, labels)  # Loss function
+
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            # LOGGING
+            print(f"[GPU{rank}] Epoch: {epoch+1:03d}/{num_epochs:03d}"
+                  f" | Batchsize {labels.shape[0]:03d}"
+                  f" | Train/Val Loss: {loss:.2f}")
+
+    model.eval()
+    train_acc = compute_accuracy(model, train_loader, device=rank)
+    print(f"[GPU{rank}] Training accuracy", train_acc)
+    test_acc = compute_accuracy(model, test_loader, device=rank)
+    print(f"[GPU{rank}] Test accuracy", test_acc)
+
+    destroy_process_group()  # NEW: cleanly exit distributed mode
+
+
+def compute_accuracy(model, dataloader, device):
+    model = model.eval()
+    correct = 0.0
+    total_examples = 0
+
+    for idx, (features, labels) in enumerate(dataloader):
+        features, labels = features.to(device), labels.to(device)
+
+        with torch.no_grad():
+            logits = model(features)
+        predictions = torch.argmax(logits, dim=1)
+        compare = labels == predictions
+        correct += torch.sum(compare)
+        total_examples += len(compare)
+    return (correct / total_examples).item()
+
+
+if __name__ == "__main__":
+    print("PyTorch version:", torch.__version__)
+    print("CUDA available:", torch.cuda.is_available())
+    print("Number of GPUs available:", torch.cuda.device_count())
+
+    torch.manual_seed(123)
+
+    # NEW: spawn new processes
+    # note that spawn will automatically pass the rank
+    num_epochs = 3
+    world_size = torch.cuda.device_count()
+    mp.spawn(main, args=(world_size, num_epochs), nprocs=world_size)
+    # nprocs=world_size spawns one process per GPU
--- a/appendix-A/01_main-chapter-code/code-part1.ipynb
+++ b/appendix-A/01_main-chapter-code/code-part1.ipynb
--- a/appendix-A/01_main-chapter-code/code-part2.ipynb
+++ b/appendix-A/01_main-chapter-code/code-part2.ipynb
@@ -0,0 +1,484 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<font size=\"1\">\n",
+    "Supplementary code for \"Build a Large Language Model From Scratch\": <a href=\"https://www.manning.com/books/build-a-large-language-model-from-scratch\">https://www.manning.com/books/build-a-large-language-model-from-scratch</a> by <a href=\"https://sebastianraschka.com\">Sebastian Raschka</a><br>\n",
+    "Code repository: <a href=\"https://github.com/rasbt/LLMs-from-scratch\">https://github.com/rasbt/LLMs-from-scratch</a>\n",
+    "</font>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "O9i6kzBsZVaZ"
+   },
+   "source": [
+    "# Appendix A: Introduction to PyTorch (Part 2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ppbG5d-NZezH"
+   },
+   "source": [
+    "## A.9 Optimizing training performance with GPUs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "6jH0J_DPZhbn"
+   },
+   "source": [
+    "### A.9.1 PyTorch computations on GPU devices"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "RM7kGhwMF_nO",
+    "outputId": "ac60b048-b81f-4bb0-90fa-1ca474f04e9a"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2.0.1+cu118\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "\n",
+    "print(torch.__version__)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "OXLCKXhiUkZt",
+    "outputId": "39fe5366-287e-47eb-cc34-3508d616c4f9"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "True\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(torch.cuda.is_available())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "MTTlfh53Va-T",
+    "outputId": "f31d8bbe-577f-4db4-9939-02e66b9f96d1"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([5., 7., 9.])"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tensor_1 = torch.tensor([1., 2., 3.])\n",
+    "tensor_2 = torch.tensor([4., 5., 6.])\n",
+    "\n",
+    "print(tensor_1 + tensor_2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "Z4LwTNw7Vmmb",
+    "outputId": "1c025c6a-e3ed-4c7c-f5fd-86c14607036e"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([5., 7., 9.], device='cuda:0')\n"
+     ]
+    }
+   ],
+   "source": [
+    "tensor_1 = tensor_1.to(\"cuda\")\n",
+    "tensor_2 = tensor_2.to(\"cuda\")\n",
+    "\n",
+    "print(tensor_1 + tensor_2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 184
+    },
+    "id": "tKT6URN1Vuft",
+    "outputId": "e6f01e7f-d9cf-44cb-cc6d-46fc7907d5c0"
+   },
+   "outputs": [
+    {
+     "ename": "RuntimeError",
+     "evalue": "ignored",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-7-4ff3c4d20fc3>\u001b[0m in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mtensor_1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtensor_1\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"cpu\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtensor_1\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mtensor_2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!"
+     ]
+    }
+   ],
+   "source": [
+    "tensor_1 = tensor_1.to(\"cpu\")\n",
+    "print(tensor_1 + tensor_2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "c8j1cWDcWAMf"
+   },
+   "source": [
+    "### A.9.2 Single-GPU training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "id": "GyY59cjieitv"
+   },
+   "outputs": [],
+   "source": [
+    "X_train = torch.tensor([\n",
+    "    [-1.2, 3.1],\n",
+    "    [-0.9, 2.9],\n",
+    "    [-0.5, 2.6],\n",
+    "    [2.3, -1.1],\n",
+    "    [2.7, -1.5]\n",
+    "])\n",
+    "\n",
+    "y_train = torch.tensor([0, 0, 0, 1, 1])\n",
+    "\n",
+    "X_test = torch.tensor([\n",
+    "    [-0.8, 2.8],\n",
+    "    [2.6, -1.6],\n",
+    "])\n",
+    "\n",
+    "y_test = torch.tensor([0, 1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "id": "v41gKqEJempa"
+   },
+   "outputs": [],
+   "source": [
+    "from torch.utils.data import Dataset\n",
+    "\n",
+    "\n",
+    "class ToyDataset(Dataset):\n",
+    "    def __init__(self, X, y):\n",
+    "        self.features = X\n",
+    "        self.labels = y\n",
+    "\n",
+    "    def __getitem__(self, index):\n",
+    "        one_x = self.features[index]\n",
+    "        one_y = self.labels[index]\n",
+    "        return one_x, one_y\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return self.labels.shape[0]\n",
+    "\n",
+    "train_ds = ToyDataset(X_train, y_train)\n",
+    "test_ds = ToyDataset(X_test, y_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
+    "id": "UPGVRuylep8Y"
+   },
+   "outputs": [],
+   "source": [
+    "from torch.utils.data import DataLoader\n",
+    "\n",
+    "torch.manual_seed(123)\n",
+    "\n",
+    "train_loader = DataLoader(\n",
+    "    dataset=train_ds,\n",
+    "    batch_size=2,\n",
+    "    shuffle=True,\n",
+    "    num_workers=1,\n",
+    "    drop_last=True\n",
+    ")\n",
+    "\n",
+    "test_loader = DataLoader(\n",
+    "    dataset=test_ds,\n",
+    "    batch_size=2,\n",
+    "    shuffle=False,\n",
+    "    num_workers=1\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {
+    "id": "drhg6IXofAXh"
+   },
+   "outputs": [],
+   "source": [
+    "class NeuralNetwork(torch.nn.Module):\n",
+    "    def __init__(self, num_inputs, num_outputs):\n",
+    "        super().__init__()\n",
+    "\n",
+    "        self.layers = torch.nn.Sequential(\n",
+    "\n",
+    "            # 1st hidden layer\n",
+    "            torch.nn.Linear(num_inputs, 30),\n",
+    "            torch.nn.ReLU(),\n",
+    "\n",
+    "            # 2nd hidden layer\n",
+    "            torch.nn.Linear(30, 20),\n",
+    "            torch.nn.ReLU(),\n",
+    "\n",
+    "            # output layer\n",
+    "            torch.nn.Linear(20, num_outputs),\n",
+    "        )\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        logits = self.layers(x)\n",
+    "        return logits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "7jaS5sqPWCY0",
+    "outputId": "84c74615-38f2-48b8-eeda-b5912fed1d3a"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch: 001/003 | Batch 000/002 | Train/Val Loss: 0.75\n",
+      "Epoch: 001/003 | Batch 001/002 | Train/Val Loss: 0.65\n",
+      "Epoch: 002/003 | Batch 000/002 | Train/Val Loss: 0.44\n",
+      "Epoch: 002/003 | Batch 001/002 | Train/Val Loss: 0.13\n",
+      "Epoch: 003/003 | Batch 000/002 | Train/Val Loss: 0.03\n",
+      "Epoch: 003/003 | Batch 001/002 | Train/Val Loss: 0.00\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch.nn.functional as F\n",
+    "\n",
+    "\n",
+    "torch.manual_seed(123)\n",
+    "model = NeuralNetwork(num_inputs=2, num_outputs=2)\n",
+    "\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\") # NEW\n",
+    "model = model.to(device) # NEW\n",
+    "\n",
+    "optimizer = torch.optim.SGD(model.parameters(), lr=0.5)\n",
+    "\n",
+    "num_epochs = 3\n",
+    "\n",
+    "for epoch in range(num_epochs):\n",
+    "\n",
+    "    model.train()\n",
+    "    for batch_idx, (features, labels) in enumerate(train_loader):\n",
+    "\n",
+    "        features, labels = features.to(device), labels.to(device) # NEW\n",
+    "        logits = model(features)\n",
+    "        loss = F.cross_entropy(logits, labels) # Loss function\n",
+    "\n",
+    "        optimizer.zero_grad()\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "\n",
+    "        ### LOGGING\n",
+    "        print(f\"Epoch: {epoch+1:03d}/{num_epochs:03d}\"\n",
+    "              f\" | Batch {batch_idx:03d}/{len(train_loader):03d}\"\n",
+    "              f\" | Train/Val Loss: {loss:.2f}\")\n",
+    "\n",
+    "    model.eval()\n",
+    "    # Optional model evaluation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {
+    "id": "4qrlmnPPe7FO"
+   },
+   "outputs": [],
+   "source": [
+    "def compute_accuracy(model, dataloader, device):\n",
+    "\n",
+    "    model = model.eval()\n",
+    "    correct = 0.0\n",
+    "    total_examples = 0\n",
+    "\n",
+    "    for idx, (features, labels) in enumerate(dataloader):\n",
+    "\n",
+    "        features, labels = features.to(device), labels.to(device) # New\n",
+    "\n",
+    "        with torch.no_grad():\n",
+    "            logits = model(features)\n",
+    "\n",
+    "        predictions = torch.argmax(logits, dim=1)\n",
+    "        compare = labels == predictions\n",
+    "        correct += torch.sum(compare)\n",
+    "        total_examples += len(compare)\n",
+    "\n",
+    "    return (correct / total_examples).item()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "1_-BfkfEf4HX",
+    "outputId": "473bf21d-5880-4de3-fc8a-051d75315b94"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1.0"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "compute_accuracy(model, train_loader, device=device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "iYtXKBGEgKss",
+    "outputId": "508edd84-3fb7-4d04-cb23-9df0c3d24170"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1.0"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "compute_accuracy(model, test_loader, device=device)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### A.9.3 Training with multiple GPUs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "See [DDP-script.py](DDP-script.py)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/appendix-a_compressed/12.webp\" width=\"600px\">\n",
+    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/appendix-a_compressed/13.webp\" width=\"600px\">"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/appendix-A/01_main-chapter-code/exercise-solutions.ipynb
+++ b/appendix-A/01_main-chapter-code/exercise-solutions.ipynb
@@ -0,0 +1,186 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<font size=\"1\">\n",
+    "Supplementary code for \"Build a Large Language Model From Scratch\": <a href=\"https://www.manning.com/books/build-a-large-language-model-from-scratch\">https://www.manning.com/books/build-a-large-language-model-from-scratch</a> by <a href=\"https://sebastianraschka.com\">Sebastian Raschka</a><br>\n",
+    "Code repository: <a href=\"https://github.com/rasbt/LLMs-from-scratch\">https://github.com/rasbt/LLMs-from-scratch</a>\n",
+    "</font>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Exercise A.3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "\n",
+    "class NeuralNetwork(torch.nn.Module):\n",
+    "    def __init__(self, num_inputs, num_outputs):\n",
+    "        super().__init__()\n",
+    "\n",
+    "        self.layers = torch.nn.Sequential(\n",
+    "                \n",
+    "            # 1st hidden layer\n",
+    "            torch.nn.Linear(num_inputs, 30),\n",
+    "            torch.nn.ReLU(),\n",
+    "\n",
+    "            # 2nd hidden layer\n",
+    "            torch.nn.Linear(30, 20),\n",
+    "            torch.nn.ReLU(),\n",
+    "\n",
+    "            # output layer\n",
+    "            torch.nn.Linear(20, num_outputs),\n",
+    "        )\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        logits = self.layers(x)\n",
+    "        return logits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total number of trainable model parameters: 752\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = NeuralNetwork(2, 2)\n",
+    "\n",
+    "num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
+    "print(\"Total number of trainable model parameters:\", num_params)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Exercise A.4"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "id": "qGgnamiyLJxp"
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "\n",
+    "a = torch.rand(100, 200)\n",
+    "b = torch.rand(200, 300)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "CvGvIeVkLzXE",
+    "outputId": "44d027be-0787-4348-9c06-4e559d94d0e1"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "63.8 µs ± 8.7 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit a @ b"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "id": "OmRtZLa9L2ZG"
+   },
+   "outputs": [],
+   "source": [
+    "a, b = a.to(\"cuda\"), b.to(\"cuda\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "duLEhXDPL6k0",
+    "outputId": "3486471d-fd62-446f-9855-2d01f41fd101"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "13.8 µs ± 425 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit a @ b"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Zqqa-To2L749"
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "V100",
+   "machine_shape": "hm",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}