From 1d6f2c908413b7744bc9912023129333d92b3ca1 Mon Sep 17 00:00:00 2001
From: rasbt <mail@sebastianraschka.com>
Date: Sun, 11 Feb 2024 14:46:05 -0600
Subject: [PATCH] rearrange exercise order

---
 .../exercise-solutions.ipynb                  | 457 +++++++++---------
 1 file changed, 228 insertions(+), 229 deletions(-)

diff --git a/ch04/01_main-chapter-code/exercise-solutions.ipynb b/ch04/01_main-chapter-code/exercise-solutions.ipynb
index 5291396..5167827 100644
--- a/ch04/01_main-chapter-code/exercise-solutions.ipynb
+++ b/ch04/01_main-chapter-code/exercise-solutions.ipynb
@@ -10,15 +10,239 @@
   },
   {
    "cell_type": "markdown",
-   "id": "33dfa199-9aee-41d4-a64b-7e3811b9a616",
+   "id": "5fea8be3-30a1-4623-a6d7-b095c6c1092e",
    "metadata": {},
    "source": [
-    "# Exercise 4.1: Using separate dropout parameters"
+    "# Exercise 4.1: Parameters in the feed forward versus attention module"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 1,
+   "id": "2751b0e5-ffd3-4be2-8db3-e20dd4d61d69",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from gpt import TransformerBlock\n",
+    "\n",
+    "GPT_CONFIG_124M = {\n",
+    "    \"vocab_size\": 50257,\n",
+    "    \"ctx_len\": 1024,\n",
+    "    \"emb_dim\": 768,\n",
+    "    \"n_heads\": 12,\n",
+    "    \"n_layers\": 12,\n",
+    "    \"drop_rate\": 0.1,\n",
+    "    \"qkv_bias\": False\n",
+    "}\n",
+    "\n",
+    "block = TransformerBlock(GPT_CONFIG_124M)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "1bcaffd1-0cf6-4f8f-bd53-ab88a37f443e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total number of parameters in feed forward module: 4,722,432\n"
+     ]
+    }
+   ],
+   "source": [
+    "total_params = sum(p.numel() for p in block.ff.parameters())\n",
+    "print(f\"Total number of parameters in feed forward module: {total_params:,}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "c1dd06c1-ab6c-4df7-ba73-f9cd54b31138",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total number of parameters in attention module: 2,360,064\n"
+     ]
+    }
+   ],
+   "source": [
+    "total_params = sum(p.numel() for p in block.att.parameters())\n",
+    "print(f\"Total number of parameters in attention module: {total_params:,}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15463dec-520a-47b4-b3ad-e180394fd076",
+   "metadata": {},
+   "source": [
+    "- The results above are for a single transformer block\n",
+    "- Optionally multiply by 12 to capture all transformer blocks in the 124M GPT model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0f7b7c7f-0fa1-4d30-ab44-e499edd55b6d",
+   "metadata": {},
+   "source": [
+    "# Exercise 4.2: Initialize larger GPT models"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "310b2e05-3ec8-47fc-afd9-83bf03d4aad8",
+   "metadata": {},
+   "source": [
+    "- **GPT2-small** (the 124M configuration we already implemented):\n",
+    "    - \"emb_dim\" = 768\n",
+    "    - \"n_layers\" = 12\n",
+    "    - \"n_heads\" = 12\n",
+    "\n",
+    "- **GPT2-medium:**\n",
+    "    - \"emb_dim\" = 1024\n",
+    "    - \"n_layers\" = 24\n",
+    "    - \"n_heads\" = 16\n",
+    "\n",
+    "- **GPT2-large:**\n",
+    "    - \"emb_dim\" = 1280\n",
+    "    - \"n_layers\" = 36\n",
+    "    - \"n_heads\" = 20\n",
+    "\n",
+    "- **GPT2-XL:**\n",
+    "    - \"emb_dim\" = 1600\n",
+    "    - \"n_layers\" = 48\n",
+    "    - \"n_heads\" = 25"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "90185dea-81ca-4cdc-aef7-4aaf95cba946",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "GPT_CONFIG_124M = {\n",
+    "    \"vocab_size\": 50257,\n",
+    "    \"ctx_len\": 1024,\n",
+    "    \"emb_dim\": 768,\n",
+    "    \"n_heads\": 12,\n",
+    "    \"n_layers\": 12,\n",
+    "    \"drop_rate\": 0.1,\n",
+    "    \"qkv_bias\": False\n",
+    "}\n",
+    "\n",
+    "\n",
+    "def get_config(base_config, model_name=\"gpt2-small\"):\n",
+    "    GPT_CONFIG = base_config.copy()\n",
+    "\n",
+    "    if model_name == \"gpt2-small\":\n",
+    "        GPT_CONFIG[\"emb_dim\"] = 768\n",
+    "        GPT_CONFIG[\"n_layers\"] = 12\n",
+    "        GPT_CONFIG[\"n_heads\"] = 12\n",
+    "\n",
+    "    elif model_name == \"gpt2-medium\":\n",
+    "        GPT_CONFIG[\"emb_dim\"] = 1024\n",
+    "        GPT_CONFIG[\"n_layers\"] = 24\n",
+    "        GPT_CONFIG[\"n_heads\"] = 16\n",
+    "\n",
+    "    elif model_name == \"gpt2-large\":\n",
+    "        GPT_CONFIG[\"emb_dim\"] = 1280\n",
+    "        GPT_CONFIG[\"n_layers\"] = 36\n",
+    "        GPT_CONFIG[\"n_heads\"] = 20\n",
+    "\n",
+    "    elif model_name == \"gpt2-xl\":\n",
+    "        GPT_CONFIG[\"emb_dim\"] = 1600\n",
+    "        GPT_CONFIG[\"n_layers\"] = 48\n",
+    "        GPT_CONFIG[\"n_heads\"] = 25\n",
+    "\n",
+    "    else:\n",
+    "        raise ValueError(f\"Incorrect model name {model_name}\")\n",
+    "\n",
+    "    return GPT_CONFIG\n",
+    "\n",
+    "\n",
+    "def calculate_size(model): # based on chapter code\n",
+    "    \n",
+    "    total_params = sum(p.numel() for p in model.parameters())\n",
+    "    print(f\"Total number of parameters: {total_params:,}\")\n",
+    "\n",
+    "    total_params_gpt2 =  total_params - sum(p.numel() for p in model.out_head.parameters())\n",
+    "    print(f\"Number of trainable parameters considering weight tying: {total_params_gpt2:,}\")\n",
+    "    \n",
+    "    # Calculate the total size in bytes (assuming float32, 4 bytes per parameter)\n",
+    "    total_size_bytes = total_params * 4\n",
+    "    \n",
+    "    # Convert to megabytes\n",
+    "    total_size_mb = total_size_bytes / (1024 * 1024)\n",
+    "    \n",
+    "    print(f\"Total size of the model: {total_size_mb:.2f} MB\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "2587e011-78a4-479c-a8fd-961cc40a5fd4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "gpt2-small:\n",
+      "Total number of parameters: 163,009,536\n",
+      "Number of trainable parameters considering weight tying: 124,412,160\n",
+      "Total size of the model: 621.83 MB\n",
+      "\n",
+      "\n",
+      "gpt2-medium:\n",
+      "Total number of parameters: 406,212,608\n",
+      "Number of trainable parameters considering weight tying: 354,749,440\n",
+      "Total size of the model: 1549.58 MB\n",
+      "\n",
+      "\n",
+      "gpt2-large:\n",
+      "Total number of parameters: 838,220,800\n",
+      "Number of trainable parameters considering weight tying: 773,891,840\n",
+      "Total size of the model: 3197.56 MB\n",
+      "\n",
+      "\n",
+      "gpt2-xl:\n",
+      "Total number of parameters: 1,637,792,000\n",
+      "Number of trainable parameters considering weight tying: 1,557,380,800\n",
+      "Total size of the model: 6247.68 MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "from gpt import GPTModel\n",
+    "\n",
+    "\n",
+    "for model_abbrev in (\"small\", \"medium\", \"large\", \"xl\"):\n",
+    "    model_name = f\"gpt2-{model_abbrev}\"\n",
+    "    CONFIG = get_config(GPT_CONFIG_124M, model_name=model_name)\n",
+    "    model = GPTModel(CONFIG)\n",
+    "    print(f\"\\n\\n{model_name}:\")\n",
+    "    calculate_size(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f5f2306e-5dc8-498e-92ee-70ae7ec37ac1",
+   "metadata": {},
+   "source": [
+    "# Exercise 4.3: Using separate dropout parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
    "id": "5fee2cf5-61c3-4167-81b5-44ea155bbaf2",
    "metadata": {},
    "outputs": [],
@@ -39,7 +263,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 7,
    "id": "5aa1b0c1-d78a-48fc-ad08-4802458b43f7",
    "metadata": {},
    "outputs": [],
@@ -120,241 +344,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 8,
    "id": "1d013d32-c275-4f42-be21-9010f1537227",
    "metadata": {},
    "outputs": [],
    "source": [
     "import torch\n",
-    "import tiktoken\n",
     "\n",
     "torch.manual_seed(123)\n",
     "model = GPTModel(GPT_CONFIG_124M)"
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "5fea8be3-30a1-4623-a6d7-b095c6c1092e",
-   "metadata": {},
-   "source": [
-    "# Exercise 4.2: Parameters in the feed forward versus attention module"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "2751b0e5-ffd3-4be2-8db3-e20dd4d61d69",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from gpt import TransformerBlock\n",
-    "\n",
-    "GPT_CONFIG_124M = {\n",
-    "    \"vocab_size\": 50257,\n",
-    "    \"ctx_len\": 1024,\n",
-    "    \"emb_dim\": 768,\n",
-    "    \"n_heads\": 12,\n",
-    "    \"n_layers\": 12,\n",
-    "    \"drop_rate\": 0.1,\n",
-    "    \"qkv_bias\": False\n",
-    "}\n",
-    "\n",
-    "model = TransformerBlock(GPT_CONFIG_124M)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "id": "1bcaffd1-0cf6-4f8f-bd53-ab88a37f443e",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Total number of parameters in feed forward module: 4,722,432\n"
-     ]
-    }
-   ],
-   "source": [
-    "total_params = sum(p.numel() for p in block.ff.parameters())\n",
-    "print(f\"Total number of parameters in feed forward module: {total_params:,}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "id": "c1dd06c1-ab6c-4df7-ba73-f9cd54b31138",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Total number of parameters in feed forward module: 2,360,064\n"
-     ]
-    }
-   ],
-   "source": [
-    "total_params = sum(p.numel() for p in block.att.parameters())\n",
-    "print(f\"Total number of parameters in attention module: {total_params:,}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "15463dec-520a-47b4-b3ad-e180394fd076",
-   "metadata": {},
-   "source": [
-    "- The results above are for a single transformer block\n",
-    "- Optionally multiply by 12 to capture all transformer blocks in the 124M GPT model"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "0f7b7c7f-0fa1-4d30-ab44-e499edd55b6d",
-   "metadata": {},
-   "source": [
-    "# Exercise 4.3: Initialize larger GPT models"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "310b2e05-3ec8-47fc-afd9-83bf03d4aad8",
-   "metadata": {},
-   "source": [
-    "- **GPT2-small** (the 124M configuration we already implemented):\n",
-    "    - \"emb_dim\" = 768\n",
-    "    - \"n_layers\" = 12\n",
-    "    - \"n_heads\" = 12\n",
-    "\n",
-    "- **GPT2-medium:**\n",
-    "    - \"emb_dim\" = 1024\n",
-    "    - \"n_layers\" = 24\n",
-    "    - \"n_heads\" = 16\n",
-    "\n",
-    "- **GPT2-large:**\n",
-    "    - \"emb_dim\" = 1280\n",
-    "    - \"n_layers\" = 36\n",
-    "    - \"n_heads\" = 20\n",
-    "\n",
-    "- **GPT2-XL:**\n",
-    "    - \"emb_dim\" = 1600\n",
-    "    - \"n_layers\" = 48\n",
-    "    - \"n_heads\" = 25"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "90185dea-81ca-4cdc-aef7-4aaf95cba946",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "GPT_CONFIG_124M = {\n",
-    "    \"vocab_size\": 50257,\n",
-    "    \"ctx_len\": 1024,\n",
-    "    \"emb_dim\": 768,\n",
-    "    \"n_heads\": 12,\n",
-    "    \"n_layers\": 12,\n",
-    "    \"drop_rate\": 0.1,\n",
-    "    \"qkv_bias\": False\n",
-    "}\n",
-    "\n",
-    "\n",
-    "def get_config(base_config, model_name=\"gpt2-small\"):\n",
-    "    GPT_CONFIG = base_config.copy()\n",
-    "\n",
-    "    if model_name == \"gpt2-small\":\n",
-    "        GPT_CONFIG[\"emb_dim\"] = 768\n",
-    "        GPT_CONFIG[\"n_layers\"] = 12\n",
-    "        GPT_CONFIG[\"n_heads\"] = 12\n",
-    "\n",
-    "    elif model_name == \"gpt2-medium\":\n",
-    "        GPT_CONFIG[\"emb_dim\"] = 1024\n",
-    "        GPT_CONFIG[\"n_layers\"] = 24\n",
-    "        GPT_CONFIG[\"n_heads\"] = 16\n",
-    "\n",
-    "    elif model_name == \"gpt2-large\":\n",
-    "        GPT_CONFIG[\"emb_dim\"] = 1280\n",
-    "        GPT_CONFIG[\"n_layers\"] = 36\n",
-    "        GPT_CONFIG[\"n_heads\"] = 20\n",
-    "\n",
-    "    elif model_name == \"gpt2-xl\":\n",
-    "        GPT_CONFIG[\"emb_dim\"] = 1600\n",
-    "        GPT_CONFIG[\"n_layers\"] = 48\n",
-    "        GPT_CONFIG[\"n_heads\"] = 25\n",
-    "\n",
-    "    else:\n",
-    "        raise ValueError(f\"Incorrect model name {model_name}\")\n",
-    "\n",
-    "    return GPT_CONFIG\n",
-    "\n",
-    "\n",
-    "def calculate_size(model): # based on chapter code\n",
-    "    \n",
-    "    total_params = sum(p.numel() for p in model.parameters())\n",
-    "    print(f\"Total number of parameters: {total_params:,}\")\n",
-    "\n",
-    "    total_params_gpt2 =  total_params - sum(p.numel() for p in model.out_head.parameters())\n",
-    "    print(f\"Number of trainable parameters considering weight tying: {total_params_gpt2:,}\")\n",
-    "    \n",
-    "    # Calculate the total size in bytes (assuming float32, 4 bytes per parameter)\n",
-    "    total_size_bytes = total_params * 4\n",
-    "    \n",
-    "    # Convert to megabytes\n",
-    "    total_size_mb = total_size_bytes / (1024 * 1024)\n",
-    "    \n",
-    "    print(f\"Total size of the model: {total_size_mb:.2f} MB\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "2587e011-78a4-479c-a8fd-961cc40a5fd4",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "\n",
-      "gpt2-small:\n",
-      "Total number of parameters: 163,009,536\n",
-      "Number of trainable parameters considering weight tying: 124,412,160\n",
-      "Total size of the model: 621.83 MB\n",
-      "\n",
-      "\n",
-      "gpt2-medium:\n",
-      "Total number of parameters: 406,212,608\n",
-      "Number of trainable parameters considering weight tying: 354,749,440\n",
-      "Total size of the model: 1549.58 MB\n",
-      "\n",
-      "\n",
-      "gpt2-large:\n",
-      "Total number of parameters: 838,220,800\n",
-      "Number of trainable parameters considering weight tying: 773,891,840\n",
-      "Total size of the model: 3197.56 MB\n",
-      "\n",
-      "\n",
-      "gpt2-xl:\n",
-      "Total number of parameters: 1,637,792,000\n",
-      "Number of trainable parameters considering weight tying: 1,557,380,800\n",
-      "Total size of the model: 6247.68 MB\n"
-     ]
-    }
-   ],
-   "source": [
-    "from gpt import GPTModel\n",
-    "\n",
-    "\n",
-    "for model_abbrev in (\"small\", \"medium\", \"large\", \"xl\"):\n",
-    "    model_name = f\"gpt2-{model_abbrev}\"\n",
-    "    CONFIG = get_config(GPT_CONFIG_124M, model_name=model_name)\n",
-    "    model = GPTModel(CONFIG)\n",
-    "    print(f\"\\n\\n{model_name}:\")\n",
-    "    calculate_size(model)"
-   ]
   }
  ],
  "metadata": {