From 1d6f2c908413b7744bc9912023129333d92b3ca1 Mon Sep 17 00:00:00 2001 From: rasbt Date: Sun, 11 Feb 2024 14:46:05 -0600 Subject: [PATCH] rearrange exercise order --- .../exercise-solutions.ipynb | 457 +++++++++--------- 1 file changed, 228 insertions(+), 229 deletions(-) diff --git a/ch04/01_main-chapter-code/exercise-solutions.ipynb b/ch04/01_main-chapter-code/exercise-solutions.ipynb index 5291396..5167827 100644 --- a/ch04/01_main-chapter-code/exercise-solutions.ipynb +++ b/ch04/01_main-chapter-code/exercise-solutions.ipynb @@ -10,15 +10,239 @@ }, { "cell_type": "markdown", - "id": "33dfa199-9aee-41d4-a64b-7e3811b9a616", + "id": "5fea8be3-30a1-4623-a6d7-b095c6c1092e", "metadata": {}, "source": [ - "# Exercise 4.1: Using separate dropout parameters" + "# Exercise 4.1: Parameters in the feed forward versus attention module" ] }, { "cell_type": "code", "execution_count": 1, + "id": "2751b0e5-ffd3-4be2-8db3-e20dd4d61d69", + "metadata": {}, + "outputs": [], + "source": [ + "from gpt import TransformerBlock\n", + "\n", + "GPT_CONFIG_124M = {\n", + " \"vocab_size\": 50257,\n", + " \"ctx_len\": 1024,\n", + " \"emb_dim\": 768,\n", + " \"n_heads\": 12,\n", + " \"n_layers\": 12,\n", + " \"drop_rate\": 0.1,\n", + " \"qkv_bias\": False\n", + "}\n", + "\n", + "block = TransformerBlock(GPT_CONFIG_124M)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1bcaffd1-0cf6-4f8f-bd53-ab88a37f443e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of parameters in feed forward module: 4,722,432\n" + ] + } + ], + "source": [ + "total_params = sum(p.numel() for p in block.ff.parameters())\n", + "print(f\"Total number of parameters in feed forward module: {total_params:,}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c1dd06c1-ab6c-4df7-ba73-f9cd54b31138", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of parameters in attention module: 2,360,064\n" + ] + } + ], + "source": [ + "total_params = sum(p.numel() for p in block.att.parameters())\n", + "print(f\"Total number of parameters in attention module: {total_params:,}\")" + ] + }, + { + "cell_type": "markdown", + "id": "15463dec-520a-47b4-b3ad-e180394fd076", + "metadata": {}, + "source": [ + "- The results above are for a single transformer block\n", + "- Optionally multiply by 12 to capture all transformer blocks in the 124M GPT model" + ] + }, + { + "cell_type": "markdown", + "id": "0f7b7c7f-0fa1-4d30-ab44-e499edd55b6d", + "metadata": {}, + "source": [ + "# Exercise 4.2: Initialize larger GPT models" + ] + }, + { + "cell_type": "markdown", + "id": "310b2e05-3ec8-47fc-afd9-83bf03d4aad8", + "metadata": {}, + "source": [ + "- **GPT2-small** (the 124M configuration we already implemented):\n", + " - \"emb_dim\" = 768\n", + " - \"n_layers\" = 12\n", + " - \"n_heads\" = 12\n", + "\n", + "- **GPT2-medium:**\n", + " - \"emb_dim\" = 1024\n", + " - \"n_layers\" = 24\n", + " - \"n_heads\" = 16\n", + "\n", + "- **GPT2-large:**\n", + " - \"emb_dim\" = 1280\n", + " - \"n_layers\" = 36\n", + " - \"n_heads\" = 20\n", + "\n", + "- **GPT2-XL:**\n", + " - \"emb_dim\" = 1600\n", + " - \"n_layers\" = 48\n", + " - \"n_heads\" = 25" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "90185dea-81ca-4cdc-aef7-4aaf95cba946", + "metadata": {}, + "outputs": [], + "source": [ + "GPT_CONFIG_124M = {\n", + " \"vocab_size\": 50257,\n", + " \"ctx_len\": 1024,\n", + " \"emb_dim\": 768,\n", + " \"n_heads\": 12,\n", + " \"n_layers\": 12,\n", + " \"drop_rate\": 0.1,\n", + " \"qkv_bias\": False\n", + "}\n", + "\n", + "\n", + "def get_config(base_config, model_name=\"gpt2-small\"):\n", + " GPT_CONFIG = base_config.copy()\n", + "\n", + " if model_name == \"gpt2-small\":\n", + " GPT_CONFIG[\"emb_dim\"] = 768\n", + " GPT_CONFIG[\"n_layers\"] = 12\n", + " GPT_CONFIG[\"n_heads\"] = 12\n", + "\n", + " elif model_name == \"gpt2-medium\":\n", + " GPT_CONFIG[\"emb_dim\"] = 1024\n", + " GPT_CONFIG[\"n_layers\"] = 24\n", + " GPT_CONFIG[\"n_heads\"] = 16\n", + "\n", + " elif model_name == \"gpt2-large\":\n", + " GPT_CONFIG[\"emb_dim\"] = 1280\n", + " GPT_CONFIG[\"n_layers\"] = 36\n", + " GPT_CONFIG[\"n_heads\"] = 20\n", + "\n", + " elif model_name == \"gpt2-xl\":\n", + " GPT_CONFIG[\"emb_dim\"] = 1600\n", + " GPT_CONFIG[\"n_layers\"] = 48\n", + " GPT_CONFIG[\"n_heads\"] = 25\n", + "\n", + " else:\n", + " raise ValueError(f\"Incorrect model name {model_name}\")\n", + "\n", + " return GPT_CONFIG\n", + "\n", + "\n", + "def calculate_size(model): # based on chapter code\n", + " \n", + " total_params = sum(p.numel() for p in model.parameters())\n", + " print(f\"Total number of parameters: {total_params:,}\")\n", + "\n", + " total_params_gpt2 = total_params - sum(p.numel() for p in model.out_head.parameters())\n", + " print(f\"Number of trainable parameters considering weight tying: {total_params_gpt2:,}\")\n", + " \n", + " # Calculate the total size in bytes (assuming float32, 4 bytes per parameter)\n", + " total_size_bytes = total_params * 4\n", + " \n", + " # Convert to megabytes\n", + " total_size_mb = total_size_bytes / (1024 * 1024)\n", + " \n", + " print(f\"Total size of the model: {total_size_mb:.2f} MB\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2587e011-78a4-479c-a8fd-961cc40a5fd4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "gpt2-small:\n", + "Total number of parameters: 163,009,536\n", + "Number of trainable parameters considering weight tying: 124,412,160\n", + "Total size of the model: 621.83 MB\n", + "\n", + "\n", + "gpt2-medium:\n", + "Total number of parameters: 406,212,608\n", + "Number of trainable parameters considering weight tying: 354,749,440\n", + "Total size of the model: 1549.58 MB\n", + "\n", + "\n", + "gpt2-large:\n", + "Total number of parameters: 838,220,800\n", + "Number of trainable parameters considering weight tying: 773,891,840\n", + "Total size of the model: 3197.56 MB\n", + "\n", + "\n", + "gpt2-xl:\n", + "Total number of parameters: 1,637,792,000\n", + "Number of trainable parameters considering weight tying: 1,557,380,800\n", + "Total size of the model: 6247.68 MB\n" + ] + } + ], + "source": [ + "from gpt import GPTModel\n", + "\n", + "\n", + "for model_abbrev in (\"small\", \"medium\", \"large\", \"xl\"):\n", + " model_name = f\"gpt2-{model_abbrev}\"\n", + " CONFIG = get_config(GPT_CONFIG_124M, model_name=model_name)\n", + " model = GPTModel(CONFIG)\n", + " print(f\"\\n\\n{model_name}:\")\n", + " calculate_size(model)" + ] + }, + { + "cell_type": "markdown", + "id": "f5f2306e-5dc8-498e-92ee-70ae7ec37ac1", + "metadata": {}, + "source": [ + "# Exercise 4.3: Using separate dropout parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "id": "5fee2cf5-61c3-4167-81b5-44ea155bbaf2", "metadata": {}, "outputs": [], @@ -39,7 +263,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 7, "id": "5aa1b0c1-d78a-48fc-ad08-4802458b43f7", "metadata": {}, "outputs": [], @@ -120,241 +344,16 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "id": "1d013d32-c275-4f42-be21-9010f1537227", "metadata": {}, "outputs": [], "source": [ "import torch\n", - "import tiktoken\n", "\n", "torch.manual_seed(123)\n", "model = GPTModel(GPT_CONFIG_124M)" ] - }, - { - "cell_type": "markdown", - "id": "5fea8be3-30a1-4623-a6d7-b095c6c1092e", - "metadata": {}, - "source": [ - "# Exercise 4.2: Parameters in the feed forward versus attention module" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "2751b0e5-ffd3-4be2-8db3-e20dd4d61d69", - "metadata": {}, - "outputs": [], - "source": [ - "from gpt import TransformerBlock\n", - "\n", - "GPT_CONFIG_124M = {\n", - " \"vocab_size\": 50257,\n", - " \"ctx_len\": 1024,\n", - " \"emb_dim\": 768,\n", - " \"n_heads\": 12,\n", - " \"n_layers\": 12,\n", - " \"drop_rate\": 0.1,\n", - " \"qkv_bias\": False\n", - "}\n", - "\n", - "model = TransformerBlock(GPT_CONFIG_124M)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "1bcaffd1-0cf6-4f8f-bd53-ab88a37f443e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total number of parameters in feed forward module: 4,722,432\n" - ] - } - ], - "source": [ - "total_params = sum(p.numel() for p in block.ff.parameters())\n", - "print(f\"Total number of parameters in feed forward module: {total_params:,}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "c1dd06c1-ab6c-4df7-ba73-f9cd54b31138", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total number of parameters in feed forward module: 2,360,064\n" - ] - } - ], - "source": [ - "total_params = sum(p.numel() for p in block.att.parameters())\n", - "print(f\"Total number of parameters in attention module: {total_params:,}\")" - ] - }, - { - "cell_type": "markdown", - "id": "15463dec-520a-47b4-b3ad-e180394fd076", - "metadata": {}, - "source": [ - "- The results above are for a single transformer block\n", - "- Optionally multiply by 12 to capture all transformer blocks in the 124M GPT model" - ] - }, - { - "cell_type": "markdown", - "id": "0f7b7c7f-0fa1-4d30-ab44-e499edd55b6d", - "metadata": {}, - "source": [ - "# Exercise 4.3: Initialize larger GPT models" - ] - }, - { - "cell_type": "markdown", - "id": "310b2e05-3ec8-47fc-afd9-83bf03d4aad8", - "metadata": {}, - "source": [ - "- **GPT2-small** (the 124M configuration we already implemented):\n", - " - \"emb_dim\" = 768\n", - " - \"n_layers\" = 12\n", - " - \"n_heads\" = 12\n", - "\n", - "- **GPT2-medium:**\n", - " - \"emb_dim\" = 1024\n", - " - \"n_layers\" = 24\n", - " - \"n_heads\" = 16\n", - "\n", - "- **GPT2-large:**\n", - " - \"emb_dim\" = 1280\n", - " - \"n_layers\" = 36\n", - " - \"n_heads\" = 20\n", - "\n", - "- **GPT2-XL:**\n", - " - \"emb_dim\" = 1600\n", - " - \"n_layers\" = 48\n", - " - \"n_heads\" = 25" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "90185dea-81ca-4cdc-aef7-4aaf95cba946", - "metadata": {}, - "outputs": [], - "source": [ - "GPT_CONFIG_124M = {\n", - " \"vocab_size\": 50257,\n", - " \"ctx_len\": 1024,\n", - " \"emb_dim\": 768,\n", - " \"n_heads\": 12,\n", - " \"n_layers\": 12,\n", - " \"drop_rate\": 0.1,\n", - " \"qkv_bias\": False\n", - "}\n", - "\n", - "\n", - "def get_config(base_config, model_name=\"gpt2-small\"):\n", - " GPT_CONFIG = base_config.copy()\n", - "\n", - " if model_name == \"gpt2-small\":\n", - " GPT_CONFIG[\"emb_dim\"] = 768\n", - " GPT_CONFIG[\"n_layers\"] = 12\n", - " GPT_CONFIG[\"n_heads\"] = 12\n", - "\n", - " elif model_name == \"gpt2-medium\":\n", - " GPT_CONFIG[\"emb_dim\"] = 1024\n", - " GPT_CONFIG[\"n_layers\"] = 24\n", - " GPT_CONFIG[\"n_heads\"] = 16\n", - "\n", - " elif model_name == \"gpt2-large\":\n", - " GPT_CONFIG[\"emb_dim\"] = 1280\n", - " GPT_CONFIG[\"n_layers\"] = 36\n", - " GPT_CONFIG[\"n_heads\"] = 20\n", - "\n", - " elif model_name == \"gpt2-xl\":\n", - " GPT_CONFIG[\"emb_dim\"] = 1600\n", - " GPT_CONFIG[\"n_layers\"] = 48\n", - " GPT_CONFIG[\"n_heads\"] = 25\n", - "\n", - " else:\n", - " raise ValueError(f\"Incorrect model name {model_name}\")\n", - "\n", - " return GPT_CONFIG\n", - "\n", - "\n", - "def calculate_size(model): # based on chapter code\n", - " \n", - " total_params = sum(p.numel() for p in model.parameters())\n", - " print(f\"Total number of parameters: {total_params:,}\")\n", - "\n", - " total_params_gpt2 = total_params - sum(p.numel() for p in model.out_head.parameters())\n", - " print(f\"Number of trainable parameters considering weight tying: {total_params_gpt2:,}\")\n", - " \n", - " # Calculate the total size in bytes (assuming float32, 4 bytes per parameter)\n", - " total_size_bytes = total_params * 4\n", - " \n", - " # Convert to megabytes\n", - " total_size_mb = total_size_bytes / (1024 * 1024)\n", - " \n", - " print(f\"Total size of the model: {total_size_mb:.2f} MB\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "2587e011-78a4-479c-a8fd-961cc40a5fd4", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "gpt2-small:\n", - "Total number of parameters: 163,009,536\n", - "Number of trainable parameters considering weight tying: 124,412,160\n", - "Total size of the model: 621.83 MB\n", - "\n", - "\n", - "gpt2-medium:\n", - "Total number of parameters: 406,212,608\n", - "Number of trainable parameters considering weight tying: 354,749,440\n", - "Total size of the model: 1549.58 MB\n", - "\n", - "\n", - "gpt2-large:\n", - "Total number of parameters: 838,220,800\n", - "Number of trainable parameters considering weight tying: 773,891,840\n", - "Total size of the model: 3197.56 MB\n", - "\n", - "\n", - "gpt2-xl:\n", - "Total number of parameters: 1,637,792,000\n", - "Number of trainable parameters considering weight tying: 1,557,380,800\n", - "Total size of the model: 6247.68 MB\n" - ] - } - ], - "source": [ - "from gpt import GPTModel\n", - "\n", - "\n", - "for model_abbrev in (\"small\", \"medium\", \"large\", \"xl\"):\n", - " model_name = f\"gpt2-{model_abbrev}\"\n", - " CONFIG = get_config(GPT_CONFIG_124M, model_name=model_name)\n", - " model = GPTModel(CONFIG)\n", - " print(f\"\\n\\n{model_name}:\")\n", - " calculate_size(model)" - ] } ], "metadata": {