From 1cea30f9b177e0395ee6fa63d61af5239899dfe8 Mon Sep 17 00:00:00 2001
From: rasbt <mail@sebastianraschka.com>
Date: Sat, 20 Dec 2025 18:41:34 -0600
Subject: [PATCH] upload saved nb

---
 .../memory-efficient-state-dict.ipynb         | 2031 +++++++++--------
 1 file changed, 1030 insertions(+), 1001 deletions(-)
diff --git a/ch05/08_memory_efficient_weight_loading/memory-efficient-state-dict.ipynb b/ch05/08_memory_efficient_weight_loading/memory-efficient-state-dict.ipynb
index 6687dd9..a7036ca 100644
--- a/ch05/08_memory_efficient_weight_loading/memory-efficient-state-dict.ipynb
+++ b/ch05/08_memory_efficient_weight_loading/memory-efficient-state-dict.ipynb
@@ -1,1003 +1,1032 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "1E_HhLEeYqFG"
-      },
-      "source": [
-        "<table style=\"width:100%\">\n",
-        "<tr>\n",
-        "<td style=\"vertical-align:middle; text-align:left;\">\n",
-        "<font size=\"2\">\n",
-        "Supplementary code for the <a href=\"http://mng.bz/orYv\">Build a Large Language Model From Scratch</a> book by <a href=\"https://sebastianraschka.com\">Sebastian Raschka</a><br>\n",
-        "<br>Code repository: <a href=\"https://github.com/rasbt/LLMs-from-scratch\">https://github.com/rasbt/LLMs-from-scratch</a>\n",
-        "</font>\n",
-        "</td>\n",
-        "<td style=\"vertical-align:middle; text-align:left;\">\n",
-        "<a href=\"http://mng.bz/orYv\"><img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/cover-small.webp\" width=\"100px\"></a>\n",
-        "</td>\n",
-        "</tr>\n",
-        "</table>"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ZuWudYFWYiH7"
-      },
-      "source": [
-        "# Memory-efficient Model Weight Loading"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "qt0Qyg6ewUt6"
-      },
-      "source": [
-        "- This notebook provides tips for loading larger pretrained or finetuned models when GPU (or CPU) memory is limited\n",
-        "- Specifically, it focuses on cases where you saved the model using `torch.save(model.state_dict(), \"model.pth\")` (for example, in chapters 5-7) and want to load it in a new session later for continued pretraining or additional finetuning\n",
-        "- While the example uses an LLM, the methods explained in this notebook are general and apply to loading any PyTorch model, not just LLMs"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Ji9LlnMlRISm"
-      },
-      "source": [
-        "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/bonus/memory-efficient-loading/memory-efficient-loading.webp\" width=\"800px\">"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 1,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "SxQzFoS-IXdY",
-        "outputId": "9f8fd57a-91e7-489d-d86e-656df536c604"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "torch version: 2.9.0+cu126\n"
-          ]
-        }
-      ],
-      "source": [
-        "from importlib.metadata import version\n",
-        "\n",
-        "pkgs = [\n",
-        "    \"torch\",\n",
-        "]\n",
-        "for p in pkgs:\n",
-        "    print(f\"{p} version: {version(p)}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "y47iQaQKyHap"
-      },
-      "source": [
-        "&nbsp;\n",
-        "## 1. Benchmark utilities"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "nQeOEoo6yT0X"
-      },
-      "source": [
-        "- First, let's define some utility code to track VRAM (GPU memory)\n",
-        "- Later, we will also introduce a tool to track the main system RAM (CPU memory)\n",
-        "- The purpose of these functions will become clear when we apply them later"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 2,
-      "metadata": {
-        "id": "pEiqjYrVivgt"
-      },
-      "outputs": [],
-      "source": [
-        "import gc\n",
-        "import time\n",
-        "import torch\n",
-        "\n",
-        "\n",
-        "def start_memory_tracking():\n",
-        "    \"\"\"Initialize GPU memory tracking.\"\"\"\n",
-        "    if torch.cuda.is_available():\n",
-        "        torch.cuda.reset_peak_memory_stats()\n",
-        "    else:\n",
-        "        print(\"This notebook is intended for CUDA GPUs but CUDA is not available.\")\n",
-        "\n",
-        "def print_memory_usage():\n",
-        "    max_gpu_memory = torch.cuda.max_memory_allocated() / (1024 ** 3)  # Convert bytes to GB\n",
-        "    print(f\"Maximum GPU memory allocated: {max_gpu_memory:.1f} GB\")\n",
-        "\n",
-        "def cleanup():\n",
-        "    gc.collect()\n",
-        "    torch.cuda.empty_cache()\n",
-        "    time.sleep(3)  # some buffer time to allow memory to clear\n",
-        "    torch.cuda.reset_peak_memory_stats()\n",
-        "    max_memory_allocated = torch.cuda.max_memory_allocated(device) / (1024 ** 3)\n",
-        "    print(f\"Maximum GPU memory allocated: {max_memory_allocated:.1f} GB\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "z5oJwoc-kkXs"
-      },
-      "source": [
-        "&nbsp;\n",
-        "## 2. Model setup"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "YfJE0vnMyr88"
-      },
-      "source": [
-        "- This code section sets up the model itself\n",
-        "- Here, we use the \"large\" GPT-2 model to make things more interesting (you may use the \"gpt2-small (124M)\" to lower the memory requirements and execution time of this notebook)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 3,
-      "metadata": {
-        "id": "tMuhCYaVI0w7"
-      },
-      "outputs": [],
-      "source": [
-        "from previous_chapters import GPTModel\n",
-        "# If the `previous_chapters.py` file is not available locally,\n",
-        "# you can import it from the `llms-from-scratch` PyPI package.\n",
-        "# For details, see: https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg\n",
-        "# E.g.,\n",
-        "# from llms_from_scratch.ch04 import GPTModel\n",
-        "\n",
-        "\n",
-        "\n",
-        "BASE_CONFIG = {\n",
-        "    \"vocab_size\": 50257,     # Vocabulary size\n",
-        "    \"context_length\": 1024,  # Context length\n",
-        "    \"drop_rate\": 0.0,        # Dropout rate\n",
-        "    \"qkv_bias\": True         # Query-key-value bias\n",
-        "}\n",
-        "\n",
-        "model_configs = {\n",
-        "    \"gpt2-small (124M)\": {\"emb_dim\": 768, \"n_layers\": 12, \"n_heads\": 12},\n",
-        "    \"gpt2-medium (355M)\": {\"emb_dim\": 1024, \"n_layers\": 24, \"n_heads\": 16},\n",
-        "    \"gpt2-large (774M)\": {\"emb_dim\": 1280, \"n_layers\": 36, \"n_heads\": 20},\n",
-        "    \"gpt2-xl (1558M)\": {\"emb_dim\": 1600, \"n_layers\": 48, \"n_heads\": 25},\n",
-        "}\n",
-        "\n",
-        "CHOOSE_MODEL = \"gpt2-xl (1558M)\"\n",
-        "\n",
-        "BASE_CONFIG.update(model_configs[CHOOSE_MODEL])"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "KWYoo1z5y8aX"
-      },
-      "source": [
-        "- Now, let's see the GPU memory functions in action:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 4,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "GK3NEA3eJv3f",
-        "outputId": "434b51ca-7c8b-44dd-8a84-41ab48a290ff"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Maximum GPU memory allocated: 6.4 GB\n"
-          ]
-        }
-      ],
-      "source": [
-        "start_memory_tracking()\n",
-        "\n",
-        "\n",
-        "model = GPTModel(BASE_CONFIG)\n",
-        "device = torch.device(\"cuda\")\n",
-        "model.to(device)\n",
-        "\n",
-        "print_memory_usage()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "GIhwBEBxzBsF"
-      },
-      "source": [
-        "- Additionally, let's make sure that the model runs okay by passing in some example tensor"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 5,
-      "metadata": {
-        "id": "i_j6nZruUd7g"
-      },
-      "outputs": [],
-      "source": [
-        "# Test if the model works (no need to track memory here)\n",
-        "test_input = torch.tensor([[1, 2, 3]]).to(device)\n",
-        "model.eval()\n",
-        "\n",
-        "with torch.no_grad():\n",
-        "    model(test_input)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "UgNb8c32zh4g"
-      },
-      "source": [
-        "- Next, imagine we were pretraining the model and saving it for later use\n",
-        "- We skip the actual pretraining here for simplicity and just save the initialized model (but the same concept applies)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 6,
-      "metadata": {
-        "id": "wUIXjcsimXU7"
-      },
-      "outputs": [],
-      "source": [
-        "# Training code would go here...\n",
-        "\n",
-        "model.train()\n",
-        "torch.save(model.state_dict(), \"model.pth\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "s9tBS4HUzz1g"
-      },
-      "source": [
-        "- Lastly, we delete the model and example tensor in the Python session to reset the GPU memory"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 7,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "SqmTzztqKnTs",
-        "outputId": "218332da-8b66-4169-d876-8d72c68691fc"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Maximum GPU memory allocated: 0.0 GB\n"
-          ]
-        }
-      ],
-      "source": [
-        "del model, test_input\n",
-        "cleanup()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "7EnO8beUJ6Sb"
-      },
-      "source": [
-        "&nbsp;\n",
-        "## 3. Basic weight loading"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "JtAXKjsG0AVL"
-      },
-      "source": [
-        "- Now begins the interesting part where we load the pretrained model weights\n",
-        "- Let's see how much GPU memory is required to load the previously saved model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 8,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "wCrQNbSJJO9w",
-        "outputId": "2623b399-bce6-4506-ec0b-c3c94729b80f"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Maximum GPU memory allocated: 12.8 GB\n"
-          ]
-        }
-      ],
-      "source": [
-        "# Then load pretrained weights\n",
-        "\n",
-        "start_memory_tracking()\n",
-        "\n",
-        "model = GPTModel(BASE_CONFIG)\n",
-        "model.to(device)\n",
-        "\n",
-        "model.load_state_dict(\n",
-        "    torch.load(\"model.pth\", map_location=device, weights_only=True)\n",
-        ")\n",
-        "model.to(device)\n",
-        "model.eval();\n",
-        "\n",
-        "print_memory_usage()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "4AGvOrcN0KdJ"
-      },
-      "source": [
-        "- Notice that the memory is 2x as large as in the previous session\n",
-        "- This is because we have the same model in memory twice, for a short period of time:\n",
-        "  - The first time via `model.to(device)`\n",
-        "  - The second time via the code line `model.load_state_dict(torch.load(\"model.pth\", map_location=device, weights_only=True))`; eventually, the loaded model weights will be copied into the model, and the `state_dict` will be discarded, but for a brief amount of time, we have both the main model and the loaded `state_dict` in memory\n",
-        "- The remaining sections focus on addressing this\n",
-        "- But first, let's test the model and reset the GPU memory\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 9,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "DvlUn-nmmbuj",
-        "outputId": "7a9afbde-826f-4fb2-874d-feb6e8724834"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Maximum GPU memory allocated: 0.0 GB\n"
-          ]
-        }
-      ],
-      "source": [
-        "# Test if the model works (no need to track memory here)\n",
-        "test_input = torch.tensor([[1, 2, 3]]).to(device)\n",
-        "model.eval()\n",
-        "\n",
-        "with torch.no_grad():\n",
-        "    model(test_input)\n",
-        "\n",
-        "del model, test_input\n",
-        "cleanup()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "- Let's test another common pattern that is very popular in practice:"
-      ],
-      "metadata": {
-        "id": "IQ531-IuRuzD"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "start_memory_tracking()\n",
-        "\n",
-        "model = GPTModel(BASE_CONFIG)\n",
-        "\n",
-        "model.load_state_dict(\n",
-        "    torch.load(\"model.pth\", map_location=\"cpu\", weights_only=True)\n",
-        ")\n",
-        "model.to(device)\n",
-        "model.eval();\n",
-        "\n",
-        "print_memory_usage()"
-      ],
-      "metadata": {
-        "id": "2m54kzX5RxLX"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# Test if the model works (no need to track memory here)\n",
-        "test_input = torch.tensor([[1, 2, 3]]).to(device)\n",
-        "model.eval()\n",
-        "\n",
-        "with torch.no_grad():\n",
-        "    model(test_input)\n",
-        "\n",
-        "del model, test_input, state_dict, param\n",
-        "cleanup()"
-      ],
-      "metadata": {
-        "id": "XWvQTRN4R2CM"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "- So, as peak memory is concerned, it doesn't make a difference whether we instantiate the model on the device first and then use `map_location=\"device\"` or load the weights into CPU memory first (`map_location=\"cpu\"`) and then move it to the device"
-      ],
-      "metadata": {
-        "id": "UGjBD6GASS_y"
-      }
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "RdPnW3iLLrjX"
-      },
-      "source": [
-        "&nbsp;\n",
-        "## 4. Loading weights sequentially"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "FYqtUON602TD"
-      },
-      "source": [
-        "- One workaround for the problem of having the model weights in GPU memory twice, as highlighted in the previous section, is to load the model sequentially\n",
-        "- Below, we:\n",
-        "  - first load the model into GPU memory\n",
-        "  - then load the model weights into CPU memory\n",
-        "  - and finally copy each parameter one by one into GPU memory\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "DOIGTNWTmx9G",
-        "outputId": "145162e6-aaa6-4c2a-ed8f-f1cf068adb80"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Maximum GPU memory allocated: 6.4 GB\n",
-            "Maximum GPU memory allocated: 6.7 GB\n"
-          ]
-        }
-      ],
-      "source": [
-        "start_memory_tracking()\n",
-        "\n",
-        "model = GPTModel(BASE_CONFIG).to(device)\n",
-        "\n",
-        "state_dict = torch.load(\"model.pth\", map_location=\"cpu\", weights_only=True)\n",
-        "\n",
-        "print_memory_usage()\n",
-        "\n",
-        "# Sequentially copy weights to the model's parameters\n",
-        "with torch.no_grad():\n",
-        "    for name, param in model.named_parameters():\n",
-        "        if name in state_dict:\n",
-        "            param.copy_(state_dict[name].to(device))\n",
-        "        else:\n",
-        "            print(f\"Warning: {name} not found in state_dict.\")\n",
-        "\n",
-        "print_memory_usage()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Pn9xD_xL1ZzM"
-      },
-      "source": [
-        "- As we can see above, the memory usage is much lower than before\n",
-        "- Notice that the memory increases from 6.4 to 6.7 GB because initially, we only have the model in memory, and then we have the model plus 1 parameter tensor in memory (we temporarily move the parameter tensor to the GPU so we can assign it using `\".to\"` the model)\n",
-        "- Overall, this is a significant improvement\n",
-        "- Again, let's briefly test the model and then reset the GPU memory for the next section"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "PRHnjA48nJgw",
-        "outputId": "dcd6b1b2-538f-4862-96a6-a5fcbf3326a4"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Maximum GPU memory allocated: 0.0 GB\n"
-          ]
-        }
-      ],
-      "source": [
-        "# Test if the model works (no need to track memory here)\n",
-        "test_input = torch.tensor([[1, 2, 3]]).to(device)\n",
-        "model.eval()\n",
-        "\n",
-        "with torch.no_grad():\n",
-        "    model(test_input)\n",
-        "\n",
-        "del model, test_input, state_dict, param\n",
-        "cleanup()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "5M92LK7usb-Z"
-      },
-      "source": [
-        "&nbsp;\n",
-        "## 5. Loading the model with low CPU memory"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "R45qgeB613e2"
-      },
-      "source": [
-        "- In the previous session, we reduced GPU memory use by loading the weights (`state_dict`) into CPU memory first before copying them one-by-one into the model\n",
-        "- However, what do we do if we have limited CPU memory?\n",
-        "- This section uses PyTorch's so-called `\"meta\"` device approach to load a model on machines with large GPU memory but small CPU memory\n",
-        "- But first, let's define a convenience function to monitor CPU memory"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "BrcWy0q-3Bbe"
-      },
-      "outputs": [],
-      "source": [
-        "import os\n",
-        "import psutil\n",
-        "from threading import Thread\n",
-        "\n",
-        "\n",
-        "def memory_usage_in_gb(func, *args, **kwargs):\n",
-        "    process = psutil.Process(os.getpid())\n",
-        "\n",
-        "    # Measure the baseline memory usage before running the function\n",
-        "    baseline_mem = process.memory_info().rss / 1024 ** 3  # in GB\n",
-        "\n",
-        "    # Start monitoring memory in a separate thread\n",
-        "    mem_usage = []\n",
-        "    done = False\n",
-        "\n",
-        "    def monitor_memory():\n",
-        "        while not done:\n",
-        "            mem_usage.append(process.memory_info().rss / 1024 ** 3)  # Convert to GB\n",
-        "            time.sleep(0.1)\n",
-        "\n",
-        "    t = Thread(target=monitor_memory)\n",
-        "    t.start()\n",
-        "\n",
-        "    # Run the function\n",
-        "    func(*args, **kwargs)\n",
-        "\n",
-        "    # Stop monitoring\n",
-        "    done = True\n",
-        "    t.join()\n",
-        "\n",
-        "    peak_mem_usage_gb = max(mem_usage) - baseline_mem\n",
-        "    return peak_mem_usage_gb\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Ayy30Ytd5hjF"
-      },
-      "source": [
-        "- To start with, let's track the CPU memory of the sequential weight loading approach from the previous section"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "rCkV6IbQtpVn",
-        "outputId": "26c0435a-1e3d-4e8f-fbe2-f9655bad61b4"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Maximum GPU memory allocated: 6.4 GB\n",
-            "Maximum GPU memory allocated: 6.7 GB\n",
-            "-> Maximum CPU memory allocated: 6.3 GB\n"
-          ]
-        }
-      ],
-      "source": [
-        "def load_sequentially():\n",
-        "    start_memory_tracking()\n",
-        "\n",
-        "    model = GPTModel(BASE_CONFIG).to(device)\n",
-        "\n",
-        "    state_dict = torch.load(\"model.pth\", map_location=\"cpu\", weights_only=True)\n",
-        "\n",
-        "    print_memory_usage()\n",
-        "\n",
-        "    # Sequentially copy weights to the model's parameters\n",
-        "    with torch.no_grad():\n",
-        "        for name, param in model.named_parameters():\n",
-        "            if name in state_dict:\n",
-        "                param.copy_(state_dict[name].to(device))\n",
-        "            else:\n",
-        "                print(f\"Warning: {name} not found in state_dict.\")\n",
-        "\n",
-        "    print_memory_usage()\n",
-        "\n",
-        "\n",
-        "peak_memory_used = memory_usage_in_gb(load_sequentially)\n",
-        "print(f\"-> Maximum CPU memory allocated: {peak_memory_used:.1f} GB\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "UWrmnCML5oKy"
-      },
-      "source": [
-        "- Now, suppose we have a machine with low CPU memory but large GPU memory\n",
-        "- We can trade off CPU memory and GPU memory usage by introducing PyTorch's so-called \"meta\" device\n",
-        "- PyTorch's meta device is a special device type that allows you to create tensors without allocating actual memory for their data, effectively creating \"meta\" tensors\n",
-        "- This is useful for tasks like model analysis or architecture definition, where you need tensor shapes and types without the overhead of memory allocation"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "PBErC_5Yt8ly",
-        "outputId": "8799db06-191c-47c4-92fa-fbb95d685aa9"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Maximum GPU memory allocated: 12.8 GB\n",
-            "Maximum GPU memory allocated: 12.8 GB\n",
-            "-> Maximum CPU memory allocated: 1.3 GB\n"
-          ]
-        }
-      ],
-      "source": [
-        "def load_sequentially_with_meta():\n",
-        "    start_memory_tracking()\n",
-        "\n",
-        "    with torch.device(\"meta\"):\n",
-        "        model = GPTModel(BASE_CONFIG)\n",
-        "\n",
-        "    model = model.to_empty(device=device)\n",
-        "\n",
-        "    state_dict = torch.load(\"model.pth\", map_location=device, weights_only=True)\n",
-        "\n",
-        "    print_memory_usage()\n",
-        "\n",
-        "    # Sequentially copy weights to the model's parameters\n",
-        "    with torch.no_grad():\n",
-        "        for name, param in model.named_parameters():\n",
-        "            if name in state_dict:\n",
-        "                param.copy_(state_dict[name])\n",
-        "            else:\n",
-        "                print(f\"Warning: {name} not found in state_dict.\")\n",
-        "\n",
-        "    print_memory_usage()\n",
-        "\n",
-        "peak_memory_used = memory_usage_in_gb(load_sequentially_with_meta)\n",
-        "print(f\"-> Maximum CPU memory allocated: {peak_memory_used:.1f} GB\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "VpnCABp75-VQ"
-      },
-      "source": [
-        "- As we can see above, by creating the model on the meta-device and loading the weights directly into GPU memory, we effectively reduced the CPU memory requirements\n",
-        "- One might ask: \"Is the sequential weight loading still necessary then, and how does that compare to the original approach?\"\n",
-        "- Let's check the simple PyTorch weight loading approach for comparison (from the first weight loading section in this notebook):"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "4f-bqBNRuR39",
-        "outputId": "f7c0a901-b404-433a-9b93-2bbfa8183c56"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Maximum GPU memory allocated: 12.8 GB\n",
-            "-> Maximum CPU memory allocated: 4.4 GB\n"
-          ]
-        }
-      ],
-      "source": [
-        "def baseline():\n",
-        "    start_memory_tracking()\n",
-        "\n",
-        "    model = GPTModel(BASE_CONFIG)\n",
-        "    model.to(device)\n",
-        "\n",
-        "    model.load_state_dict(torch.load(\"model.pth\", map_location=device, weights_only=True))\n",
-        "    model.to(device)\n",
-        "    model.eval();\n",
-        "\n",
-        "    print_memory_usage()\n",
-        "\n",
-        "peak_memory_used = memory_usage_in_gb(baseline)\n",
-        "print(f\"-> Maximum CPU memory allocated: {peak_memory_used:.1f} GB\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "NKAjxbX86xnb"
-      },
-      "source": [
-        "- As we can see above, the \"simple\" weight loading without the meta device uses more memory\n",
-        "- In other words, if you have a machine with limited CPU memory, you can use the meta device approach to directly load the model weights into GPU memory to reduce peak CPU memory usage"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "jvDVFpcaRISr"
-      },
-      "source": [
-        "&nbsp;\n",
-        "## 6. Using `mmap=True` (recommmended)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "w3H5gPygRISr"
-      },
-      "source": [
-        "- As an intermediate or advanced `torch.load` user, you may wonder how these approaches compare to the `mmap=True` setting in PyTorch\n",
-        "- The `mmap=True` setting in PyTorch enables memory-mapped file I/O, which allows the tensor to access data directly from disk storage, thus reducing memory usage by not loading the entire file into RAM if RAM is limited\n",
-        "- Also, see the helpful comment by [mikaylagawarecki](https://github.com/rasbt/LLMs-from-scratch/issues/402)\n",
-        "- At first glance, it may look less efficient than the sequential approaches above:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "GKwV0AMNemuR",
-        "outputId": "e207f2bf-5c87-498e-80fe-e8c4016ac711"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Maximum GPU memory allocated: 6.4 GB\n",
-            "-> Maximum CPU memory allocated: 5.9 GB\n"
-          ]
-        }
-      ],
-      "source": [
-        "def best_practices():\n",
-        "  with torch.device(\"meta\"):\n",
-        "      model = GPTModel(BASE_CONFIG)\n",
-        "\n",
-        "  model.load_state_dict(\n",
-        "      torch.load(\"model.pth\", map_location=device, weights_only=True, mmap=True),\n",
-        "      assign=True\n",
-        "  )\n",
-        "\n",
-        "  print_memory_usage()\n",
-        "\n",
-        "peak_memory_used = memory_usage_in_gb(best_practices)\n",
-        "print(f\"-> Maximum CPU memory allocated: {peak_memory_used:.1f} GB\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "pGC0rBv4RISr"
-      },
-      "source": [
-        "- The reason why the CPU RAM usage is so high is that there's enough CPU RAM available on this machine\n",
-        "- However, if you were to run this on a machine with limited CPU RAM, the `mmap` approach would use less memory"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "fd11QM8pRISr"
-      },
-      "source": [
-        "&nbsp;\n",
-        "## 7. Other methods"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "0U2Y6eo8RISr"
-      },
-      "source": [
-        "- This notebook is focused on simple, built-in methods for loading weights in PyTorch\n",
-        "- The recommended approach for limited CPU memory cases is the `mmap=True` approach explained enough\n",
-        "- Alternatively, one other option is a brute-force approach that saves and loads each weight tensor separately:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "2CgPEZUIb00w"
-      },
-      "outputs": [],
-      "source": [
-        "model = GPTModel(BASE_CONFIG)\n",
-        "# Assume `model` is your trained model\n",
-        "state_dict = model.state_dict()\n",
-        "\n",
-        "# Create a directory to store individual parameter files\n",
-        "os.makedirs(\"model_parameters\", exist_ok=True)\n",
-        "\n",
-        "# Save each parameter tensor separately\n",
-        "for name, param in state_dict.items():\n",
-        "    torch.save(param.cpu(), f\"model_parameters/{name}.pt\")\n",
-        "\n",
-        "del model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "gTsmtJK-b4yy",
-        "outputId": "d361e2d3-e34c-48d7-9047-846c9bfd291e"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Maximum GPU memory allocated: 6.4 GB\n",
-            "Maximum GPU memory allocated: 6.4 GB\n",
-            "-> Maximum CPU memory allocated: 0.3 GB\n"
-          ]
-        }
-      ],
-      "source": [
-        "def load_individual_weights():\n",
-        "\n",
-        "    start_memory_tracking()\n",
-        "\n",
-        "    with torch.device(\"meta\"):\n",
-        "        model = GPTModel(BASE_CONFIG)\n",
-        "\n",
-        "    model = model.to_empty(device=device)\n",
-        "\n",
-        "    print_memory_usage()\n",
-        "    param_dir = \"model_parameters\"\n",
-        "\n",
-        "    with torch.no_grad():\n",
-        "        for name, param in model.named_parameters():\n",
-        "            weight_path = os.path.join(param_dir, f\"{name}.pt\")\n",
-        "            if os.path.exists(weight_path):\n",
-        "                param_data = torch.load(weight_path, map_location=\"cpu\", weights_only=True)\n",
-        "                param.copy_(param_data)\n",
-        "                del param_data  # Free memory\n",
-        "            else:\n",
-        "                print(f\"Warning: {name} not found in {param_dir}.\")\n",
-        "\n",
-        "    print_memory_usage()\n",
-        "\n",
-        "\n",
-        "peak_memory_used = memory_usage_in_gb(load_individual_weights)\n",
-        "print(f\"-> Maximum CPU memory allocated: {peak_memory_used:.1f} GB\")"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "gpuType": "T4",
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.10.16"
-    }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "1E_HhLEeYqFG"
+   },
+   "source": [
+    "<table style=\"width:100%\">\n",
+    "<tr>\n",
+    "<td style=\"vertical-align:middle; text-align:left;\">\n",
+    "<font size=\"2\">\n",
+    "Supplementary code for the <a href=\"http://mng.bz/orYv\">Build a Large Language Model From Scratch</a> book by <a href=\"https://sebastianraschka.com\">Sebastian Raschka</a><br>\n",
+    "<br>Code repository: <a href=\"https://github.com/rasbt/LLMs-from-scratch\">https://github.com/rasbt/LLMs-from-scratch</a>\n",
+    "</font>\n",
+    "</td>\n",
+    "<td style=\"vertical-align:middle; text-align:left;\">\n",
+    "<a href=\"http://mng.bz/orYv\"><img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/cover-small.webp\" width=\"100px\"></a>\n",
+    "</td>\n",
+    "</tr>\n",
+    "</table>"
+   ]
   },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
\ No newline at end of file
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ZuWudYFWYiH7"
+   },
+   "source": [
+    "# Memory-efficient Model Weight Loading"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "qt0Qyg6ewUt6"
+   },
+   "source": [
+    "- This notebook provides tips for loading larger pretrained or finetuned models when GPU (or CPU) memory is limited\n",
+    "- Specifically, it focuses on cases where you saved the model using `torch.save(model.state_dict(), \"model.pth\")` (for example, in chapters 5-7) and want to load it in a new session later for continued pretraining or additional finetuning\n",
+    "- While the example uses an LLM, the methods explained in this notebook are general and apply to loading any PyTorch model, not just LLMs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Ji9LlnMlRISm"
+   },
+   "source": [
+    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/bonus/memory-efficient-loading/memory-efficient-loading.webp\" width=\"800px\">"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "SxQzFoS-IXdY",
+    "outputId": "9f8fd57a-91e7-489d-d86e-656df536c604"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch version: 2.9.1+cu130\n"
+     ]
+    }
+   ],
+   "source": [
+    "from importlib.metadata import version\n",
+    "\n",
+    "pkgs = [\n",
+    "    \"torch\",\n",
+    "]\n",
+    "for p in pkgs:\n",
+    "    print(f\"{p} version: {version(p)}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "y47iQaQKyHap"
+   },
+   "source": [
+    "&nbsp;\n",
+    "## 1. Benchmark utilities"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "nQeOEoo6yT0X"
+   },
+   "source": [
+    "- First, let's define some utility code to track VRAM (GPU memory)\n",
+    "- Later, we will also introduce a tool to track the main system RAM (CPU memory)\n",
+    "- The purpose of these functions will become clear when we apply them later"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "id": "pEiqjYrVivgt"
+   },
+   "outputs": [],
+   "source": [
+    "import gc\n",
+    "import time\n",
+    "import torch\n",
+    "\n",
+    "\n",
+    "def start_memory_tracking():\n",
+    "    \"\"\"Initialize GPU memory tracking.\"\"\"\n",
+    "    if torch.cuda.is_available():\n",
+    "        torch.cuda.reset_peak_memory_stats()\n",
+    "    else:\n",
+    "        print(\"This notebook is intended for CUDA GPUs but CUDA is not available.\")\n",
+    "\n",
+    "def print_memory_usage():\n",
+    "    max_gpu_memory = torch.cuda.max_memory_allocated() / (1024 ** 3)  # Convert bytes to GB\n",
+    "    print(f\"Maximum GPU memory allocated: {max_gpu_memory:.1f} GB\")\n",
+    "\n",
+    "def cleanup():\n",
+    "    gc.collect()\n",
+    "    torch.cuda.empty_cache()\n",
+    "    time.sleep(3)  # some buffer time to allow memory to clear\n",
+    "    torch.cuda.reset_peak_memory_stats()\n",
+    "    max_memory_allocated = torch.cuda.max_memory_allocated(device) / (1024 ** 3)\n",
+    "    print(f\"Maximum GPU memory allocated: {max_memory_allocated:.1f} GB\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "z5oJwoc-kkXs"
+   },
+   "source": [
+    "&nbsp;\n",
+    "## 2. Model setup"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "YfJE0vnMyr88"
+   },
+   "source": [
+    "- This code section sets up the model itself\n",
+    "- Here, we use the \"large\" GPT-2 model to make things more interesting (you may use the \"gpt2-small (124M)\" to lower the memory requirements and execution time of this notebook)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "id": "tMuhCYaVI0w7"
+   },
+   "outputs": [],
+   "source": [
+    "from previous_chapters import GPTModel\n",
+    "# If the `previous_chapters.py` file is not available locally,\n",
+    "# you can import it from the `llms-from-scratch` PyPI package.\n",
+    "# For details, see: https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg\n",
+    "# E.g.,\n",
+    "# from llms_from_scratch.ch04 import GPTModel\n",
+    "\n",
+    "\n",
+    "\n",
+    "BASE_CONFIG = {\n",
+    "    \"vocab_size\": 50257,     # Vocabulary size\n",
+    "    \"context_length\": 1024,  # Context length\n",
+    "    \"drop_rate\": 0.0,        # Dropout rate\n",
+    "    \"qkv_bias\": True         # Query-key-value bias\n",
+    "}\n",
+    "\n",
+    "model_configs = {\n",
+    "    \"gpt2-small (124M)\": {\"emb_dim\": 768, \"n_layers\": 12, \"n_heads\": 12},\n",
+    "    \"gpt2-medium (355M)\": {\"emb_dim\": 1024, \"n_layers\": 24, \"n_heads\": 16},\n",
+    "    \"gpt2-large (774M)\": {\"emb_dim\": 1280, \"n_layers\": 36, \"n_heads\": 20},\n",
+    "    \"gpt2-xl (1558M)\": {\"emb_dim\": 1600, \"n_layers\": 48, \"n_heads\": 25},\n",
+    "}\n",
+    "\n",
+    "CHOOSE_MODEL = \"gpt2-xl (1558M)\"\n",
+    "\n",
+    "BASE_CONFIG.update(model_configs[CHOOSE_MODEL])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "KWYoo1z5y8aX"
+   },
+   "source": [
+    "- Now, let's see the GPU memory functions in action:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "GK3NEA3eJv3f",
+    "outputId": "434b51ca-7c8b-44dd-8a84-41ab48a290ff"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/rasbt/jupyterlab/reasoning/.venv/lib/python3.12/site-packages/torch/cuda/__init__.py:283: UserWarning: \n",
+      "    Found GPU0 NVIDIA GB10 which is of cuda capability 12.1.\n",
+      "    Minimum and Maximum cuda capability supported by this version of PyTorch is\n",
+      "    (8.0) - (12.0)\n",
+      "    \n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Maximum GPU memory allocated: 6.4 GB\n"
+     ]
+    }
+   ],
+   "source": [
+    "start_memory_tracking()\n",
+    "\n",
+    "\n",
+    "model = GPTModel(BASE_CONFIG)\n",
+    "device = torch.device(\"cuda\")\n",
+    "model.to(device)\n",
+    "\n",
+    "print_memory_usage()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "GIhwBEBxzBsF"
+   },
+   "source": [
+    "- Additionally, let's make sure that the model runs okay by passing in some example tensor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "id": "i_j6nZruUd7g"
+   },
+   "outputs": [],
+   "source": [
+    "# Test if the model works (no need to track memory here)\n",
+    "test_input = torch.tensor([[1, 2, 3]]).to(device)\n",
+    "model.eval()\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    model(test_input)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "UgNb8c32zh4g"
+   },
+   "source": [
+    "- Next, imagine we were pretraining the model and saving it for later use\n",
+    "- We skip the actual pretraining here for simplicity and just save the initialized model (but the same concept applies)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "id": "wUIXjcsimXU7"
+   },
+   "outputs": [],
+   "source": [
+    "# Training code would go here...\n",
+    "\n",
+    "model.train()\n",
+    "torch.save(model.state_dict(), \"model.pth\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "s9tBS4HUzz1g"
+   },
+   "source": [
+    "- Lastly, we delete the model and example tensor in the Python session to reset the GPU memory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "SqmTzztqKnTs",
+    "outputId": "218332da-8b66-4169-d876-8d72c68691fc"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Maximum GPU memory allocated: 0.0 GB\n"
+     ]
+    }
+   ],
+   "source": [
+    "del model, test_input\n",
+    "cleanup()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "7EnO8beUJ6Sb"
+   },
+   "source": [
+    "&nbsp;\n",
+    "## 3. Basic weight loading"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "JtAXKjsG0AVL"
+   },
+   "source": [
+    "- Now begins the interesting part where we load the pretrained model weights\n",
+    "- Let's see how much GPU memory is required to load the previously saved model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "wCrQNbSJJO9w",
+    "outputId": "2623b399-bce6-4506-ec0b-c3c94729b80f"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Maximum GPU memory allocated: 12.8 GB\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Then load pretrained weights\n",
+    "\n",
+    "start_memory_tracking()\n",
+    "\n",
+    "model = GPTModel(BASE_CONFIG)\n",
+    "model.to(device)\n",
+    "\n",
+    "model.load_state_dict(\n",
+    "    torch.load(\"model.pth\", map_location=device, weights_only=True)\n",
+    ")\n",
+    "model.to(device)\n",
+    "model.eval();\n",
+    "\n",
+    "print_memory_usage()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "4AGvOrcN0KdJ"
+   },
+   "source": [
+    "- Notice that the memory is 2x as large as in the previous session\n",
+    "- This is because we have the same model in memory twice, for a short period of time:\n",
+    "  - The first time via `model.to(device)`\n",
+    "  - The second time via the code line `model.load_state_dict(torch.load(\"model.pth\", map_location=device, weights_only=True))`; eventually, the loaded model weights will be copied into the model, and the `state_dict` will be discarded, but for a brief amount of time, we have both the main model and the loaded `state_dict` in memory\n",
+    "- The remaining sections focus on addressing this\n",
+    "- But first, let's test the model and reset the GPU memory\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "DvlUn-nmmbuj",
+    "outputId": "7a9afbde-826f-4fb2-874d-feb6e8724834"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Maximum GPU memory allocated: 0.0 GB\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Test if the model works (no need to track memory here)\n",
+    "test_input = torch.tensor([[1, 2, 3]]).to(device)\n",
+    "model.eval()\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    model(test_input)\n",
+    "\n",
+    "del model, test_input\n",
+    "cleanup()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "IQ531-IuRuzD"
+   },
+   "source": [
+    "- Let's test another common pattern that is very popular in practice:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "id": "2m54kzX5RxLX"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Maximum GPU memory allocated: 6.4 GB\n"
+     ]
+    }
+   ],
+   "source": [
+    "start_memory_tracking()\n",
+    "\n",
+    "model = GPTModel(BASE_CONFIG)\n",
+    "\n",
+    "model.load_state_dict(\n",
+    "    torch.load(\"model.pth\", map_location=\"cpu\", weights_only=True)\n",
+    ")\n",
+    "model.to(device)\n",
+    "model.eval();\n",
+    "\n",
+    "print_memory_usage()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "id": "XWvQTRN4R2CM"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Maximum GPU memory allocated: 0.0 GB\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Test if the model works (no need to track memory here)\n",
+    "test_input = torch.tensor([[1, 2, 3]]).to(device)\n",
+    "model.eval()\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    model(test_input)\n",
+    "\n",
+    "del model, test_input\n",
+    "cleanup()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "UGjBD6GASS_y"
+   },
+   "source": [
+    "- So, as peak memory is concerned, it doesn't make a difference whether we instantiate the model on the device first and then use `map_location=\"device\"` or load the weights into CPU memory first (`map_location=\"cpu\"`) and then move it to the device"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "RdPnW3iLLrjX"
+   },
+   "source": [
+    "&nbsp;\n",
+    "## 4. Loading weights sequentially"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "FYqtUON602TD"
+   },
+   "source": [
+    "- One workaround for the problem of having the model weights in GPU memory twice, as highlighted in the previous section, is to load the model sequentially\n",
+    "- Below, we:\n",
+    "  - first load the model into GPU memory\n",
+    "  - then load the model weights into CPU memory\n",
+    "  - and finally copy each parameter one by one into GPU memory\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "DOIGTNWTmx9G",
+    "outputId": "145162e6-aaa6-4c2a-ed8f-f1cf068adb80"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Maximum GPU memory allocated: 6.4 GB\n",
+      "Maximum GPU memory allocated: 6.7 GB\n"
+     ]
+    }
+   ],
+   "source": [
+    "start_memory_tracking()\n",
+    "\n",
+    "model = GPTModel(BASE_CONFIG).to(device)\n",
+    "\n",
+    "state_dict = torch.load(\"model.pth\", map_location=\"cpu\", weights_only=True)\n",
+    "\n",
+    "print_memory_usage()\n",
+    "\n",
+    "# Sequentially copy weights to the model's parameters\n",
+    "with torch.no_grad():\n",
+    "    for name, param in model.named_parameters():\n",
+    "        if name in state_dict:\n",
+    "            param.copy_(state_dict[name].to(device))\n",
+    "        else:\n",
+    "            print(f\"Warning: {name} not found in state_dict.\")\n",
+    "\n",
+    "print_memory_usage()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Pn9xD_xL1ZzM"
+   },
+   "source": [
+    "- As we can see above, the memory usage is much lower than before\n",
+    "- Notice that the memory increases from 6.4 to 6.7 GB because initially, we only have the model in memory, and then we have the model plus 1 parameter tensor in memory (we temporarily move the parameter tensor to the GPU so we can assign it using `\".to\"` the model)\n",
+    "- Overall, this is a significant improvement\n",
+    "- Again, let's briefly test the model and then reset the GPU memory for the next section"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "PRHnjA48nJgw",
+    "outputId": "dcd6b1b2-538f-4862-96a6-a5fcbf3326a4"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Maximum GPU memory allocated: 0.0 GB\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Test if the model works (no need to track memory here)\n",
+    "test_input = torch.tensor([[1, 2, 3]]).to(device)\n",
+    "model.eval()\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    model(test_input)\n",
+    "\n",
+    "del model, test_input, state_dict, param\n",
+    "cleanup()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "5M92LK7usb-Z"
+   },
+   "source": [
+    "&nbsp;\n",
+    "## 5. Loading the model with low CPU memory"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "R45qgeB613e2"
+   },
+   "source": [
+    "- In the previous session, we reduced GPU memory use by loading the weights (`state_dict`) into CPU memory first before copying them one-by-one into the model\n",
+    "- However, what do we do if we have limited CPU memory?\n",
+    "- This section uses PyTorch's so-called `\"meta\"` device approach to load a model on machines with large GPU memory but small CPU memory\n",
+    "- But first, let's define a convenience function to monitor CPU memory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "BrcWy0q-3Bbe"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import psutil\n",
+    "from threading import Thread\n",
+    "\n",
+    "\n",
+    "def memory_usage_in_gb(func, *args, **kwargs):\n",
+    "    process = psutil.Process(os.getpid())\n",
+    "\n",
+    "    # Measure the baseline memory usage before running the function\n",
+    "    baseline_mem = process.memory_info().rss / 1024 ** 3  # in GB\n",
+    "\n",
+    "    # Start monitoring memory in a separate thread\n",
+    "    mem_usage = []\n",
+    "    done = False\n",
+    "\n",
+    "    def monitor_memory():\n",
+    "        while not done:\n",
+    "            mem_usage.append(process.memory_info().rss / 1024 ** 3)  # Convert to GB\n",
+    "            time.sleep(0.1)\n",
+    "\n",
+    "    t = Thread(target=monitor_memory)\n",
+    "    t.start()\n",
+    "\n",
+    "    # Run the function\n",
+    "    func(*args, **kwargs)\n",
+    "\n",
+    "    # Stop monitoring\n",
+    "    done = True\n",
+    "    t.join()\n",
+    "\n",
+    "    peak_mem_usage_gb = max(mem_usage) - baseline_mem\n",
+    "    return peak_mem_usage_gb\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Ayy30Ytd5hjF"
+   },
+   "source": [
+    "- To start with, let's track the CPU memory of the sequential weight loading approach from the previous section"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "rCkV6IbQtpVn",
+    "outputId": "26c0435a-1e3d-4e8f-fbe2-f9655bad61b4"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Maximum GPU memory allocated: 6.4 GB\n",
+      "Maximum GPU memory allocated: 6.7 GB\n",
+      "-> Maximum CPU memory allocated: 6.3 GB\n"
+     ]
+    }
+   ],
+   "source": [
+    "def load_sequentially():\n",
+    "    start_memory_tracking()\n",
+    "\n",
+    "    model = GPTModel(BASE_CONFIG).to(device)\n",
+    "\n",
+    "    state_dict = torch.load(\"model.pth\", map_location=\"cpu\", weights_only=True)\n",
+    "\n",
+    "    print_memory_usage()\n",
+    "\n",
+    "    # Sequentially copy weights to the model's parameters\n",
+    "    with torch.no_grad():\n",
+    "        for name, param in model.named_parameters():\n",
+    "            if name in state_dict:\n",
+    "                param.copy_(state_dict[name].to(device))\n",
+    "            else:\n",
+    "                print(f\"Warning: {name} not found in state_dict.\")\n",
+    "\n",
+    "    print_memory_usage()\n",
+    "\n",
+    "\n",
+    "peak_memory_used = memory_usage_in_gb(load_sequentially)\n",
+    "print(f\"-> Maximum CPU memory allocated: {peak_memory_used:.1f} GB\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "UWrmnCML5oKy"
+   },
+   "source": [
+    "- Now, suppose we have a machine with low CPU memory but large GPU memory\n",
+    "- We can trade off CPU memory and GPU memory usage by introducing PyTorch's so-called \"meta\" device\n",
+    "- PyTorch's meta device is a special device type that allows you to create tensors without allocating actual memory for their data, effectively creating \"meta\" tensors\n",
+    "- This is useful for tasks like model analysis or architecture definition, where you need tensor shapes and types without the overhead of memory allocation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "PBErC_5Yt8ly",
+    "outputId": "8799db06-191c-47c4-92fa-fbb95d685aa9"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Maximum GPU memory allocated: 12.8 GB\n",
+      "Maximum GPU memory allocated: 12.8 GB\n",
+      "-> Maximum CPU memory allocated: 1.3 GB\n"
+     ]
+    }
+   ],
+   "source": [
+    "def load_sequentially_with_meta():\n",
+    "    start_memory_tracking()\n",
+    "\n",
+    "    with torch.device(\"meta\"):\n",
+    "        model = GPTModel(BASE_CONFIG)\n",
+    "\n",
+    "    model = model.to_empty(device=device)\n",
+    "\n",
+    "    state_dict = torch.load(\"model.pth\", map_location=device, weights_only=True)\n",
+    "\n",
+    "    print_memory_usage()\n",
+    "\n",
+    "    # Sequentially copy weights to the model's parameters\n",
+    "    with torch.no_grad():\n",
+    "        for name, param in model.named_parameters():\n",
+    "            if name in state_dict:\n",
+    "                param.copy_(state_dict[name])\n",
+    "            else:\n",
+    "                print(f\"Warning: {name} not found in state_dict.\")\n",
+    "\n",
+    "    print_memory_usage()\n",
+    "\n",
+    "peak_memory_used = memory_usage_in_gb(load_sequentially_with_meta)\n",
+    "print(f\"-> Maximum CPU memory allocated: {peak_memory_used:.1f} GB\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "VpnCABp75-VQ"
+   },
+   "source": [
+    "- As we can see above, by creating the model on the meta-device and loading the weights directly into GPU memory, we effectively reduced the CPU memory requirements\n",
+    "- One might ask: \"Is the sequential weight loading still necessary then, and how does that compare to the original approach?\"\n",
+    "- Let's check the simple PyTorch weight loading approach for comparison (from the first weight loading section in this notebook):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "4f-bqBNRuR39",
+    "outputId": "f7c0a901-b404-433a-9b93-2bbfa8183c56"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Maximum GPU memory allocated: 12.8 GB\n",
+      "-> Maximum CPU memory allocated: 4.4 GB\n"
+     ]
+    }
+   ],
+   "source": [
+    "def baseline():\n",
+    "    start_memory_tracking()\n",
+    "\n",
+    "    model = GPTModel(BASE_CONFIG)\n",
+    "    model.to(device)\n",
+    "\n",
+    "    model.load_state_dict(torch.load(\"model.pth\", map_location=device, weights_only=True))\n",
+    "    model.to(device)\n",
+    "    model.eval();\n",
+    "\n",
+    "    print_memory_usage()\n",
+    "\n",
+    "peak_memory_used = memory_usage_in_gb(baseline)\n",
+    "print(f\"-> Maximum CPU memory allocated: {peak_memory_used:.1f} GB\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "NKAjxbX86xnb"
+   },
+   "source": [
+    "- As we can see above, the \"simple\" weight loading without the meta device uses more memory\n",
+    "- In other words, if you have a machine with limited CPU memory, you can use the meta device approach to directly load the model weights into GPU memory to reduce peak CPU memory usage"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "jvDVFpcaRISr"
+   },
+   "source": [
+    "&nbsp;\n",
+    "## 6. Using `mmap=True` (recommmended)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "w3H5gPygRISr"
+   },
+   "source": [
+    "- As an intermediate or advanced `torch.load` user, you may wonder how these approaches compare to the `mmap=True` setting in PyTorch\n",
+    "- The `mmap=True` setting in PyTorch enables memory-mapped file I/O, which allows the tensor to access data directly from disk storage, thus reducing memory usage by not loading the entire file into RAM if RAM is limited\n",
+    "- Also, see the helpful comment by [mikaylagawarecki](https://github.com/rasbt/LLMs-from-scratch/issues/402)\n",
+    "- At first glance, it may look less efficient than the sequential approaches above:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "GKwV0AMNemuR",
+    "outputId": "e207f2bf-5c87-498e-80fe-e8c4016ac711"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Maximum GPU memory allocated: 6.4 GB\n",
+      "-> Maximum CPU memory allocated: 5.9 GB\n"
+     ]
+    }
+   ],
+   "source": [
+    "def best_practices():\n",
+    "  with torch.device(\"meta\"):\n",
+    "      model = GPTModel(BASE_CONFIG)\n",
+    "\n",
+    "  model.load_state_dict(\n",
+    "      torch.load(\"model.pth\", map_location=device, weights_only=True, mmap=True),\n",
+    "      assign=True\n",
+    "  )\n",
+    "\n",
+    "  print_memory_usage()\n",
+    "\n",
+    "peak_memory_used = memory_usage_in_gb(best_practices)\n",
+    "print(f\"-> Maximum CPU memory allocated: {peak_memory_used:.1f} GB\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "pGC0rBv4RISr"
+   },
+   "source": [
+    "- The reason why the CPU RAM usage is so high is that there's enough CPU RAM available on this machine\n",
+    "- However, if you were to run this on a machine with limited CPU RAM, the `mmap` approach would use less memory"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "fd11QM8pRISr"
+   },
+   "source": [
+    "&nbsp;\n",
+    "## 7. Other methods"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "0U2Y6eo8RISr"
+   },
+   "source": [
+    "- This notebook is focused on simple, built-in methods for loading weights in PyTorch\n",
+    "- The recommended approach for limited CPU memory cases is the `mmap=True` approach explained enough\n",
+    "- Alternatively, one other option is a brute-force approach that saves and loads each weight tensor separately:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "2CgPEZUIb00w"
+   },
+   "outputs": [],
+   "source": [
+    "model = GPTModel(BASE_CONFIG)\n",
+    "# Assume `model` is your trained model\n",
+    "state_dict = model.state_dict()\n",
+    "\n",
+    "# Create a directory to store individual parameter files\n",
+    "os.makedirs(\"model_parameters\", exist_ok=True)\n",
+    "\n",
+    "# Save each parameter tensor separately\n",
+    "for name, param in state_dict.items():\n",
+    "    torch.save(param.cpu(), f\"model_parameters/{name}.pt\")\n",
+    "\n",
+    "del model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "gTsmtJK-b4yy",
+    "outputId": "d361e2d3-e34c-48d7-9047-846c9bfd291e"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Maximum GPU memory allocated: 6.4 GB\n",
+      "Maximum GPU memory allocated: 6.4 GB\n",
+      "-> Maximum CPU memory allocated: 0.3 GB\n"
+     ]
+    }
+   ],
+   "source": [
+    "def load_individual_weights():\n",
+    "\n",
+    "    start_memory_tracking()\n",
+    "\n",
+    "    with torch.device(\"meta\"):\n",
+    "        model = GPTModel(BASE_CONFIG)\n",
+    "\n",
+    "    model = model.to_empty(device=device)\n",
+    "\n",
+    "    print_memory_usage()\n",
+    "    param_dir = \"model_parameters\"\n",
+    "\n",
+    "    with torch.no_grad():\n",
+    "        for name, param in model.named_parameters():\n",
+    "            weight_path = os.path.join(param_dir, f\"{name}.pt\")\n",
+    "            if os.path.exists(weight_path):\n",
+    "                param_data = torch.load(weight_path, map_location=\"cpu\", weights_only=True)\n",
+    "                param.copy_(param_data)\n",
+    "                del param_data  # Free memory\n",
+    "            else:\n",
+    "                print(f\"Warning: {name} not found in {param_dir}.\")\n",
+    "\n",
+    "    print_memory_usage()\n",
+    "\n",
+    "\n",
+    "peak_memory_used = memory_usage_in_gb(load_individual_weights)\n",
+    "print(f\"-> Maximum CPU memory allocated: {peak_memory_used:.1f} GB\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}