add mmap=True comparison

2026-04-10 12:33:42 +00:00 · 2024-10-14 11:09:55 -05:00
parent 05b04f2a5a
commit 08362fd290
1 changed files with 93 additions and 1 deletions
--- a/ch05/08_memory_efficient_weight_loading/memory-efficient-state-dict.ipynb
+++ b/ch05/08_memory_efficient_weight_loading/memory-efficient-state-dict.ipynb
@@ -752,7 +752,99 @@
   "metadata": {},
   "source": [
    "&nbsp;\n",
-    "## 6. Other methods"
+    "## 6. Using `mmap=True`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- As an intermediate or advanced `torch.load` user, you may wonder how these approaches compare to the `mmap=True` setting in PyTorch\n",
+    "- The `mmap=True` setting in PyTorch enables memory-mapped file I/O, which allows the tensor to access data directly from disk storage, thus reducing memory usage by not loading the entire file into RAM\n",
+    "- However, in practice, I found it to be less efficient than the sequential approaches above"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "7AX3vPrpv5c_",
+    "outputId": "e6fca10b-55c3-4e89-8674-075df5ce26e7"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Maximum GPU memory allocated: 6.4 GB\n",
+      "-> Maximum CPU memory allocated: 9.9 GB\n"
+     ]
+    }
+   ],
+   "source": [
+    "def baseline_mmap():\n",
+    "    start_memory_tracking()\n",
+    "\n",
+    "    model = GPTModel(BASE_CONFIG)  # load model on CPU\n",
+    "\n",
+    "    model.load_state_dict(\n",
+    "        torch.load(\"model.pth\", map_location=\"cpu\", weights_only=True, mmap=True)\n",
+    "    )\n",
+    "    model.to(device)  # Move model to GPU\n",
+    "    model.eval();\n",
+    "\n",
+    "    print_memory_usage()\n",
+    "\n",
+    "peak_memory_used = memory_usage_in_gb(baseline_mmap)\n",
+    "print(f\"-> Maximum CPU memory allocated: {peak_memory_used:.1f} GB\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "KUyK3QVRwmjR",
+    "outputId": "a77c191a-2f9e-4ae5-be19-8ce128e704e9"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Maximum GPU memory allocated: 12.8 GB\n",
+      "-> Maximum CPU memory allocated: 7.0 GB\n"
+     ]
+    }
+   ],
+   "source": [
+    "def baseline_mmap_2():\n",
+    "    start_memory_tracking()\n",
+    "\n",
+    "    model = GPTModel(BASE_CONFIG).to(device)\n",
+    "\n",
+    "    model.load_state_dict(\n",
+    "        torch.load(\"model.pth\", map_location=device, weights_only=True, mmap=True)\n",
+    "    )\n",
+    "    model.eval();\n",
+    "\n",
+    "    print_memory_usage()\n",
+    "\n",
+    "peak_memory_used = memory_usage_in_gb(baseline_mmap_2)\n",
+    "print(f\"-> Maximum CPU memory allocated: {peak_memory_used:.1f} GB\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "&nbsp;\n",
+    "## 7. Other methods"
   ]
  },
  {