diff --git a/ch05/08_memory_efficient_weight_loading/memory-efficient-state-dict.ipynb b/ch05/08_memory_efficient_weight_loading/memory-efficient-state-dict.ipynb index a8b3d35..3782527 100644 --- a/ch05/08_memory_efficient_weight_loading/memory-efficient-state-dict.ipynb +++ b/ch05/08_memory_efficient_weight_loading/memory-efficient-state-dict.ipynb @@ -752,7 +752,99 @@ "metadata": {}, "source": [ " \n", - "## 6. Other methods" + "## 6. Using `mmap=True`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- As an intermediate or advanced `torch.load` user, you may wonder how these approaches compare to the `mmap=True` setting in PyTorch\n", + "- The `mmap=True` setting in PyTorch enables memory-mapped file I/O, which allows the tensor to access data directly from disk storage, thus reducing memory usage by not loading the entire file into RAM\n", + "- However, in practice, I found it to be less efficient than the sequential approaches above" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7AX3vPrpv5c_", + "outputId": "e6fca10b-55c3-4e89-8674-075df5ce26e7" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Maximum GPU memory allocated: 6.4 GB\n", + "-> Maximum CPU memory allocated: 9.9 GB\n" + ] + } + ], + "source": [ + "def baseline_mmap():\n", + " start_memory_tracking()\n", + "\n", + " model = GPTModel(BASE_CONFIG) # load model on CPU\n", + "\n", + " model.load_state_dict(\n", + " torch.load(\"model.pth\", map_location=\"cpu\", weights_only=True, mmap=True)\n", + " )\n", + " model.to(device) # Move model to GPU\n", + " model.eval();\n", + "\n", + " print_memory_usage()\n", + "\n", + "peak_memory_used = memory_usage_in_gb(baseline_mmap)\n", + "print(f\"-> Maximum CPU memory allocated: {peak_memory_used:.1f} GB\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KUyK3QVRwmjR", + "outputId": "a77c191a-2f9e-4ae5-be19-8ce128e704e9" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Maximum GPU memory allocated: 12.8 GB\n", + "-> Maximum CPU memory allocated: 7.0 GB\n" + ] + } + ], + "source": [ + "def baseline_mmap_2():\n", + " start_memory_tracking()\n", + "\n", + " model = GPTModel(BASE_CONFIG).to(device)\n", + "\n", + " model.load_state_dict(\n", + " torch.load(\"model.pth\", map_location=device, weights_only=True, mmap=True)\n", + " )\n", + " model.eval();\n", + "\n", + " print_memory_usage()\n", + "\n", + "peak_memory_used = memory_usage_in_gb(baseline_mmap_2)\n", + "print(f\"-> Maximum CPU memory allocated: {peak_memory_used:.1f} GB\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " \n", + "## 7. Other methods" ] }, {