From 145322ded8311fab6f88dd731704d6473d8f50df Mon Sep 17 00:00:00 2001 From: casinca <47400729+casinca@users.noreply.github.com> Date: Tue, 29 Jul 2025 00:29:44 +0200 Subject: [PATCH] [Minor] Qwen3 typo & optim (#758) * typo * remove weight dict after loading --- ch05/11_qwen3/README.md | 2 +- ch05/11_qwen3/standalone-qwen3.ipynb | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/ch05/11_qwen3/README.md b/ch05/11_qwen3/README.md index 33e8b10..ea77b87 100644 --- a/ch05/11_qwen3/README.md +++ b/ch05/11_qwen3/README.md @@ -255,7 +255,7 @@ The following table shows a performance comparison on an A100 for consequent `ge | Qwen3Model compiled | 107 | 1.99 GB |   -#### Pro tip 2: speed up inference with compilation +#### Pro tip 2: speed up inference with KV cache You can significantly boost inference performance using the KV cache `Qwen3Model` drop-in replacement when running the model on a CPU. (See my [Understanding and Coding the KV Cache in LLMs from Scratch](https://magazine.sebastianraschka.com/p/coding-the-kv-cache-in-llms) article to learn more about KV caches.) diff --git a/ch05/11_qwen3/standalone-qwen3.ipynb b/ch05/11_qwen3/standalone-qwen3.ipynb index 9dcff2f..d56655d 100644 --- a/ch05/11_qwen3/standalone-qwen3.ipynb +++ b/ch05/11_qwen3/standalone-qwen3.ipynb @@ -822,7 +822,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "id": "699cb1b8-a67d-49fb-80a6-0dad9d81f392", "metadata": { "colab": { @@ -936,7 +936,8 @@ " weights_dict.update(shard)\n", "\n", "load_weights_into_qwen(model, QWEN3_CONFIG, weights_dict)\n", - "model.to(device);" + "model.to(device)\n", + "del weights_dict" ] }, { @@ -1187,7 +1188,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -1201,7 +1202,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.6" } }, "nbformat": 4,