From be5e2a33310b9e6d24dc96f3478c97ae15ff0daf Mon Sep 17 00:00:00 2001
From: Sebastian Raschka <mail@sebastianraschka.com>
Date: Tue, 17 Feb 2026 19:44:56 -0500
Subject: [PATCH] Readability and code quality improvements (#959)

* Consistent dataset naming

* consistent section headers
---
 .gitignore                                    |  3 ++
 ch02/01_main-chapter-code/ch02.ipynb          | 12 ++++-
 .../exercise-solutions.ipynb                  |  8 +--
 .../compare-bpe-tiktoken.ipynb                | 24 +++++----
 ch03/01_main-chapter-code/ch03.ipynb          | 22 ++++++--
 .../exercise-solutions.ipynb                  | 11 ++--
 .../mha-implementations.ipynb                 | 53 +++++++++++++------
 .../tests/test_mha_implementations.py         |  8 +--
 ch04/01_main-chapter-code/ch04.ipynb          | 16 ++++++
 .../exercise-solutions.ipynb                  | 11 ++--
 ch04/04_gqa/memory_estimator_gqa.py           | 14 ++---
 ch04/04_gqa/plot_memory_estimates_gqa.py      |  6 +--
 ch04/05_mla/memory_estimator_mla.py           | 20 +++----
 ch04/05_mla/plot_memory_estimates_mla.py      | 20 +++----
 ch04/06_swa/memory_estimator_swa.py           | 20 +++----
 ch04/06_swa/plot_memory_estimates_swa.py      | 36 ++++++-------
 ch04/07_moe/memory_estimator_moe.py           | 22 ++++----
 ch04/07_moe/plot_memory_estimates_moe.py      | 16 +++---
 .../plot_memory_estimates_gated_deltanet.py   | 26 ++++-----
 ch05/01_main-chapter-code/ch05.ipynb          | 20 ++++---
 .../exercise-solutions.ipynb                  | 20 ++++---
 .../converting-gpt-to-llama2.ipynb            |  8 +--
 .../converting-llama2-to-llama3.ipynb         | 10 ++--
 ch05/07_gpt_to_llama/standalone-llama32.ipynb |  6 +--
 ch05/07_gpt_to_llama/tests/test_llama32_nb.py | 12 ++---
 .../standalone-qwen3-moe-plus-kvcache.ipynb   | 10 ++--
 ch05/11_qwen3/standalone-qwen3-moe.ipynb      | 10 ++--
 .../standalone-qwen3-plus-kvcache.ipynb       | 12 ++---
 ch05/11_qwen3/standalone-qwen3.ipynb          | 10 ++--
 ch05/11_qwen3/tests/test_qwen3_kvcache_nb.py  | 12 ++---
 ch05/11_qwen3/tests/test_qwen3_nb.py          | 12 ++---
 .../standalone-gemma3-plus-kvcache.ipynb      | 14 ++---
 ch05/12_gemma3/standalone-gemma3.ipynb        | 14 ++---
 ch05/12_gemma3/tests/test_gemma3_kv_nb.py     | 12 ++---
 ch05/12_gemma3/tests/test_gemma3_nb.py        | 12 ++---
 .../standalone-olmo3-plus-kv-cache.ipynb      |  8 +--
 ch05/13_olmo3/standalone-olmo3.ipynb          |  8 +--
 ch05/13_olmo3/tests/olmo3_layer_debugger.py   | 10 ++--
 ch05/13_olmo3/tests/test_olmo3_kvcache_nb.py  | 12 ++---
 ch05/13_olmo3/tests/test_olmo3_nb.py          | 12 ++---
 ch06/01_main-chapter-code/ch06.ipynb          | 25 ++++++---
 .../additional_experiments.py                 | 16 +++---
 .../sklearn-baseline.ipynb                    |  3 +-
 .../train_bert_hf_spam.py                     | 14 ++---
 ch07/01_main-chapter-code/ch07.ipynb          | 30 +++++++++--
 .../create-passive-voice-entries.ipynb        |  8 +--
 .../dpo-from-scratch.ipynb                    |  2 +-
 .../reflection-gpt4.ipynb                     | 26 ++++-----
 48 files changed, 419 insertions(+), 297 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8cdcb67..943f8f3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+# Reports
+reports/
+
 # Configs and keys
 .chainlit
 ch05/07_gpt_to_llama/config.json
diff --git a/ch02/01_main-chapter-code/ch02.ipynb b/ch02/01_main-chapter-code/ch02.ipynb
index 272e4d7..53241d1 100644
--- a/ch02/01_main-chapter-code/ch02.ipynb
+++ b/ch02/01_main-chapter-code/ch02.ipynb
@@ -79,6 +79,7 @@
    "id": "2417139b-2357-44d2-bd67-23f5d7f52ae7",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "## 2.1 Understanding word embeddings"
    ]
   },
@@ -128,6 +129,7 @@
    "id": "eddbb984-8d23-40c5-bbfa-c3c379e7eec3",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "## 2.2 Tokenizing text"
    ]
   },
@@ -445,6 +447,7 @@
    "id": "0b5ce8fe-3a07-4f2a-90f1-a0321ce3a231",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "## 2.3 Converting tokens into token IDs"
    ]
   },
@@ -738,6 +741,7 @@
    "id": "4b821ef8-4d53-43b6-a2b2-aef808c343c7",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "## 2.4 Adding special context tokens"
    ]
   },
@@ -1013,6 +1017,7 @@
    "id": "5c4ba34b-170f-4e71-939b-77aabb776f14",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "## 2.5 BytePair encoding"
    ]
   },
@@ -1528,6 +1533,7 @@
    "id": "2cd2fcda-2fda-4aa8-8bc8-de1e496f9db1",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "## 2.7 Creating token embeddings"
    ]
   },
@@ -1715,6 +1721,7 @@
    "id": "c393d270-b950-4bc8-99ea-97d74f2ea0f6",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "## 2.8 Encoding word positions"
    ]
   },
@@ -1945,7 +1952,8 @@
    "id": "63230f2e-258f-4497-9e2e-8deee4530364",
    "metadata": {},
    "source": [
-    "# Summary and takeaways"
+    "&nbsp;\n",
+    "## Summary and takeaways"
    ]
   },
   {
@@ -1977,7 +1985,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.16"
+   "version": "3.13.5"
   }
  },
  "nbformat": 4,
diff --git a/ch02/01_main-chapter-code/exercise-solutions.ipynb b/ch02/01_main-chapter-code/exercise-solutions.ipynb
index 77b3774..be23260 100644
--- a/ch02/01_main-chapter-code/exercise-solutions.ipynb
+++ b/ch02/01_main-chapter-code/exercise-solutions.ipynb
@@ -63,7 +63,8 @@
    "id": "6f678e62-7bcb-4405-86ae-dce94f494303",
    "metadata": {},
    "source": [
-    "# Exercise 2.1"
+    "&nbsp;\n",
+    "## Exercise 2.1"
    ]
   },
   {
@@ -273,7 +274,8 @@
    "id": "29e5034a-95ed-46d8-9972-589354dc9fd4",
    "metadata": {},
    "source": [
-    "# Exercise 2.2"
+    "&nbsp;\n",
+    "## Exercise 2.2"
    ]
   },
   {
@@ -407,7 +409,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.16"
+   "version": "3.13.5"
   }
  },
  "nbformat": 4,
diff --git a/ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb b/ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb
index 75f335f..bd42bb1 100644
--- a/ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb
+++ b/ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb
@@ -54,7 +54,7 @@
     "<br>\n",
     "&nbsp;\n",
     "\n",
-    "## Using BPE from `tiktoken`"
+    "## 1. Using BPE from `tiktoken`"
    ]
   },
   {
@@ -157,7 +157,7 @@
     "<br>\n",
     "&nbsp;\n",
     "\n",
-    "## Using the original BPE implementation used in GPT-2"
+    "## 2. Using the original BPE implementation used in GPT-2"
    ]
   },
   {
@@ -247,7 +247,7 @@
     "<br>\n",
     "&nbsp;\n",
     "\n",
-    "## Using the BPE via Hugging Face transformers"
+    "## 3. Using the BPE via Hugging Face transformers"
    ]
   },
   {
@@ -355,7 +355,7 @@
     "<br>\n",
     "&nbsp;\n",
     "\n",
-    "## Using my own from-scratch BPE tokenizer"
+    "## 4. Using my own from-scratch BPE tokenizer"
    ]
   },
   {
@@ -449,7 +449,7 @@
     "<br>\n",
     "&nbsp;\n",
     "\n",
-    "## A quick performance benchmark"
+    "## 5. A quick performance benchmark"
    ]
   },
   {
@@ -468,7 +468,8 @@
    "id": "9c0ae9f0-47a1-4e7f-a210-e1d2721f4d1e",
    "metadata": {},
    "source": [
-    "### Original OpenAI GPT-2 tokenizer"
+    "&nbsp;\n",
+    "### 5.1 Original OpenAI GPT-2 tokenizer"
    ]
   },
   {
@@ -494,7 +495,8 @@
    "id": "ef2ce3f3-1f81-47ce-b563-99fe2c7a1e90",
    "metadata": {},
    "source": [
-    "### Tiktoken OpenAI GPT-2 tokenizer"
+    "&nbsp;\n",
+    "### 5.2 Tiktoken OpenAI GPT-2 tokenizer"
    ]
   },
   {
@@ -520,7 +522,8 @@
    "id": "0c748de8-273e-42df-b078-3a510106da60",
    "metadata": {},
    "source": [
-    "### Hugging Face OpenAI GPT-2 tokenizer"
+    "&nbsp;\n",
+    "### 5.3 Hugging Face OpenAI GPT-2 tokenizer"
    ]
   },
   {
@@ -614,7 +617,8 @@
    "id": "91ac2876-f36e-498c-bd75-8597a39f2d4b",
    "metadata": {},
    "source": [
-    "### My own GPT-2 tokenizer (for educational purposes)"
+    "&nbsp;\n",
+    "### 5.4 My own GPT-2 tokenizer (for educational purposes)"
    ]
   },
   {
@@ -652,7 +656,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.16"
+   "version": "3.13.5"
   }
  },
  "nbformat": 4,
diff --git a/ch03/01_main-chapter-code/ch03.ipynb b/ch03/01_main-chapter-code/ch03.ipynb
index c01d0f2..5a7823c 100644
--- a/ch03/01_main-chapter-code/ch03.ipynb
+++ b/ch03/01_main-chapter-code/ch03.ipynb
@@ -85,6 +85,7 @@
    "id": "ecc4dcee-34ea-4c05-9085-2f8887f70363",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "## 3.1 The problem with modeling long sequences"
    ]
   },
@@ -127,6 +128,7 @@
    "id": "3602c585-b87a-41c7-a324-c5e8298849df",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "## 3.2 Capturing data dependencies with attention mechanisms"
    ]
   },
@@ -168,6 +170,7 @@
    "id": "5efe05ff-b441-408e-8d66-cde4eb3397e3",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "## 3.3 Attending to different parts of the input with self-attention"
    ]
   },
@@ -176,6 +179,7 @@
    "id": "6d9af516-7c37-4400-ab53-34936d5495a9",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "### 3.3.1 A simple self-attention mechanism without trainable weights"
    ]
   },
@@ -216,7 +220,7 @@
    "id": "ff856c58-8382-44c7-827f-798040e6e697",
    "metadata": {},
    "source": [
-    "- By convention, the unnormalized attention weights are referred to as **\"attention scores\"** whereas the normalized attention scores, which sum to 1, are referred to as **\"attention weights\"**\n"
+    "- By convention, the unnormalized attention weights are referred to as **\"attention scores\"** whereas the normalized attention scores, which sum to 1, are referred to as **\"attention weights\"**"
    ]
   },
   {
@@ -503,6 +507,7 @@
    "id": "5a454262-40eb-430e-9ca4-e43fb8d6cd89",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "### 3.3.2 Computing attention weights for all input tokens"
    ]
   },
@@ -739,6 +744,7 @@
    "id": "a303b6fb-9f7e-42bb-9fdb-2adabf0a6525",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "## 3.4 Implementing self-attention with trainable weights"
    ]
   },
@@ -763,6 +769,7 @@
    "id": "2b90a77e-d746-4704-9354-1ddad86e6298",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "### 3.4.1 Computing the attention weights step by step"
    ]
   },
@@ -1046,6 +1053,7 @@
    "id": "9d7b2907-e448-473e-b46c-77735a7281d8",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "### 3.4.2 Implementing a compact SelfAttention class"
    ]
   },
@@ -1179,6 +1187,7 @@
    "id": "c5025b37-0f2c-4a67-a7cb-1286af7026ab",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "## 3.5 Hiding future words with causal attention"
    ]
   },
@@ -1203,6 +1212,7 @@
    "id": "82f405de-cd86-4e72-8f3c-9ea0354946ba",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "### 3.5.1 Applying a causal attention mask"
    ]
   },
@@ -1455,6 +1465,7 @@
    "id": "7636fc5f-6bc6-461e-ac6a-99ec8e3c0912",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "### 3.5.2 Masking additional attention weights with dropout"
    ]
   },
@@ -1554,6 +1565,7 @@
    "id": "cdc14639-5f0f-4840-aa9d-8eb36ea90fb7",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "### 3.5.3 Implementing a compact causal self-attention class"
    ]
   },
@@ -1679,6 +1691,7 @@
    "id": "c8bef90f-cfd4-4289-b0e8-6a00dc9be44c",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "## 3.6 Extending single-head attention to multi-head attention"
    ]
   },
@@ -1687,6 +1700,7 @@
    "id": "11697757-9198-4a1c-9cee-f450d8bbd3b9",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "### 3.6.1 Stacking multiple single-head attention layers"
    ]
   },
@@ -1776,6 +1790,7 @@
    "id": "6836b5da-ef82-4b4c-bda1-72a462e48d4e",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "### 3.6.2 Implementing multi-head attention with weight splits"
    ]
   },
@@ -2032,7 +2047,8 @@
    "id": "dec671bf-7938-4304-ad1e-75d9920e7f43",
    "metadata": {},
    "source": [
-    "# Summary and takeaways"
+    "&nbsp;\n",
+    "## Summary and takeaways"
    ]
   },
   {
@@ -2061,7 +2077,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.16"
+   "version": "3.13.5"
   }
  },
  "nbformat": 4,
diff --git a/ch03/01_main-chapter-code/exercise-solutions.ipynb b/ch03/01_main-chapter-code/exercise-solutions.ipynb
index b0537b9..f35431d 100644
--- a/ch03/01_main-chapter-code/exercise-solutions.ipynb
+++ b/ch03/01_main-chapter-code/exercise-solutions.ipynb
@@ -54,7 +54,8 @@
    "id": "33dfa199-9aee-41d4-a64b-7e3811b9a616",
    "metadata": {},
    "source": [
-    "# Exercise 3.1"
+    "&nbsp;\n",
+    "## Exercise 3.1"
    ]
   },
   {
@@ -209,7 +210,8 @@
    "id": "33543edb-46b5-4b01-8704-f7f101230544",
    "metadata": {},
    "source": [
-    "# Exercise 3.2"
+    "&nbsp;\n",
+    "## Exercise 3.2"
    ]
   },
   {
@@ -266,7 +268,8 @@
    "id": "92bdabcb-06cf-4576-b810-d883bbd313ba",
    "metadata": {},
    "source": [
-    "# Exercise 3.3"
+    "&nbsp;\n",
+    "## Exercise 3.3"
    ]
   },
   {
@@ -339,7 +342,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.16"
+   "version": "3.13.5"
   }
  },
  "nbformat": 4,
diff --git a/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb b/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb
index 1df1f1e..8dc4b57 100644
--- a/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb
+++ b/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb
@@ -117,7 +117,7 @@
     "<br>\n",
     "&nbsp;\n",
     "\n",
-    "## 1) CausalAttention MHA wrapper class from chapter 3"
+    "## 1. CausalAttention MHA wrapper class from chapter 3"
    ]
   },
   {
@@ -208,7 +208,7 @@
     "<br>\n",
     "&nbsp;\n",
     "\n",
-    "## 2) The multi-head attention class from chapter 3"
+    "## 2. The multi-head attention class from chapter 3"
    ]
   },
   {
@@ -311,7 +311,7 @@
     "<br>\n",
     "&nbsp;\n",
     "\n",
-    "## 3) An alternative multi-head attention with combined weights"
+    "## 3. An alternative multi-head attention with combined weights"
    ]
   },
   {
@@ -435,7 +435,7 @@
     "<br>\n",
     "&nbsp;\n",
     "\n",
-    "## 4) Multi-head attention with Einsum\n",
+    "## 4. Multi-head attention with Einsum\n",
     "\n",
     "- Implementing multi-head attention using Einstein summation via [`torch.einsum`](https://pytorch.org/docs/stable/generated/torch.einsum.html)"
    ]
@@ -567,7 +567,7 @@
     "<br>\n",
     "&nbsp;\n",
     "\n",
-    "## 5) Multi-head attention with PyTorch's scaled dot product attention and FlashAttention"
+    "## 5. Multi-head attention with PyTorch's scaled dot product attention and FlashAttention"
    ]
   },
   {
@@ -676,7 +676,7 @@
     "<br>\n",
     "&nbsp;\n",
     "\n",
-    "## 6) PyTorch's scaled dot product attention without FlashAttention\n",
+    "## 6. PyTorch's scaled dot product attention without FlashAttention\n",
     "\n",
     "- This is similar to above, except that we disable FlashAttention by passing an explicit causal mask"
    ]
@@ -785,7 +785,7 @@
     "<br>\n",
     "&nbsp;\n",
     "\n",
-    "## 7) Using PyTorch's torch.nn.MultiheadAttention"
+    "## 7. Using PyTorch's torch.nn.MultiheadAttention"
    ]
   },
   {
@@ -883,7 +883,7 @@
     "<br>\n",
     "&nbsp;\n",
     "\n",
-    "## 8) Using PyTorch's torch.nn.MultiheadAttention with `scaled_dot_product_attention`"
+    "## 8. Using PyTorch's torch.nn.MultiheadAttention with `scaled_dot_product_attention`"
    ]
   },
   {
@@ -948,7 +948,7 @@
     "<br>\n",
     "&nbsp;\n",
     "\n",
-    "## 9) Using PyTorch's FlexAttention\n",
+    "## 9. Using PyTorch's FlexAttention\n",
     "\n",
     "- See [FlexAttention: The Flexibility of PyTorch with the Performance of FlashAttention](https://pytorch.org/blog/flexattention/) to learn more about FlexAttention\n",
     "- FlexAttention caveat: It currently doesn't support dropout\n",
@@ -1108,7 +1108,18 @@
     "<br>\n",
     "&nbsp;\n",
     "\n",
-    "## Quick speed comparison (M3 Macbook Air CPU)"
+    "## 10. Quick speed comparisons"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "992e28f4-a6b9-4dd3-9705-30d0b9f4b5f0",
+   "metadata": {},
+   "source": [
+    "<br>\n",
+    "&nbsp;\n",
+    "\n",
+    "### 10.1 Speed comparisons on M3 Macbook Air CPU"
    ]
   },
   {
@@ -1361,7 +1372,7 @@
     "<br>\n",
     "&nbsp;\n",
     "\n",
-    "## Quick speed comparison (Nvidia A100 GPU)"
+    "### 10.2 Quick speed comparison on Nvidia A100 GPU"
    ]
   },
   {
@@ -1643,7 +1654,18 @@
     "&nbsp;\n",
     "\n",
     "\n",
-    "# Visualizations"
+    "## 11. Visualizations"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e6baf5ce-45ac-4e26-9523-5c32b82dc784",
+   "metadata": {},
+   "source": [
+    "<br>\n",
+    "&nbsp;\n",
+    "\n",
+    "### 11.1 Visualization utility functions"
    ]
   },
   {
@@ -1752,7 +1774,8 @@
     "id": "4df834dc"
    },
    "source": [
-    "## Speed comparison (Nvidia A100 GPU) with warmup (forward pass only)"
+    "&nbsp;\n",
+    "### 11.2 Speed comparison (Nvidia A100 GPU) with warmup (forward pass only)"
    ]
   },
   {
@@ -1834,7 +1857,7 @@
     "&nbsp;\n",
     "\n",
     "\n",
-    "## Speed comparison (Nvidia A100 GPU) with warmup (forward and backward pass)"
+    "### 11.3 Speed comparison (Nvidia A100 GPU) with warmup (forward and backward pass)"
    ]
   },
   {
@@ -1920,7 +1943,7 @@
     "&nbsp;\n",
     "\n",
     "\n",
-    "## Speed comparison (Nvidia A100 GPU) with warmup and compilation (forward and backward pass)"
+    "### 11.4 Speed comparison (Nvidia A100 GPU) with warmup and compilation (forward and backward pass)"
    ]
   },
   {
diff --git a/ch03/02_bonus_efficient-multihead-attention/tests/test_mha_implementations.py b/ch03/02_bonus_efficient-multihead-attention/tests/test_mha_implementations.py
index 03985ea..b8f642c 100644
--- a/ch03/02_bonus_efficient-multihead-attention/tests/test_mha_implementations.py
+++ b/ch03/02_bonus_efficient-multihead-attention/tests/test_mha_implementations.py
@@ -7,7 +7,7 @@ from llms_from_scratch.utils import import_definitions_from_notebook
 
 
 @pytest.fixture
-def nb_imports():
+def import_notebook_defs():
     nb_dir = Path(__file__).resolve().parents[1]
     mod = import_definitions_from_notebook(nb_dir, "mha-implementations.ipynb")
     return mod
@@ -31,12 +31,12 @@ def copy_weights(from_mha, to_mha):
         (1024, 512, 2, 4, 8, 789),   # d_in > d_out
     ],
 )
-def test_mha_einsum_matches_ch03(d_in, d_out, batch, seq_len, num_heads, seed, nb_imports):
+def test_mha_einsum_matches_ch03(d_in, d_out, batch, seq_len, num_heads, seed, import_notebook_defs):
     torch.manual_seed(seed)
 
     x = torch.randn(batch, seq_len, d_in)
 
-    mha_linear = nb_imports.Ch03_MHA(
+    mha_linear = import_notebook_defs.Ch03_MHA(
         d_in=d_in,
         d_out=d_out,
         context_length=seq_len,
@@ -45,7 +45,7 @@ def test_mha_einsum_matches_ch03(d_in, d_out, batch, seq_len, num_heads, seed, n
         qkv_bias=False,
     ).eval()
 
-    mha_einsum = nb_imports.MHAEinsum(
+    mha_einsum = import_notebook_defs.MHAEinsum(
         d_in=d_in,
         d_out=d_out,
         context_length=seq_len,
diff --git a/ch04/01_main-chapter-code/ch04.ipynb b/ch04/01_main-chapter-code/ch04.ipynb
index 7355fc4..59bb28c 100644
--- a/ch04/01_main-chapter-code/ch04.ipynb
+++ b/ch04/01_main-chapter-code/ch04.ipynb
@@ -73,6 +73,7 @@
    "id": "53fe99ab-0bcf-4778-a6b5-6db81fb826ef",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "## 4.1 Coding an LLM architecture"
    ]
   },
@@ -323,6 +324,7 @@
    "id": "f8332a00-98da-4eb4-b882-922776a89917",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "## 4.2 Normalizing activations with layer normalization"
    ]
   },
@@ -606,6 +608,7 @@
    "id": "11190e7d-8c29-4115-824a-e03702f9dd54",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "## 4.3 Implementing a feed forward network with GELU activations"
    ]
   },
@@ -789,6 +792,7 @@
    "id": "4ffcb905-53c7-4886-87d2-4464c5fecf89",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "## 4.4 Adding shortcut connections"
    ]
   },
@@ -950,6 +954,7 @@
    "id": "cae578ca-e564-42cf-8635-a2267047cdff",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "## 4.5 Connecting attention and linear layers in a transformer block"
    ]
   },
@@ -1068,6 +1073,7 @@
    "id": "46618527-15ac-4c32-ad85-6cfea83e006e",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "## 4.6 Coding the GPT model"
    ]
   },
@@ -1332,6 +1338,7 @@
    "id": "da5d9bc0-95ab-45d4-9378-417628d86e35",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "## 4.7 Generating text"
    ]
   },
@@ -1519,11 +1526,20 @@
    "id": "a35278b6-9e5c-480f-83e5-011a1173648f",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "## Summary and takeaways\n",
     "\n",
     "- See the [./gpt.py](./gpt.py) script, a self-contained script containing the GPT model we implement in this Jupyter notebook\n",
     "- You can find the exercise solutions in [./exercise-solutions.ipynb](./exercise-solutions.ipynb)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4821ac83-ef84-42c4-a327-32bf2820a8e5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/ch04/01_main-chapter-code/exercise-solutions.ipynb b/ch04/01_main-chapter-code/exercise-solutions.ipynb
index 7f514d0..fad48ba 100644
--- a/ch04/01_main-chapter-code/exercise-solutions.ipynb
+++ b/ch04/01_main-chapter-code/exercise-solutions.ipynb
@@ -53,7 +53,8 @@
    "id": "5fea8be3-30a1-4623-a6d7-b095c6c1092e",
    "metadata": {},
    "source": [
-    "# Exercise 4.1: Parameters in the feed forward versus attention module"
+    "&nbsp;\n",
+    "## Exercise 4.1: Parameters in the feed forward versus attention module"
    ]
   },
   {
@@ -182,7 +183,8 @@
    "id": "0f7b7c7f-0fa1-4d30-ab44-e499edd55b6d",
    "metadata": {},
    "source": [
-    "# Exercise 4.2: Initialize larger GPT models"
+    "&nbsp;\n",
+    "## Exercise 4.2: Initialize larger GPT models"
    ]
   },
   {
@@ -329,7 +331,8 @@
    "id": "f5f2306e-5dc8-498e-92ee-70ae7ec37ac1",
    "metadata": {},
    "source": [
-    "# Exercise 4.3: Using separate dropout parameters"
+    "&nbsp;\n",
+    "## Exercise 4.3: Using separate dropout parameters"
    ]
   },
   {
@@ -451,7 +454,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.16"
+   "version": "3.13.5"
   }
  },
  "nbformat": 4,
diff --git a/ch04/04_gqa/memory_estimator_gqa.py b/ch04/04_gqa/memory_estimator_gqa.py
index 276cc31..380b4a2 100644
--- a/ch04/04_gqa/memory_estimator_gqa.py
+++ b/ch04/04_gqa/memory_estimator_gqa.py
@@ -18,13 +18,13 @@ DTYPE_BYTES = {
 }
 
 
-def bytes_convert(n):
+def convert_bytes(n):
     gb = n / (1000 ** 3)
     return f"{gb:,.2f} GB"
 
 
-def kv_bytes_total(batch, context_length, emb_dim, n_heads,
-                   n_kv_heads, n_layers, bytes_per_elem):
+def calc_kv_bytes_total(batch, context_length, emb_dim, n_heads,
+                             n_kv_heads, n_layers, bytes_per_elem):
     head_dim = math.ceil(emb_dim / n_heads)
     per_layer = batch * context_length * head_dim * n_kv_heads * 2 * bytes_per_elem
     return per_layer * n_layers
@@ -58,7 +58,7 @@ def main():
     n_kv_heads_mha = cfg["n_heads"]
     n_kv_heads_gqa = cfg["n_heads"] // cfg["n_kv_groups"]
 
-    total_mha = kv_bytes_total(
+    total_mha = calc_kv_bytes_total(
         args.batch_size,
         cfg["context_length"],
         cfg["emb_dim"],
@@ -68,7 +68,7 @@ def main():
         bytes_per_elem,
     )
 
-    total_gqa = kv_bytes_total(
+    total_gqa = calc_kv_bytes_total(
         args.batch_size,
         cfg["context_length"],
         cfg["emb_dim"],
@@ -91,8 +91,8 @@ def main():
     print()
 
     print("==== KV-cache totals across all layers ====")
-    print(f"MHA total KV cache  : {bytes_convert(total_mha)}")
-    print(f"GQA total KV cache  : {bytes_convert(total_gqa)}")
+    print(f"MHA total KV cache  : {convert_bytes(total_mha)}")
+    print(f"GQA total KV cache  : {convert_bytes(total_gqa)}")
     print(f"Ratio (MHA / GQA)   : {ratio:,.2f}x")
     print(f"Savings (GQA vs MHA): {savings*100:,.2f}%")
 
diff --git a/ch04/04_gqa/plot_memory_estimates_gqa.py b/ch04/04_gqa/plot_memory_estimates_gqa.py
index f114180..53b8a94 100644
--- a/ch04/04_gqa/plot_memory_estimates_gqa.py
+++ b/ch04/04_gqa/plot_memory_estimates_gqa.py
@@ -8,7 +8,7 @@
 import matplotlib.pyplot as plt
 
 # Import from ./memory_estimator.py
-from memory_estimator_gqa import kv_bytes_total, DTYPE_BYTES
+from memory_estimator_gqa import calc_kv_bytes_total, DTYPE_BYTES
 
 
 def bytes_convert(n):
@@ -36,7 +36,7 @@ def plot_abs_kv_vs_context_multi_groups():
 
     mha_gb = []
     for L in context_lengths:
-        total_mha = kv_bytes_total(
+        total_mha = calc_kv_bytes_total(
             batch_size, L, emb_dim, n_heads,
             n_heads,  # MHA: n_kv_heads = n_heads
             n_layers, bytes_per_elem
@@ -52,7 +52,7 @@ def plot_abs_kv_vs_context_multi_groups():
         n_kv_heads = n_heads // g
         gqa_gb = []
         for L in context_lengths:
-            total_gqa = kv_bytes_total(
+            total_gqa = calc_kv_bytes_total(
                 batch_size, L, emb_dim, n_heads,
                 n_kv_heads, n_layers, bytes_per_elem
             )
diff --git a/ch04/05_mla/memory_estimator_mla.py b/ch04/05_mla/memory_estimator_mla.py
index f9ab9f5..bf903c8 100644
--- a/ch04/05_mla/memory_estimator_mla.py
+++ b/ch04/05_mla/memory_estimator_mla.py
@@ -17,20 +17,20 @@ DTYPE_BYTES = {
 }
 
 
-def bytes_convert(n):
+def convert_bytes(n):
     gb = n / (1000 ** 3)
     return f"{gb:,.2f} GB"
 
 
-def kv_bytes_total(batch, context_length, emb_dim, n_heads,
-                   n_kv_heads, n_layers, bytes_per_elem):
+def calc_kv_bytes_total(batch, context_length, emb_dim, n_heads,
+                             n_kv_heads, n_layers, bytes_per_elem):
     # Generic KV-cache: per-head dim is embed_dim / n_heads, times 2 for K and V
     head_dim = math.ceil(emb_dim / n_heads)
     per_layer = batch * context_length * head_dim * n_kv_heads * 2 * bytes_per_elem
     return per_layer * n_layers
 
 
-def mla_bytes_total(batch, context_length, n_layers, latent_dim, bytes_per_elem):
+def calc_mla_bytes_total(batch, context_length, n_layers, latent_dim, bytes_per_elem):
     # Simple MLA (per-token compressed latent)
     # bytes ≈ batch × seqlen × n_layers × latent_dim × bytes_per_elem
     return batch * context_length * n_layers * latent_dim * bytes_per_elem
@@ -66,7 +66,7 @@ def main():
     n_kv_heads_mha = cfg["n_heads"]
     n_kv_heads_gqa = cfg["n_heads"] // cfg["n_kv_groups"]
 
-    total_mha = kv_bytes_total(
+    total_mha = calc_kv_bytes_total(
         args.batch_size,
         cfg["context_length"],
         cfg["emb_dim"],
@@ -76,7 +76,7 @@ def main():
         bytes_per_elem,
     )
 
-    total_gqa = kv_bytes_total(
+    total_gqa = calc_kv_bytes_total(
         args.batch_size,
         cfg["context_length"],
         cfg["emb_dim"],
@@ -86,7 +86,7 @@ def main():
         bytes_per_elem,
     )
 
-    total_mla = mla_bytes_total(
+    total_mla = calc_mla_bytes_total(
         args.batch_size,
         cfg["context_length"],
         cfg["n_layers"],
@@ -110,9 +110,9 @@ def main():
     print()
 
     print("==== KV-cache totals across all layers ====")
-    print(f"MHA total KV cache  : {bytes_convert(total_mha)}")
-    print(f"GQA total KV cache  : {bytes_convert(total_gqa)}")
-    print(f"MLA total KV cache  : {bytes_convert(total_mla)}")
+    print(f"MHA total KV cache  : {convert_bytes(total_mha)}")
+    print(f"GQA total KV cache  : {convert_bytes(total_gqa)}")
+    print(f"MLA total KV cache  : {convert_bytes(total_mla)}")
     print(f"Ratio (MHA / GQA)   : {ratio:,.2f}x")
     print(f"Savings (GQA vs MHA): {savings*100:,.2f}%")
     print(f"Ratio (MHA / MLA)   : {ratio_mha_mla:,.2f}x")
diff --git a/ch04/05_mla/plot_memory_estimates_mla.py b/ch04/05_mla/plot_memory_estimates_mla.py
index e4c4208..1120519 100644
--- a/ch04/05_mla/plot_memory_estimates_mla.py
+++ b/ch04/05_mla/plot_memory_estimates_mla.py
@@ -15,18 +15,18 @@ DTYPE_BYTES = {
 }
 
 
-def bytes_to_gb(n_bytes):
+def convert_bytes_to_gb(n_bytes):
     return n_bytes / (1000. ** 3)
 
 
-def kv_bytes_total_mha(batch, context_length, emb_dim, n_heads,
-                       n_layers, bytes_per_elem):
+def calc_kv_bytes_total_mha(batch, context_length, emb_dim, n_heads,
+                                 n_layers, bytes_per_elem):
     head_dim = emb_dim / n_heads
     per_layer = batch * context_length * head_dim * n_heads * 2 * bytes_per_elem
     return per_layer * n_layers
 
 
-def kv_bytes_total_mla(batch, context_length, n_layers, latent_dim, bytes_per_elem):
+def calc_kv_bytes_total_mla(batch, context_length, n_layers, latent_dim, bytes_per_elem):
     return batch * context_length * n_layers * latent_dim * bytes_per_elem
 
 
@@ -45,27 +45,27 @@ def plot_abs_kv_vs_context_multiple():
 
     mha_gb = []
     for L in context_lengths:
-        total_mha = kv_bytes_total_mha(
+        total_mha = calc_kv_bytes_total_mha(
             batch_size, L, emb_dim, n_heads, n_layers, bytes_per_elem
         )
-        mha_gb.append(bytes_to_gb(total_mha))
+        mha_gb.append(convert_bytes_to_gb(total_mha))
 
     latent_dims = [1024, 512, 256, 64]
     plt.figure()
     plt.plot(context_lengths, mha_gb, marker="o", label="MHA (KV total)")
 
     L_ref = context_lengths[-1]
-    total_mha_ref = kv_bytes_total_mha(batch_size, L_ref, emb_dim, n_heads, n_layers, bytes_per_elem)
+    total_mha_ref = calc_kv_bytes_total_mha(batch_size, L_ref, emb_dim, n_heads, n_layers, bytes_per_elem)
 
     for latent_dim in latent_dims:
         mla_gb = []
         for L in context_lengths:
-            total_mla = kv_bytes_total_mla(
+            total_mla = calc_kv_bytes_total_mla(
                 batch_size, L, n_layers, latent_dim, bytes_per_elem
             )
-            mla_gb.append(bytes_to_gb(total_mla))
+            mla_gb.append(convert_bytes_to_gb(total_mla))
 
-        total_mla_ref = kv_bytes_total_mla(batch_size, L_ref, n_layers, latent_dim, bytes_per_elem)
+        total_mla_ref = calc_kv_bytes_total_mla(batch_size, L_ref, n_layers, latent_dim, bytes_per_elem)
         comp = total_mha_ref / total_mla_ref if total_mla_ref != 0 else float("inf")
 
         plt.plot(context_lengths, mla_gb, marker="o",
diff --git a/ch04/06_swa/memory_estimator_swa.py b/ch04/06_swa/memory_estimator_swa.py
index 2686c28..2401433 100644
--- a/ch04/06_swa/memory_estimator_swa.py
+++ b/ch04/06_swa/memory_estimator_swa.py
@@ -17,12 +17,12 @@ DTYPE_BYTES = {
 }
 
 
-def bytes_convert(n):
+def convert_bytes(n):
     gb = n / (1000 ** 3)
     return f"{gb:,.2f} GB"
 
 
-def kv_bytes_per_layer(batch, context_length, head_dim, n_kv_heads, bytes_per_elem):
+def calc_kv_bytes_per_layer(batch, context_length, head_dim, n_kv_heads, bytes_per_elem):
     # KV = batch * tokens * head_dim * n_kv_heads * 2 (K,V) * bytes
     return batch * context_length * head_dim * n_kv_heads * 2 * bytes_per_elem
 
@@ -64,10 +64,10 @@ def estimate_totals(context_length, sliding_window_size, emb_dim, n_heads, n_lay
     L = context_length
 
     # Per-layer costs
-    per_mha_full = kv_bytes_per_layer(batch_size, L, head_dim, n_kv_heads_mha, bytes_per_elem)
-    per_gqa_full = kv_bytes_per_layer(batch_size, L, head_dim, n_kv_heads_gqa, bytes_per_elem)
-    per_mha_swa = kv_bytes_per_layer(batch_size, eff_W, head_dim, n_kv_heads_mha, bytes_per_elem)
-    per_gqa_swa = kv_bytes_per_layer(batch_size, eff_W, head_dim, n_kv_heads_gqa, bytes_per_elem)
+    per_mha_full = calc_kv_bytes_per_layer(batch_size, L, head_dim, n_kv_heads_mha, bytes_per_elem)
+    per_gqa_full = calc_kv_bytes_per_layer(batch_size, L, head_dim, n_kv_heads_gqa, bytes_per_elem)
+    per_mha_swa = calc_kv_bytes_per_layer(batch_size, eff_W, head_dim, n_kv_heads_mha, bytes_per_elem)
+    per_gqa_swa = calc_kv_bytes_per_layer(batch_size, eff_W, head_dim, n_kv_heads_gqa, bytes_per_elem)
 
     # Totals
     total_mha_allfull = per_mha_full * n_layers
@@ -140,10 +140,10 @@ def main():
     print()
 
     print("==== KV-cache totals across all layers ====")
-    print(f"MHA KV total           : {bytes_convert(res['total_mha_allfull'])}")
-    print(f"GQA KV total           : {bytes_convert(res['total_gqa_allfull'])}")
-    print(f"MHA + SWA (ratio {args.swa_ratio})  : {bytes_convert(res['total_mixed_mha'])}")
-    print(f"GQA + SWA (ratio {args.swa_ratio})  : {bytes_convert(res['total_mixed_gqa'])}")
+    print(f"MHA KV total           : {convert_bytes(res['total_mha_allfull'])}")
+    print(f"GQA KV total           : {convert_bytes(res['total_gqa_allfull'])}")
+    print(f"MHA + SWA (ratio {args.swa_ratio})  : {convert_bytes(res['total_mixed_mha'])}")
+    print(f"GQA + SWA (ratio {args.swa_ratio})  : {convert_bytes(res['total_mixed_gqa'])}")
     print()
 
 
diff --git a/ch04/06_swa/plot_memory_estimates_swa.py b/ch04/06_swa/plot_memory_estimates_swa.py
index b75f0cf..9636bf6 100644
--- a/ch04/06_swa/plot_memory_estimates_swa.py
+++ b/ch04/06_swa/plot_memory_estimates_swa.py
@@ -24,7 +24,7 @@ DTYPE_BYTES = {
 }
 
 
-def bytes_to_gb(n_bytes):
+def convert_bytes_to_gb(n_bytes):
     return n_bytes / (1000.0 ** 3)
 
 
@@ -39,22 +39,22 @@ def parse_ratio(ratio_str):
         raise ValueError("--swa_ratio must be in the form 'a:b' with nonnegative integers and a+b>0")
 
 
-def kv_bytes_total_mha(batch, context_length, emb_dim, n_layers, bytes_per_elem):
+def calc_kv_bytes_total_mha(batch, context_length, emb_dim, n_layers, bytes_per_elem):
     # For MHA, n_kv_heads = n_heads, which cancels out:
     # total = B * L * E * 2 (K,V) * bytes * n_layers
     return batch * context_length * emb_dim * 2 * bytes_per_elem * n_layers
 
 
-def kv_bytes_total_gqa(
+def calc_kv_bytes_total_gqa(
     batch, context_length, emb_dim, n_layers, bytes_per_elem, n_kv_groups
 ):
     # For GQA, n_kv_heads = n_heads / n_kv_groups
     # => scale the MHA total by 1 / n_kv_groups
-    base = kv_bytes_total_mha(batch, context_length, emb_dim, n_layers, bytes_per_elem)
+    base = calc_kv_bytes_total_mha(batch, context_length, emb_dim, n_layers, bytes_per_elem)
     return base / n_kv_groups
 
 
-def kv_bytes_total_mha_swa(
+def calc_kv_bytes_total_mha_swa(
     batch, context_length, emb_dim, n_layers, bytes_per_elem, window, swa_ratio
 ):
     # Split layers into SWA vs Full
@@ -63,16 +63,16 @@ def kv_bytes_total_mha_swa(
     n_swa_layers = int(round(n_layers * (a / total_blocks)))
     n_full_layers = n_layers - n_swa_layers
 
-    total_full = kv_bytes_total_mha(
+    total_full = calc_kv_bytes_total_mha(
         batch, context_length, emb_dim, n_full_layers, bytes_per_elem
     )
-    total_swa = kv_bytes_total_mha(
+    total_swa = calc_kv_bytes_total_mha(
         batch, window, emb_dim, n_swa_layers, bytes_per_elem
     )
     return total_full + total_swa
 
 
-def kv_bytes_total_gqa_swa(
+def calc_kv_bytes_total_gqa_swa(
     batch,
     context_length,
     emb_dim,
@@ -87,7 +87,7 @@ def kv_bytes_total_gqa_swa(
     n_swa_layers = int(round(n_layers * (a / total_blocks)))
     n_full_layers = n_layers - n_swa_layers
 
-    total_full = kv_bytes_total_gqa(
+    total_full = calc_kv_bytes_total_gqa(
         batch,
         context_length,
         emb_dim,
@@ -95,7 +95,7 @@ def kv_bytes_total_gqa_swa(
         bytes_per_elem,
         n_kv_groups,
     )
-    total_swa = kv_bytes_total_gqa(
+    total_swa = calc_kv_bytes_total_gqa(
         batch, window, emb_dim, n_swa_layers, bytes_per_elem, n_kv_groups
     )
     return total_full + total_swa
@@ -144,10 +144,10 @@ def main():
         ] = []
 
     for L in context_lengths:
-        total_mha = kv_bytes_total_mha(
+        total_mha = calc_kv_bytes_total_mha(
             batch_size, L, emb_dim, n_layers, bytes_per_elem
         )
-        total_mha_swa = kv_bytes_total_mha_swa(
+        total_mha_swa = calc_kv_bytes_total_mha_swa(
             batch_size,
             L,
             emb_dim,
@@ -156,16 +156,16 @@ def main():
             window=args.sliding_window_size,
             swa_ratio=args.swa_ratio,
         )
-        series["MHA (KV total)"].append(bytes_to_gb(total_mha))
+        series["MHA (KV total)"].append(convert_bytes_to_gb(total_mha))
         series[
             f"SWA on MHA (ratio {args.swa_ratio}, W={args.sliding_window_size})"
-        ].append(bytes_to_gb(total_mha_swa))
+        ].append(convert_bytes_to_gb(total_mha_swa))
 
         if valid_g4:
-            total_gqa = kv_bytes_total_gqa(
+            total_gqa = calc_kv_bytes_total_gqa(
                 batch_size, L, emb_dim, n_layers, bytes_per_elem, n_kv_groups=kv_groups
             )
-            total_gqa_swa = kv_bytes_total_gqa_swa(
+            total_gqa_swa = calc_kv_bytes_total_gqa_swa(
                 batch_size,
                 L,
                 emb_dim,
@@ -175,10 +175,10 @@ def main():
                 window=args.sliding_window_size,
                 swa_ratio=args.swa_ratio,
             )
-            series["GQA kv_groups=4 (full)"].append(bytes_to_gb(total_gqa))
+            series["GQA kv_groups=4 (full)"].append(convert_bytes_to_gb(total_gqa))
             series[
                 f"SWA on GQA kv_groups=4 (ratio {args.swa_ratio}, W={args.sliding_window_size})"
-            ].append(bytes_to_gb(total_gqa_swa))
+            ].append(convert_bytes_to_gb(total_gqa_swa))
 
     plt.figure(figsize=(10, 5))
     x = np.array(context_lengths, dtype=float)
diff --git a/ch04/07_moe/memory_estimator_moe.py b/ch04/07_moe/memory_estimator_moe.py
index 6fa7839..7c3bb2b 100644
--- a/ch04/07_moe/memory_estimator_moe.py
+++ b/ch04/07_moe/memory_estimator_moe.py
@@ -14,7 +14,7 @@ DTYPE_BYTES = {
 }
 
 
-def bytes_convert(n):
+def convert_bytes(n):
     gb = n / (1000 ** 3)
     return f"{gb:,.2f} GB"
 
@@ -28,19 +28,19 @@ def get_num_param_matrices(ffn_type):
         raise ValueError("--ffn_type must be 'gelu' or 'swiglu'")
 
 
-def ffn_params(emb_dim, hidden_dim, ffn_type):
+def calc_ffn_params(emb_dim, hidden_dim, ffn_type):
     return get_num_param_matrices(ffn_type) * emb_dim * hidden_dim
 
 
-def router_params(emb_dim, num_experts):
+def calc_router_params(emb_dim, num_experts):
     return emb_dim * num_experts
 
 
 def estimate_params_and_hidden(
     emb_dim, hidden_dim, ffn_type, num_experts, match_dense=False
 ):
-    P_dense = ffn_params(emb_dim, hidden_dim, ffn_type)
-    R = router_params(emb_dim, num_experts)
+    P_dense = calc_ffn_params(emb_dim, hidden_dim, ffn_type)
+    R = calc_router_params(emb_dim, num_experts)
 
     if match_dense:
         num_param_matrices = get_num_param_matrices(ffn_type)
@@ -52,7 +52,7 @@ def estimate_params_and_hidden(
     else:
         moe_hidden_dim = hidden_dim
 
-    per_expert_params = ffn_params(emb_dim, moe_hidden_dim, ffn_type)
+    per_expert_params = calc_ffn_params(emb_dim, moe_hidden_dim, ffn_type)
     moe_total = num_experts * per_expert_params + R
 
     return {
@@ -110,15 +110,15 @@ def main():
 
     print("==== Model weights (parameters) ====")
     print(f"{'Dense FFN params':23}: {res['dense_params']:,} "
-          f"({bytes_convert(res['dense_params'] * bytes_per_elem)})")
+          f"({convert_bytes(res['dense_params'] * bytes_per_elem)})")
     print(f"{'Per-expert params':23}: {res['per_expert_params']:,} "
-          f"({bytes_convert(res['per_expert_params'] * bytes_per_elem)})")
+          f"({convert_bytes(res['per_expert_params'] * bytes_per_elem)})")
     print(f"{'Router params':23}: {res['router']:,} "
-          f"({bytes_convert(res['router'] * bytes_per_elem)})")
+          f"({convert_bytes(res['router'] * bytes_per_elem)})")
     print(f"{'MoE TOTAL params':23}: {res['moe_total']:,} "
-          f"({bytes_convert(res['moe_total'] * bytes_per_elem)})")
+          f"({convert_bytes(res['moe_total'] * bytes_per_elem)})")
     print(f"{'MoE ACTIVE/Token':23}: {moe_active_params_per_token:,} "
-          f"({bytes_convert(moe_active_params_per_token * bytes_per_elem)})")
+          f"({convert_bytes(moe_active_params_per_token * bytes_per_elem)})")
     print(f"{'moe_hidden_dim':23}: {res['moe_hidden_dim']}")
     print()
 
diff --git a/ch04/07_moe/plot_memory_estimates_moe.py b/ch04/07_moe/plot_memory_estimates_moe.py
index 089e1c5..10346bb 100644
--- a/ch04/07_moe/plot_memory_estimates_moe.py
+++ b/ch04/07_moe/plot_memory_estimates_moe.py
@@ -6,14 +6,14 @@
 
 import argparse
 import matplotlib.pyplot as plt
-from ffn_moe_memory_estimator import (
+from memory_estimator_moe import (
     estimate_params_and_hidden,
-    ffn_params,
-    router_params,
+    calc_ffn_params,
+    calc_router_params,
 )
 
 
-def moe_active_and_total(
+def calc_moe_active_and_total(
     emb_dim,
     hidden_dim,
     ffn_type,
@@ -22,8 +22,8 @@ def moe_active_and_total(
     match_dense=True,
 ):
     if match_dense:
-        dense_params = ffn_params(emb_dim, hidden_dim, ffn_type)
-        router = router_params(emb_dim, num_experts)
+        dense_params = calc_ffn_params(emb_dim, hidden_dim, ffn_type)
+        router = calc_router_params(emb_dim, num_experts)
         if dense_params <= router:
             match_dense = False
 
@@ -52,11 +52,11 @@ def plot_active_params_vs_experts(
     experts = [1, 2, 4, 8, 16, 32, 64, 128, 192, 256, 384, 512]
     experts = [e for e in experts if e <= max_experts]
 
-    dense_active = ffn_params(emb_dim, hidden_dim, ffn_type)
+    dense_active = calc_ffn_params(emb_dim, hidden_dim, ffn_type)
     moe_active = []
     moe_total = []
     for e in experts:
-        active, total = moe_active_and_total(
+        active, total = calc_moe_active_and_total(
             emb_dim=emb_dim,
             hidden_dim=hidden_dim,
             ffn_type=ffn_type,
diff --git a/ch04/08_deltanet/plot_memory_estimates_gated_deltanet.py b/ch04/08_deltanet/plot_memory_estimates_gated_deltanet.py
index 2ddcb9d..86d7a1e 100644
--- a/ch04/08_deltanet/plot_memory_estimates_gated_deltanet.py
+++ b/ch04/08_deltanet/plot_memory_estimates_gated_deltanet.py
@@ -17,21 +17,21 @@ DTYPE_BYTES = {
 }
 
 
-def kv_bytes_total_mha(batch, context_length, emb_dim, n_layers, bytes_per_elem, n_heads):
+def calc_kv_bytes_total_mha(batch, context_length, emb_dim, n_layers, bytes_per_elem, n_heads):
     # Full attention (MHA)
     d_head = emb_dim // n_heads
     per_layer = batch * context_length * n_heads * d_head * 2 * bytes_per_elem
     return per_layer * n_layers
 
 
-def kv_bytes_total_deltanet_no_conv(batch, emb_dim, n_layers, bytes_per_elem, n_heads):
+def calc_kv_bytes_total_deltanet_no_conv(batch, emb_dim, n_layers, bytes_per_elem, n_heads):
     # Simple Gated DeltaNet (no convolutional mixing)
     d_head = emb_dim // n_heads
     per_layer = batch * n_heads * d_head * d_head * bytes_per_elem
     return per_layer * n_layers
 
 
-def gb(x):
+def convert_to_gb(x):
     return x / 1e9
 
 
@@ -52,13 +52,13 @@ def main():
 
     # 1) Full attention only
     mha_bytes = np.array([
-        kv_bytes_total_mha(args.batch, int(t), args.emb_dim, args.n_layers,
-                           bytes_per_elem, args.n_heads)
+        calc_kv_bytes_total_mha(args.batch, int(t), args.emb_dim, args.n_layers,
+                                     bytes_per_elem, args.n_heads)
         for t in ctx
     ], dtype=float)
 
     # 2) DeltaNet only
-    dnet_bytes_const = kv_bytes_total_deltanet_no_conv(
+    dnet_bytes_const = calc_kv_bytes_total_deltanet_no_conv(
         args.batch, args.emb_dim, args.n_layers,
         bytes_per_elem, args.n_heads
     )
@@ -68,17 +68,17 @@ def main():
     n_mha_layers = args.n_layers / 4
     n_dnet_layers = args.n_layers - n_mha_layers
     mix_bytes = np.array([
-        kv_bytes_total_mha(args.batch, int(t), args.emb_dim, n_mha_layers,
-                           bytes_per_elem, args.n_heads)
-        + kv_bytes_total_deltanet_no_conv(args.batch, args.emb_dim, n_dnet_layers,
-                                          bytes_per_elem, args.n_heads)
+        calc_kv_bytes_total_mha(args.batch, int(t), args.emb_dim, n_mha_layers,
+                                     bytes_per_elem, args.n_heads)
+        + calc_kv_bytes_total_deltanet_no_conv(args.batch, args.emb_dim, n_dnet_layers,
+                                                    bytes_per_elem, args.n_heads)
         for t in ctx
     ], dtype=float)
 
     # Convert to GB
-    mha_gb = gb(mha_bytes)
-    dnet_gb = gb(dnet_bytes)
-    mix_gb = gb(mix_bytes)
+    mha_gb = convert_to_gb(mha_bytes)
+    dnet_gb = convert_to_gb(dnet_bytes)
+    mix_gb = convert_to_gb(mix_bytes)
 
     # Plot
     fig, ax = plt.subplots(figsize=(7, 4.5))
diff --git a/ch05/01_main-chapter-code/ch05.ipynb b/ch05/01_main-chapter-code/ch05.ipynb
index 97c2a4d..a80a115 100644
--- a/ch05/01_main-chapter-code/ch05.ipynb
+++ b/ch05/01_main-chapter-code/ch05.ipynb
@@ -101,6 +101,7 @@
     "id": "0d824183-145c-4865-89e1-1f0d0a338f19"
    },
    "source": [
+    "&nbsp;\n",
     "## 5.1 Evaluating generative text models"
    ]
   },
@@ -121,6 +122,7 @@
     "id": "bdc1cf3f-82d8-46c7-9ecc-58979ce87cdd"
    },
    "source": [
+    "&nbsp;\n",
     "### 5.1.1 Using GPT to generate text"
    ]
   },
@@ -253,14 +255,6 @@
     "- The next chapters on finetuning LLMs will also introduce additional ways to measure model quality"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "955f9e1a-7bf7-40d8-b1fa-eacabdee8d8e",
-   "metadata": {},
-   "source": [
-    "<br>"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "0f3d7ea2-637f-4490-bc76-e361fc81ae98",
@@ -268,6 +262,7 @@
     "id": "0f3d7ea2-637f-4490-bc76-e361fc81ae98"
    },
    "source": [
+    "&nbsp;\n",
     "### 5.1.2 Calculating the text generation loss: cross-entropy and perplexity"
    ]
   },
@@ -763,6 +758,7 @@
     "id": "2ec6c217-e429-40c7-ad71-5d0a9da8e487"
    },
    "source": [
+    "&nbsp;\n",
     "### 5.1.3 Calculating the training and validation set losses"
    ]
   },
@@ -1220,6 +1216,7 @@
     "id": "b9339f8d-00cb-4206-af67-58c32bd72055"
    },
    "source": [
+    "&nbsp;\n",
     "## 5.2 Training an LLM"
    ]
   },
@@ -1490,6 +1487,7 @@
     "id": "699f45fc-bf78-42f2-bd24-2355db41b28f"
    },
    "source": [
+    "&nbsp;\n",
     "## 5.3 Decoding strategies to control randomness"
    ]
   },
@@ -1558,6 +1556,7 @@
    "id": "4bb6f380-a798-4fd9-825c-17b7cd29a994",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "### 5.3.1 Temperature scaling"
    ]
   },
@@ -1837,6 +1836,7 @@
    "id": "c6e4873e-07e4-4abb-85df-bdaedcc1a6f7",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "### 5.3.2 Top-k sampling"
    ]
   },
@@ -1957,6 +1957,7 @@
    "id": "56056503-a15d-4315-a3ff-46647a4c7c45",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "### 5.3.3 Modifying the text generation function"
    ]
   },
@@ -2054,6 +2055,7 @@
    "id": "4e2002ca-f4c1-48af-9e0a-88bfc163ba0b",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "## 5.4 Loading and saving model weights in PyTorch"
    ]
   },
@@ -2164,6 +2166,7 @@
    "id": "4194350e-0409-4a63-8ffd-d3a896509032",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "## 5.5 Loading pretrained weights from OpenAI"
    ]
   },
@@ -2615,6 +2618,7 @@
    "id": "f2a66474-230d-4180-a8ff-843e04f1f1c4",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "## Summary and takeaways"
    ]
   },
diff --git a/ch05/01_main-chapter-code/exercise-solutions.ipynb b/ch05/01_main-chapter-code/exercise-solutions.ipynb
index 8f1f0aa..c831dde 100644
--- a/ch05/01_main-chapter-code/exercise-solutions.ipynb
+++ b/ch05/01_main-chapter-code/exercise-solutions.ipynb
@@ -62,7 +62,8 @@
    "id": "5fea8be3-30a1-4623-a6d7-b095c6c1092e",
    "metadata": {},
    "source": [
-    "# Exercise 5.1: Temperature-scaled softmax scores and sampling probabilities"
+    "&nbsp;\n",
+    "## Exercise 5.1: Temperature-scaled softmax scores and sampling probabilities"
    ]
   },
   {
@@ -239,7 +240,8 @@
    "id": "b510ffb0-adca-4d64-8a12-38c4646fd736",
    "metadata": {},
    "source": [
-    "# Exercise 5.2: Different temperature and top-k settings"
+    "&nbsp;\n",
+    "## Exercise 5.2: Different temperature and top-k settings"
    ]
   },
   {
@@ -258,7 +260,8 @@
    "id": "3f35425d-529d-4179-a1c4-63cb8b25b156",
    "metadata": {},
    "source": [
-    "# Exercise 5.3: Deterministic behavior in the decoding functions"
+    "&nbsp;\n",
+    "## Exercise 5.3: Deterministic behavior in the decoding functions"
    ]
   },
   {
@@ -425,7 +428,8 @@
    "id": "6d0480e5-fb4e-41f8-a161-7ac980d71d47",
    "metadata": {},
    "source": [
-    "# Exercise 5.4: Continued pretraining"
+    "&nbsp;\n",
+    "## Exercise 5.4: Continued pretraining"
    ]
   },
   {
@@ -598,7 +602,8 @@
    "id": "3384e788-f5a1-407c-8dd1-87959b75026d",
    "metadata": {},
    "source": [
-    "# Exercise 5.5: Training and validation set losses of the pretrained model"
+    "&nbsp;\n",
+    "## Exercise 5.5: Training and validation set losses of the pretrained model"
    ]
   },
   {
@@ -874,7 +879,8 @@
    "id": "3a76a1e0-9635-480a-9391-3bda7aea402d",
    "metadata": {},
    "source": [
-    "# Exercise 5.6: Trying larger models"
+    "&nbsp;\n",
+    "## Exercise 5.6: Trying larger models"
    ]
   },
   {
@@ -1028,7 +1034,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.16"
+   "version": "3.13.5"
   }
  },
  "nbformat": 4,
diff --git a/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb b/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb
index a1fc0ea..d5dd3e9 100644
--- a/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb
+++ b/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb
@@ -977,7 +977,7 @@
     }
    ],
    "source": [
-    "def model_memory_size(model, input_dtype=torch.float32):\n",
+    "def calc_model_memory_size(model, input_dtype=torch.float32):\n",
     "    total_params = 0\n",
     "    total_grads = 0\n",
     "    for param in model.parameters():\n",
@@ -1001,8 +1001,8 @@
     "\n",
     "    return total_memory_gb\n",
     "\n",
-    "print(f\"float32 (PyTorch default): {model_memory_size(model, input_dtype=torch.float32):.2f} GB\")\n",
-    "print(f\"bfloat16: {model_memory_size(model, input_dtype=torch.bfloat16):.2f} GB\")"
+    "print(f\"float32 (PyTorch default): {calc_model_memory_size(model, input_dtype=torch.float32):.2f} GB\")\n",
+    "print(f\"bfloat16: {calc_model_memory_size(model, input_dtype=torch.bfloat16):.2f} GB\")"
    ]
   },
   {
@@ -1659,7 +1659,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.16"
+   "version": "3.13.5"
   }
  },
  "nbformat": 4,
diff --git a/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb b/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb
index d6bebfd..21ef146 100644
--- a/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb
+++ b/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb
@@ -1050,7 +1050,7 @@
     }
    ],
    "source": [
-    "def model_memory_size(model, input_dtype=torch.float32):\n",
+    "def calc_model_memory_size(model, input_dtype=torch.float32):\n",
     "    total_params = 0\n",
     "    total_grads = 0\n",
     "    for param in model.parameters():\n",
@@ -1074,8 +1074,8 @@
     "\n",
     "    return total_memory_gb\n",
     "\n",
-    "print(f\"float32 (PyTorch default): {model_memory_size(model, input_dtype=torch.float32):.2f} GB\")\n",
-    "print(f\"bfloat16: {model_memory_size(model, input_dtype=torch.bfloat16):.2f} GB\")"
+    "print(f\"float32 (PyTorch default): {calc_model_memory_size(model, input_dtype=torch.float32):.2f} GB\")\n",
+    "print(f\"bfloat16: {calc_model_memory_size(model, input_dtype=torch.bfloat16):.2f} GB\")"
    ]
   },
   {
@@ -2120,7 +2120,7 @@
    },
    "source": [
     "&nbsp;\n",
-    "# Llama 3.1 8B"
+    "# 6. Llama 3.1 8B"
    ]
   },
   {
@@ -2460,7 +2460,7 @@
    },
    "source": [
     "&nbsp;\n",
-    "# Llama 3.2 1B"
+    "# 7. Llama 3.2 1B"
    ]
   },
   {
diff --git a/ch05/07_gpt_to_llama/standalone-llama32.ipynb b/ch05/07_gpt_to_llama/standalone-llama32.ipynb
index ab68fff..d10a540 100644
--- a/ch05/07_gpt_to_llama/standalone-llama32.ipynb
+++ b/ch05/07_gpt_to_llama/standalone-llama32.ipynb
@@ -492,7 +492,7 @@
     }
    ],
    "source": [
-    "def model_memory_size(model, input_dtype=torch.float32):\n",
+    "def calc_model_memory_size(model, input_dtype=torch.float32):\n",
     "    total_params = 0\n",
     "    total_grads = 0\n",
     "    for param in model.parameters():\n",
@@ -516,8 +516,8 @@
     "\n",
     "    return total_memory_gb\n",
     "\n",
-    "print(f\"float32 (PyTorch default): {model_memory_size(model, input_dtype=torch.float32):.2f} GB\")\n",
-    "print(f\"bfloat16: {model_memory_size(model, input_dtype=torch.bfloat16):.2f} GB\")"
+    "print(f\"float32 (PyTorch default): {calc_model_memory_size(model, input_dtype=torch.float32):.2f} GB\")\n",
+    "print(f\"bfloat16: {calc_model_memory_size(model, input_dtype=torch.bfloat16):.2f} GB\")"
    ]
   },
   {
diff --git a/ch05/07_gpt_to_llama/tests/test_llama32_nb.py b/ch05/07_gpt_to_llama/tests/test_llama32_nb.py
index 234b84c..c82891e 100644
--- a/ch05/07_gpt_to_llama/tests/test_llama32_nb.py
+++ b/ch05/07_gpt_to_llama/tests/test_llama32_nb.py
@@ -16,7 +16,7 @@ transformers_installed = importlib.util.find_spec("transformers") is not None
 
 
 @pytest.fixture
-def nb_imports():
+def import_notebook_defs():
     nb_dir = Path(__file__).resolve().parents[1]
     mod = import_definitions_from_notebook(nb_dir, "standalone-llama32.ipynb")
     return mod
@@ -51,16 +51,16 @@ def dummy_cfg_base():
 
 
 @torch.inference_mode()
-def test_dummy_llama3_forward(dummy_cfg_base, dummy_input, nb_imports):
+def test_dummy_llama3_forward(dummy_cfg_base, dummy_input, import_notebook_defs):
     torch.manual_seed(123)
-    model = nb_imports.Llama3Model(dummy_cfg_base)
+    model = import_notebook_defs.Llama3Model(dummy_cfg_base)
     out = model(dummy_input)
     assert out.shape == (1, dummy_input.size(1), dummy_cfg_base["vocab_size"])
 
 
 @torch.inference_mode()
 @pytest.mark.skipif(not transformers_installed, reason="transformers not installed")
-def test_llama3_base_equivalence_with_transformers(nb_imports):
+def test_llama3_base_equivalence_with_transformers(import_notebook_defs):
     from transformers.models.llama import LlamaConfig, LlamaForCausalLM
     cfg = {
         "vocab_size": 257,
@@ -80,7 +80,7 @@ def test_llama3_base_equivalence_with_transformers(nb_imports):
         "dtype": torch.float32,
     }
 
-    ours = nb_imports.Llama3Model(cfg)
+    ours = import_notebook_defs.Llama3Model(cfg)
 
     hf_cfg = LlamaConfig(
         vocab_size=cfg["vocab_size"],
@@ -107,7 +107,7 @@ def test_llama3_base_equivalence_with_transformers(nb_imports):
     theirs = LlamaForCausalLM(hf_cfg)
 
     hf_state = theirs.state_dict()
-    nb_imports.load_weights_into_llama(ours, {"n_layers": cfg["n_layers"], "hidden_dim": cfg["hidden_dim"]}, hf_state)
+    import_notebook_defs.load_weights_into_llama(ours, {"n_layers": cfg["n_layers"], "hidden_dim": cfg["hidden_dim"]}, hf_state)
 
     x = torch.randint(0, cfg["vocab_size"], (2, 8), dtype=torch.long)
     ours_logits = ours(x)
diff --git a/ch05/11_qwen3/standalone-qwen3-moe-plus-kvcache.ipynb b/ch05/11_qwen3/standalone-qwen3-moe-plus-kvcache.ipynb
index c01f5c1..c7cac2c 100644
--- a/ch05/11_qwen3/standalone-qwen3-moe-plus-kvcache.ipynb
+++ b/ch05/11_qwen3/standalone-qwen3-moe-plus-kvcache.ipynb
@@ -681,7 +681,7 @@
     }
    ],
    "source": [
-    "def model_memory_size(model, input_dtype=torch.float32):\n",
+    "def calc_model_memory_size(model, input_dtype=torch.float32):\n",
     "    total_params = 0\n",
     "    total_grads = 0\n",
     "    for param in model.parameters():\n",
@@ -705,8 +705,8 @@
     "\n",
     "    return total_memory_gb\n",
     "\n",
-    "print(f\"float32 (PyTorch default): {model_memory_size(model, input_dtype=torch.float32):.2f} GB\")\n",
-    "print(f\"bfloat16: {model_memory_size(model, input_dtype=torch.bfloat16):.2f} GB\")"
+    "print(f\"float32 (PyTorch default): {calc_model_memory_size(model, input_dtype=torch.float32):.2f} GB\")\n",
+    "print(f\"bfloat16: {calc_model_memory_size(model, input_dtype=torch.bfloat16):.2f} GB\")"
    ]
   },
   {
@@ -725,7 +725,7 @@
    },
    "source": [
     "&nbsp;\n",
-    "# 4. Load pretrained weights"
+    "# 3. Load pretrained weights"
    ]
   },
   {
@@ -1223,7 +1223,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.13.5"
   }
  },
  "nbformat": 4,
diff --git a/ch05/11_qwen3/standalone-qwen3-moe.ipynb b/ch05/11_qwen3/standalone-qwen3-moe.ipynb
index de10c1a..85f7214 100644
--- a/ch05/11_qwen3/standalone-qwen3-moe.ipynb
+++ b/ch05/11_qwen3/standalone-qwen3-moe.ipynb
@@ -623,7 +623,7 @@
     }
    ],
    "source": [
-    "def model_memory_size(model, input_dtype=torch.float32):\n",
+    "def calc_model_memory_size(model, input_dtype=torch.float32):\n",
     "    total_params = 0\n",
     "    total_grads = 0\n",
     "    for param in model.parameters():\n",
@@ -647,8 +647,8 @@
     "\n",
     "    return total_memory_gb\n",
     "\n",
-    "print(f\"float32 (PyTorch default): {model_memory_size(model, input_dtype=torch.float32):.2f} GB\")\n",
-    "print(f\"bfloat16: {model_memory_size(model, input_dtype=torch.bfloat16):.2f} GB\")"
+    "print(f\"float32 (PyTorch default): {calc_model_memory_size(model, input_dtype=torch.float32):.2f} GB\")\n",
+    "print(f\"bfloat16: {calc_model_memory_size(model, input_dtype=torch.bfloat16):.2f} GB\")"
    ]
   },
   {
@@ -879,7 +879,7 @@
    "metadata": {},
    "source": [
     "&nbsp;\n",
-    "# 4. Load tokenizer"
+    "# 3. Load tokenizer"
    ]
   },
   {
@@ -1016,7 +1016,7 @@
    },
    "source": [
     "&nbsp;\n",
-    "# 5. Generate text"
+    "# 4. Generate text"
    ]
   },
   {
diff --git a/ch05/11_qwen3/standalone-qwen3-plus-kvcache.ipynb b/ch05/11_qwen3/standalone-qwen3-plus-kvcache.ipynb
index bbf86ad..a21a0fa 100644
--- a/ch05/11_qwen3/standalone-qwen3-plus-kvcache.ipynb
+++ b/ch05/11_qwen3/standalone-qwen3-plus-kvcache.ipynb
@@ -734,7 +734,7 @@
     }
    ],
    "source": [
-    "def model_memory_size(model, input_dtype=torch.float32):\n",
+    "def calc_model_memory_size(model, input_dtype=torch.float32):\n",
     "    total_params = 0\n",
     "    total_grads = 0\n",
     "    for param in model.parameters():\n",
@@ -758,8 +758,8 @@
     "\n",
     "    return total_memory_gb\n",
     "\n",
-    "print(f\"float32 (PyTorch default): {model_memory_size(model, input_dtype=torch.float32):.2f} GB\")\n",
-    "print(f\"bfloat16: {model_memory_size(model, input_dtype=torch.bfloat16):.2f} GB\")"
+    "print(f\"float32 (PyTorch default): {calc_model_memory_size(model, input_dtype=torch.float32):.2f} GB\")\n",
+    "print(f\"bfloat16: {calc_model_memory_size(model, input_dtype=torch.bfloat16):.2f} GB\")"
    ]
   },
   {
@@ -977,7 +977,7 @@
    "metadata": {},
    "source": [
     "&nbsp;\n",
-    "# 4. Load tokenizer"
+    "# 3. Load tokenizer"
    ]
   },
   {
@@ -1131,7 +1131,7 @@
    },
    "source": [
     "&nbsp;\n",
-    "# 5. Generate text"
+    "# 4. Generate text"
    ]
   },
   {
@@ -1253,7 +1253,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.13.5"
   }
  },
  "nbformat": 4,
diff --git a/ch05/11_qwen3/standalone-qwen3.ipynb b/ch05/11_qwen3/standalone-qwen3.ipynb
index 1e22787..55d64d2 100644
--- a/ch05/11_qwen3/standalone-qwen3.ipynb
+++ b/ch05/11_qwen3/standalone-qwen3.ipynb
@@ -676,7 +676,7 @@
     }
    ],
    "source": [
-    "def model_memory_size(model, input_dtype=torch.float32):\n",
+    "def calc_model_memory_size(model, input_dtype=torch.float32):\n",
     "    total_params = 0\n",
     "    total_grads = 0\n",
     "    for param in model.parameters():\n",
@@ -700,8 +700,8 @@
     "\n",
     "    return total_memory_gb\n",
     "\n",
-    "print(f\"float32 (PyTorch default): {model_memory_size(model, input_dtype=torch.float32):.2f} GB\")\n",
-    "print(f\"bfloat16: {model_memory_size(model, input_dtype=torch.bfloat16):.2f} GB\")"
+    "print(f\"float32 (PyTorch default): {calc_model_memory_size(model, input_dtype=torch.float32):.2f} GB\")\n",
+    "print(f\"bfloat16: {calc_model_memory_size(model, input_dtype=torch.bfloat16):.2f} GB\")"
    ]
   },
   {
@@ -731,7 +731,7 @@
    },
    "source": [
     "&nbsp;\n",
-    "# 4. Load pretrained weights"
+    "# 3. Load pretrained weights"
    ]
   },
   {
@@ -1064,7 +1064,7 @@
    },
    "source": [
     "&nbsp;\n",
-    "# 5. Generate text"
+    "# 4. Generate text"
    ]
   },
   {
diff --git a/ch05/11_qwen3/tests/test_qwen3_kvcache_nb.py b/ch05/11_qwen3/tests/test_qwen3_kvcache_nb.py
index 103dcce..73b07d5 100644
--- a/ch05/11_qwen3/tests/test_qwen3_kvcache_nb.py
+++ b/ch05/11_qwen3/tests/test_qwen3_kvcache_nb.py
@@ -16,7 +16,7 @@ transformers_installed = importlib.util.find_spec("transformers") is not None
 
 
 @pytest.fixture
-def nb_imports():
+def import_notebook_defs():
     nb_dir = Path(__file__).resolve().parents[1]
     mod = import_definitions_from_notebook(nb_dir, "standalone-qwen3-plus-kvcache.ipynb")
     return mod
@@ -58,9 +58,9 @@ def dummy_cfg_moe(dummy_cfg_base):
 
 
 @torch.inference_mode()
-def test_dummy_qwen3_forward(dummy_cfg_base, dummy_input, nb_imports):
+def test_dummy_qwen3_forward(dummy_cfg_base, dummy_input, import_notebook_defs):
     torch.manual_seed(123)
-    model = nb_imports.Qwen3Model(dummy_cfg_base)
+    model = import_notebook_defs.Qwen3Model(dummy_cfg_base)
     out = model(dummy_input)
     assert out.shape == (1, dummy_input.size(1), dummy_cfg_base["vocab_size"]), \
         f"Expected shape (1, seq_len, vocab_size), got {out.shape}"
@@ -68,7 +68,7 @@ def test_dummy_qwen3_forward(dummy_cfg_base, dummy_input, nb_imports):
 
 @torch.inference_mode()
 @pytest.mark.skipif(not transformers_installed, reason="transformers not installed")
-def test_qwen3_base_equivalence_with_transformers(nb_imports):
+def test_qwen3_base_equivalence_with_transformers(import_notebook_defs):
     from transformers import Qwen3Config, Qwen3ForCausalLM
 
     # Tiny config so the test is fast
@@ -89,7 +89,7 @@ def test_qwen3_base_equivalence_with_transformers(nb_imports):
         "dtype": torch.float32,
         "query_pre_attn_scalar": 256,
     }
-    model = nb_imports.Qwen3Model(cfg)
+    model = import_notebook_defs.Qwen3Model(cfg)
 
     hf_cfg = Qwen3Config(
         vocab_size=cfg["vocab_size"],
@@ -114,7 +114,7 @@ def test_qwen3_base_equivalence_with_transformers(nb_imports):
 
     hf_state = hf_model.state_dict()
     param_config = {"n_layers": cfg["n_layers"], "hidden_dim": cfg["hidden_dim"]}
-    nb_imports.load_weights_into_qwen(model, param_config, hf_state)
+    import_notebook_defs.load_weights_into_qwen(model, param_config, hf_state)
 
     x = torch.randint(0, cfg["vocab_size"], (2, cfg["context_length"]), dtype=torch.long)
     ours_logits = model(x)
diff --git a/ch05/11_qwen3/tests/test_qwen3_nb.py b/ch05/11_qwen3/tests/test_qwen3_nb.py
index 2b7ecce..47c5ea3 100644
--- a/ch05/11_qwen3/tests/test_qwen3_nb.py
+++ b/ch05/11_qwen3/tests/test_qwen3_nb.py
@@ -16,7 +16,7 @@ transformers_installed = importlib.util.find_spec("transformers") is not None
 
 
 @pytest.fixture
-def nb_imports():
+def import_notebook_defs():
     nb_dir = Path(__file__).resolve().parents[1]
     mod = import_definitions_from_notebook(nb_dir, "standalone-qwen3.ipynb")
     return mod
@@ -58,9 +58,9 @@ def dummy_cfg_moe(dummy_cfg_base):
 
 
 @torch.inference_mode()
-def test_dummy_qwen3_forward(dummy_cfg_base, dummy_input, nb_imports):
+def test_dummy_qwen3_forward(dummy_cfg_base, dummy_input, import_notebook_defs):
     torch.manual_seed(123)
-    model = nb_imports.Qwen3Model(dummy_cfg_base)
+    model = import_notebook_defs.Qwen3Model(dummy_cfg_base)
     out = model(dummy_input)
     assert out.shape == (1, dummy_input.size(1), dummy_cfg_base["vocab_size"]), \
         f"Expected shape (1, seq_len, vocab_size), got {out.shape}"
@@ -68,7 +68,7 @@ def test_dummy_qwen3_forward(dummy_cfg_base, dummy_input, nb_imports):
 
 @torch.inference_mode()
 @pytest.mark.skipif(not transformers_installed, reason="transformers not installed")
-def test_qwen3_base_equivalence_with_transformers(nb_imports):
+def test_qwen3_base_equivalence_with_transformers(import_notebook_defs):
     from transformers import Qwen3Config, Qwen3ForCausalLM
 
     # Tiny config so the test is fast
@@ -89,7 +89,7 @@ def test_qwen3_base_equivalence_with_transformers(nb_imports):
         "dtype": torch.float32,
         "query_pre_attn_scalar": 256,
     }
-    model = nb_imports.Qwen3Model(cfg)
+    model = import_notebook_defs.Qwen3Model(cfg)
 
     hf_cfg = Qwen3Config(
         vocab_size=cfg["vocab_size"],
@@ -114,7 +114,7 @@ def test_qwen3_base_equivalence_with_transformers(nb_imports):
 
     hf_state = hf_model.state_dict()
     param_config = {"n_layers": cfg["n_layers"], "hidden_dim": cfg["hidden_dim"]}
-    nb_imports.load_weights_into_qwen(model, param_config, hf_state)
+    import_notebook_defs.load_weights_into_qwen(model, param_config, hf_state)
 
     x = torch.randint(0, cfg["vocab_size"], (2, cfg["context_length"]), dtype=torch.long)
     ours_logits = model(x)
diff --git a/ch05/12_gemma3/standalone-gemma3-plus-kvcache.ipynb b/ch05/12_gemma3/standalone-gemma3-plus-kvcache.ipynb
index a90783e..583d6ee 100644
--- a/ch05/12_gemma3/standalone-gemma3-plus-kvcache.ipynb
+++ b/ch05/12_gemma3/standalone-gemma3-plus-kvcache.ipynb
@@ -771,7 +771,7 @@
     }
    ],
    "source": [
-    "def model_memory_size(model, input_dtype=torch.float32):\n",
+    "def calc_model_memory_size(model, input_dtype=torch.float32):\n",
     "    total_params = 0\n",
     "    total_grads = 0\n",
     "    for param in model.parameters():\n",
@@ -795,8 +795,8 @@
     "\n",
     "    return total_memory_gb\n",
     "\n",
-    "print(f\"float32 (PyTorch default): {model_memory_size(model, input_dtype=torch.float32):.2f} GB\")\n",
-    "print(f\"bfloat16: {model_memory_size(model, input_dtype=torch.bfloat16):.2f} GB\")"
+    "print(f\"float32 (PyTorch default): {calc_model_memory_size(model, input_dtype=torch.float32):.2f} GB\")\n",
+    "print(f\"bfloat16: {calc_model_memory_size(model, input_dtype=torch.bfloat16):.2f} GB\")"
    ]
   },
   {
@@ -1120,7 +1120,7 @@
    "metadata": {},
    "source": [
     "&nbsp;\n",
-    "# 4. Load tokenizer"
+    "# 3. Load tokenizer"
    ]
   },
   {
@@ -1307,10 +1307,10 @@
     "    )\n",
     "\n",
     "if torch.cuda.is_available():\n",
-    "    def gpu_gb(x):\n",
+    "    def calc_gpu_gb(x):\n",
     "        return f\"{x / 1024 / 1024 / 1024:.2f} GB\"\n",
     "    \n",
-    "    print(f\"\\n\\nGPU memory used: {gpu_gb(torch.cuda.max_memory_allocated())}\")"
+    "    print(f\"\\n\\nGPU memory used: {calc_gpu_gb(torch.cuda.max_memory_allocated())}\")"
    ]
   },
   {
@@ -1358,7 +1358,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.13.5"
   }
  },
  "nbformat": 4,
diff --git a/ch05/12_gemma3/standalone-gemma3.ipynb b/ch05/12_gemma3/standalone-gemma3.ipynb
index 6e5d870..16a1088 100644
--- a/ch05/12_gemma3/standalone-gemma3.ipynb
+++ b/ch05/12_gemma3/standalone-gemma3.ipynb
@@ -695,7 +695,7 @@
     }
    ],
    "source": [
-    "def model_memory_size(model, input_dtype=torch.float32):\n",
+    "def calc_model_memory_size(model, input_dtype=torch.float32):\n",
     "    total_params = 0\n",
     "    total_grads = 0\n",
     "    for param in model.parameters():\n",
@@ -719,8 +719,8 @@
     "\n",
     "    return total_memory_gb\n",
     "\n",
-    "print(f\"float32 (PyTorch default): {model_memory_size(model, input_dtype=torch.float32):.2f} GB\")\n",
-    "print(f\"bfloat16: {model_memory_size(model, input_dtype=torch.bfloat16):.2f} GB\")"
+    "print(f\"float32 (PyTorch default): {calc_model_memory_size(model, input_dtype=torch.float32):.2f} GB\")\n",
+    "print(f\"bfloat16: {calc_model_memory_size(model, input_dtype=torch.bfloat16):.2f} GB\")"
    ]
   },
   {
@@ -1005,7 +1005,7 @@
    "metadata": {},
    "source": [
     "&nbsp;\n",
-    "# 4. Load tokenizer"
+    "# 3. Load tokenizer"
    ]
   },
   {
@@ -1172,10 +1172,10 @@
     "    )\n",
     "\n",
     "if torch.cuda.is_available():\n",
-    "    def gpu_gb(x):\n",
+    "    def calc_gpu_gb(x):\n",
     "        return f\"{x / 1024 / 1024 / 1024:.2f} GB\"\n",
     "    \n",
-    "    print(f\"\\n\\nGPU memory used: {gpu_gb(torch.cuda.max_memory_allocated())}\")"
+    "    print(f\"\\n\\nGPU memory used: {calc_gpu_gb(torch.cuda.max_memory_allocated())}\")"
    ]
   },
   {
@@ -1223,7 +1223,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.13.5"
   }
  },
  "nbformat": 4,
diff --git a/ch05/12_gemma3/tests/test_gemma3_kv_nb.py b/ch05/12_gemma3/tests/test_gemma3_kv_nb.py
index ca2f857..877c1cc 100644
--- a/ch05/12_gemma3/tests/test_gemma3_kv_nb.py
+++ b/ch05/12_gemma3/tests/test_gemma3_kv_nb.py
@@ -16,7 +16,7 @@ transformers_installed = importlib.util.find_spec("transformers") is not None
 
 
 @pytest.fixture
-def nb_imports():
+def import_notebook_defs():
     nb_dir = Path(__file__).resolve().parents[1]
     mod = import_definitions_from_notebook(nb_dir, "standalone-gemma3-plus-kvcache.ipynb")
     return mod
@@ -50,16 +50,16 @@ def dummy_cfg_base():
 
 
 @torch.inference_mode()
-def test_dummy_gemma3_forward(dummy_cfg_base, dummy_input, nb_imports):
+def test_dummy_gemma3_forward(dummy_cfg_base, dummy_input, import_notebook_defs):
     torch.manual_seed(123)
-    model = nb_imports.Gemma3Model(dummy_cfg_base)
+    model = import_notebook_defs.Gemma3Model(dummy_cfg_base)
     out = model(dummy_input)
     assert out.shape == (1, dummy_input.size(1), dummy_cfg_base["vocab_size"])
 
 
 @torch.inference_mode()
 @pytest.mark.skipif(not transformers_installed, reason="transformers not installed")
-def test_gemma3_base_equivalence_with_transformers(nb_imports):
+def test_gemma3_base_equivalence_with_transformers(import_notebook_defs):
     from transformers import Gemma3TextConfig, Gemma3ForCausalLM
 
     # Tiny config so the test is fast
@@ -80,7 +80,7 @@ def test_gemma3_base_equivalence_with_transformers(nb_imports):
         "dtype": torch.float32,
         "query_pre_attn_scalar": 256,
     }
-    model = nb_imports.Gemma3Model(cfg)
+    model = import_notebook_defs.Gemma3Model(cfg)
 
     hf_cfg = Gemma3TextConfig(
         vocab_size=cfg["vocab_size"],
@@ -105,7 +105,7 @@ def test_gemma3_base_equivalence_with_transformers(nb_imports):
 
     hf_state = hf_model.state_dict()
     param_config = {"n_layers": cfg["n_layers"], "hidden_dim": cfg["hidden_dim"]}
-    nb_imports.load_weights_into_gemma(model, param_config, hf_state)
+    import_notebook_defs.load_weights_into_gemma(model, param_config, hf_state)
 
     x = torch.randint(0, cfg["vocab_size"], (2, cfg["context_length"]), dtype=torch.long)
     ours_logits = model(x)
diff --git a/ch05/12_gemma3/tests/test_gemma3_nb.py b/ch05/12_gemma3/tests/test_gemma3_nb.py
index dd928b9..0fc8d84 100644
--- a/ch05/12_gemma3/tests/test_gemma3_nb.py
+++ b/ch05/12_gemma3/tests/test_gemma3_nb.py
@@ -16,7 +16,7 @@ transformers_installed = importlib.util.find_spec("transformers") is not None
 
 
 @pytest.fixture
-def nb_imports():
+def import_notebook_defs():
     nb_dir = Path(__file__).resolve().parents[1]
     mod = import_definitions_from_notebook(nb_dir, "standalone-gemma3.ipynb")
     return mod
@@ -50,16 +50,16 @@ def dummy_cfg_base():
 
 
 @torch.inference_mode()
-def test_dummy_gemma3_forward(dummy_cfg_base, dummy_input, nb_imports):
+def test_dummy_gemma3_forward(dummy_cfg_base, dummy_input, import_notebook_defs):
     torch.manual_seed(123)
-    model = nb_imports.Gemma3Model(dummy_cfg_base)
+    model = import_notebook_defs.Gemma3Model(dummy_cfg_base)
     out = model(dummy_input)
     assert out.shape == (1, dummy_input.size(1), dummy_cfg_base["vocab_size"])
 
 
 @torch.inference_mode()
 @pytest.mark.skipif(not transformers_installed, reason="transformers not installed")
-def test_gemma3_base_equivalence_with_transformers(nb_imports):
+def test_gemma3_base_equivalence_with_transformers(import_notebook_defs):
     from transformers import Gemma3TextConfig, Gemma3ForCausalLM
 
     # Tiny config so the test is fast
@@ -80,7 +80,7 @@ def test_gemma3_base_equivalence_with_transformers(nb_imports):
         "dtype": torch.float32,
         "query_pre_attn_scalar": 256,
     }
-    model = nb_imports.Gemma3Model(cfg)
+    model = import_notebook_defs.Gemma3Model(cfg)
 
     hf_cfg = Gemma3TextConfig(
         vocab_size=cfg["vocab_size"],
@@ -105,7 +105,7 @@ def test_gemma3_base_equivalence_with_transformers(nb_imports):
 
     hf_state = hf_model.state_dict()
     param_config = {"n_layers": cfg["n_layers"], "hidden_dim": cfg["hidden_dim"]}
-    nb_imports.load_weights_into_gemma(model, param_config, hf_state)
+    import_notebook_defs.load_weights_into_gemma(model, param_config, hf_state)
 
     x = torch.randint(0, cfg["vocab_size"], (2, cfg["context_length"]), dtype=torch.long)
     ours_logits = model(x)
diff --git a/ch05/13_olmo3/standalone-olmo3-plus-kv-cache.ipynb b/ch05/13_olmo3/standalone-olmo3-plus-kv-cache.ipynb
index e9cfa07..dde815b 100644
--- a/ch05/13_olmo3/standalone-olmo3-plus-kv-cache.ipynb
+++ b/ch05/13_olmo3/standalone-olmo3-plus-kv-cache.ipynb
@@ -904,7 +904,7 @@
    },
    "source": [
     "&nbsp;\n",
-    "# 4. Load pretrained weights"
+    "# 3. Load pretrained weights"
    ]
   },
   {
@@ -1269,10 +1269,10 @@
     "    )\n",
     "\n",
     "if torch.cuda.is_available():\n",
-    "    def gpu_gb(x):\n",
+    "    def calc_gpu_gb(x):\n",
     "        return f\"{x / 1024 / 1024 / 1024:.2f} GB\"\n",
     "    \n",
-    "    print(f\"\\n\\nGPU memory used: {gpu_gb(torch.cuda.max_memory_allocated())}\")"
+    "    print(f\"\\n\\nGPU memory used: {calc_gpu_gb(torch.cuda.max_memory_allocated())}\")"
    ]
   },
   {
@@ -1320,7 +1320,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.13.5"
   }
  },
  "nbformat": 4,
diff --git a/ch05/13_olmo3/standalone-olmo3.ipynb b/ch05/13_olmo3/standalone-olmo3.ipynb
index ed8885f..ab615a4 100644
--- a/ch05/13_olmo3/standalone-olmo3.ipynb
+++ b/ch05/13_olmo3/standalone-olmo3.ipynb
@@ -801,7 +801,7 @@
    },
    "source": [
     "&nbsp;\n",
-    "# 4. Load pretrained weights"
+    "# 3. Load pretrained weights"
    ]
   },
   {
@@ -1160,10 +1160,10 @@
     "    )\n",
     "\n",
     "if torch.cuda.is_available():\n",
-    "    def gpu_gb(x):\n",
+    "    def calc_gpu_gb(x):\n",
     "        return f\"{x / 1024 / 1024 / 1024:.2f} GB\"\n",
     "    \n",
-    "    print(f\"\\n\\nGPU memory used: {gpu_gb(torch.cuda.max_memory_allocated())}\")"
+    "    print(f\"\\n\\nGPU memory used: {calc_gpu_gb(torch.cuda.max_memory_allocated())}\")"
    ]
   },
   {
@@ -1211,7 +1211,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.13.5"
   }
  },
  "nbformat": 4,
diff --git a/ch05/13_olmo3/tests/olmo3_layer_debugger.py b/ch05/13_olmo3/tests/olmo3_layer_debugger.py
index 7f4664b..920ec5b 100644
--- a/ch05/13_olmo3/tests/olmo3_layer_debugger.py
+++ b/ch05/13_olmo3/tests/olmo3_layer_debugger.py
@@ -116,11 +116,11 @@ def load_notebook_defs(nb_name="standalone-olmo3.ipynb"):
     return import_definitions_from_notebook(nb_dir, nb_name)
 
 
-def build_olmo3_pair(nb_imports, cfg, hf_checkpoint=None):
+def build_olmo3_pair(import_notebook_defs, cfg, hf_checkpoint=None):
     if Olmo3ForCausalLM is None:
         raise ImportError("transformers is required for the Olmo-3 debugger.")
 
-    ours = nb_imports.Olmo3Model(cfg)
+    ours = import_notebook_defs.Olmo3Model(cfg)
     hf_cfg = _hf_config_from_dict(cfg)
 
     if hf_checkpoint:
@@ -133,7 +133,7 @@ def build_olmo3_pair(nb_imports, cfg, hf_checkpoint=None):
         hf_model = Olmo3ForCausalLM(hf_cfg)
 
     param_config = {"n_layers": cfg["n_layers"], "hidden_dim": cfg["hidden_dim"]}
-    nb_imports.load_weights_into_olmo(ours, param_config, hf_model.state_dict())
+    import_notebook_defs.load_weights_into_olmo(ours, param_config, hf_model.state_dict())
 
     ours.eval()
     hf_model.eval()
@@ -271,10 +271,10 @@ if __name__ == "__main__":
     if not transformers_available:
         raise SystemExit("transformers is not installed; install it to run the debugger.")
 
-    nb_imports = load_notebook_defs()
+    import_notebook_defs = load_notebook_defs()
     cfg = yarn_debug_config()
 
-    ours_model, hf_model = build_olmo3_pair(nb_imports, cfg)
+    ours_model, hf_model = build_olmo3_pair(import_notebook_defs, cfg)
     torch.manual_seed(0)
     input_ids = torch.randint(0, cfg["vocab_size"], (1, cfg["context_length"]), dtype=torch.long)
     diffs = layerwise_differences(ours_model, hf_model, input_ids)
diff --git a/ch05/13_olmo3/tests/test_olmo3_kvcache_nb.py b/ch05/13_olmo3/tests/test_olmo3_kvcache_nb.py
index 5675e0e..c7efe78 100644
--- a/ch05/13_olmo3/tests/test_olmo3_kvcache_nb.py
+++ b/ch05/13_olmo3/tests/test_olmo3_kvcache_nb.py
@@ -16,7 +16,7 @@ transformers_installed = importlib.util.find_spec("transformers") is not None
 
 
 @pytest.fixture
-def nb_imports():
+def import_notebook_defs():
     nb_dir = Path(__file__).resolve().parents[1]
     mod = import_definitions_from_notebook(nb_dir, "standalone-olmo3-plus-kv-cache.ipynb")
     return mod
@@ -55,9 +55,9 @@ def dummy_cfg_base():
     }
 
 @torch.inference_mode()
-def test_dummy_olmo3_forward(dummy_cfg_base, dummy_input, nb_imports):
+def test_dummy_olmo3_forward(dummy_cfg_base, dummy_input, import_notebook_defs):
     torch.manual_seed(123)
-    model = nb_imports.Olmo3Model(dummy_cfg_base)
+    model = import_notebook_defs.Olmo3Model(dummy_cfg_base)
     out = model(dummy_input)
     assert out.shape == (1, dummy_input.size(1), dummy_cfg_base["vocab_size"]), \
         f"Expected shape (1, seq_len, vocab_size), got {out.shape}"
@@ -65,7 +65,7 @@ def test_dummy_olmo3_forward(dummy_cfg_base, dummy_input, nb_imports):
 
 @torch.inference_mode()
 @pytest.mark.skipif(not transformers_installed, reason="transformers not installed")
-def test_olmo3_base_equivalence_with_transformers(nb_imports):
+def test_olmo3_base_equivalence_with_transformers(import_notebook_defs):
     from transformers import Olmo3Config, Olmo3ForCausalLM
 
     # Tiny config so the test is fast
@@ -99,7 +99,7 @@ def test_olmo3_base_equivalence_with_transformers(nb_imports):
         "rope_local_base": 10_000.0,
     }
 
-    model = nb_imports.Olmo3Model(cfg)
+    model = import_notebook_defs.Olmo3Model(cfg)
 
     hf_cfg = Olmo3Config(
         vocab_size=cfg["vocab_size"],
@@ -129,7 +129,7 @@ def test_olmo3_base_equivalence_with_transformers(nb_imports):
         "n_layers": cfg["n_layers"],
         "hidden_dim": cfg["hidden_dim"],
     }
-    nb_imports.load_weights_into_olmo(model, param_config, hf_state)
+    import_notebook_defs.load_weights_into_olmo(model, param_config, hf_state)
 
     x = torch.randint(
         0,
diff --git a/ch05/13_olmo3/tests/test_olmo3_nb.py b/ch05/13_olmo3/tests/test_olmo3_nb.py
index fa528dc..1ad5900 100644
--- a/ch05/13_olmo3/tests/test_olmo3_nb.py
+++ b/ch05/13_olmo3/tests/test_olmo3_nb.py
@@ -16,7 +16,7 @@ transformers_installed = importlib.util.find_spec("transformers") is not None
 
 
 @pytest.fixture
-def nb_imports():
+def import_notebook_defs():
     nb_dir = Path(__file__).resolve().parents[1]
     mod = import_definitions_from_notebook(nb_dir, "standalone-olmo3.ipynb")
     return mod
@@ -55,9 +55,9 @@ def dummy_cfg_base():
     }
 
 @torch.inference_mode()
-def test_dummy_olmo3_forward(dummy_cfg_base, dummy_input, nb_imports):
+def test_dummy_olmo3_forward(dummy_cfg_base, dummy_input, import_notebook_defs):
     torch.manual_seed(123)
-    model = nb_imports.Olmo3Model(dummy_cfg_base)
+    model = import_notebook_defs.Olmo3Model(dummy_cfg_base)
     out = model(dummy_input)
     assert out.shape == (1, dummy_input.size(1), dummy_cfg_base["vocab_size"]), \
         f"Expected shape (1, seq_len, vocab_size), got {out.shape}"
@@ -65,7 +65,7 @@ def test_dummy_olmo3_forward(dummy_cfg_base, dummy_input, nb_imports):
 
 @torch.inference_mode()
 @pytest.mark.skipif(not transformers_installed, reason="transformers not installed")
-def test_olmo3_base_equivalence_with_transformers(nb_imports):
+def test_olmo3_base_equivalence_with_transformers(import_notebook_defs):
     from transformers import Olmo3Config, Olmo3ForCausalLM
 
     # Tiny config so the test is fast
@@ -99,7 +99,7 @@ def test_olmo3_base_equivalence_with_transformers(nb_imports):
         "rope_local_base": 10_000.0,
     }
 
-    model = nb_imports.Olmo3Model(cfg)
+    model = import_notebook_defs.Olmo3Model(cfg)
 
     hf_cfg = Olmo3Config(
         vocab_size=cfg["vocab_size"],
@@ -129,7 +129,7 @@ def test_olmo3_base_equivalence_with_transformers(nb_imports):
         "n_layers": cfg["n_layers"],
         "hidden_dim": cfg["hidden_dim"],
     }
-    nb_imports.load_weights_into_olmo(model, param_config, hf_state)
+    import_notebook_defs.load_weights_into_olmo(model, param_config, hf_state)
 
     x = torch.randint(
         0,
diff --git a/ch06/01_main-chapter-code/ch06.ipynb b/ch06/01_main-chapter-code/ch06.ipynb
index 22b3c88..9b38045 100644
--- a/ch06/01_main-chapter-code/ch06.ipynb
+++ b/ch06/01_main-chapter-code/ch06.ipynb
@@ -86,7 +86,8 @@
     "id": "3a84cf35-b37f-4c15-8972-dfafc9fadc1c"
    },
    "source": [
-    "## 6.1 Different categories of finetuning"
+    "&nbsp;\n",
+    "### 6.1 Different categories of finetuning"
    ]
   },
   {
@@ -142,7 +143,8 @@
     "id": "8c7017a2-32aa-4002-a2f3-12aac293ccdf"
    },
    "source": [
-    "## 6.2 Preparing the dataset"
+    "&nbsp;\n",
+    "### 6.2 Preparing the dataset"
    ]
   },
   {
@@ -699,7 +701,8 @@
    "id": "a8d7a0c5-1d5f-458a-b685-3f49520b0094",
    "metadata": {},
    "source": [
-    "## 6.3 Creating data loaders"
+    "&nbsp;\n",
+    "### 6.3 Creating data loaders"
    ]
   },
   {
@@ -1019,7 +1022,8 @@
     "id": "d1c4f61a-5f5d-4b3b-97cf-151b617d1d6c"
    },
    "source": [
-    "## 6.4 Initializing a model with pretrained weights"
+    "&nbsp;\n",
+    "### 6.4 Initializing a model with pretrained weights"
    ]
   },
   {
@@ -1219,7 +1223,8 @@
     "id": "4c9ae440-32f9-412f-96cf-fd52cc3e2522"
    },
    "source": [
-    "## 6.5 Adding a classification head"
+    "&nbsp;\n",
+    "### 6.5 Adding a classification head"
    ]
   },
   {
@@ -1722,7 +1727,8 @@
    "id": "32aa4aef-e1e9-491b-9adf-5aa973e59b8c",
    "metadata": {},
    "source": [
-    "## 6.6 Calculating the classification loss and accuracy"
+    "&nbsp;\n",
+    "### 6.6 Calculating the classification loss and accuracy"
    ]
   },
   {
@@ -2042,7 +2048,8 @@
     "id": "456ae0fd-6261-42b4-ab6a-d24289953083"
    },
    "source": [
-    "## 6.7 Finetuning the model on supervised data"
+    "&nbsp;\n",
+    "### 6.7 Finetuning the model on supervised data"
    ]
   },
   {
@@ -2372,7 +2379,8 @@
    "id": "a74d9ad7-3ec1-450e-8c9f-4fc46d3d5bb0",
    "metadata": {},
    "source": [
-    "## 6.8 Using the LLM as a spam classifier"
+    "&nbsp;\n",
+    "### 6.8 Using the LLM as a spam classifier"
    ]
   },
   {
@@ -2564,6 +2572,7 @@
     "id": "5b70ac71-234f-4eeb-b33d-c62726d50cd4"
    },
    "source": [
+    "&nbsp;\n",
     "## Summary and takeaways"
    ]
   },
diff --git a/ch06/02_bonus_additional-experiments/additional_experiments.py b/ch06/02_bonus_additional-experiments/additional_experiments.py
index 5d518cb..a600234 100644
--- a/ch06/02_bonus_additional-experiments/additional_experiments.py
+++ b/ch06/02_bonus_additional-experiments/additional_experiments.py
@@ -130,20 +130,20 @@ def download_and_unzip(url, zip_path, extract_to, new_file_path):
     print(f"File downloaded and saved as {new_file_path}")
 
 
-def random_split(df, train_frac, validation_frac):
+def random_split(df, train_frac, val_frac):
     # Shuffle the entire DataFrame
     df = df.sample(frac=1, random_state=123).reset_index(drop=True)
 
     # Calculate split indices
     train_end = int(len(df) * train_frac)
-    validation_end = train_end + int(len(df) * validation_frac)
+    val_end = train_end + int(len(df) * val_frac)
 
     # Split the DataFrame
     train_df = df[:train_end]
-    validation_df = df[train_end:validation_end]
-    test_df = df[validation_end:]
+    val_df = df[train_end:val_end]
+    test_df = df[val_end:]
 
-    return train_df, validation_df, test_df
+    return train_df, val_df, test_df
 
 
 def create_dataset_csvs(new_file_path):
@@ -157,9 +157,9 @@ def create_dataset_csvs(new_file_path):
     balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})
 
     # Sample and save csv files
-    train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
+    train_df, val_df, test_df = random_split(balanced_df, 0.7, 0.1)
     train_df.to_csv("train.csv", index=None)
-    validation_df.to_csv("validation.csv", index=None)
+    val_df.to_csv("validation.csv", index=None)
     test_df.to_csv("test.csv", index=None)
 
 
@@ -611,7 +611,7 @@ if __name__ == "__main__":
     base_path = Path(".")
     file_names = ["train.csv", "validation.csv", "test.csv"]
     all_exist = all((base_path / file_name).exists() for file_name in file_names)
-    
+
     if not all_exist:
         try:
             download_and_unzip(url, zip_path, extract_to, new_file_path)
diff --git a/ch06/03_bonus_imdb-classification/sklearn-baseline.ipynb b/ch06/03_bonus_imdb-classification/sklearn-baseline.ipynb
index 4529c81..bcffc83 100644
--- a/ch06/03_bonus_imdb-classification/sklearn-baseline.ipynb
+++ b/ch06/03_bonus_imdb-classification/sklearn-baseline.ipynb
@@ -144,6 +144,7 @@
    "id": "fae87bc1-14ca-4f89-8e12-49f77b0ec00d",
    "metadata": {},
    "source": [
+    "&nbsp;\n",
     "## Scikit-learn baseline"
    ]
   },
@@ -269,7 +270,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.16"
+   "version": "3.13.5"
   }
  },
  "nbformat": 4,
diff --git a/ch06/03_bonus_imdb-classification/train_bert_hf_spam.py b/ch06/03_bonus_imdb-classification/train_bert_hf_spam.py
index cf9d56c..89bbfd9 100644
--- a/ch06/03_bonus_imdb-classification/train_bert_hf_spam.py
+++ b/ch06/03_bonus_imdb-classification/train_bert_hf_spam.py
@@ -79,20 +79,20 @@ def download_and_unzip(url, zip_path, extract_to, new_file_path):
     print(f"File downloaded and saved as {new_file_path}")
 
 
-def random_split(df, train_frac, validation_frac):
+def random_split(df, train_frac, val_frac):
     # Shuffle the entire DataFrame
     df = df.sample(frac=1, random_state=123).reset_index(drop=True)
 
     # Calculate split indices
     train_end = int(len(df) * train_frac)
-    validation_end = train_end + int(len(df) * validation_frac)
+    val_end = train_end + int(len(df) * val_frac)
 
     # Split the DataFrame
     train_df = df[:train_end]
-    validation_df = df[train_end:validation_end]
-    test_df = df[validation_end:]
+    val_df = df[train_end:val_end]
+    test_df = df[val_end:]
 
-    return train_df, validation_df, test_df
+    return train_df, val_df, test_df
 
 
 def create_dataset_csvs(new_file_path):
@@ -106,9 +106,9 @@ def create_dataset_csvs(new_file_path):
     balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})
 
     # Sample and save csv files
-    train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
+    train_df, val_df, test_df = random_split(balanced_df, 0.7, 0.1)
     train_df.to_csv("train.csv", index=None)
-    validation_df.to_csv("validation.csv", index=None)
+    val_df.to_csv("validation.csv", index=None)
     test_df.to_csv("test.csv", index=None)
 
 
diff --git a/ch07/01_main-chapter-code/ch07.ipynb b/ch07/01_main-chapter-code/ch07.ipynb
index 9e3ea30..4faafb1 100644
--- a/ch07/01_main-chapter-code/ch07.ipynb
+++ b/ch07/01_main-chapter-code/ch07.ipynb
@@ -89,6 +89,7 @@
     "id": "8bbc68e9-75b3-41f1-ac2c-e071c3cd0813"
    },
    "source": [
+    "&nbsp;\n",
     "## 7.1 Introduction to instruction finetuning"
    ]
   },
@@ -133,6 +134,7 @@
     "id": "5384f0cf-ef3c-4436-a5fa-59bd25649f86"
    },
    "source": [
+    "&nbsp;\n",
     "## 7.2 Preparing a dataset for supervised instruction finetuning"
    ]
   },
@@ -499,6 +501,7 @@
     "id": "fcaaf606-f913-4445-8301-632ae10d387d"
    },
    "source": [
+    "&nbsp;\n",
     "## 7.3 Organizing data into training batches"
    ]
   },
@@ -1492,6 +1495,7 @@
     "id": "d6aad445-8f19-4238-b9bf-db80767fb91a"
    },
    "source": [
+    "&nbsp;\n",
     "## 7.5 Loading a pretrained LLM"
    ]
   },
@@ -1724,6 +1728,7 @@
     "id": "70d27b9d-a942-4cf5-b797-848c5f01e723"
    },
    "source": [
+    "&nbsp;\n",
     "## 7.6 Finetuning the LLM on instruction data"
    ]
   },
@@ -1995,6 +2000,7 @@
     "id": "87b79a47-13f9-4d1f-87b1-3339bafaf2a3"
    },
    "source": [
+    "&nbsp;\n",
     "## 7.7 Extracting and saving responses"
    ]
   },
@@ -2251,6 +2257,7 @@
     "id": "obgoGI89dgPm"
    },
    "source": [
+    "&nbsp;\n",
     "## 7.8 Evaluating the finetuned LLM"
    ]
   },
@@ -2847,6 +2854,7 @@
     "id": "tIbNMluCDjVM"
    },
    "source": [
+    "&nbsp;\n",
     "### 7.9.1 What's next\n",
     "\n",
     "- This marks the final chapter of this book\n",
@@ -2857,12 +2865,26 @@
     "- An optional step that is sometimes followed after instruction finetuning, as described in this chapter, is preference finetuning\n",
     "- Preference finetuning process can be particularly useful for customizing a model to better align with specific user preferences; see the [../04_preference-tuning-with-dpo](../04_preference-tuning-with-dpo) folder if you are interested in this\n",
     "\n",
-    "- This GitHub repository also contains a large selection of additional bonus material you may enjoy; for more information, please see the [Bonus Material](https://github.com/rasbt/LLMs-from-scratch?tab=readme-ov-file#bonus-material) section on this repository's README page\n",
-    "\n",
+    "- This GitHub repository also contains a large selection of additional bonus material you may enjoy; for more information, please see the [Bonus Material](https://github.com/rasbt/LLMs-from-scratch?tab=readme-ov-file#bonus-material) section on this repository's README page"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0e2b7bc2-2e8d-483f-a8f5-e2aa093db189",
+   "metadata": {},
+   "source": [
+    "&nbsp;\n",
     "### 7.9.2 Staying up to date in a fast-moving field\n",
     "\n",
-    "- No code in this section\n",
-    "\n",
+    "- No code in this section"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e3d8327d-afb5-4d24-88af-e253889251cf",
+   "metadata": {},
+   "source": [
+    "&nbsp;\n",
     "### 7.9.3 Final words\n",
     "\n",
     "- I hope you enjoyed this journey of implementing an LLM from the ground up and coding the pretraining and finetuning functions\n",
diff --git a/ch07/02_dataset-utilities/create-passive-voice-entries.ipynb b/ch07/02_dataset-utilities/create-passive-voice-entries.ipynb
index c5029e6..818c805 100644
--- a/ch07/02_dataset-utilities/create-passive-voice-entries.ipynb
+++ b/ch07/02_dataset-utilities/create-passive-voice-entries.ipynb
@@ -88,7 +88,8 @@
    "id": "8bcdcb34-ac75-4f4f-9505-3ce0666c42d5",
    "metadata": {},
    "source": [
-    "## Test OpenAI API"
+    "&nbsp;\n",
+    "## 1. Test OpenAI API"
    ]
   },
   {
@@ -177,7 +178,8 @@
    "id": "162a4739-6f03-4092-a5c2-f57a0b6a4c4d",
    "metadata": {},
    "source": [
-    "## Create JSON Entries"
+    "&nbsp;\n",
+    "## 2. Create JSON Entries"
    ]
   },
   {
@@ -418,7 +420,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.13.5"
   }
  },
  "nbformat": 4,
diff --git a/ch07/04_preference-tuning-with-dpo/dpo-from-scratch.ipynb b/ch07/04_preference-tuning-with-dpo/dpo-from-scratch.ipynb
index 7411d34..830f6f8 100644
--- a/ch07/04_preference-tuning-with-dpo/dpo-from-scratch.ipynb
+++ b/ch07/04_preference-tuning-with-dpo/dpo-from-scratch.ipynb
@@ -1303,7 +1303,7 @@
    },
    "source": [
     "&nbsp;\n",
-    "## 2.4) Creating training, validation, and test set data loaders"
+    "## 2.4 Creating training, validation, and test set data loaders"
    ]
   },
   {
diff --git a/ch07/05_dataset-generation/reflection-gpt4.ipynb b/ch07/05_dataset-generation/reflection-gpt4.ipynb
index ce24d27..db9e376 100644
--- a/ch07/05_dataset-generation/reflection-gpt4.ipynb
+++ b/ch07/05_dataset-generation/reflection-gpt4.ipynb
@@ -306,7 +306,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def instr_prompt_no_input(ins, outp):\n",
+    "def build_instruction_reflection_prompt_no_input(ins, outp):\n",
     "\n",
     "    sys_prompt = \"You are a helpful, precise but picky assistant for checking the quality of a given instruction.\"\n",
     "    prompt_template = \"[Instruction]\\n{ins}\\n\\n[The Start of Answer]\\n{outp}\\n\\n[The End of Answer]\\n\\n[System]\\n{criteria}\\n\\n\"\n",
@@ -356,7 +356,7 @@
    "id": "9572a1aa-532a-4a76-9fa3-3b59d996ba13",
    "metadata": {},
    "source": [
-    "- We can refine the instruction as follows, using `instr_prompt_no_input` function defined above:"
+    "- We can refine the instruction as follows, using `build_instruction_reflection_prompt_no_input` function defined above:"
    ]
   },
   {
@@ -405,7 +405,7 @@
    "source": [
     "entry = json_data[2]\n",
     "\n",
-    "system_prompt, prompt = instr_prompt_no_input(ins=entry[\"instruction\"], outp=entry[\"output\"])\n",
+    "system_prompt, prompt = build_instruction_reflection_prompt_no_input(ins=entry[\"instruction\"], outp=entry[\"output\"])\n",
     "output = run_chatgpt(prompt=prompt, client=client, system_prompt=system_prompt)\n",
     "\n",
     "print(output)"
@@ -430,7 +430,7 @@
    "source": [
     "import re\n",
     "\n",
-    "def extract_ins(text, no_input=True):\n",
+    "def extract_instruction_segment(text, no_input=True):\n",
     "    if '[New Instruction]' in text:\n",
     "        pattern = r'(\\[New Instruction\\])(.*?)(\\[End\\]|\\[New Answer\\]|New Answer:)'\n",
     "    else:\n",
@@ -445,7 +445,7 @@
     "    return seg_ins\n",
     "\n",
     "\n",
-    "def extract_oup(text, no_input=True):\n",
+    "def extract_output_segment(text, no_input=True):\n",
     "    if '[New Answer]' in text:\n",
     "        pattern = r'(\\[New Answer\\])(.*?)(\\[End\\]|$)'\n",
     "    else:\n",
@@ -462,8 +462,8 @@
     "def extract_instruction(text):\n",
     "    if text == '':\n",
     "        return []\n",
-    "    seg_ins = extract_ins(text, no_input=True)\n",
-    "    seg_oup = extract_oup(text, no_input=True)\n",
+    "    seg_ins = extract_instruction_segment(text, no_input=True)\n",
+    "    seg_oup = extract_output_segment(text, no_input=True)\n",
     "    return [seg_ins, seg_oup]"
    ]
   },
@@ -561,7 +561,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def res_gen_prompt_no_input(ins, outp):\n",
+    "def build_response_reflection_prompt_no_input(ins, outp):\n",
     "\n",
     "    sys_prompt = \"You are a helpful, precise but picky assistant for checking the quality of the answer to a given instruction.\"\n",
     "    prompt_template = \"[Instruction]\\n{ins}\\n\\n[The Start of Answer]\\n{outp}\\n\\n[The End of Answer]\\n\\n[System]\\n{criteria}\\n\\n\"\n",
@@ -574,7 +574,7 @@
     "    return sys_prompt, prompt\n",
     "\n",
     "\n",
-    "def res_gen_prompt_input(ins, inp, outp):\n",
+    "def build_response_reflection_prompt_with_input(ins, inp, outp):\n",
     "\n",
     "    sys_prompt = \"You are a helpful and precise assistant for checking the quality of the answer to a given instruction and its input.\"\n",
     "    prompt_template = \"[Instruction]\\n{ins}\\n\\n[The Start of Input]\\n{inp}\\n\\n[The End of Input]\\n\\n[The Start of Answer]\\n{outp}\\n\\n[The End of Answer]\\n\\n[System]\\n{criteria}\\n\\n\"\n",
@@ -626,7 +626,7 @@
    "source": [
     "entry = json_data[2]\n",
     "\n",
-    "system_prompt, prompt = res_gen_prompt_no_input(ins=entry[\"instruction\"], outp=entry[\"output\"])\n",
+    "system_prompt, prompt = build_response_reflection_prompt_no_input(ins=entry[\"instruction\"], outp=entry[\"output\"])\n",
     "output = run_chatgpt(prompt=prompt, client=client, system_prompt=system_prompt)\n",
     "\n",
     "print(output)"
@@ -750,7 +750,7 @@
     "    for entry in tqdm(json_data):\n",
     "        \n",
     "        if not entry[\"input\"]:\n",
-    "            system_prompt, prompt = instr_prompt_no_input(ins=entry[\"instruction\"], outp=entry[\"output\"])\n",
+    "            system_prompt, prompt = build_instruction_reflection_prompt_no_input(ins=entry[\"instruction\"], outp=entry[\"output\"])\n",
     "            output = run_chatgpt(prompt=prompt, client=client, system_prompt=system_prompt)\n",
     "            new_instr, new_outp = extract_instruction(output)\n",
     "            new_entry = {\"instruction\": new_instr, \"input\": \"\", \"output\": new_outp}\n",
@@ -906,7 +906,7 @@
     "    for entry in tqdm(json_data):\n",
     "        \n",
     "        if not entry[\"input\"]:\n",
-    "            system_prompt, prompt = res_gen_prompt_no_input(ins=entry[\"instruction\"], outp=entry[\"output\"])\n",
+    "            system_prompt, prompt = build_response_reflection_prompt_no_input(ins=entry[\"instruction\"], outp=entry[\"output\"])\n",
     "            output = run_chatgpt(prompt=prompt, client=client, system_prompt=system_prompt)\n",
     "            new_response = extract_response(output)\n",
     "\n",
@@ -917,7 +917,7 @@
     "            new_json_data.append(new_entry)\n",
     "\n",
     "        else:\n",
-    "            system_prompt, prompt = res_gen_prompt_input(ins=entry[\"instruction\"], inp=entry[\"input\"], outp=entry[\"output\"])\n",
+    "            system_prompt, prompt = build_response_reflection_prompt_with_input(ins=entry[\"instruction\"], inp=entry[\"input\"], outp=entry[\"output\"])\n",
     "            output = run_chatgpt(prompt=prompt, client=client, system_prompt=system_prompt)\n",
     "            new_response = extract_response(output)\n",
     "\n",