Numerically stable generate on mps (#849)

* Numerically stable generate on mps * add file
2026-04-10 12:33:42 +00:00 · 2025-09-26 22:42:44 -05:00
parent f492c949d3
commit 9bc827ea7e
8 changed files with 35 additions and 2 deletions
--- a/ch05/01_main-chapter-code/ch05.ipynb
+++ b/ch05/01_main-chapter-code/ch05.ipynb
@@ -1933,7 +1933,7 @@
   "metadata": {},
   "source": [
    "- The previous two subsections introduced temperature sampling and top-k sampling\n",
-    "- Let's use these two concepts to modify the `generate_simple` function we used to generate text via the LLM earlier, creating a new `generate` function:"
+    "- Let's use these two concepts to modify the `generate_text_simple` function from chapter 4, creating a new `generate` function:"
   ]
  },
  {
@@ -1963,6 +1963,10 @@
    "        if temperature > 0.0:\n",
    "            logits = logits / temperature\n",
    "\n",
    "            # New (not in book): numerical stability tip to get equivalent results on mps device\n",
    "            # subtract rowwise max before softmax\n",
    "            logits = logits - logits.max(dim=-1, keepdim=True).values\n",
    "            \n",
    "            # Apply softmax to get probabilities\n",
    "            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)\n",
    "\n",
--- a/ch05/01_main-chapter-code/gpt_generate.py
+++ b/ch05/01_main-chapter-code/gpt_generate.py
@@ -235,6 +235,10 @@ def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=No
        if temperature > 0.0:
            logits = logits / temperature
            # New (not in book): numerical stability tip to get equivalent results on mps device
            # subtract rowwise max before softmax
            logits = logits - logits.max(dim=-1, keepdim=True).values
            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)
--- a/ch05/07_gpt_to_llama/previous_chapters.py
+++ b/ch05/07_gpt_to_llama/previous_chapters.py
@@ -44,6 +44,10 @@ def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=No
        if temperature > 0.0:
            logits = logits / temperature
            # New (not in book): numerical stability tip to get equivalent results on mps device
            # subtract rowwise max before softmax
            logits = logits - logits.max(dim=-1, keepdim=True).values
            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)
--- a/ch05/07_gpt_to_llama/standalone-llama32.ipynb
+++ b/ch05/07_gpt_to_llama/standalone-llama32.ipynb
@@ -1047,6 +1047,10 @@
    "        if temperature > 0.0:\n",
    "            logits = logits / temperature\n",
    "\n",
    "            # New (not in book): numerical stability tip to get equivalent results on mps device\n",
    "            # subtract rowwise max before softmax\n",
    "            logits = logits - logits.max(dim=-1, keepdim=True).values\n",
    "            \n",
    "            # Apply softmax to get probabilities\n",
    "            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)\n",
    "\n",
--- a/ch06/02_bonus_additional-experiments/previous_chapters.py
+++ b/ch06/02_bonus_additional-experiments/previous_chapters.py
@@ -334,6 +334,10 @@ def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=No
        if temperature > 0.0:
            logits = logits / temperature
            # New (not in book): numerical stability tip to get equivalent results on mps device
            # subtract rowwise max before softmax
            logits = logits - logits.max(dim=-1, keepdim=True).values
            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)
--- a/ch07/01_main-chapter-code/previous_chapters.py
+++ b/ch07/01_main-chapter-code/previous_chapters.py
@@ -267,6 +267,10 @@ def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=No
        if temperature > 0.0:
            logits = logits / temperature
            # New (not in book): numerical stability tip to get equivalent results on mps device
            # subtract rowwise max before softmax
            logits = logits - logits.max(dim=-1, keepdim=True).values
            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)
--- a/ch07/04_preference-tuning-with-dpo/previous_chapters.py
+++ b/ch07/04_preference-tuning-with-dpo/previous_chapters.py
@@ -267,8 +267,13 @@ def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=No
        if temperature > 0.0:
            logits = logits / temperature
            # New (not in book): numerical stability tip to get equivalent results on mps device
            # subtract rowwise max before softmax
            #logits = logits - logits.max(dim=-1, keepdim=True).values
            # Apply softmax to get probabilities
-            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)
+            #probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)
            probs = torch.log_softmax(logits, dim=-1)
            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)
--- a/pkg/llms_from_scratch/ch05.py
+++ b/pkg/llms_from_scratch/ch05.py
@@ -36,6 +36,10 @@ def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=No
        if temperature > 0.0:
            logits = logits / temperature
            # New (not in book): numerical stability tip to get equivalent results on mps device
            # subtract rowwise max before softmax
            logits = logits - logits.max(dim=-1, keepdim=True).values
            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)