From 42c130623b28ec260b012f5accc61264e72dbb89 Mon Sep 17 00:00:00 2001
From: casinca <47400729+casinca@users.noreply.github.com>
Date: Wed, 17 Sep 2025 15:14:11 +0200
Subject: [PATCH] `Qwen3Tokenizer` fix for Qwen3 Base models and generation
 mismatch with HF (#828)

* prevent `self.apply_chat_template` being applied for base Qwen models

* - added no chat template comparison in `test_chat_wrap_and_equivalence`
- removed duplicate comparison

* Revert "- added no chat template comparison in `test_chat_wrap_and_equivalence`"

This reverts commit 3a5ee8cfa19aa7e4874cd5f35171098be760b05f.

* Revert "prevent `self.apply_chat_template` being applied for base Qwen models"

This reverts commit df504397a8957886c6d6d808615545e37ceffcad.

* copied `download_file` in `utils` from https://github.com/rasbt/reasoning-from-scratch/blob/main/reasoning_from_scratch/utils.py

* added copy of test `def test_tokenizer_equivalence()` from `reasoning-from-scratch` in `test_qwen3.py`

* removed duplicate code fragment in`test_chat_wrap_and_equivalence`

* use apply_chat_template

* add toggle for instruct model

* Update tokenizer usage

---------

Co-authored-by: rasbt <mail@sebastianraschka.com>
---
 ch05/11_qwen3/README.md                       | 15 +++-
 .../standalone-qwen3-moe-plus-kvcache.ipynb   |  1 +
 ch05/11_qwen3/standalone-qwen3-moe.ipynb      |  1 +
 .../standalone-qwen3-plus-kvcache.ipynb       | 12 ++-
 ch05/11_qwen3/standalone-qwen3.ipynb          | 12 ++-
 pkg/llms_from_scratch/tests/test_qwen3.py     | 78 +++++++++++++++++--
 pkg/llms_from_scratch/utils.py                | 21 +++++
 7 files changed, 125 insertions(+), 15 deletions(-)

diff --git a/ch05/11_qwen3/README.md b/ch05/11_qwen3/README.md
index 97dfe28..08da9b7 100644
--- a/ch05/11_qwen3/README.md
+++ b/ch05/11_qwen3/README.md
@@ -45,8 +45,14 @@ pip install llms_from_scratch tokenizers
 Specify which model to use:
 
 ```python
-USE_REASONING_MODEL = False  # The base model
-USE_REASONING_MODEL = True   # The "thinking" model
+USE_REASONING_MODEL = True
+# Uses the base model if USE_REASONING_MODEL = False
+
+USE_INSTRUCT_MODEL = False
+# Uses the instruct mode (without reasoning) if 
+# USE_REASONING_MODEL = True
+# USE_INSTRUCT_MODEL = False
+# This setting does have no effect if USE_REASONING_MODEL = False
 
 
 # Use
@@ -187,10 +193,11 @@ else:
     tok_filename = "tokenizer-base.json"   
 
 tokenizer = Qwen3Tokenizer(
-    tokenizer_file_path=tok_filename,
+    tokenizer_file_path=tokenizer_file_path,
     repo_id=repo_id,
+    apply_chat_template=USE_REASONING_MODEL,
     add_generation_prompt=USE_REASONING_MODEL,
-    add_thinking=USE_REASONING_MODEL
+    add_thinking=not USE_INSTRUCT_MODEL
 )
 ```
 
diff --git a/ch05/11_qwen3/standalone-qwen3-moe-plus-kvcache.ipynb b/ch05/11_qwen3/standalone-qwen3-moe-plus-kvcache.ipynb
index 36f8f9d..42c5f63 100644
--- a/ch05/11_qwen3/standalone-qwen3-moe-plus-kvcache.ipynb
+++ b/ch05/11_qwen3/standalone-qwen3-moe-plus-kvcache.ipynb
@@ -1064,6 +1064,7 @@
     "tokenizer = Qwen3Tokenizer(\n",
     "    tokenizer_file_path=tokenizer_file_path,\n",
     "    repo_id=repo_id,\n",
+    "    apply_chat_template=True,\n",
     "    add_generation_prompt=True,\n",
     "    add_thinking=True\n",
     ")"
diff --git a/ch05/11_qwen3/standalone-qwen3-moe.ipynb b/ch05/11_qwen3/standalone-qwen3-moe.ipynb
index 5c1a402..6e845fb 100644
--- a/ch05/11_qwen3/standalone-qwen3-moe.ipynb
+++ b/ch05/11_qwen3/standalone-qwen3-moe.ipynb
@@ -1006,6 +1006,7 @@
     "tokenizer = Qwen3Tokenizer(\n",
     "    tokenizer_file_path=tokenizer_file_path,\n",
     "    repo_id=repo_id,\n",
+    "    apply_chat_template=True,\n",
     "    add_generation_prompt=True,\n",
     "    add_thinking=True\n",
     ")"
diff --git a/ch05/11_qwen3/standalone-qwen3-plus-kvcache.ipynb b/ch05/11_qwen3/standalone-qwen3-plus-kvcache.ipynb
index ca9e15e..df216ff 100644
--- a/ch05/11_qwen3/standalone-qwen3-plus-kvcache.ipynb
+++ b/ch05/11_qwen3/standalone-qwen3-plus-kvcache.ipynb
@@ -115,7 +115,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "USE_REASONING_MODEL = True"
+    "USE_REASONING_MODEL = True\n",
+    "# Uses the base model if USE_REASONING_MODEL = False\n",
+    "\n",
+    "USE_INSTRUCT_MODEL = False\n",
+    "# Uses the instruct mode (without reasoning) if \n",
+    "# USE_REASONING_MODEL = True\n",
+    "# USE_INSTRUCT_MODEL = False\n",
+    "# This setting does have no effect if USE_REASONING_MODEL = False"
    ]
   },
   {
@@ -1060,8 +1067,9 @@
     "tokenizer = Qwen3Tokenizer(\n",
     "    tokenizer_file_path=tokenizer_file_path,\n",
     "    repo_id=repo_id,\n",
+    "    apply_chat_template=USE_REASONING_MODEL,\n",
     "    add_generation_prompt=USE_REASONING_MODEL,\n",
-    "    add_thinking=USE_REASONING_MODEL\n",
+    "    add_thinking=not USE_INSTRUCT_MODEL\n",
     ")"
    ]
   },
diff --git a/ch05/11_qwen3/standalone-qwen3.ipynb b/ch05/11_qwen3/standalone-qwen3.ipynb
index 0302990..fd9f588 100644
--- a/ch05/11_qwen3/standalone-qwen3.ipynb
+++ b/ch05/11_qwen3/standalone-qwen3.ipynb
@@ -113,7 +113,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "USE_REASONING_MODEL = True"
+    "USE_REASONING_MODEL = True\n",
+    "# Uses the base model if USE_REASONING_MODEL = False\n",
+    "\n",
+    "USE_INSTRUCT_MODEL = False\n",
+    "# Uses the instruct mode (without reasoning) if \n",
+    "# USE_REASONING_MODEL = True\n",
+    "# USE_INSTRUCT_MODEL = False\n",
+    "# This setting does have no effect if USE_REASONING_MODEL = False"
    ]
   },
   {
@@ -1002,8 +1009,9 @@
     "tokenizer = Qwen3Tokenizer(\n",
     "    tokenizer_file_path=tokenizer_file_path,\n",
     "    repo_id=repo_id,\n",
+    "    apply_chat_template=USE_REASONING_MODEL,\n",
     "    add_generation_prompt=USE_REASONING_MODEL,\n",
-    "    add_thinking=USE_REASONING_MODEL\n",
+    "    add_thinking=not USE_INSTRUCT_MODEL\n",
     ")"
    ]
   },
diff --git a/pkg/llms_from_scratch/tests/test_qwen3.py b/pkg/llms_from_scratch/tests/test_qwen3.py
index 78555c0..5f72208 100644
--- a/pkg/llms_from_scratch/tests/test_qwen3.py
+++ b/pkg/llms_from_scratch/tests/test_qwen3.py
@@ -20,7 +20,12 @@ from llms_from_scratch.kv_cache.generate import generate_text_simple as generate
 from llms_from_scratch.kv_cache_batched.qwen3 import Qwen3Model as Qwen3ModelKVBatched
 from llms_from_scratch.kv_cache_batched.generate import generate_text_simple as generate_text_simple_batched
 
+from llms_from_scratch.utils import download_file
+
 import importlib
+import os
+import shutil
+import tempfile
 import platform
 import pytest
 import torch
@@ -465,13 +470,6 @@ def test_chat_wrap_and_equivalence(add_gen, add_think):
             add_generation_prompt=add_gen,
             enable_thinking=add_think,
         )
-        ours = qt.encode(prompt)
-        ref = hf_tok.apply_chat_template(
-            messages,
-            tokenize=True,
-            add_generation_prompt=add_gen,
-            enable_thinking=add_think,
-        )
 
         if add_gen and not add_think:
             pass  # skip edge case as this is not something we use in practice
@@ -534,6 +532,72 @@ def test_multiturn_equivalence(repo_id, tok_file, add_gen, add_think):
     assert ours_dec == ref_dec
 
 
+@pytest.mark.skipif(not transformers_installed, reason="transformers not installed")
+def test_tokenizer_equivalence():
+    from transformers import AutoTokenizer
+
+    prompt = "Give me a short introduction to large language models."
+    messages = [
+        {"role": "user", "content": prompt},
+    ]
+
+    for apply_chat_template in (True, False):
+        for s in ("-Base", ""):
+            repo_id = f"Qwen/Qwen3-0.6B{s}"
+            tokenizer_ref = AutoTokenizer.from_pretrained(repo_id)
+            tokenizer_url = f"https://huggingface.co/Qwen/Qwen3-0.6B{s}/resolve/main/tokenizer.json"
+            download_file(tokenizer_url, out_dir=".")
+
+            old_name = "tokenizer.json"
+
+            if not s:
+                new_name = "tokenizer-reasoning.json"
+            else:
+                new_name = "tokenizer-base.json"
+
+            try:
+                shutil.move(old_name, new_name)
+            except Exception:
+                with tempfile.NamedTemporaryFile(delete=False, dir=".") as tmp_file:
+                    shutil.copyfile(old_name, tmp_file.name)
+                    os.replace(tmp_file.name, new_name)
+                os.remove(old_name)
+
+            for states in ((True, True), (False, False)):
+                tokenizer = Qwen3Tokenizer(
+                    tokenizer_file_path=new_name,
+                    repo_id=repo_id,
+                    apply_chat_template=apply_chat_template,
+                    add_generation_prompt=states[0],
+                    add_thinking=states[1]
+                )
+                input_token_ids = tokenizer.encode(prompt)
+
+                if apply_chat_template:
+                    input_token_ids_ref = tokenizer_ref.apply_chat_template(
+                        messages,
+                        tokenize=True,
+                        add_generation_prompt=states[0],
+                        enable_thinking=states[1],
+                    )
+                else:
+                    input_token_ids_ref = input_token_ids
+
+                assert input_token_ids == input_token_ids_ref, states
+
+                output_text = tokenizer.decode(input_token_ids)
+                out_text_ref = tokenizer_ref.decode(input_token_ids_ref)
+                assert output_text == out_text_ref, states
+
+                assert tokenizer.encode("<|endoftext|>") == [tokenizer._special_to_id["<|endoftext|>"]]
+                assert tokenizer.encode("<|im_end|>") == [tokenizer._special_to_id["<|im_end|>"]]
+
+                expected_eos_token = "<|im_end|>" if "base" not in new_name else "<|endoftext|>"
+                expected_pad_token = "<|endoftext|>"
+                assert tokenizer.decode([tokenizer.eos_token_id]) == expected_eos_token
+                assert tokenizer.decode([tokenizer.pad_token_id]) == expected_pad_token
+
+
 @pytest.mark.skipif(not transformers_installed, reason="transformers not installed")
 @pytest.mark.parametrize("repo_id, tok_file", [
     ("Qwen/Qwen3-0.6B", "Qwen3-0.6B/tokenizer.json"),
diff --git a/pkg/llms_from_scratch/utils.py b/pkg/llms_from_scratch/utils.py
index 466ca4c..174f83a 100644
--- a/pkg/llms_from_scratch/utils.py
+++ b/pkg/llms_from_scratch/utils.py
@@ -9,6 +9,8 @@ import ast
 import re
 import types
 from pathlib import Path
+import urllib.request
+import urllib.parse
 
 import nbformat
 
@@ -122,3 +124,22 @@ def import_definitions_from_notebook(nb_dir_or_path, notebook_name=None, *, extr
 
     exec(src, mod.__dict__)
     return mod
+
+def download_file(url, out_dir="."):
+    """Simple file download utility for tests."""
+    from pathlib import Path
+    out_dir = Path(out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    filename = Path(urllib.parse.urlparse(url).path).name
+    dest = out_dir / filename
+    
+    if dest.exists():
+        return dest
+        
+    try:
+        with urllib.request.urlopen(url) as response:
+            with open(dest, 'wb') as f:
+                f.write(response.read())
+        return dest
+    except Exception as e:
+        raise RuntimeError(f"Failed to download {url}: {e}")