From 42c130623b28ec260b012f5accc61264e72dbb89 Mon Sep 17 00:00:00 2001 From: casinca <47400729+casinca@users.noreply.github.com> Date: Wed, 17 Sep 2025 15:14:11 +0200 Subject: [PATCH] `Qwen3Tokenizer` fix for Qwen3 Base models and generation mismatch with HF (#828) * prevent `self.apply_chat_template` being applied for base Qwen models * - added no chat template comparison in `test_chat_wrap_and_equivalence` - removed duplicate comparison * Revert "- added no chat template comparison in `test_chat_wrap_and_equivalence`" This reverts commit 3a5ee8cfa19aa7e4874cd5f35171098be760b05f. * Revert "prevent `self.apply_chat_template` being applied for base Qwen models" This reverts commit df504397a8957886c6d6d808615545e37ceffcad. * copied `download_file` in `utils` from https://github.com/rasbt/reasoning-from-scratch/blob/main/reasoning_from_scratch/utils.py * added copy of test `def test_tokenizer_equivalence()` from `reasoning-from-scratch` in `test_qwen3.py` * removed duplicate code fragment in`test_chat_wrap_and_equivalence` * use apply_chat_template * add toggle for instruct model * Update tokenizer usage --------- Co-authored-by: rasbt --- ch05/11_qwen3/README.md | 15 +++- .../standalone-qwen3-moe-plus-kvcache.ipynb | 1 + ch05/11_qwen3/standalone-qwen3-moe.ipynb | 1 + .../standalone-qwen3-plus-kvcache.ipynb | 12 ++- ch05/11_qwen3/standalone-qwen3.ipynb | 12 ++- pkg/llms_from_scratch/tests/test_qwen3.py | 78 +++++++++++++++++-- pkg/llms_from_scratch/utils.py | 21 +++++ 7 files changed, 125 insertions(+), 15 deletions(-) diff --git a/ch05/11_qwen3/README.md b/ch05/11_qwen3/README.md index 97dfe28..08da9b7 100644 --- a/ch05/11_qwen3/README.md +++ b/ch05/11_qwen3/README.md @@ -45,8 +45,14 @@ pip install llms_from_scratch tokenizers Specify which model to use: ```python -USE_REASONING_MODEL = False # The base model -USE_REASONING_MODEL = True # The "thinking" model +USE_REASONING_MODEL = True +# Uses the base model if USE_REASONING_MODEL = False + +USE_INSTRUCT_MODEL = False +# Uses the instruct mode (without reasoning) if +# USE_REASONING_MODEL = True +# USE_INSTRUCT_MODEL = False +# This setting does have no effect if USE_REASONING_MODEL = False # Use @@ -187,10 +193,11 @@ else: tok_filename = "tokenizer-base.json" tokenizer = Qwen3Tokenizer( - tokenizer_file_path=tok_filename, + tokenizer_file_path=tokenizer_file_path, repo_id=repo_id, + apply_chat_template=USE_REASONING_MODEL, add_generation_prompt=USE_REASONING_MODEL, - add_thinking=USE_REASONING_MODEL + add_thinking=not USE_INSTRUCT_MODEL ) ``` diff --git a/ch05/11_qwen3/standalone-qwen3-moe-plus-kvcache.ipynb b/ch05/11_qwen3/standalone-qwen3-moe-plus-kvcache.ipynb index 36f8f9d..42c5f63 100644 --- a/ch05/11_qwen3/standalone-qwen3-moe-plus-kvcache.ipynb +++ b/ch05/11_qwen3/standalone-qwen3-moe-plus-kvcache.ipynb @@ -1064,6 +1064,7 @@ "tokenizer = Qwen3Tokenizer(\n", " tokenizer_file_path=tokenizer_file_path,\n", " repo_id=repo_id,\n", + " apply_chat_template=True,\n", " add_generation_prompt=True,\n", " add_thinking=True\n", ")" diff --git a/ch05/11_qwen3/standalone-qwen3-moe.ipynb b/ch05/11_qwen3/standalone-qwen3-moe.ipynb index 5c1a402..6e845fb 100644 --- a/ch05/11_qwen3/standalone-qwen3-moe.ipynb +++ b/ch05/11_qwen3/standalone-qwen3-moe.ipynb @@ -1006,6 +1006,7 @@ "tokenizer = Qwen3Tokenizer(\n", " tokenizer_file_path=tokenizer_file_path,\n", " repo_id=repo_id,\n", + " apply_chat_template=True,\n", " add_generation_prompt=True,\n", " add_thinking=True\n", ")" diff --git a/ch05/11_qwen3/standalone-qwen3-plus-kvcache.ipynb b/ch05/11_qwen3/standalone-qwen3-plus-kvcache.ipynb index ca9e15e..df216ff 100644 --- a/ch05/11_qwen3/standalone-qwen3-plus-kvcache.ipynb +++ b/ch05/11_qwen3/standalone-qwen3-plus-kvcache.ipynb @@ -115,7 +115,14 @@ "metadata": {}, "outputs": [], "source": [ - "USE_REASONING_MODEL = True" + "USE_REASONING_MODEL = True\n", + "# Uses the base model if USE_REASONING_MODEL = False\n", + "\n", + "USE_INSTRUCT_MODEL = False\n", + "# Uses the instruct mode (without reasoning) if \n", + "# USE_REASONING_MODEL = True\n", + "# USE_INSTRUCT_MODEL = False\n", + "# This setting does have no effect if USE_REASONING_MODEL = False" ] }, { @@ -1060,8 +1067,9 @@ "tokenizer = Qwen3Tokenizer(\n", " tokenizer_file_path=tokenizer_file_path,\n", " repo_id=repo_id,\n", + " apply_chat_template=USE_REASONING_MODEL,\n", " add_generation_prompt=USE_REASONING_MODEL,\n", - " add_thinking=USE_REASONING_MODEL\n", + " add_thinking=not USE_INSTRUCT_MODEL\n", ")" ] }, diff --git a/ch05/11_qwen3/standalone-qwen3.ipynb b/ch05/11_qwen3/standalone-qwen3.ipynb index 0302990..fd9f588 100644 --- a/ch05/11_qwen3/standalone-qwen3.ipynb +++ b/ch05/11_qwen3/standalone-qwen3.ipynb @@ -113,7 +113,14 @@ "metadata": {}, "outputs": [], "source": [ - "USE_REASONING_MODEL = True" + "USE_REASONING_MODEL = True\n", + "# Uses the base model if USE_REASONING_MODEL = False\n", + "\n", + "USE_INSTRUCT_MODEL = False\n", + "# Uses the instruct mode (without reasoning) if \n", + "# USE_REASONING_MODEL = True\n", + "# USE_INSTRUCT_MODEL = False\n", + "# This setting does have no effect if USE_REASONING_MODEL = False" ] }, { @@ -1002,8 +1009,9 @@ "tokenizer = Qwen3Tokenizer(\n", " tokenizer_file_path=tokenizer_file_path,\n", " repo_id=repo_id,\n", + " apply_chat_template=USE_REASONING_MODEL,\n", " add_generation_prompt=USE_REASONING_MODEL,\n", - " add_thinking=USE_REASONING_MODEL\n", + " add_thinking=not USE_INSTRUCT_MODEL\n", ")" ] }, diff --git a/pkg/llms_from_scratch/tests/test_qwen3.py b/pkg/llms_from_scratch/tests/test_qwen3.py index 78555c0..5f72208 100644 --- a/pkg/llms_from_scratch/tests/test_qwen3.py +++ b/pkg/llms_from_scratch/tests/test_qwen3.py @@ -20,7 +20,12 @@ from llms_from_scratch.kv_cache.generate import generate_text_simple as generate from llms_from_scratch.kv_cache_batched.qwen3 import Qwen3Model as Qwen3ModelKVBatched from llms_from_scratch.kv_cache_batched.generate import generate_text_simple as generate_text_simple_batched +from llms_from_scratch.utils import download_file + import importlib +import os +import shutil +import tempfile import platform import pytest import torch @@ -465,13 +470,6 @@ def test_chat_wrap_and_equivalence(add_gen, add_think): add_generation_prompt=add_gen, enable_thinking=add_think, ) - ours = qt.encode(prompt) - ref = hf_tok.apply_chat_template( - messages, - tokenize=True, - add_generation_prompt=add_gen, - enable_thinking=add_think, - ) if add_gen and not add_think: pass # skip edge case as this is not something we use in practice @@ -534,6 +532,72 @@ def test_multiturn_equivalence(repo_id, tok_file, add_gen, add_think): assert ours_dec == ref_dec +@pytest.mark.skipif(not transformers_installed, reason="transformers not installed") +def test_tokenizer_equivalence(): + from transformers import AutoTokenizer + + prompt = "Give me a short introduction to large language models." + messages = [ + {"role": "user", "content": prompt}, + ] + + for apply_chat_template in (True, False): + for s in ("-Base", ""): + repo_id = f"Qwen/Qwen3-0.6B{s}" + tokenizer_ref = AutoTokenizer.from_pretrained(repo_id) + tokenizer_url = f"https://huggingface.co/Qwen/Qwen3-0.6B{s}/resolve/main/tokenizer.json" + download_file(tokenizer_url, out_dir=".") + + old_name = "tokenizer.json" + + if not s: + new_name = "tokenizer-reasoning.json" + else: + new_name = "tokenizer-base.json" + + try: + shutil.move(old_name, new_name) + except Exception: + with tempfile.NamedTemporaryFile(delete=False, dir=".") as tmp_file: + shutil.copyfile(old_name, tmp_file.name) + os.replace(tmp_file.name, new_name) + os.remove(old_name) + + for states in ((True, True), (False, False)): + tokenizer = Qwen3Tokenizer( + tokenizer_file_path=new_name, + repo_id=repo_id, + apply_chat_template=apply_chat_template, + add_generation_prompt=states[0], + add_thinking=states[1] + ) + input_token_ids = tokenizer.encode(prompt) + + if apply_chat_template: + input_token_ids_ref = tokenizer_ref.apply_chat_template( + messages, + tokenize=True, + add_generation_prompt=states[0], + enable_thinking=states[1], + ) + else: + input_token_ids_ref = input_token_ids + + assert input_token_ids == input_token_ids_ref, states + + output_text = tokenizer.decode(input_token_ids) + out_text_ref = tokenizer_ref.decode(input_token_ids_ref) + assert output_text == out_text_ref, states + + assert tokenizer.encode("<|endoftext|>") == [tokenizer._special_to_id["<|endoftext|>"]] + assert tokenizer.encode("<|im_end|>") == [tokenizer._special_to_id["<|im_end|>"]] + + expected_eos_token = "<|im_end|>" if "base" not in new_name else "<|endoftext|>" + expected_pad_token = "<|endoftext|>" + assert tokenizer.decode([tokenizer.eos_token_id]) == expected_eos_token + assert tokenizer.decode([tokenizer.pad_token_id]) == expected_pad_token + + @pytest.mark.skipif(not transformers_installed, reason="transformers not installed") @pytest.mark.parametrize("repo_id, tok_file", [ ("Qwen/Qwen3-0.6B", "Qwen3-0.6B/tokenizer.json"), diff --git a/pkg/llms_from_scratch/utils.py b/pkg/llms_from_scratch/utils.py index 466ca4c..174f83a 100644 --- a/pkg/llms_from_scratch/utils.py +++ b/pkg/llms_from_scratch/utils.py @@ -9,6 +9,8 @@ import ast import re import types from pathlib import Path +import urllib.request +import urllib.parse import nbformat @@ -122,3 +124,22 @@ def import_definitions_from_notebook(nb_dir_or_path, notebook_name=None, *, extr exec(src, mod.__dict__) return mod + +def download_file(url, out_dir="."): + """Simple file download utility for tests.""" + from pathlib import Path + out_dir = Path(out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + filename = Path(urllib.parse.urlparse(url).path).name + dest = out_dir / filename + + if dest.exists(): + return dest + + try: + with urllib.request.urlopen(url) as response: + with open(dest, 'wb') as f: + f.write(response.read()) + return dest + except Exception as e: + raise RuntimeError(f"Failed to download {url}: {e}")