Fix encoding of multiple preceding spaces in BPE tokenizer. (#945)

* Fix encoding of multiple preceding spaces in BPE tokenizer.

* Add test

---------

Co-authored-by: rasbt <mail@sebastianraschka.com>
This commit is contained in:
Maxwell De Jong
2026-01-10 11:27:23 -05:00
committed by GitHub
parent 90e0f3cc15
commit e0dbec3331
2 changed files with 13 additions and 2 deletions

View File

@@ -609,9 +609,9 @@
" else:\n",
" word = m.group(2)\n",
" if pending_spaces > 0:\n",
" tokens.append(\"Ġ\" + word) # one leading space\n",
" for _ in range(pending_spaces - 1):\n",
" tokens.append(\"Ġ\") # remaining spaces as standalone\n",
" tokens.append(\"Ġ\" + word) # one leading space\n",
" pending_spaces = 0\n",
" else:\n",
" tokens.append(word)\n",

View File

@@ -238,4 +238,15 @@ def test_space_newline_space_patterns(imported_module, gpt2_files):
"Hello\n world",
]
for s in samples:
assert tok.encode(s) == tik.encode(s), f"Mismatch vs tiktoken: {repr(s)}"
assert tok.encode(s) == tik.encode(s), f"Mismatch vs tiktoken: {repr(s)}"
def test_multiple_leading_spaces_roundtrip(imported_module, gpt2_files):
BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None)
tok = BPETokenizerSimple()
tok.load_vocab_and_merges_from_openai(
vocab_path=gpt2_files["encoder.json"], bpe_merges_path=gpt2_files["vocab.bpe"]
)
text = " Hello World."
assert tok.decode(tok.encode(text)) == text