diff --git a/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb b/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb index cad47a8..9ff3acc 100644 --- a/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb +++ b/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb @@ -609,9 +609,9 @@ " else:\n", " word = m.group(2)\n", " if pending_spaces > 0:\n", - " tokens.append(\"Ġ\" + word) # one leading space\n", " for _ in range(pending_spaces - 1):\n", " tokens.append(\"Ġ\") # remaining spaces as standalone\n", + " tokens.append(\"Ġ\" + word) # one leading space\n", " pending_spaces = 0\n", " else:\n", " tokens.append(word)\n", diff --git a/ch02/05_bpe-from-scratch/tests.py b/ch02/05_bpe-from-scratch/tests.py index 1b570bf..8383842 100644 --- a/ch02/05_bpe-from-scratch/tests.py +++ b/ch02/05_bpe-from-scratch/tests.py @@ -238,4 +238,15 @@ def test_space_newline_space_patterns(imported_module, gpt2_files): "Hello\n world", ] for s in samples: - assert tok.encode(s) == tik.encode(s), f"Mismatch vs tiktoken: {repr(s)}" \ No newline at end of file + assert tok.encode(s) == tik.encode(s), f"Mismatch vs tiktoken: {repr(s)}" + + +def test_multiple_leading_spaces_roundtrip(imported_module, gpt2_files): + BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None) + tok = BPETokenizerSimple() + tok.load_vocab_and_merges_from_openai( + vocab_path=gpt2_files["encoder.json"], bpe_merges_path=gpt2_files["vocab.bpe"] + ) + + text = " Hello World." + assert tok.decode(tok.encode(text)) == text