mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2026-04-10 12:33:42 +00:00
Fix encoding of multiple preceding spaces in BPE tokenizer. (#945)
* Fix encoding of multiple preceding spaces in BPE tokenizer. * Add test --------- Co-authored-by: rasbt <mail@sebastianraschka.com>
This commit is contained in:
@@ -609,9 +609,9 @@
|
|||||||
" else:\n",
|
" else:\n",
|
||||||
" word = m.group(2)\n",
|
" word = m.group(2)\n",
|
||||||
" if pending_spaces > 0:\n",
|
" if pending_spaces > 0:\n",
|
||||||
" tokens.append(\"Ġ\" + word) # one leading space\n",
|
|
||||||
" for _ in range(pending_spaces - 1):\n",
|
" for _ in range(pending_spaces - 1):\n",
|
||||||
" tokens.append(\"Ġ\") # remaining spaces as standalone\n",
|
" tokens.append(\"Ġ\") # remaining spaces as standalone\n",
|
||||||
|
" tokens.append(\"Ġ\" + word) # one leading space\n",
|
||||||
" pending_spaces = 0\n",
|
" pending_spaces = 0\n",
|
||||||
" else:\n",
|
" else:\n",
|
||||||
" tokens.append(word)\n",
|
" tokens.append(word)\n",
|
||||||
|
|||||||
@@ -239,3 +239,14 @@ def test_space_newline_space_patterns(imported_module, gpt2_files):
|
|||||||
]
|
]
|
||||||
for s in samples:
|
for s in samples:
|
||||||
assert tok.encode(s) == tik.encode(s), f"Mismatch vs tiktoken: {repr(s)}"
|
assert tok.encode(s) == tik.encode(s), f"Mismatch vs tiktoken: {repr(s)}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_multiple_leading_spaces_roundtrip(imported_module, gpt2_files):
|
||||||
|
BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None)
|
||||||
|
tok = BPETokenizerSimple()
|
||||||
|
tok.load_vocab_and_merges_from_openai(
|
||||||
|
vocab_path=gpt2_files["encoder.json"], bpe_merges_path=gpt2_files["vocab.bpe"]
|
||||||
|
)
|
||||||
|
|
||||||
|
text = " Hello World."
|
||||||
|
assert tok.decode(tok.encode(text)) == text
|
||||||
|
|||||||
Reference in New Issue
Block a user