mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2026-04-10 12:33:42 +00:00
Fix encoding of multiple preceding spaces in BPE tokenizer. (#945)
* Fix encoding of multiple preceding spaces in BPE tokenizer. * Add test --------- Co-authored-by: rasbt <mail@sebastianraschka.com>
This commit is contained in:
@@ -609,9 +609,9 @@
|
||||
" else:\n",
|
||||
" word = m.group(2)\n",
|
||||
" if pending_spaces > 0:\n",
|
||||
" tokens.append(\"Ġ\" + word) # one leading space\n",
|
||||
" for _ in range(pending_spaces - 1):\n",
|
||||
" tokens.append(\"Ġ\") # remaining spaces as standalone\n",
|
||||
" tokens.append(\"Ġ\" + word) # one leading space\n",
|
||||
" pending_spaces = 0\n",
|
||||
" else:\n",
|
||||
" tokens.append(word)\n",
|
||||
|
||||
@@ -238,4 +238,15 @@ def test_space_newline_space_patterns(imported_module, gpt2_files):
|
||||
"Hello\n world",
|
||||
]
|
||||
for s in samples:
|
||||
assert tok.encode(s) == tik.encode(s), f"Mismatch vs tiktoken: {repr(s)}"
|
||||
assert tok.encode(s) == tik.encode(s), f"Mismatch vs tiktoken: {repr(s)}"
|
||||
|
||||
|
||||
def test_multiple_leading_spaces_roundtrip(imported_module, gpt2_files):
|
||||
BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None)
|
||||
tok = BPETokenizerSimple()
|
||||
tok.load_vocab_and_merges_from_openai(
|
||||
vocab_path=gpt2_files["encoder.json"], bpe_merges_path=gpt2_files["vocab.bpe"]
|
||||
)
|
||||
|
||||
text = " Hello World."
|
||||
assert tok.decode(tok.encode(text)) == text
|
||||
|
||||
Reference in New Issue
Block a user