add endoftext token

2026-04-10 12:33:42 +00:00 · 2024-03-26 06:47:05 -05:00
parent de576296de
commit 12fff1ddcb
2 changed files with 7 additions and 7 deletions
--- a/ch05/01_main-chapter-code/ch05.ipynb
+++ b/ch05/01_main-chapter-code/ch05.ipynb
@@ -198,7 +198,7 @@
    "from previous_chapters import generate_text_simple\n",
    "\n",
    "def text_to_token_ids(text, tokenizer):\n",
-    "    encoded = tokenizer.encode(text)\n",
+    "    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})\n",
    "    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension\n",
    "    return encoded_tensor\n",
    "\n",
@@ -430,7 +430,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 8,
   "id": "54aef09c-d6e3-4238-8653-b3a1b0a1077a",
   "metadata": {
    "colab": {
@@ -470,7 +470,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 9,
   "id": "31402a67-a16e-4aeb-977e-70abb9c9949b",
   "metadata": {
    "colab": {
@@ -504,7 +504,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 10,
   "id": "9b003797-161b-4d98-81dc-e68320e09fec",
   "metadata": {
    "colab": {
--- a/ch05/03_bonus_pretraining_on_gutenberg/previous_chapters.py
+++ b/ch05/03_bonus_pretraining_on_gutenberg/previous_chapters.py
@@ -305,11 +305,11 @@ def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses, output_dir):


 def text_to_token_ids(text, tokenizer):
-    encoded = tokenizer.encode(text)
-    encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # add batch dimension
+    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
+    encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # Add batch dimension
    return encoded_tensor


 def token_ids_to_text(token_ids, tokenizer):
-    flat = token_ids.squeeze(0)  # remove batch dimension
+    flat = token_ids.squeeze(0)  # Remove batch dimension
    return tokenizer.decode(flat.tolist())