From 8860e16e051b121e810d554130b925491d8a4135 Mon Sep 17 00:00:00 2001
From: rasbt <mail@sebastianraschka.com>
Date: Sun, 21 Jan 2024 12:03:04 -0600
Subject: [PATCH] <|endoftext|> token in dataset v1

---
 ch02/01_main-chapter-code/ch02.ipynb                | 2 +-
 ch02/01_main-chapter-code/dataloader.ipynb          | 2 +-
 ch03/01_main-chapter-code/multihead-attention.ipynb | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ch02/01_main-chapter-code/ch02.ipynb b/ch02/01_main-chapter-code/ch02.ipynb
index 383d6a5..c2a465c 100644
--- a/ch02/01_main-chapter-code/ch02.ipynb
+++ b/ch02/01_main-chapter-code/ch02.ipynb
@@ -1136,7 +1136,7 @@
     "        self.target_ids = []\n",
     "\n",
     "        # Tokenize the entire text\n",
-    "        token_ids = tokenizer.encode(txt)\n",
+    "        token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
     "\n",
     "        # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
     "        for i in range(0, len(token_ids) - max_length, stride):\n",
diff --git a/ch02/01_main-chapter-code/dataloader.ipynb b/ch02/01_main-chapter-code/dataloader.ipynb
index 6225c9a..f5c53f4 100644
--- a/ch02/01_main-chapter-code/dataloader.ipynb
+++ b/ch02/01_main-chapter-code/dataloader.ipynb
@@ -62,7 +62,7 @@
     "        self.target_ids = []\n",
     "\n",
     "        # Tokenize the entire text\n",
-    "        token_ids = tokenizer.encode(txt)\n",
+    "        token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
     "\n",
     "        # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
     "        for i in range(0, len(token_ids) - max_length, stride):\n",
diff --git a/ch03/01_main-chapter-code/multihead-attention.ipynb b/ch03/01_main-chapter-code/multihead-attention.ipynb
index 6c2bfc6..37f7c50 100644
--- a/ch03/01_main-chapter-code/multihead-attention.ipynb
+++ b/ch03/01_main-chapter-code/multihead-attention.ipynb
@@ -46,7 +46,7 @@
     "        self.target_ids = []\n",
     "\n",
     "        # Tokenize the entire text\n",
-    "        token_ids = tokenizer.encode(txt)\n",
+    "        token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
     "\n",
     "        # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
     "        for i in range(0, len(token_ids) - max_length, stride):\n",