Remove leftover instances of self.tokenizer (#201)

* Remove leftover instances of self.tokenizer * add endoftext token
2026-04-10 12:33:42 +00:00 · 2024-06-08 14:57:34 -05:00
parent c303a7f36d
commit 72a073bbbf
13 changed files with 18 additions and 23 deletions
--- a/ch02/01_main-chapter-code/exercise-solutions.ipynb
+++ b/ch02/01_main-chapter-code/exercise-solutions.ipynb
@@ -248,7 +248,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
   "id": "4d50af16-937b-49e0-8ffd-42d30cbb41c9",
   "metadata": {},
   "outputs": [],
@@ -260,12 +260,11 @@
    "\n",
    "class GPTDatasetV1(Dataset):\n",
    "    def __init__(self, txt, tokenizer, max_length, stride):\n",
-    "        self.tokenizer = tokenizer\n",
    "        self.input_ids = []\n",
    "        self.target_ids = []\n",
    "\n",
    "        # Tokenize the entire text\n",
-    "        token_ids = self.tokenizer.encode(txt)\n",
+    "        token_ids = tokenizer.encode(txt)\n",
    "\n",
    "        # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
    "        for i in range(0, len(token_ids) - max_length, stride):\n",
@@ -311,7 +310,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
   "id": "0128eefa-d7c8-4f76-9851-566dfa7c3745",
   "metadata": {},
   "outputs": [
@@ -324,7 +323,7 @@
       "        [ 402,  271]])"
      ]
     },
-     "execution_count": 11,
+     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -341,7 +340,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
   "id": "ff5c1e90-c6de-4a87-adf6-7e19f603291c",
   "metadata": {},
   "outputs": [
@@ -354,7 +353,7 @@
       "        [  402,   271, 10899,  2138,   257,  7026, 15632,   438]])"
      ]
     },
-     "execution_count": 12,
+     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }