From 8860e16e051b121e810d554130b925491d8a4135 Mon Sep 17 00:00:00 2001 From: rasbt Date: Sun, 21 Jan 2024 12:03:04 -0600 Subject: [PATCH] <|endoftext|> token in dataset v1 --- ch02/01_main-chapter-code/ch02.ipynb | 2 +- ch02/01_main-chapter-code/dataloader.ipynb | 2 +- ch03/01_main-chapter-code/multihead-attention.ipynb | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ch02/01_main-chapter-code/ch02.ipynb b/ch02/01_main-chapter-code/ch02.ipynb index 383d6a5..c2a465c 100644 --- a/ch02/01_main-chapter-code/ch02.ipynb +++ b/ch02/01_main-chapter-code/ch02.ipynb @@ -1136,7 +1136,7 @@ " self.target_ids = []\n", "\n", " # Tokenize the entire text\n", - " token_ids = tokenizer.encode(txt)\n", + " token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n", "\n", " # Use a sliding window to chunk the book into overlapping sequences of max_length\n", " for i in range(0, len(token_ids) - max_length, stride):\n", diff --git a/ch02/01_main-chapter-code/dataloader.ipynb b/ch02/01_main-chapter-code/dataloader.ipynb index 6225c9a..f5c53f4 100644 --- a/ch02/01_main-chapter-code/dataloader.ipynb +++ b/ch02/01_main-chapter-code/dataloader.ipynb @@ -62,7 +62,7 @@ " self.target_ids = []\n", "\n", " # Tokenize the entire text\n", - " token_ids = tokenizer.encode(txt)\n", + " token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n", "\n", " # Use a sliding window to chunk the book into overlapping sequences of max_length\n", " for i in range(0, len(token_ids) - max_length, stride):\n", diff --git a/ch03/01_main-chapter-code/multihead-attention.ipynb b/ch03/01_main-chapter-code/multihead-attention.ipynb index 6c2bfc6..37f7c50 100644 --- a/ch03/01_main-chapter-code/multihead-attention.ipynb +++ b/ch03/01_main-chapter-code/multihead-attention.ipynb @@ -46,7 +46,7 @@ " self.target_ids = []\n", "\n", " # Tokenize the entire text\n", - " token_ids = tokenizer.encode(txt)\n", + " token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n", "\n", " # Use a sliding window to chunk the book into overlapping sequences of max_length\n", " for i in range(0, len(token_ids) - max_length, stride):\n",