diff --git a/ch02/01_main-chapter-code/ch02.ipynb b/ch02/01_main-chapter-code/ch02.ipynb index 383d6a5..c2a465c 100644 --- a/ch02/01_main-chapter-code/ch02.ipynb +++ b/ch02/01_main-chapter-code/ch02.ipynb @@ -1136,7 +1136,7 @@ " self.target_ids = []\n", "\n", " # Tokenize the entire text\n", - " token_ids = tokenizer.encode(txt)\n", + " token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n", "\n", " # Use a sliding window to chunk the book into overlapping sequences of max_length\n", " for i in range(0, len(token_ids) - max_length, stride):\n", diff --git a/ch02/01_main-chapter-code/dataloader.ipynb b/ch02/01_main-chapter-code/dataloader.ipynb index 6225c9a..f5c53f4 100644 --- a/ch02/01_main-chapter-code/dataloader.ipynb +++ b/ch02/01_main-chapter-code/dataloader.ipynb @@ -62,7 +62,7 @@ " self.target_ids = []\n", "\n", " # Tokenize the entire text\n", - " token_ids = tokenizer.encode(txt)\n", + " token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n", "\n", " # Use a sliding window to chunk the book into overlapping sequences of max_length\n", " for i in range(0, len(token_ids) - max_length, stride):\n", diff --git a/ch03/01_main-chapter-code/multihead-attention.ipynb b/ch03/01_main-chapter-code/multihead-attention.ipynb index 6c2bfc6..37f7c50 100644 --- a/ch03/01_main-chapter-code/multihead-attention.ipynb +++ b/ch03/01_main-chapter-code/multihead-attention.ipynb @@ -46,7 +46,7 @@ " self.target_ids = []\n", "\n", " # Tokenize the entire text\n", - " token_ids = tokenizer.encode(txt)\n", + " token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n", "\n", " # Use a sliding window to chunk the book into overlapping sequences of max_length\n", " for i in range(0, len(token_ids) - max_length, stride):\n",