<|endoftext|> token in dataset v1

This commit is contained in:
rasbt
2024-01-21 12:03:04 -06:00
parent b1923a3075
commit 8860e16e05
3 changed files with 3 additions and 3 deletions

View File

@@ -46,7 +46,7 @@
" self.target_ids = []\n",
"\n",
" # Tokenize the entire text\n",
" token_ids = tokenizer.encode(txt)\n",
" token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
"\n",
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
" for i in range(0, len(token_ids) - max_length, stride):\n",