add allowed_special={"<|endoftext|>"}

This commit is contained in:
rasbt
2024-06-09 06:04:02 -05:00
parent 40ba3a4068
commit e1adeb14f3
2 changed files with 3 additions and 3 deletions

View File

@@ -18,7 +18,7 @@ class GPTDatasetV1(Dataset):
self.target_ids = []
# Tokenize the entire text
token_ids = tokenizer.encode(txt)
token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
# Use a sliding window to chunk the book into overlapping sequences of max_length
for i in range(0, len(token_ids) - max_length, stride):