Use instance tokenizer (#116)

* Use instance tokenizer

* consistency updates

---------

Co-authored-by: Sebastian Raschka <mail@sebastianraschka.com>
This commit is contained in:
James Holcombe
2024-04-10 21:16:19 -04:00
committed by GitHub
parent 94f6582cff
commit 0b866c133f
11 changed files with 14 additions and 14 deletions

View File

@@ -19,7 +19,7 @@ class GPTDatasetV1(Dataset):
self.target_ids = []
# Tokenize the entire text
token_ids = tokenizer.encode(txt)
token_ids = self.tokenizer.encode(txt)
# Use a sliding window to chunk the book into overlapping sequences of max_length
for i in range(0, len(token_ids) - max_length, stride):