Remove leftover instances of self.tokenizer (#201)

* Remove leftover instances of self.tokenizer

* add endoftext token
This commit is contained in:
Sebastian Raschka
2024-06-08 14:57:34 -05:00
committed by GitHub
parent 98d23751f7
commit 40ba3a4068
13 changed files with 18 additions and 23 deletions

View File

@@ -20,12 +20,11 @@ from torch.utils.data import Dataset, DataLoader
class GPTDatasetV1(Dataset):
def __init__(self, txt, tokenizer, max_length, stride):
self.tokenizer = tokenizer
self.input_ids = []
self.target_ids = []
# Tokenize the entire text
token_ids = tokenizer.encode(txt)
token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
# Use a sliding window to chunk the book into overlapping sequences of max_length
for i in range(0, len(token_ids) - max_length, stride):

View File

@@ -20,12 +20,11 @@ from torch.utils.data import Dataset, DataLoader
class GPTDatasetV1(Dataset):
def __init__(self, txt, tokenizer, max_length, stride):
self.tokenizer = tokenizer
self.input_ids = []
self.target_ids = []
# Tokenize the entire text
token_ids = tokenizer.encode(txt)
token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
# Use a sliding window to chunk the book into overlapping sequences of max_length
for i in range(0, len(token_ids) - max_length, stride):

View File

@@ -25,7 +25,7 @@ class GPTDatasetV1(Dataset):
self.target_ids = []
# Tokenize the entire text
token_ids = tokenizer.encode(txt)
token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
# Use a sliding window to chunk the book into overlapping sequences of max_length
for i in range(0, len(token_ids) - max_length, stride):