mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2026-04-10 12:33:42 +00:00
Remove leftover instances of self.tokenizer (#201)
* Remove leftover instances of self.tokenizer * add endoftext token
This commit is contained in:
committed by
GitHub
parent
98d23751f7
commit
40ba3a4068
@@ -20,12 +20,11 @@ from torch.utils.data import Dataset, DataLoader
|
||||
|
||||
class GPTDatasetV1(Dataset):
|
||||
def __init__(self, txt, tokenizer, max_length, stride):
|
||||
self.tokenizer = tokenizer
|
||||
self.input_ids = []
|
||||
self.target_ids = []
|
||||
|
||||
# Tokenize the entire text
|
||||
token_ids = tokenizer.encode(txt)
|
||||
token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
|
||||
|
||||
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
||||
for i in range(0, len(token_ids) - max_length, stride):
|
||||
|
||||
@@ -20,12 +20,11 @@ from torch.utils.data import Dataset, DataLoader
|
||||
|
||||
class GPTDatasetV1(Dataset):
|
||||
def __init__(self, txt, tokenizer, max_length, stride):
|
||||
self.tokenizer = tokenizer
|
||||
self.input_ids = []
|
||||
self.target_ids = []
|
||||
|
||||
# Tokenize the entire text
|
||||
token_ids = tokenizer.encode(txt)
|
||||
token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
|
||||
|
||||
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
||||
for i in range(0, len(token_ids) - max_length, stride):
|
||||
|
||||
@@ -25,7 +25,7 @@ class GPTDatasetV1(Dataset):
|
||||
self.target_ids = []
|
||||
|
||||
# Tokenize the entire text
|
||||
token_ids = tokenizer.encode(txt)
|
||||
token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
|
||||
|
||||
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
||||
for i in range(0, len(token_ids) - max_length, stride):
|
||||
|
||||
Reference in New Issue
Block a user