Make datesets and loaders compatible with multiprocessing (#118)

2026-04-10 12:33:42 +00:00 · 2024-04-13 14:57:56 -04:00
parent 9f3f231ac7
commit dd51d4ad83
17 changed files with 140 additions and 116 deletions
--- a/ch05/05_bonus_hparam_tuning/hparam_search.py
+++ b/ch05/05_bonus_hparam_tuning/hparam_search.py
@@ -6,6 +6,7 @@
 import itertools
 import math
 import os
+import tiktoken
 import torch
 from previous_chapters import GPTModel, create_dataloader_v1

@@ -58,7 +59,7 @@ def evaluate_model(model, train_loader, val_loader, device, eval_iter):

 def train_model(model, train_loader, val_loader, optimizer, device,
                n_epochs, eval_freq, eval_iter,
-                encoded_start_context, warmup_iters=10,
+                encoded_start_context, tokenizer, warmup_iters=10,
                initial_lr=3e-05, min_lr=1e-6):
    global_step = 0

@@ -120,6 +121,7 @@ if __name__ == "__main__":
    with open(os.path.join(script_dir, "the-verdict.txt"), "r", encoding="utf-8") as file:
        text_data = file.read()

+    tokenizer = tiktoken.get_encoding("gpt2")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_ratio = 0.95
@@ -155,7 +157,8 @@ if __name__ == "__main__":
                max_length=GPT_CONFIG_124M["context_length"],
                stride=GPT_CONFIG_124M["context_length"],
                drop_last=True,
-                shuffle=True
+                shuffle=True,
+                num_workers=0
            )

            val_loader = create_dataloader_v1(
@@ -164,7 +167,8 @@ if __name__ == "__main__":
                max_length=GPT_CONFIG_124M["context_length"],
                stride=GPT_CONFIG_124M["context_length"],
                drop_last=False,
-                shuffle=False
+                shuffle=False,
+                num_workers=0
            )

            model = GPTModel(GPT_CONFIG_124M)
@@ -176,7 +180,7 @@ if __name__ == "__main__":
                weight_decay=HPARAM_CONFIG["weight_decay"]
            )

-            encoded_start_context = train_loader.dataset.tokenizer.encode("Nevertheless")
+            encoded_start_context = tokenizer.encode("Nevertheless")
            encoded_tensor = torch.tensor(encoded_start_context).unsqueeze(0)

            train_loss, val_loss = train_model(
@@ -184,6 +188,7 @@ if __name__ == "__main__":
                n_epochs=HPARAM_CONFIG["n_epochs"],
                eval_freq=5, eval_iter=1,
                encoded_start_context=encoded_tensor,
+                tokenizer=tokenizer,
                warmup_iters=HPARAM_CONFIG["warmup_iters"],
                initial_lr=HPARAM_CONFIG["initial_lr"],
                min_lr=HPARAM_CONFIG["min_lr"]
--- a/ch05/05_bonus_hparam_tuning/previous_chapters.py
+++ b/ch05/05_bonus_hparam_tuning/previous_chapters.py
@@ -19,12 +19,11 @@ from torch.utils.data import Dataset, DataLoader

 class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
-        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
-        token_ids = self.tokenizer.encode(txt)
+        token_ids = tokenizer.encode(txt)

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
@@ -46,11 +45,11 @@ def create_dataloader_v1(txt, batch_size=4, max_length=256,
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
-    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
+    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride, num_workers=0)

    # Create dataloader
    dataloader = DataLoader(
-        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)
+        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=0)

    return dataloader