diff --git a/README.md b/README.md index 3d0e529..b0803ec 100644 --- a/README.md +++ b/README.md @@ -183,7 +183,7 @@ Several folders contain optional materials as a bonus for interested readers: - [PyTorch Performance Tips for Faster LLM Training](ch05/10_llm-training-speed) - **Chapter 6: Finetuning for classification** - [Additional experiments finetuning different layers and using larger models](ch06/02_bonus_additional-experiments) - - [Finetuning different models on 50k IMDB movie review dataset](ch06/03_bonus_imdb-classification) + - [Finetuning different models on 50k IMDb movie review dataset](ch06/03_bonus_imdb-classification) - [Building a User Interface to Interact With the GPT-based Spam Classifier](ch06/04_user_interface) - **Chapter 7: Finetuning to follow instructions** - [Dataset Utilities for Finding Near Duplicates and Creating Passive Voice Entries](ch07/02_dataset-utilities) diff --git a/ch06/03_bonus_imdb-classification/README.md b/ch06/03_bonus_imdb-classification/README.md index 08bd939..6cfdcd5 100644 --- a/ch06/03_bonus_imdb-classification/README.md +++ b/ch06/03_bonus_imdb-classification/README.md @@ -1,4 +1,4 @@ -# Additional Experiments Classifying the Sentiment of 50k IMDB Movie Reviews +# Additional Experiments Classifying the Sentiment of 50k IMDb Movie Reviews ## Overview diff --git a/ch06/03_bonus_imdb-classification/train_bert_hf.py b/ch06/03_bonus_imdb-classification/train_bert_hf.py index df359c0..037cfb7 100644 --- a/ch06/03_bonus_imdb-classification/train_bert_hf.py +++ b/ch06/03_bonus_imdb-classification/train_bert_hf.py @@ -15,7 +15,7 @@ from torch.utils.data import Dataset from transformers import AutoTokenizer, AutoModelForSequenceClassification -class IMDBDataset(Dataset): +class IMDbDataset(Dataset): def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256, use_attention_mask=False): self.data = pd.read_csv(csv_file) self.max_length = max_length if max_length is not None else self._longest_encoded_length(tokenizer) @@ -375,21 +375,21 @@ if __name__ == "__main__": else: raise ValueError("Invalid argument for `use_attention_mask`.") - train_dataset = IMDBDataset( + train_dataset = IMDbDataset( base_path / "train.csv", max_length=256, tokenizer=tokenizer, pad_token_id=tokenizer.pad_token_id, use_attention_mask=use_attention_mask ) - val_dataset = IMDBDataset( + val_dataset = IMDbDataset( base_path / "validation.csv", max_length=256, tokenizer=tokenizer, pad_token_id=tokenizer.pad_token_id, use_attention_mask=use_attention_mask ) - test_dataset = IMDBDataset( + test_dataset = IMDbDataset( base_path / "test.csv", max_length=256, tokenizer=tokenizer, diff --git a/ch06/03_bonus_imdb-classification/train_gpt.py b/ch06/03_bonus_imdb-classification/train_gpt.py index 99091d3..b650634 100644 --- a/ch06/03_bonus_imdb-classification/train_gpt.py +++ b/ch06/03_bonus_imdb-classification/train_gpt.py @@ -17,7 +17,7 @@ from gpt_download import download_and_load_gpt2 from previous_chapters import GPTModel, load_weights_into_gpt -class IMDBDataset(Dataset): +class IMDbDataset(Dataset): def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256): self.data = pd.read_csv(csv_file) self.max_length = max_length if max_length is not None else self._longest_encoded_length(tokenizer) @@ -368,7 +368,7 @@ if __name__ == "__main__": if args.context_length == "model_context_length": max_length = model.pos_emb.weight.shape[0] elif args.context_length == "longest_training_example": - train_dataset = IMDBDataset(base_path / "train.csv", max_length=None, tokenizer=tokenizer) + train_dataset = IMDbDataset(base_path / "train.csv", max_length=None, tokenizer=tokenizer) max_length = train_dataset.max_length else: try: @@ -377,9 +377,9 @@ if __name__ == "__main__": raise ValueError("Invalid --context_length argument") if train_dataset is None: - train_dataset = IMDBDataset(base_path / "train.csv", max_length=max_length, tokenizer=tokenizer) - val_dataset = IMDBDataset(base_path / "validation.csv", max_length=max_length, tokenizer=tokenizer) - test_dataset = IMDBDataset(base_path / "test.csv", max_length=max_length, tokenizer=tokenizer) + train_dataset = IMDbDataset(base_path / "train.csv", max_length=max_length, tokenizer=tokenizer) + val_dataset = IMDbDataset(base_path / "validation.csv", max_length=max_length, tokenizer=tokenizer) + test_dataset = IMDbDataset(base_path / "test.csv", max_length=max_length, tokenizer=tokenizer) num_workers = 0 batch_size = 8 diff --git a/ch06/README.md b/ch06/README.md index 65be918..26452f3 100644 --- a/ch06/README.md +++ b/ch06/README.md @@ -9,7 +9,7 @@ ## Bonus Materials - [02_bonus_additional-experiments](02_bonus_additional-experiments) includes additional experiments (e.g., training the last vs first token, extending the input length, etc.) -- [03_bonus_imdb-classification](03_bonus_imdb-classification) compares the LLM from chapter 6 with other models on a 50k IMDB movie review sentiment classification dataset +- [03_bonus_imdb-classification](03_bonus_imdb-classification) compares the LLM from chapter 6 with other models on a 50k IMDb movie review sentiment classification dataset - [04_user_interface](04_user_interface) implements an interactive user interface to interact with the pretrained LLM