diff --git a/ch04/04_gqa/gpt_with_kv_gqa.py b/ch04/04_gqa/gpt_with_kv_gqa.py index 6a38a62..c0de3cf 100644 --- a/ch04/04_gqa/gpt_with_kv_gqa.py +++ b/ch04/04_gqa/gpt_with_kv_gqa.py @@ -290,7 +290,7 @@ def generate_text_simple_cached(model, idx, max_new_tokens, def main(): - parser = argparse.ArgumentParser(description="Run GPT with grouped-query attention.") + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Run GPT with grouped-query attention.") parser.add_argument("--emb_dim", type=int, default=768, help="Model embedding dimension.") parser.add_argument("--n_heads", type=int, default=12, help="Number of attention heads.") parser.add_argument("--n_layers", type=int, default=12, help="Number of transformer blocks.") diff --git a/ch04/04_gqa/gpt_with_kv_mha.py b/ch04/04_gqa/gpt_with_kv_mha.py index f906d71..c3c5817 100644 --- a/ch04/04_gqa/gpt_with_kv_mha.py +++ b/ch04/04_gqa/gpt_with_kv_mha.py @@ -278,7 +278,7 @@ def generate_text_simple_cached(model, idx, max_new_tokens, def main(): - parser = argparse.ArgumentParser(description="Run GPT with standard multi-head attention.") + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Run GPT with standard multi-head attention.") parser.add_argument("--emb_dim", type=int, default=768, help="Model embedding dimension.") parser.add_argument("--n_heads", type=int, default=12, help="Number of attention heads.") parser.add_argument("--n_layers", type=int, default=12, help="Number of transformer blocks.") diff --git a/ch04/04_gqa/memory_estimator_gqa.py b/ch04/04_gqa/memory_estimator_gqa.py index 380b4a2..c3cc592 100644 --- a/ch04/04_gqa/memory_estimator_gqa.py +++ b/ch04/04_gqa/memory_estimator_gqa.py @@ -31,7 +31,7 @@ def calc_kv_bytes_total(batch, context_length, emb_dim, n_heads, def main(): - p = argparse.ArgumentParser(description="Estimate KV-cache memory for MHA vs GQA") + p = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Estimate KV-cache memory for MHA vs GQA") p.add_argument("--context_length", default=1024, type=int) p.add_argument("--emb_dim", required=True, type=int) p.add_argument("--n_heads", required=True, type=int) diff --git a/ch04/05_mla/gpt_with_kv_mha.py b/ch04/05_mla/gpt_with_kv_mha.py index 92e06f1..5c5abf8 100644 --- a/ch04/05_mla/gpt_with_kv_mha.py +++ b/ch04/05_mla/gpt_with_kv_mha.py @@ -278,7 +278,7 @@ def generate_text_simple_cached(model, idx, max_new_tokens, def main(): - parser = argparse.ArgumentParser(description="Run GPT with standard multi-head attention.") + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Run GPT with standard multi-head attention.") parser.add_argument("--emb_dim", type=int, default=768, help="Model embedding dimension.") parser.add_argument("--n_heads", type=int, default=12, help="Number of attention heads.") parser.add_argument("--n_layers", type=int, default=12, help="Number of transformer blocks.") diff --git a/ch04/05_mla/gpt_with_kv_mla.py b/ch04/05_mla/gpt_with_kv_mla.py index 6e9c388..d162308 100644 --- a/ch04/05_mla/gpt_with_kv_mla.py +++ b/ch04/05_mla/gpt_with_kv_mla.py @@ -286,13 +286,13 @@ def generate_text_simple_cached(model, idx, max_new_tokens, def main(): - parser = argparse.ArgumentParser(description="Run GPT with standard multi-head attention.") + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Run GPT with standard multi-head attention.") parser.add_argument("--emb_dim", type=int, default=768, help="Model embedding dimension.") parser.add_argument("--n_heads", type=int, default=12, help="Number of attention heads.") parser.add_argument("--n_layers", type=int, default=12, help="Number of transformer blocks.") parser.add_argument("--max_new_tokens", type=int, default=200, help="Number of tokens to generate.") parser.add_argument("--latent_dim", type=int, default=None, - help="Latent dim for MLA (default: d_out//8)") + help="Latent dim for MLA") args = parser.parse_args() diff --git a/ch04/05_mla/memory_estimator_mla.py b/ch04/05_mla/memory_estimator_mla.py index bf903c8..61ba99b 100644 --- a/ch04/05_mla/memory_estimator_mla.py +++ b/ch04/05_mla/memory_estimator_mla.py @@ -37,7 +37,7 @@ def calc_mla_bytes_total(batch, context_length, n_layers, latent_dim, bytes_per_ def main(): - p = argparse.ArgumentParser(description="Estimate KV-cache memory for MHA vs GQA vs MLA") + p = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Estimate KV-cache memory for MHA vs GQA vs MLA") p.add_argument("--context_length", default=1024, type=int) p.add_argument("--emb_dim", required=True, type=int) p.add_argument("--n_heads", required=True, type=int) diff --git a/ch04/06_swa/gpt_with_kv_mha.py b/ch04/06_swa/gpt_with_kv_mha.py index f906d71..c3c5817 100644 --- a/ch04/06_swa/gpt_with_kv_mha.py +++ b/ch04/06_swa/gpt_with_kv_mha.py @@ -278,7 +278,7 @@ def generate_text_simple_cached(model, idx, max_new_tokens, def main(): - parser = argparse.ArgumentParser(description="Run GPT with standard multi-head attention.") + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Run GPT with standard multi-head attention.") parser.add_argument("--emb_dim", type=int, default=768, help="Model embedding dimension.") parser.add_argument("--n_heads", type=int, default=12, help="Number of attention heads.") parser.add_argument("--n_layers", type=int, default=12, help="Number of transformer blocks.") diff --git a/ch04/06_swa/gpt_with_kv_swa.py b/ch04/06_swa/gpt_with_kv_swa.py index bd4cda7..a9cce3a 100644 --- a/ch04/06_swa/gpt_with_kv_swa.py +++ b/ch04/06_swa/gpt_with_kv_swa.py @@ -311,7 +311,7 @@ def generate_text_simple_cached(model, idx, max_new_tokens, def main(): - parser = argparse.ArgumentParser(description="Run GPT with standard multi-head attention.") + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Run GPT with standard multi-head attention.") parser.add_argument("--emb_dim", type=int, default=768, help="Model embedding dimension.") parser.add_argument("--n_heads", type=int, default=12, help="Number of attention heads.") parser.add_argument("--n_layers", type=int, default=12, help="Number of transformer blocks.") diff --git a/ch04/06_swa/memory_estimator_swa.py b/ch04/06_swa/memory_estimator_swa.py index 2401433..20b9134 100644 --- a/ch04/06_swa/memory_estimator_swa.py +++ b/ch04/06_swa/memory_estimator_swa.py @@ -90,7 +90,7 @@ def estimate_totals(context_length, sliding_window_size, emb_dim, n_heads, n_lay def main(): - p = argparse.ArgumentParser(description="Estimate KV-cache memory for MHA/GQA with SWA layer ratio") + p = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Estimate KV-cache memory for MHA/GQA with SWA layer ratio") p.add_argument("--context_length", default=1024, type=int) p.add_argument("--sliding_window_size", required=True, type=int, help="SWA window size W per SWA layer.") diff --git a/ch04/06_swa/plot_memory_estimates_swa.py b/ch04/06_swa/plot_memory_estimates_swa.py index 9636bf6..ed66c36 100644 --- a/ch04/06_swa/plot_memory_estimates_swa.py +++ b/ch04/06_swa/plot_memory_estimates_swa.py @@ -102,7 +102,7 @@ def calc_kv_bytes_total_gqa_swa( def main(): - p = argparse.ArgumentParser( + p = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="KV-cache vs Context Length — MHA vs GQA with SWA overlays" ) p.add_argument("--emb_dim", type=int, required=True) diff --git a/ch04/07_moe/gpt_with_kv_ffn.py b/ch04/07_moe/gpt_with_kv_ffn.py index a6035e0..1e646ef 100644 --- a/ch04/07_moe/gpt_with_kv_ffn.py +++ b/ch04/07_moe/gpt_with_kv_ffn.py @@ -341,7 +341,7 @@ def generate_text_simple_cached(model, idx, max_new_tokens, def main(): - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--emb_dim", type=int, default=768, help="Model embedding dimension.") parser.add_argument("--hidden_dim", type=int, default=768*4, help="Intermediate FFN size.") parser.add_argument("--n_heads", type=int, default=12, help="Number of attention heads.") diff --git a/ch04/07_moe/gpt_with_kv_moe.py b/ch04/07_moe/gpt_with_kv_moe.py index 498c987..9c473ce 100644 --- a/ch04/07_moe/gpt_with_kv_moe.py +++ b/ch04/07_moe/gpt_with_kv_moe.py @@ -401,7 +401,7 @@ def generate_text_simple_cached(model, idx, max_new_tokens, def main(): - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--emb_dim", type=int, default=768, help="Model embedding dimension.") parser.add_argument("--hidden_dim", type=int, default=768*4, help="Intermediate FFN or MoE size.") parser.add_argument("--n_heads", type=int, default=12, help="Number of attention heads.") diff --git a/ch04/07_moe/memory_estimator_moe.py b/ch04/07_moe/memory_estimator_moe.py index 7c3bb2b..a5522a5 100644 --- a/ch04/07_moe/memory_estimator_moe.py +++ b/ch04/07_moe/memory_estimator_moe.py @@ -65,7 +65,7 @@ def estimate_params_and_hidden( def main(): - p = argparse.ArgumentParser( + p = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Estimate FFN vs MoE parameter memory" ) p.add_argument("--emb_dim", type=int, required=True, diff --git a/ch04/07_moe/plot_memory_estimates_moe.py b/ch04/07_moe/plot_memory_estimates_moe.py index 10346bb..4c8383d 100644 --- a/ch04/07_moe/plot_memory_estimates_moe.py +++ b/ch04/07_moe/plot_memory_estimates_moe.py @@ -91,7 +91,7 @@ def plot_active_params_vs_experts( def main(): - p = argparse.ArgumentParser(description="Plot Dense vs MoE active parameters.") + p = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Plot Dense vs MoE active parameters.") p.add_argument("--emb_dim", type=int, required=True, help="Embedding dimension") p.add_argument("--hidden_dim", type=int, required=True, help="Dense FFN hidden size") p.add_argument("--ffn_type", choices=["gelu", "swiglu"], default="swiglu") diff --git a/ch04/08_deltanet/plot_memory_estimates_gated_deltanet.py b/ch04/08_deltanet/plot_memory_estimates_gated_deltanet.py index 86d7a1e..8e340de 100644 --- a/ch04/08_deltanet/plot_memory_estimates_gated_deltanet.py +++ b/ch04/08_deltanet/plot_memory_estimates_gated_deltanet.py @@ -36,7 +36,7 @@ def convert_to_gb(x): def main(): - p = argparse.ArgumentParser(description="Memory vs. Context Length: MHA vs. DeltaNet (3:1 mix)") + p = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Memory vs. Context Length: MHA vs. DeltaNet (3:1 mix)") p.add_argument("--batch", type=int, default=1) p.add_argument("--emb_dim", type=int, default=2048) p.add_argument("--n_heads", type=int, default=16) diff --git a/ch05/01_main-chapter-code/gpt_generate.py b/ch05/01_main-chapter-code/gpt_generate.py index 3fdfd51..af380a7 100644 --- a/ch05/01_main-chapter-code/gpt_generate.py +++ b/ch05/01_main-chapter-code/gpt_generate.py @@ -253,16 +253,16 @@ def main(gpt_config, input_prompt, model_size, device): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Generate text with a pretrained GPT-2 model.") + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Generate text with a pretrained GPT-2 model.") parser.add_argument( "--prompt", default="Every effort moves you", - help="Prompt text used to seed the generation (default matches the script's built-in prompt)." + help="Prompt text used to seed the generation." ) parser.add_argument( "--device", default="cpu", - help="Device for running inference, e.g., cpu, cuda, mps, or auto. Defaults to cpu." + help="Device for running inference, e.g., cpu, cuda, mps, or auto." ) args = parser.parse_args() diff --git a/ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py b/ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py index 0d17d65..5920360 100644 --- a/ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py +++ b/ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py @@ -66,7 +66,7 @@ def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftex if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Preprocess and combine text files for pretraining") + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Preprocess and combine text files for pretraining") parser.add_argument("--data_dir", type=str, default="gutenberg/data/raw", help="Directory containing the downloaded raw training data") diff --git a/ch05/03_bonus_pretraining_on_gutenberg/pretraining_simple.py b/ch05/03_bonus_pretraining_on_gutenberg/pretraining_simple.py index 5bb7728..287090b 100644 --- a/ch05/03_bonus_pretraining_on_gutenberg/pretraining_simple.py +++ b/ch05/03_bonus_pretraining_on_gutenberg/pretraining_simple.py @@ -148,7 +148,7 @@ def train_model_simple(model, optimizer, device, n_epochs, if __name__ == "__main__": - parser = argparse.ArgumentParser(description="GPT Model Training Configuration") + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="GPT Model Training Configuration") parser.add_argument("--data_dir", type=str, default="gutenberg/data", help="Directory containing the training data") diff --git a/ch06/01_main-chapter-code/gpt_class_finetune.py b/ch06/01_main-chapter-code/gpt_class_finetune.py index 523b85d..47fd22d 100644 --- a/ch06/01_main-chapter-code/gpt_class_finetune.py +++ b/ch06/01_main-chapter-code/gpt_class_finetune.py @@ -239,7 +239,7 @@ if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser( + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Finetune a GPT model for classification" ) parser.add_argument( diff --git a/ch06/02_bonus_additional-experiments/additional_experiments.py b/ch06/02_bonus_additional-experiments/additional_experiments.py index a600234..006bf13 100644 --- a/ch06/02_bonus_additional-experiments/additional_experiments.py +++ b/ch06/02_bonus_additional-experiments/additional_experiments.py @@ -410,7 +410,7 @@ def replace_linear_with_lora(model, rank, alpha, alternative=False): if __name__ == "__main__": - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "--model_size", type=str, diff --git a/ch06/03_bonus_imdb-classification/train_bert_hf.py b/ch06/03_bonus_imdb-classification/train_bert_hf.py index 037cfb7..e83d174 100644 --- a/ch06/03_bonus_imdb-classification/train_bert_hf.py +++ b/ch06/03_bonus_imdb-classification/train_bert_hf.py @@ -175,7 +175,7 @@ def train_classifier_simple(model, train_loader, val_loader, optimizer, device, if __name__ == "__main__": - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "--trainable_layers", type=str, diff --git a/ch06/03_bonus_imdb-classification/train_bert_hf_spam.py b/ch06/03_bonus_imdb-classification/train_bert_hf_spam.py index 89bbfd9..ed7ab0e 100644 --- a/ch06/03_bonus_imdb-classification/train_bert_hf_spam.py +++ b/ch06/03_bonus_imdb-classification/train_bert_hf_spam.py @@ -272,7 +272,7 @@ def train_classifier_simple(model, train_loader, val_loader, optimizer, device, if __name__ == "__main__": - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "--trainable_layers", type=str, diff --git a/ch06/03_bonus_imdb-classification/train_gpt.py b/ch06/03_bonus_imdb-classification/train_gpt.py index 4eaca10..8d6b3f2 100644 --- a/ch06/03_bonus_imdb-classification/train_gpt.py +++ b/ch06/03_bonus_imdb-classification/train_gpt.py @@ -225,7 +225,7 @@ def train_classifier_simple(model, train_loader, val_loader, optimizer, device, if __name__ == "__main__": - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "--model_size", type=str, diff --git a/ch07/01_main-chapter-code/exercise_experiments.py b/ch07/01_main-chapter-code/exercise_experiments.py index 773e1e2..a014a02 100644 --- a/ch07/01_main-chapter-code/exercise_experiments.py +++ b/ch07/01_main-chapter-code/exercise_experiments.py @@ -534,7 +534,7 @@ if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser( + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Instruction finetune a GPT model" ) options = {"baseline", "mask_instructions", "alpaca_52k", "phi3_prompt", "lora"} diff --git a/ch07/01_main-chapter-code/gpt_instruction_finetuning.py b/ch07/01_main-chapter-code/gpt_instruction_finetuning.py index 248bf6a..79cb1f3 100644 --- a/ch07/01_main-chapter-code/gpt_instruction_finetuning.py +++ b/ch07/01_main-chapter-code/gpt_instruction_finetuning.py @@ -333,7 +333,7 @@ if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser( + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Finetune a GPT model for classification" ) parser.add_argument( diff --git a/ch07/01_main-chapter-code/ollama_evaluate.py b/ch07/01_main-chapter-code/ollama_evaluate.py index a75f592..84c4a76 100644 --- a/ch07/01_main-chapter-code/ollama_evaluate.py +++ b/ch07/01_main-chapter-code/ollama_evaluate.py @@ -103,7 +103,7 @@ if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser( + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Evaluate model responses with ollama" ) parser.add_argument( diff --git a/ch07/02_dataset-utilities/find-near-duplicates.py b/ch07/02_dataset-utilities/find-near-duplicates.py index 6b62c26..fdf3bce 100644 --- a/ch07/02_dataset-utilities/find-near-duplicates.py +++ b/ch07/02_dataset-utilities/find-near-duplicates.py @@ -100,7 +100,7 @@ def find_print_and_remove_near_duplicates(json_data, remove_duplicates=False, th if __name__ == "__main__": print("scikit-learn version:", sklearn_version) - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "--json_file", type=str,