diff --git a/ch04/04_gqa/gpt_with_kv_gqa.py b/ch04/04_gqa/gpt_with_kv_gqa.py
index 6a38a62..c0de3cf 100644
--- a/ch04/04_gqa/gpt_with_kv_gqa.py
+++ b/ch04/04_gqa/gpt_with_kv_gqa.py
@@ -290,7 +290,7 @@ def generate_text_simple_cached(model, idx, max_new_tokens,
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Run GPT with grouped-query attention.")
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Run GPT with grouped-query attention.")
     parser.add_argument("--emb_dim", type=int, default=768, help="Model embedding dimension.")
     parser.add_argument("--n_heads", type=int, default=12, help="Number of attention heads.")
     parser.add_argument("--n_layers", type=int, default=12, help="Number of transformer blocks.")
diff --git a/ch04/04_gqa/gpt_with_kv_mha.py b/ch04/04_gqa/gpt_with_kv_mha.py
index f906d71..c3c5817 100644
--- a/ch04/04_gqa/gpt_with_kv_mha.py
+++ b/ch04/04_gqa/gpt_with_kv_mha.py
@@ -278,7 +278,7 @@ def generate_text_simple_cached(model, idx, max_new_tokens,
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Run GPT with standard multi-head attention.")
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Run GPT with standard multi-head attention.")
     parser.add_argument("--emb_dim", type=int, default=768, help="Model embedding dimension.")
     parser.add_argument("--n_heads", type=int, default=12, help="Number of attention heads.")
     parser.add_argument("--n_layers", type=int, default=12, help="Number of transformer blocks.")
diff --git a/ch04/04_gqa/memory_estimator_gqa.py b/ch04/04_gqa/memory_estimator_gqa.py
index 380b4a2..c3cc592 100644
--- a/ch04/04_gqa/memory_estimator_gqa.py
+++ b/ch04/04_gqa/memory_estimator_gqa.py
@@ -31,7 +31,7 @@ def calc_kv_bytes_total(batch, context_length, emb_dim, n_heads,
 
 
 def main():
-    p = argparse.ArgumentParser(description="Estimate KV-cache memory for MHA vs GQA")
+    p = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Estimate KV-cache memory for MHA vs GQA")
     p.add_argument("--context_length", default=1024, type=int)
     p.add_argument("--emb_dim", required=True, type=int)
     p.add_argument("--n_heads", required=True, type=int)
diff --git a/ch04/05_mla/gpt_with_kv_mha.py b/ch04/05_mla/gpt_with_kv_mha.py
index 92e06f1..5c5abf8 100644
--- a/ch04/05_mla/gpt_with_kv_mha.py
+++ b/ch04/05_mla/gpt_with_kv_mha.py
@@ -278,7 +278,7 @@ def generate_text_simple_cached(model, idx, max_new_tokens,
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Run GPT with standard multi-head attention.")
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Run GPT with standard multi-head attention.")
     parser.add_argument("--emb_dim", type=int, default=768, help="Model embedding dimension.")
     parser.add_argument("--n_heads", type=int, default=12, help="Number of attention heads.")
     parser.add_argument("--n_layers", type=int, default=12, help="Number of transformer blocks.")
diff --git a/ch04/05_mla/gpt_with_kv_mla.py b/ch04/05_mla/gpt_with_kv_mla.py
index 6e9c388..d162308 100644
--- a/ch04/05_mla/gpt_with_kv_mla.py
+++ b/ch04/05_mla/gpt_with_kv_mla.py
@@ -286,13 +286,13 @@ def generate_text_simple_cached(model, idx, max_new_tokens,
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Run GPT with standard multi-head attention.")
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Run GPT with standard multi-head attention.")
     parser.add_argument("--emb_dim", type=int, default=768, help="Model embedding dimension.")
     parser.add_argument("--n_heads", type=int, default=12, help="Number of attention heads.")
     parser.add_argument("--n_layers", type=int, default=12, help="Number of transformer blocks.")
     parser.add_argument("--max_new_tokens", type=int, default=200, help="Number of tokens to generate.")
     parser.add_argument("--latent_dim", type=int, default=None,
-                        help="Latent dim for MLA (default: d_out//8)")
+        help="Latent dim for MLA")
 
     args = parser.parse_args()
 
diff --git a/ch04/05_mla/memory_estimator_mla.py b/ch04/05_mla/memory_estimator_mla.py
index bf903c8..61ba99b 100644
--- a/ch04/05_mla/memory_estimator_mla.py
+++ b/ch04/05_mla/memory_estimator_mla.py
@@ -37,7 +37,7 @@ def calc_mla_bytes_total(batch, context_length, n_layers, latent_dim, bytes_per_
 
 
 def main():
-    p = argparse.ArgumentParser(description="Estimate KV-cache memory for MHA vs GQA vs MLA")
+    p = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Estimate KV-cache memory for MHA vs GQA vs MLA")
     p.add_argument("--context_length", default=1024, type=int)
     p.add_argument("--emb_dim", required=True, type=int)
     p.add_argument("--n_heads", required=True, type=int)
diff --git a/ch04/06_swa/gpt_with_kv_mha.py b/ch04/06_swa/gpt_with_kv_mha.py
index f906d71..c3c5817 100644
--- a/ch04/06_swa/gpt_with_kv_mha.py
+++ b/ch04/06_swa/gpt_with_kv_mha.py
@@ -278,7 +278,7 @@ def generate_text_simple_cached(model, idx, max_new_tokens,
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Run GPT with standard multi-head attention.")
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Run GPT with standard multi-head attention.")
     parser.add_argument("--emb_dim", type=int, default=768, help="Model embedding dimension.")
     parser.add_argument("--n_heads", type=int, default=12, help="Number of attention heads.")
     parser.add_argument("--n_layers", type=int, default=12, help="Number of transformer blocks.")
diff --git a/ch04/06_swa/gpt_with_kv_swa.py b/ch04/06_swa/gpt_with_kv_swa.py
index bd4cda7..a9cce3a 100644
--- a/ch04/06_swa/gpt_with_kv_swa.py
+++ b/ch04/06_swa/gpt_with_kv_swa.py
@@ -311,7 +311,7 @@ def generate_text_simple_cached(model, idx, max_new_tokens,
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Run GPT with standard multi-head attention.")
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Run GPT with standard multi-head attention.")
     parser.add_argument("--emb_dim", type=int, default=768, help="Model embedding dimension.")
     parser.add_argument("--n_heads", type=int, default=12, help="Number of attention heads.")
     parser.add_argument("--n_layers", type=int, default=12, help="Number of transformer blocks.")
diff --git a/ch04/06_swa/memory_estimator_swa.py b/ch04/06_swa/memory_estimator_swa.py
index 2401433..20b9134 100644
--- a/ch04/06_swa/memory_estimator_swa.py
+++ b/ch04/06_swa/memory_estimator_swa.py
@@ -90,7 +90,7 @@ def estimate_totals(context_length, sliding_window_size, emb_dim, n_heads, n_lay
 
 
 def main():
-    p = argparse.ArgumentParser(description="Estimate KV-cache memory for MHA/GQA with SWA layer ratio")
+    p = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Estimate KV-cache memory for MHA/GQA with SWA layer ratio")
     p.add_argument("--context_length", default=1024, type=int)
     p.add_argument("--sliding_window_size", required=True, type=int,
                    help="SWA window size W per SWA layer.")
diff --git a/ch04/06_swa/plot_memory_estimates_swa.py b/ch04/06_swa/plot_memory_estimates_swa.py
index 9636bf6..ed66c36 100644
--- a/ch04/06_swa/plot_memory_estimates_swa.py
+++ b/ch04/06_swa/plot_memory_estimates_swa.py
@@ -102,7 +102,7 @@ def calc_kv_bytes_total_gqa_swa(
 
 
 def main():
-    p = argparse.ArgumentParser(
+    p = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         description="KV-cache vs Context Length — MHA vs GQA with SWA overlays"
     )
     p.add_argument("--emb_dim", type=int, required=True)
diff --git a/ch04/07_moe/gpt_with_kv_ffn.py b/ch04/07_moe/gpt_with_kv_ffn.py
index a6035e0..1e646ef 100644
--- a/ch04/07_moe/gpt_with_kv_ffn.py
+++ b/ch04/07_moe/gpt_with_kv_ffn.py
@@ -341,7 +341,7 @@ def generate_text_simple_cached(model, idx, max_new_tokens,
 
 
 def main():
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument("--emb_dim", type=int, default=768, help="Model embedding dimension.")
     parser.add_argument("--hidden_dim", type=int, default=768*4, help="Intermediate FFN size.")
     parser.add_argument("--n_heads", type=int, default=12, help="Number of attention heads.")
diff --git a/ch04/07_moe/gpt_with_kv_moe.py b/ch04/07_moe/gpt_with_kv_moe.py
index 498c987..9c473ce 100644
--- a/ch04/07_moe/gpt_with_kv_moe.py
+++ b/ch04/07_moe/gpt_with_kv_moe.py
@@ -401,7 +401,7 @@ def generate_text_simple_cached(model, idx, max_new_tokens,
 
 
 def main():
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument("--emb_dim", type=int, default=768, help="Model embedding dimension.")
     parser.add_argument("--hidden_dim", type=int, default=768*4, help="Intermediate FFN or MoE size.")
     parser.add_argument("--n_heads", type=int, default=12, help="Number of attention heads.")
diff --git a/ch04/07_moe/memory_estimator_moe.py b/ch04/07_moe/memory_estimator_moe.py
index 7c3bb2b..a5522a5 100644
--- a/ch04/07_moe/memory_estimator_moe.py
+++ b/ch04/07_moe/memory_estimator_moe.py
@@ -65,7 +65,7 @@ def estimate_params_and_hidden(
 
 
 def main():
-    p = argparse.ArgumentParser(
+    p = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         description="Estimate FFN vs MoE parameter memory"
     )
     p.add_argument("--emb_dim", type=int, required=True,
diff --git a/ch04/07_moe/plot_memory_estimates_moe.py b/ch04/07_moe/plot_memory_estimates_moe.py
index 10346bb..4c8383d 100644
--- a/ch04/07_moe/plot_memory_estimates_moe.py
+++ b/ch04/07_moe/plot_memory_estimates_moe.py
@@ -91,7 +91,7 @@ def plot_active_params_vs_experts(
 
 
 def main():
-    p = argparse.ArgumentParser(description="Plot Dense vs MoE active parameters.")
+    p = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Plot Dense vs MoE active parameters.")
     p.add_argument("--emb_dim", type=int, required=True, help="Embedding dimension")
     p.add_argument("--hidden_dim", type=int, required=True, help="Dense FFN hidden size")
     p.add_argument("--ffn_type", choices=["gelu", "swiglu"], default="swiglu")
diff --git a/ch04/08_deltanet/plot_memory_estimates_gated_deltanet.py b/ch04/08_deltanet/plot_memory_estimates_gated_deltanet.py
index 86d7a1e..8e340de 100644
--- a/ch04/08_deltanet/plot_memory_estimates_gated_deltanet.py
+++ b/ch04/08_deltanet/plot_memory_estimates_gated_deltanet.py
@@ -36,7 +36,7 @@ def convert_to_gb(x):
 
 
 def main():
-    p = argparse.ArgumentParser(description="Memory vs. Context Length: MHA vs. DeltaNet (3:1 mix)")
+    p = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Memory vs. Context Length: MHA vs. DeltaNet (3:1 mix)")
     p.add_argument("--batch", type=int, default=1)
     p.add_argument("--emb_dim", type=int, default=2048)
     p.add_argument("--n_heads", type=int, default=16)
diff --git a/ch05/01_main-chapter-code/gpt_generate.py b/ch05/01_main-chapter-code/gpt_generate.py
index 3fdfd51..af380a7 100644
--- a/ch05/01_main-chapter-code/gpt_generate.py
+++ b/ch05/01_main-chapter-code/gpt_generate.py
@@ -253,16 +253,16 @@ def main(gpt_config, input_prompt, model_size, device):
 
 if __name__ == "__main__":
 
-    parser = argparse.ArgumentParser(description="Generate text with a pretrained GPT-2 model.")
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Generate text with a pretrained GPT-2 model.")
     parser.add_argument(
         "--prompt",
         default="Every effort moves you",
-        help="Prompt text used to seed the generation (default matches the script's built-in prompt)."
+        help="Prompt text used to seed the generation."
     )
     parser.add_argument(
         "--device",
         default="cpu",
-        help="Device for running inference, e.g., cpu, cuda, mps, or auto. Defaults to cpu."
+        help="Device for running inference, e.g., cpu, cuda, mps, or auto."
     )
 
     args = parser.parse_args()
diff --git a/ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py b/ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py
index 0d17d65..5920360 100644
--- a/ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py
+++ b/ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py
@@ -66,7 +66,7 @@ def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftex
 
 if __name__ == "__main__":
 
-    parser = argparse.ArgumentParser(description="Preprocess and combine text files for pretraining")
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Preprocess and combine text files for pretraining")
 
     parser.add_argument("--data_dir", type=str, default="gutenberg/data/raw",
                         help="Directory containing the downloaded raw training data")
diff --git a/ch05/03_bonus_pretraining_on_gutenberg/pretraining_simple.py b/ch05/03_bonus_pretraining_on_gutenberg/pretraining_simple.py
index 5bb7728..287090b 100644
--- a/ch05/03_bonus_pretraining_on_gutenberg/pretraining_simple.py
+++ b/ch05/03_bonus_pretraining_on_gutenberg/pretraining_simple.py
@@ -148,7 +148,7 @@ def train_model_simple(model, optimizer, device, n_epochs,
 
 if __name__ == "__main__":
 
-    parser = argparse.ArgumentParser(description="GPT Model Training Configuration")
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="GPT Model Training Configuration")
 
     parser.add_argument("--data_dir", type=str, default="gutenberg/data",
                         help="Directory containing the training data")
diff --git a/ch06/01_main-chapter-code/gpt_class_finetune.py b/ch06/01_main-chapter-code/gpt_class_finetune.py
index 523b85d..47fd22d 100644
--- a/ch06/01_main-chapter-code/gpt_class_finetune.py
+++ b/ch06/01_main-chapter-code/gpt_class_finetune.py
@@ -239,7 +239,7 @@ if __name__ == "__main__":
 
     import argparse
 
-    parser = argparse.ArgumentParser(
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         description="Finetune a GPT model for classification"
     )
     parser.add_argument(
diff --git a/ch06/02_bonus_additional-experiments/additional_experiments.py b/ch06/02_bonus_additional-experiments/additional_experiments.py
index a600234..006bf13 100644
--- a/ch06/02_bonus_additional-experiments/additional_experiments.py
+++ b/ch06/02_bonus_additional-experiments/additional_experiments.py
@@ -410,7 +410,7 @@ def replace_linear_with_lora(model, rank, alpha, alternative=False):
 
 if __name__ == "__main__":
 
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument(
         "--model_size",
         type=str,
diff --git a/ch06/03_bonus_imdb-classification/train_bert_hf.py b/ch06/03_bonus_imdb-classification/train_bert_hf.py
index 037cfb7..e83d174 100644
--- a/ch06/03_bonus_imdb-classification/train_bert_hf.py
+++ b/ch06/03_bonus_imdb-classification/train_bert_hf.py
@@ -175,7 +175,7 @@ def train_classifier_simple(model, train_loader, val_loader, optimizer, device,
 
 if __name__ == "__main__":
 
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument(
         "--trainable_layers",
         type=str,
diff --git a/ch06/03_bonus_imdb-classification/train_bert_hf_spam.py b/ch06/03_bonus_imdb-classification/train_bert_hf_spam.py
index 89bbfd9..ed7ab0e 100644
--- a/ch06/03_bonus_imdb-classification/train_bert_hf_spam.py
+++ b/ch06/03_bonus_imdb-classification/train_bert_hf_spam.py
@@ -272,7 +272,7 @@ def train_classifier_simple(model, train_loader, val_loader, optimizer, device,
 
 if __name__ == "__main__":
 
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument(
         "--trainable_layers",
         type=str,
diff --git a/ch06/03_bonus_imdb-classification/train_gpt.py b/ch06/03_bonus_imdb-classification/train_gpt.py
index 4eaca10..8d6b3f2 100644
--- a/ch06/03_bonus_imdb-classification/train_gpt.py
+++ b/ch06/03_bonus_imdb-classification/train_gpt.py
@@ -225,7 +225,7 @@ def train_classifier_simple(model, train_loader, val_loader, optimizer, device,
 
 if __name__ == "__main__":
 
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument(
         "--model_size",
         type=str,
diff --git a/ch07/01_main-chapter-code/exercise_experiments.py b/ch07/01_main-chapter-code/exercise_experiments.py
index 773e1e2..a014a02 100644
--- a/ch07/01_main-chapter-code/exercise_experiments.py
+++ b/ch07/01_main-chapter-code/exercise_experiments.py
@@ -534,7 +534,7 @@ if __name__ == "__main__":
 
     import argparse
 
-    parser = argparse.ArgumentParser(
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         description="Instruction finetune a GPT model"
     )
     options = {"baseline", "mask_instructions", "alpaca_52k", "phi3_prompt", "lora"}
diff --git a/ch07/01_main-chapter-code/gpt_instruction_finetuning.py b/ch07/01_main-chapter-code/gpt_instruction_finetuning.py
index 248bf6a..79cb1f3 100644
--- a/ch07/01_main-chapter-code/gpt_instruction_finetuning.py
+++ b/ch07/01_main-chapter-code/gpt_instruction_finetuning.py
@@ -333,7 +333,7 @@ if __name__ == "__main__":
 
     import argparse
 
-    parser = argparse.ArgumentParser(
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         description="Finetune a GPT model for classification"
     )
     parser.add_argument(
diff --git a/ch07/01_main-chapter-code/ollama_evaluate.py b/ch07/01_main-chapter-code/ollama_evaluate.py
index a75f592..84c4a76 100644
--- a/ch07/01_main-chapter-code/ollama_evaluate.py
+++ b/ch07/01_main-chapter-code/ollama_evaluate.py
@@ -103,7 +103,7 @@ if __name__ == "__main__":
 
     import argparse
 
-    parser = argparse.ArgumentParser(
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         description="Evaluate model responses with ollama"
     )
     parser.add_argument(
diff --git a/ch07/02_dataset-utilities/find-near-duplicates.py b/ch07/02_dataset-utilities/find-near-duplicates.py
index 6b62c26..fdf3bce 100644
--- a/ch07/02_dataset-utilities/find-near-duplicates.py
+++ b/ch07/02_dataset-utilities/find-near-duplicates.py
@@ -100,7 +100,7 @@ def find_print_and_remove_near_duplicates(json_data, remove_duplicates=False, th
 if __name__ == "__main__":
     print("scikit-learn version:", sklearn_version)
 
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument(
         "--json_file",
         type=str,