mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2026-04-10 12:33:42 +00:00
User argpars utils to show default args on command line
This commit is contained in:
@@ -290,7 +290,7 @@ def generate_text_simple_cached(model, idx, max_new_tokens,
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Run GPT with grouped-query attention.")
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Run GPT with grouped-query attention.")
|
||||
parser.add_argument("--emb_dim", type=int, default=768, help="Model embedding dimension.")
|
||||
parser.add_argument("--n_heads", type=int, default=12, help="Number of attention heads.")
|
||||
parser.add_argument("--n_layers", type=int, default=12, help="Number of transformer blocks.")
|
||||
|
||||
@@ -278,7 +278,7 @@ def generate_text_simple_cached(model, idx, max_new_tokens,
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Run GPT with standard multi-head attention.")
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Run GPT with standard multi-head attention.")
|
||||
parser.add_argument("--emb_dim", type=int, default=768, help="Model embedding dimension.")
|
||||
parser.add_argument("--n_heads", type=int, default=12, help="Number of attention heads.")
|
||||
parser.add_argument("--n_layers", type=int, default=12, help="Number of transformer blocks.")
|
||||
|
||||
@@ -31,7 +31,7 @@ def calc_kv_bytes_total(batch, context_length, emb_dim, n_heads,
|
||||
|
||||
|
||||
def main():
|
||||
p = argparse.ArgumentParser(description="Estimate KV-cache memory for MHA vs GQA")
|
||||
p = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Estimate KV-cache memory for MHA vs GQA")
|
||||
p.add_argument("--context_length", default=1024, type=int)
|
||||
p.add_argument("--emb_dim", required=True, type=int)
|
||||
p.add_argument("--n_heads", required=True, type=int)
|
||||
|
||||
@@ -278,7 +278,7 @@ def generate_text_simple_cached(model, idx, max_new_tokens,
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Run GPT with standard multi-head attention.")
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Run GPT with standard multi-head attention.")
|
||||
parser.add_argument("--emb_dim", type=int, default=768, help="Model embedding dimension.")
|
||||
parser.add_argument("--n_heads", type=int, default=12, help="Number of attention heads.")
|
||||
parser.add_argument("--n_layers", type=int, default=12, help="Number of transformer blocks.")
|
||||
|
||||
@@ -286,13 +286,13 @@ def generate_text_simple_cached(model, idx, max_new_tokens,
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Run GPT with standard multi-head attention.")
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Run GPT with standard multi-head attention.")
|
||||
parser.add_argument("--emb_dim", type=int, default=768, help="Model embedding dimension.")
|
||||
parser.add_argument("--n_heads", type=int, default=12, help="Number of attention heads.")
|
||||
parser.add_argument("--n_layers", type=int, default=12, help="Number of transformer blocks.")
|
||||
parser.add_argument("--max_new_tokens", type=int, default=200, help="Number of tokens to generate.")
|
||||
parser.add_argument("--latent_dim", type=int, default=None,
|
||||
help="Latent dim for MLA (default: d_out//8)")
|
||||
help="Latent dim for MLA")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ def calc_mla_bytes_total(batch, context_length, n_layers, latent_dim, bytes_per_
|
||||
|
||||
|
||||
def main():
|
||||
p = argparse.ArgumentParser(description="Estimate KV-cache memory for MHA vs GQA vs MLA")
|
||||
p = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Estimate KV-cache memory for MHA vs GQA vs MLA")
|
||||
p.add_argument("--context_length", default=1024, type=int)
|
||||
p.add_argument("--emb_dim", required=True, type=int)
|
||||
p.add_argument("--n_heads", required=True, type=int)
|
||||
|
||||
@@ -278,7 +278,7 @@ def generate_text_simple_cached(model, idx, max_new_tokens,
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Run GPT with standard multi-head attention.")
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Run GPT with standard multi-head attention.")
|
||||
parser.add_argument("--emb_dim", type=int, default=768, help="Model embedding dimension.")
|
||||
parser.add_argument("--n_heads", type=int, default=12, help="Number of attention heads.")
|
||||
parser.add_argument("--n_layers", type=int, default=12, help="Number of transformer blocks.")
|
||||
|
||||
@@ -311,7 +311,7 @@ def generate_text_simple_cached(model, idx, max_new_tokens,
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Run GPT with standard multi-head attention.")
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Run GPT with standard multi-head attention.")
|
||||
parser.add_argument("--emb_dim", type=int, default=768, help="Model embedding dimension.")
|
||||
parser.add_argument("--n_heads", type=int, default=12, help="Number of attention heads.")
|
||||
parser.add_argument("--n_layers", type=int, default=12, help="Number of transformer blocks.")
|
||||
|
||||
@@ -90,7 +90,7 @@ def estimate_totals(context_length, sliding_window_size, emb_dim, n_heads, n_lay
|
||||
|
||||
|
||||
def main():
|
||||
p = argparse.ArgumentParser(description="Estimate KV-cache memory for MHA/GQA with SWA layer ratio")
|
||||
p = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Estimate KV-cache memory for MHA/GQA with SWA layer ratio")
|
||||
p.add_argument("--context_length", default=1024, type=int)
|
||||
p.add_argument("--sliding_window_size", required=True, type=int,
|
||||
help="SWA window size W per SWA layer.")
|
||||
|
||||
@@ -102,7 +102,7 @@ def calc_kv_bytes_total_gqa_swa(
|
||||
|
||||
|
||||
def main():
|
||||
p = argparse.ArgumentParser(
|
||||
p = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||
description="KV-cache vs Context Length — MHA vs GQA with SWA overlays"
|
||||
)
|
||||
p.add_argument("--emb_dim", type=int, required=True)
|
||||
|
||||
@@ -341,7 +341,7 @@ def generate_text_simple_cached(model, idx, max_new_tokens,
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument("--emb_dim", type=int, default=768, help="Model embedding dimension.")
|
||||
parser.add_argument("--hidden_dim", type=int, default=768*4, help="Intermediate FFN size.")
|
||||
parser.add_argument("--n_heads", type=int, default=12, help="Number of attention heads.")
|
||||
|
||||
@@ -401,7 +401,7 @@ def generate_text_simple_cached(model, idx, max_new_tokens,
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument("--emb_dim", type=int, default=768, help="Model embedding dimension.")
|
||||
parser.add_argument("--hidden_dim", type=int, default=768*4, help="Intermediate FFN or MoE size.")
|
||||
parser.add_argument("--n_heads", type=int, default=12, help="Number of attention heads.")
|
||||
|
||||
@@ -65,7 +65,7 @@ def estimate_params_and_hidden(
|
||||
|
||||
|
||||
def main():
|
||||
p = argparse.ArgumentParser(
|
||||
p = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||
description="Estimate FFN vs MoE parameter memory"
|
||||
)
|
||||
p.add_argument("--emb_dim", type=int, required=True,
|
||||
|
||||
@@ -91,7 +91,7 @@ def plot_active_params_vs_experts(
|
||||
|
||||
|
||||
def main():
|
||||
p = argparse.ArgumentParser(description="Plot Dense vs MoE active parameters.")
|
||||
p = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Plot Dense vs MoE active parameters.")
|
||||
p.add_argument("--emb_dim", type=int, required=True, help="Embedding dimension")
|
||||
p.add_argument("--hidden_dim", type=int, required=True, help="Dense FFN hidden size")
|
||||
p.add_argument("--ffn_type", choices=["gelu", "swiglu"], default="swiglu")
|
||||
|
||||
@@ -36,7 +36,7 @@ def convert_to_gb(x):
|
||||
|
||||
|
||||
def main():
|
||||
p = argparse.ArgumentParser(description="Memory vs. Context Length: MHA vs. DeltaNet (3:1 mix)")
|
||||
p = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Memory vs. Context Length: MHA vs. DeltaNet (3:1 mix)")
|
||||
p.add_argument("--batch", type=int, default=1)
|
||||
p.add_argument("--emb_dim", type=int, default=2048)
|
||||
p.add_argument("--n_heads", type=int, default=16)
|
||||
|
||||
@@ -253,16 +253,16 @@ def main(gpt_config, input_prompt, model_size, device):
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser(description="Generate text with a pretrained GPT-2 model.")
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Generate text with a pretrained GPT-2 model.")
|
||||
parser.add_argument(
|
||||
"--prompt",
|
||||
default="Every effort moves you",
|
||||
help="Prompt text used to seed the generation (default matches the script's built-in prompt)."
|
||||
help="Prompt text used to seed the generation."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--device",
|
||||
default="cpu",
|
||||
help="Device for running inference, e.g., cpu, cuda, mps, or auto. Defaults to cpu."
|
||||
help="Device for running inference, e.g., cpu, cuda, mps, or auto."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -66,7 +66,7 @@ def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftex
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser(description="Preprocess and combine text files for pretraining")
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Preprocess and combine text files for pretraining")
|
||||
|
||||
parser.add_argument("--data_dir", type=str, default="gutenberg/data/raw",
|
||||
help="Directory containing the downloaded raw training data")
|
||||
|
||||
@@ -148,7 +148,7 @@ def train_model_simple(model, optimizer, device, n_epochs,
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser(description="GPT Model Training Configuration")
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="GPT Model Training Configuration")
|
||||
|
||||
parser.add_argument("--data_dir", type=str, default="gutenberg/data",
|
||||
help="Directory containing the training data")
|
||||
|
||||
@@ -239,7 +239,7 @@ if __name__ == "__main__":
|
||||
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||
description="Finetune a GPT model for classification"
|
||||
)
|
||||
parser.add_argument(
|
||||
|
||||
@@ -410,7 +410,7 @@ def replace_linear_with_lora(model, rank, alpha, alternative=False):
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument(
|
||||
"--model_size",
|
||||
type=str,
|
||||
|
||||
@@ -175,7 +175,7 @@ def train_classifier_simple(model, train_loader, val_loader, optimizer, device,
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument(
|
||||
"--trainable_layers",
|
||||
type=str,
|
||||
|
||||
@@ -272,7 +272,7 @@ def train_classifier_simple(model, train_loader, val_loader, optimizer, device,
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument(
|
||||
"--trainable_layers",
|
||||
type=str,
|
||||
|
||||
@@ -225,7 +225,7 @@ def train_classifier_simple(model, train_loader, val_loader, optimizer, device,
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument(
|
||||
"--model_size",
|
||||
type=str,
|
||||
|
||||
@@ -534,7 +534,7 @@ if __name__ == "__main__":
|
||||
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||
description="Instruction finetune a GPT model"
|
||||
)
|
||||
options = {"baseline", "mask_instructions", "alpaca_52k", "phi3_prompt", "lora"}
|
||||
|
||||
@@ -333,7 +333,7 @@ if __name__ == "__main__":
|
||||
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||
description="Finetune a GPT model for classification"
|
||||
)
|
||||
parser.add_argument(
|
||||
|
||||
@@ -103,7 +103,7 @@ if __name__ == "__main__":
|
||||
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||
description="Evaluate model responses with ollama"
|
||||
)
|
||||
parser.add_argument(
|
||||
|
||||
@@ -100,7 +100,7 @@ def find_print_and_remove_near_duplicates(json_data, remove_duplicates=False, th
|
||||
if __name__ == "__main__":
|
||||
print("scikit-learn version:", sklearn_version)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument(
|
||||
"--json_file",
|
||||
type=str,
|
||||
|
||||
Reference in New Issue
Block a user