mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2026-04-10 12:33:42 +00:00
Optional weight tying for Qwen3 and Llama3.2 pretraining (#949)
* optional weight tying for Qwen3 and Llama3.2 * typo
This commit is contained in:
@@ -65,7 +65,7 @@ class Llama3Model(nn.Module):
|
||||
self.final_norm = nn.RMSNorm(cfg["emb_dim"], eps=1e-5, dtype=cfg["dtype"])
|
||||
self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False, dtype=cfg["dtype"])
|
||||
|
||||
# Reusuable utilities
|
||||
# Reusable utilities
|
||||
cos, sin = compute_rope_params(
|
||||
head_dim=cfg["emb_dim"] // cfg["n_heads"],
|
||||
theta_base=cfg["rope_base"],
|
||||
|
||||
Reference in New Issue
Block a user