Make quote style consistent (#891)

2026-04-10 12:33:42 +00:00 · 2025-10-21 19:42:33 -05:00
parent 9276edbc37
commit 7ca7c47e4a
24 changed files with 239 additions and 81 deletions
--- a/pkg/llms_from_scratch/ch03.py
+++ b/pkg/llms_from_scratch/ch03.py
@@ -59,7 +59,7 @@ class CausalAttention(nn.Module):
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.dropout = nn.Dropout(dropout)  # New
-        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) # New
+        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1)) # New

    def forward(self, x):
        b, num_tokens, d_in = x.shape  # New batch dimension b
@@ -109,7 +109,7 @@ class MultiHeadAttention(nn.Module):
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
-        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
+        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape
--- a/pkg/llms_from_scratch/ch05.py
+++ b/pkg/llms_from_scratch/ch05.py
@@ -30,7 +30,7 @@ def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=No
            # Keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
-            logits = torch.where(logits < min_val, torch.tensor(float('-inf')).to(logits.device), logits)
+            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

        # New: Apply temperature scaling
        if temperature > 0.0:
@@ -125,8 +125,8 @@ def assign(left, right):


 def load_weights_into_gpt(gpt, params):
-    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
-    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])
+    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params["wpe"])
+    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params["wte"])

    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
--- a/pkg/llms_from_scratch/tests/test_qwen3.py
+++ b/pkg/llms_from_scratch/tests/test_qwen3.py
@@ -110,7 +110,7 @@ def test_dummy_qwen3_moe_forward(dummy_cfg_moe, dummy_input):
    out = model(dummy_input)
    assert out.shape == (1, dummy_input.size(1), dummy_cfg_moe["vocab_size"]), \
        f"Expected shape (1, seq_len, vocab_size), got {out.shape}"
-    assert any(hasattr(block.ff, 'gate') for block in model.trf_blocks), \
+    assert any(hasattr(block.ff, "gate") for block in model.trf_blocks), \
        "Expected MoEFeedForward in at least one transformer block"