Merge branch 'main' into qwen-tokenizer-fix

2026-04-10 12:33:42 +00:00 · 2025-09-16 09:38:45 +02:00
parent 3a5ee8cfa1 b6cd0a312f
commit 701b5ad54d
12 changed files with 14 additions and 14 deletions
--- a/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb
+++ b/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb
@@ -435,7 +435,7 @@
    "    positions = torch.arange(context_length)\n",
    "\n",
    "    # Compute the angles\n",
-    "    angles = positions[:, None] * inv_freq[None, :]  # Shape: (context_length, head_dim // 2)\n",
+    "    angles = positions.unsqueeze(1) * inv_freq.unsqueeze(0)  # Shape: (context_length, head_dim // 2)\n",
    "\n",
    "    # Expand angles to match the head_dim\n",
    "    angles = torch.cat([angles, angles], dim=1)  # Shape: (context_length, head_dim)\n",
--- a/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb
+++ b/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb
@@ -310,7 +310,7 @@
    "    positions = torch.arange(context_length)\n",
    "\n",
    "    # Compute the angles\n",
-    "    angles = positions[:, None] * inv_freq[None, :]  # Shape: (context_length, head_dim // 2)\n",
+    "    angles = positions.unsqueeze(1) * inv_freq.unsqueeze(0)  # Shape: (context_length, head_dim // 2)\n",
    "\n",
    "    # Expand angles to match the head_dim\n",
    "    angles = torch.cat([angles, angles], dim=1)  # Shape: (context_length, head_dim)\n",
--- a/ch05/07_gpt_to_llama/standalone-llama32.ipynb
+++ b/ch05/07_gpt_to_llama/standalone-llama32.ipynb
@@ -180,7 +180,7 @@
    "    positions = torch.arange(context_length, dtype=dtype)\n",
    "\n",
    "    # Compute the angles\n",
-    "    angles = positions[:, None] * inv_freq[None, :]  # Shape: (context_length, head_dim // 2)\n",
+    "    angles = positions.unsqueeze(1) * inv_freq.unsqueeze(0)  # Shape: (context_length, head_dim // 2)\n",
    "\n",
    "    # Expand angles to match the head_dim\n",
    "    angles = torch.cat([angles, angles], dim=1)  # Shape: (context_length, head_dim)\n",
--- a/ch05/11_qwen3/standalone-qwen3-moe-plus-kvcache.ipynb
+++ b/ch05/11_qwen3/standalone-qwen3-moe-plus-kvcache.ipynb
@@ -275,7 +275,7 @@
    "    positions = torch.arange(context_length, dtype=dtype)\n",
    "\n",
    "    # Compute the angles\n",
-    "    angles = positions[:, None] * inv_freq[None, :]  # Shape: (context_length, head_dim // 2)\n",
+    "    angles = positions.unsqueeze(1) * inv_freq.unsqueeze(0)  # Shape: (context_length, head_dim // 2)\n",
    "\n",
    "    # Expand angles to match the head_dim\n",
    "    angles = torch.cat([angles, angles], dim=1)  # Shape: (context_length, head_dim)\n",
--- a/ch05/11_qwen3/standalone-qwen3-moe.ipynb
+++ b/ch05/11_qwen3/standalone-qwen3-moe.ipynb
@@ -275,7 +275,7 @@
    "    positions = torch.arange(context_length, dtype=dtype)\n",
    "\n",
    "    # Compute the angles\n",
-    "    angles = positions[:, None] * inv_freq[None, :]  # Shape: (context_length, head_dim // 2)\n",
+    "    angles = positions.unsqueeze(1) * inv_freq.unsqueeze(0)  # Shape: (context_length, head_dim // 2)\n",
    "\n",
    "    # Expand angles to match the head_dim\n",
    "    angles = torch.cat([angles, angles], dim=1)  # Shape: (context_length, head_dim)\n",
--- a/ch05/11_qwen3/standalone-qwen3-plus-kvcache.ipynb
+++ b/ch05/11_qwen3/standalone-qwen3-plus-kvcache.ipynb
@@ -206,7 +206,7 @@
    "    positions = torch.arange(context_length, dtype=dtype)\n",
    "\n",
    "    # Compute the angles\n",
-    "    angles = positions[:, None] * inv_freq[None, :]  # Shape: (context_length, head_dim // 2)\n",
+    "    angles = positions.unsqueeze(1) * inv_freq.unsqueeze(0)  # Shape: (context_length, head_dim // 2)\n",
    "\n",
    "    # Expand angles to match the head_dim\n",
    "    angles = torch.cat([angles, angles], dim=1)  # Shape: (context_length, head_dim)\n",
--- a/ch05/11_qwen3/standalone-qwen3.ipynb
+++ b/ch05/11_qwen3/standalone-qwen3.ipynb
@@ -204,7 +204,7 @@
    "    positions = torch.arange(context_length, dtype=dtype)\n",
    "\n",
    "    # Compute the angles\n",
-    "    angles = positions[:, None] * inv_freq[None, :]  # Shape: (context_length, head_dim // 2)\n",
+    "    angles = positions.unsqueeze(1) * inv_freq.unsqueeze(0)  # Shape: (context_length, head_dim // 2)\n",
    "\n",
    "    # Expand angles to match the head_dim\n",
    "    angles = torch.cat([angles, angles], dim=1)  # Shape: (context_length, head_dim)\n",
--- a/ch05/12_gemma3/standalone-gemma3-plus-kvcache.ipynb
+++ b/ch05/12_gemma3/standalone-gemma3-plus-kvcache.ipynb
@@ -200,7 +200,7 @@
    "    positions = torch.arange(context_length, dtype=dtype)\n",
    "\n",
    "    # Compute the angles\n",
-    "    angles = positions[:, None] * inv_freq[None, :]  # Shape: (context_length, head_dim // 2)\n",
+    "    angles = positions.unsqueeze(1) * inv_freq.unsqueeze(0)  # Shape: (context_length, head_dim // 2)\n",
    "\n",
    "    # Expand angles to match the head_dim\n",
    "    angles = torch.cat([angles, angles], dim=1)  # Shape: (context_length, head_dim)\n",
--- a/ch05/12_gemma3/standalone-gemma3.ipynb
+++ b/ch05/12_gemma3/standalone-gemma3.ipynb
@@ -200,7 +200,7 @@
    "    positions = torch.arange(context_length, dtype=dtype)\n",
    "\n",
    "    # Compute the angles\n",
-    "    angles = positions[:, None] * inv_freq[None, :]  # Shape: (context_length, head_dim // 2)\n",
+    "    angles = positions.unsqueeze(1) * inv_freq.unsqueeze(0)  # Shape: (context_length, head_dim // 2)\n",
    "\n",
    "    # Expand angles to match the head_dim\n",
    "    angles = torch.cat([angles, angles], dim=1)  # Shape: (context_length, head_dim)\n",
--- a/ch06/03_bonus_imdb-classification/train_sklearn_logreg.py
+++ b/ch06/03_bonus_imdb-classification/train_sklearn_logreg.py
@@ -19,7 +19,7 @@ def load_dataframes():
    return df_train, df_val, df_test


-def eval(model, X_train, y_train, X_val, y_val, X_test, y_test):
+def eval_model(model, X_train, y_train, X_val, y_val, X_test, y_test):
    # Making predictions
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)
@@ -67,9 +67,9 @@ if __name__ == "__main__":
    dummy_clf.fit(X_train, y_train)

    print("Dummy classifier:")
-    eval(dummy_clf, X_train, y_train, X_val, y_val, X_test, y_test)
+    eval_model(dummy_clf, X_train, y_train, X_val, y_val, X_test, y_test)

    print("\n\nLogistic regression classifier:")
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
-    eval(model, X_train, y_train, X_val, y_val, X_test, y_test)
+    eval_model(model, X_train, y_train, X_val, y_val, X_test, y_test)
--- a/pkg/llms_from_scratch/llama3.py
+++ b/pkg/llms_from_scratch/llama3.py
@@ -238,7 +238,7 @@ def compute_rope_params(head_dim, theta_base=10_000, context_length=4096, freq_c
    positions = torch.arange(context_length, dtype=dtype)

    # Compute the angles
-    angles = positions[:, None] * inv_freq[None, :]  # Shape: (context_length, head_dim // 2)
+    angles = positions.unsqueeze(1) * inv_freq.unsqueeze(0)  # Shape: (context_length, head_dim // 2)

    # Expand angles to match the head_dim
    angles = torch.cat([angles, angles], dim=1)  # Shape: (context_length, head_dim)
--- a/pkg/llms_from_scratch/qwen3.py
+++ b/pkg/llms_from_scratch/qwen3.py
@@ -326,7 +326,7 @@ def compute_rope_params(head_dim, theta_base=10_000, context_length=4096, dtype=
    positions = torch.arange(context_length, dtype=dtype)

    # Compute the angles
-    angles = positions[:, None] * inv_freq[None, :]  # Shape: (context_length, head_dim // 2)
+    angles = positions.unsqueeze(1) * inv_freq.unsqueeze(0) # Shape: (context_length, head_dim // 2)

    # Expand angles to match the head_dim
    angles = torch.cat([angles, angles], dim=1)  # Shape: (context_length, head_dim)