Support different Qwen3 sizes in pkg (#714)

2026-04-10 12:33:42 +00:00 · 2025-06-28 08:00:23 -05:00
parent ddbaf0d83e
commit c4ec55edac
4 changed files with 194 additions and 175 deletions
--- a/pkg/llms_from_scratch/kv_cache/qwen3.py
+++ b/pkg/llms_from_scratch/kv_cache/qwen3.py
@@ -4,29 +4,17 @@
 # Code: https://github.com/rasbt/LLMs-from-scratch

 from .utils import KVCache   # noqa: F401
-
-import os
-import urllib.request
-from pathlib import Path
+from ..qwen3 import (   # noqa: F401
+    QWEN_CONFIG_06_B, QWEN3_CONFIG_1_7B, QWEN3_CONFIG_4B,
+    QWEN3_CONFIG_8B, QWEN3_CONFIG_14B, QWEN3_CONFIG_32B,
+    Qwen3Tokenizer, load_weights_into_qwen,
+    download_from_huggingface,
+    download_from_huggingface_from_snapshots
+)

 import torch
 import torch.nn as nn

-# 0.6B model
-QWEN_CONFIG_06_B = {
-    "vocab_size": 151_936,           # Vocabulary size
-    "context_length": 40_960,        # Context length that was used to train the model
-    "emb_dim": 1024,                 # Embedding dimension
-    "n_heads": 16,                   # Number of attention heads
-    "n_layers": 28,                  # Number of layers
-    "hidden_dim": 3072,              # Size of the intermediate dimension in FeedForward
-    "head_dim": 128,                 # Size of the heads in GQA
-    "qk_norm": True,                 # Whether to normalize queries and values in GQA
-    "n_kv_groups": 8,                # Key-Value groups for grouped-query attention
-    "rope_base": 1_000_000.0,        # The base in RoPE's "theta"
-    "dtype": torch.bfloat16,         # Lower-precision dtype to reduce memory usage
-}
-

 class Qwen3Model(nn.Module):
    def __init__(self, cfg):
@@ -285,150 +273,3 @@ class RMSNorm(nn.Module):
            norm_x = norm_x + self.shift

        return norm_x.to(input_dtype)
-
-
-def load_weights_into_qwen(model, param_config, params):
-    def assign(left, right, tensor_name="unknown"):
-        if left.shape != right.shape:
-            raise ValueError(f"Shape mismatch in tensor '{tensor_name}'. Left: {left.shape}, Right: {right.shape}")
-        return torch.nn.Parameter(right.clone().detach() if isinstance(right, torch.Tensor) else torch.tensor(right))
-
-    model.tok_emb.weight = assign(model.tok_emb.weight, params["model.embed_tokens.weight"], "model.embed_tokens.weight")
-
-    for l in range(param_config["n_layers"]):
-        block = model.trf_blocks[l]
-        att = block.att
-
-        # Q, K, V projections
-        att.W_query.weight = assign(
-            att.W_query.weight,
-            params[f"model.layers.{l}.self_attn.q_proj.weight"],
-            f"model.layers.{l}.self_attn.q_proj.weight"
-        )
-        att.W_key.weight = assign(
-            att.W_key.weight,
-            params[f"model.layers.{l}.self_attn.k_proj.weight"],
-            f"model.layers.{l}.self_attn.k_proj.weight"
-        )
-        att.W_value.weight = assign(
-            att.W_value.weight,
-            params[f"model.layers.{l}.self_attn.v_proj.weight"],
-            f"model.layers.{l}.self_attn.v_proj.weight"
-        )
-
-        # Output projection
-        att.out_proj.weight = assign(
-            att.out_proj.weight,
-            params[f"model.layers.{l}.self_attn.o_proj.weight"],
-            f"model.layers.{l}.self_attn.o_proj.weight"
-        )
-
-        # QK norms
-        if hasattr(att, "q_norm") and att.q_norm is not None:
-            att.q_norm.scale = assign(
-                att.q_norm.scale,
-                params[f"model.layers.{l}.self_attn.q_norm.weight"],
-                f"model.layers.{l}.self_attn.q_norm.weight"
-            )
-        if hasattr(att, "k_norm") and att.k_norm is not None:
-            att.k_norm.scale = assign(
-                att.k_norm.scale,
-                params[f"model.layers.{l}.self_attn.k_norm.weight"],
-                f"model.layers.{l}.self_attn.k_norm.weight"
-            )
-
-        # Attention layernorm
-        block.norm1.scale = assign(
-            block.norm1.scale,
-            params[f"model.layers.{l}.input_layernorm.weight"],
-            f"model.layers.{l}.input_layernorm.weight"
-        )
-
-        # Feedforward weights
-        block.ff.fc1.weight = assign(
-            block.ff.fc1.weight,
-            params[f"model.layers.{l}.mlp.gate_proj.weight"],
-            f"model.layers.{l}.mlp.gate_proj.weight"
-        )
-        block.ff.fc2.weight = assign(
-            block.ff.fc2.weight,
-            params[f"model.layers.{l}.mlp.up_proj.weight"],
-            f"model.layers.{l}.mlp.up_proj.weight"
-        )
-        block.ff.fc3.weight = assign(
-            block.ff.fc3.weight,
-            params[f"model.layers.{l}.mlp.down_proj.weight"],
-            f"model.layers.{l}.mlp.down_proj.weight"
-        )
-        block.norm2.scale = assign(
-            block.norm2.scale,
-            params[f"model.layers.{l}.post_attention_layernorm.weight"],
-            f"model.layers.{l}.post_attention_layernorm.weight"
-        )
-
-    # Final normalization and output head
-    model.final_norm.scale = assign(model.final_norm.scale, params["model.norm.weight"], "model.norm.weight")
-
-    # Model uses weight tying, hence we reuse the embedding layer weights here
-    model.out_head.weight = assign(model.out_head.weight, params["model.embed_tokens.weight"], "model.embed_tokens.weight")
-
-
-class Qwen3Tokenizer():
-    def __init__(self, tokenizer_file_path="tokenizer.json",
-                 repo_id=None, add_generation_prompt=False, add_thinking=False):
-        from tokenizers import Tokenizer
-        self.tokenizer_file_path = tokenizer_file_path
-
-        if add_generation_prompt != add_thinking:
-            raise ValueError(
-                "Only add_generation_prompt==add_thinking settings are currently supported"
-            )
-
-        self.add_generation_prompt = add_generation_prompt
-        self.add_thinking = add_thinking
-
-        tokenizer_file_path_obj = Path(tokenizer_file_path)
-        if not tokenizer_file_path_obj.is_file() and repo_id is not None:
-            _ = download_from_huggingface(
-                repo_id=repo_id,
-                filename=str(tokenizer_file_path_obj.name),
-                local_dir=str(tokenizer_file_path_obj.parent.name)
-            )
-        self.tokenizer = Tokenizer.from_file(tokenizer_file_path)
-
-    def encode(self, prompt):
-        messages = [
-            {"role": "user", "content": prompt}
-        ]
-        formatted_prompt = self.format_qwen_chat(
-            messages,
-            add_generation_prompt=self.add_generation_prompt,
-            add_thinking=self.add_thinking
-        )
-        return self.tokenizer.encode(formatted_prompt).ids
-
-    def decode(self, token_ids):
-        return self.tokenizer.decode(token_ids, skip_special_tokens=False)
-
-    @staticmethod
-    def format_qwen_chat(messages, add_generation_prompt=False, add_thinking=False):
-        prompt = ""
-        for msg in messages:
-            prompt += f"<|im_start|>{msg['role']}\n{msg['content']}<|im_end|>\n"
-        if add_generation_prompt:
-            prompt += "<|im_start|>assistant"
-            if not add_thinking:
-                prompt += "<|think>\n\n<|/think>\n\n"
-            else:
-                prompt += "\n"
-        return prompt
-
-
-def download_from_huggingface(repo_id, filename, local_dir, revision="main"):
-    base_url = "https://huggingface.co"
-    url = f"{base_url}/{repo_id}/resolve/{revision}/{filename}"
-    Path(local_dir).mkdir(parents=True, exist_ok=True)
-    dest_path = os.path.join(local_dir, filename)
-    print(f"Downloading {url} to {dest_path}...")
-    urllib.request.urlretrieve(url, dest_path)
-    return dest_path
--- a/pkg/llms_from_scratch/qwen3.py
+++ b/pkg/llms_from_scratch/qwen3.py
@@ -4,13 +4,15 @@
 # Code: https://github.com/rasbt/LLMs-from-scratch

 import os
+import json
 import urllib.request
 from pathlib import Path

 import torch
 import torch.nn as nn

-# 0.6B model
+
+# 0.6 billion parameters
 QWEN_CONFIG_06_B = {
    "vocab_size": 151_936,           # Vocabulary size
    "context_length": 40_960,        # Context length that was used to train the model
@@ -25,6 +27,80 @@ QWEN_CONFIG_06_B = {
    "dtype": torch.bfloat16,         # Lower-precision dtype to reduce memory usage
 }

+# 1.7 billion parameters
+QWEN3_CONFIG_1_7B = {
+    "vocab_size": 151_936,
+    "context_length": 40_960,
+    "emb_dim": 2048,                 # 2x larger than above
+    "n_heads": 16,
+    "n_layers": 28,
+    "hidden_dim": 6144,              # 2x larger than above
+    "head_dim": 128,
+    "qk_norm": True,
+    "n_kv_groups": 8,
+    "rope_base": 1_000_000.0,
+    "dtype": torch.bfloat16,
+}
+
+# 4 billion parameters
+QWEN3_CONFIG_4B = {
+    "vocab_size": 151_936,
+    "context_length": 40_960,
+    "emb_dim": 2560,                 # 25% larger than above
+    "n_heads": 32,                   # 2x larger than above
+    "n_layers": 36,                  # 29% larger than above
+    "hidden_dim": 9728,              # ~3x larger than above
+    "head_dim": 128,
+    "qk_norm": True,
+    "n_kv_groups": 8,
+    "rope_base": 1_000_000.0,
+    "dtype": torch.bfloat16,
+}
+
+# 8 billion parameters
+QWEN3_CONFIG_8B = {
+    "vocab_size": 151_936,
+    "context_length": 40_960,
+    "emb_dim": 4096,                 # 60% larger than above
+    "n_heads": 32,
+    "n_layers": 36,                  # 26% larger than above
+    "hidden_dim": 12288,
+    "head_dim": 128,
+    "qk_norm": True,
+    "n_kv_groups": 8,
+    "rope_base": 1_000_000.0,
+    "dtype": torch.bfloat16,
+}
+
+# 14 billion parameters
+QWEN3_CONFIG_14B = {
+        "vocab_size": 151_936,
+        "context_length": 40_960,
+        "emb_dim": 5120,                 # 25% larger than above
+        "n_heads": 40,                   # 25% larger than above
+        "n_layers": 40,                  # 11% larger than above
+        "hidden_dim": 17408,             # 42% larger than above
+        "head_dim": 128,
+        "qk_norm": True,
+        "n_kv_groups": 8,
+        "rope_base": 1_000_000.0,
+        "dtype": torch.bfloat16,
+}
+
+QWEN3_CONFIG_32B = {
+        "vocab_size": 151_936,
+        "context_length": 40_960,
+        "emb_dim": 5120,
+        "n_heads": 64,                   # 60% larger than above
+        "n_layers": 64,                  # 60% larger than above
+        "hidden_dim": 25600,             # 47% larger than above
+        "head_dim": 128,
+        "qk_norm": True,
+        "n_kv_groups": 8,
+        "rope_base": 1_000_000.0,
+        "dtype": torch.bfloat16,
+}
+

 class Qwen3Model(nn.Module):
    def __init__(self, cfg):
@@ -388,6 +464,44 @@ def download_from_huggingface(repo_id, filename, local_dir, revision="main"):
    url = f"{base_url}/{repo_id}/resolve/{revision}/{filename}"
    Path(local_dir).mkdir(parents=True, exist_ok=True)
    dest_path = os.path.join(local_dir, filename)
-    print(f"Downloading {url} to {dest_path}...")
-    urllib.request.urlretrieve(url, dest_path)
+
+    if os.path.exists(dest_path):
+        print(f"File already exists: {dest_path}")
+    else:
+        print(f"Downloading {url} to {dest_path}...")
+        urllib.request.urlretrieve(url, dest_path)
+
    return dest_path
+
+
+def download_from_huggingface_from_snapshots(repo_id, local_dir):
+    from huggingface_hub import hf_hub_download, snapshot_download
+    from safetensors.torch import load_file  # or your preferred loader
+
+    repo_dir = snapshot_download(repo_id=repo_id, local_dir=local_dir)
+
+    index_path = os.path.join(repo_dir, "model.safetensors.index.json")
+    single_file_path = os.path.join(repo_dir, "model.safetensors")
+
+    if os.path.exists(index_path):
+        # Multi-shard model
+        with open(index_path, "r") as f:
+            index = json.load(f)
+
+        weights_dict = {}
+        for filename in set(index["weight_map"].values()):
+            shard_path = os.path.join(repo_dir, filename)
+            shard = load_file(shard_path)
+            weights_dict.update(shard)
+    elif os.path.exists(single_file_path):
+        # Single-shard model
+        weights_file = hf_hub_download(
+            repo_id=repo_id,
+            filename="model.safetensors",
+            local_dir=local_dir,
+        )
+        weights_dict = load_file(weights_file)
+    else:
+        raise FileNotFoundError("No model.safetensors or model.safetensors.index.json found.")
+
+    return weights_dict