mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2026-04-10 12:33:42 +00:00
Support different Qwen3 sizes in pkg (#714)
This commit is contained in:
committed by
GitHub
parent
ddbaf0d83e
commit
c4ec55edac
@@ -4,29 +4,17 @@
|
||||
# Code: https://github.com/rasbt/LLMs-from-scratch
|
||||
|
||||
from .utils import KVCache # noqa: F401
|
||||
|
||||
import os
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
from ..qwen3 import ( # noqa: F401
|
||||
QWEN_CONFIG_06_B, QWEN3_CONFIG_1_7B, QWEN3_CONFIG_4B,
|
||||
QWEN3_CONFIG_8B, QWEN3_CONFIG_14B, QWEN3_CONFIG_32B,
|
||||
Qwen3Tokenizer, load_weights_into_qwen,
|
||||
download_from_huggingface,
|
||||
download_from_huggingface_from_snapshots
|
||||
)
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
# 0.6B model
|
||||
QWEN_CONFIG_06_B = {
|
||||
"vocab_size": 151_936, # Vocabulary size
|
||||
"context_length": 40_960, # Context length that was used to train the model
|
||||
"emb_dim": 1024, # Embedding dimension
|
||||
"n_heads": 16, # Number of attention heads
|
||||
"n_layers": 28, # Number of layers
|
||||
"hidden_dim": 3072, # Size of the intermediate dimension in FeedForward
|
||||
"head_dim": 128, # Size of the heads in GQA
|
||||
"qk_norm": True, # Whether to normalize queries and values in GQA
|
||||
"n_kv_groups": 8, # Key-Value groups for grouped-query attention
|
||||
"rope_base": 1_000_000.0, # The base in RoPE's "theta"
|
||||
"dtype": torch.bfloat16, # Lower-precision dtype to reduce memory usage
|
||||
}
|
||||
|
||||
|
||||
class Qwen3Model(nn.Module):
|
||||
def __init__(self, cfg):
|
||||
@@ -285,150 +273,3 @@ class RMSNorm(nn.Module):
|
||||
norm_x = norm_x + self.shift
|
||||
|
||||
return norm_x.to(input_dtype)
|
||||
|
||||
|
||||
def load_weights_into_qwen(model, param_config, params):
|
||||
def assign(left, right, tensor_name="unknown"):
|
||||
if left.shape != right.shape:
|
||||
raise ValueError(f"Shape mismatch in tensor '{tensor_name}'. Left: {left.shape}, Right: {right.shape}")
|
||||
return torch.nn.Parameter(right.clone().detach() if isinstance(right, torch.Tensor) else torch.tensor(right))
|
||||
|
||||
model.tok_emb.weight = assign(model.tok_emb.weight, params["model.embed_tokens.weight"], "model.embed_tokens.weight")
|
||||
|
||||
for l in range(param_config["n_layers"]):
|
||||
block = model.trf_blocks[l]
|
||||
att = block.att
|
||||
|
||||
# Q, K, V projections
|
||||
att.W_query.weight = assign(
|
||||
att.W_query.weight,
|
||||
params[f"model.layers.{l}.self_attn.q_proj.weight"],
|
||||
f"model.layers.{l}.self_attn.q_proj.weight"
|
||||
)
|
||||
att.W_key.weight = assign(
|
||||
att.W_key.weight,
|
||||
params[f"model.layers.{l}.self_attn.k_proj.weight"],
|
||||
f"model.layers.{l}.self_attn.k_proj.weight"
|
||||
)
|
||||
att.W_value.weight = assign(
|
||||
att.W_value.weight,
|
||||
params[f"model.layers.{l}.self_attn.v_proj.weight"],
|
||||
f"model.layers.{l}.self_attn.v_proj.weight"
|
||||
)
|
||||
|
||||
# Output projection
|
||||
att.out_proj.weight = assign(
|
||||
att.out_proj.weight,
|
||||
params[f"model.layers.{l}.self_attn.o_proj.weight"],
|
||||
f"model.layers.{l}.self_attn.o_proj.weight"
|
||||
)
|
||||
|
||||
# QK norms
|
||||
if hasattr(att, "q_norm") and att.q_norm is not None:
|
||||
att.q_norm.scale = assign(
|
||||
att.q_norm.scale,
|
||||
params[f"model.layers.{l}.self_attn.q_norm.weight"],
|
||||
f"model.layers.{l}.self_attn.q_norm.weight"
|
||||
)
|
||||
if hasattr(att, "k_norm") and att.k_norm is not None:
|
||||
att.k_norm.scale = assign(
|
||||
att.k_norm.scale,
|
||||
params[f"model.layers.{l}.self_attn.k_norm.weight"],
|
||||
f"model.layers.{l}.self_attn.k_norm.weight"
|
||||
)
|
||||
|
||||
# Attention layernorm
|
||||
block.norm1.scale = assign(
|
||||
block.norm1.scale,
|
||||
params[f"model.layers.{l}.input_layernorm.weight"],
|
||||
f"model.layers.{l}.input_layernorm.weight"
|
||||
)
|
||||
|
||||
# Feedforward weights
|
||||
block.ff.fc1.weight = assign(
|
||||
block.ff.fc1.weight,
|
||||
params[f"model.layers.{l}.mlp.gate_proj.weight"],
|
||||
f"model.layers.{l}.mlp.gate_proj.weight"
|
||||
)
|
||||
block.ff.fc2.weight = assign(
|
||||
block.ff.fc2.weight,
|
||||
params[f"model.layers.{l}.mlp.up_proj.weight"],
|
||||
f"model.layers.{l}.mlp.up_proj.weight"
|
||||
)
|
||||
block.ff.fc3.weight = assign(
|
||||
block.ff.fc3.weight,
|
||||
params[f"model.layers.{l}.mlp.down_proj.weight"],
|
||||
f"model.layers.{l}.mlp.down_proj.weight"
|
||||
)
|
||||
block.norm2.scale = assign(
|
||||
block.norm2.scale,
|
||||
params[f"model.layers.{l}.post_attention_layernorm.weight"],
|
||||
f"model.layers.{l}.post_attention_layernorm.weight"
|
||||
)
|
||||
|
||||
# Final normalization and output head
|
||||
model.final_norm.scale = assign(model.final_norm.scale, params["model.norm.weight"], "model.norm.weight")
|
||||
|
||||
# Model uses weight tying, hence we reuse the embedding layer weights here
|
||||
model.out_head.weight = assign(model.out_head.weight, params["model.embed_tokens.weight"], "model.embed_tokens.weight")
|
||||
|
||||
|
||||
class Qwen3Tokenizer():
|
||||
def __init__(self, tokenizer_file_path="tokenizer.json",
|
||||
repo_id=None, add_generation_prompt=False, add_thinking=False):
|
||||
from tokenizers import Tokenizer
|
||||
self.tokenizer_file_path = tokenizer_file_path
|
||||
|
||||
if add_generation_prompt != add_thinking:
|
||||
raise ValueError(
|
||||
"Only add_generation_prompt==add_thinking settings are currently supported"
|
||||
)
|
||||
|
||||
self.add_generation_prompt = add_generation_prompt
|
||||
self.add_thinking = add_thinking
|
||||
|
||||
tokenizer_file_path_obj = Path(tokenizer_file_path)
|
||||
if not tokenizer_file_path_obj.is_file() and repo_id is not None:
|
||||
_ = download_from_huggingface(
|
||||
repo_id=repo_id,
|
||||
filename=str(tokenizer_file_path_obj.name),
|
||||
local_dir=str(tokenizer_file_path_obj.parent.name)
|
||||
)
|
||||
self.tokenizer = Tokenizer.from_file(tokenizer_file_path)
|
||||
|
||||
def encode(self, prompt):
|
||||
messages = [
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
formatted_prompt = self.format_qwen_chat(
|
||||
messages,
|
||||
add_generation_prompt=self.add_generation_prompt,
|
||||
add_thinking=self.add_thinking
|
||||
)
|
||||
return self.tokenizer.encode(formatted_prompt).ids
|
||||
|
||||
def decode(self, token_ids):
|
||||
return self.tokenizer.decode(token_ids, skip_special_tokens=False)
|
||||
|
||||
@staticmethod
|
||||
def format_qwen_chat(messages, add_generation_prompt=False, add_thinking=False):
|
||||
prompt = ""
|
||||
for msg in messages:
|
||||
prompt += f"<|im_start|>{msg['role']}\n{msg['content']}<|im_end|>\n"
|
||||
if add_generation_prompt:
|
||||
prompt += "<|im_start|>assistant"
|
||||
if not add_thinking:
|
||||
prompt += "<|think>\n\n<|/think>\n\n"
|
||||
else:
|
||||
prompt += "\n"
|
||||
return prompt
|
||||
|
||||
|
||||
def download_from_huggingface(repo_id, filename, local_dir, revision="main"):
|
||||
base_url = "https://huggingface.co"
|
||||
url = f"{base_url}/{repo_id}/resolve/{revision}/{filename}"
|
||||
Path(local_dir).mkdir(parents=True, exist_ok=True)
|
||||
dest_path = os.path.join(local_dir, filename)
|
||||
print(f"Downloading {url} to {dest_path}...")
|
||||
urllib.request.urlretrieve(url, dest_path)
|
||||
return dest_path
|
||||
|
||||
@@ -4,13 +4,15 @@
|
||||
# Code: https://github.com/rasbt/LLMs-from-scratch
|
||||
|
||||
import os
|
||||
import json
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
# 0.6B model
|
||||
|
||||
# 0.6 billion parameters
|
||||
QWEN_CONFIG_06_B = {
|
||||
"vocab_size": 151_936, # Vocabulary size
|
||||
"context_length": 40_960, # Context length that was used to train the model
|
||||
@@ -25,6 +27,80 @@ QWEN_CONFIG_06_B = {
|
||||
"dtype": torch.bfloat16, # Lower-precision dtype to reduce memory usage
|
||||
}
|
||||
|
||||
# 1.7 billion parameters
|
||||
QWEN3_CONFIG_1_7B = {
|
||||
"vocab_size": 151_936,
|
||||
"context_length": 40_960,
|
||||
"emb_dim": 2048, # 2x larger than above
|
||||
"n_heads": 16,
|
||||
"n_layers": 28,
|
||||
"hidden_dim": 6144, # 2x larger than above
|
||||
"head_dim": 128,
|
||||
"qk_norm": True,
|
||||
"n_kv_groups": 8,
|
||||
"rope_base": 1_000_000.0,
|
||||
"dtype": torch.bfloat16,
|
||||
}
|
||||
|
||||
# 4 billion parameters
|
||||
QWEN3_CONFIG_4B = {
|
||||
"vocab_size": 151_936,
|
||||
"context_length": 40_960,
|
||||
"emb_dim": 2560, # 25% larger than above
|
||||
"n_heads": 32, # 2x larger than above
|
||||
"n_layers": 36, # 29% larger than above
|
||||
"hidden_dim": 9728, # ~3x larger than above
|
||||
"head_dim": 128,
|
||||
"qk_norm": True,
|
||||
"n_kv_groups": 8,
|
||||
"rope_base": 1_000_000.0,
|
||||
"dtype": torch.bfloat16,
|
||||
}
|
||||
|
||||
# 8 billion parameters
|
||||
QWEN3_CONFIG_8B = {
|
||||
"vocab_size": 151_936,
|
||||
"context_length": 40_960,
|
||||
"emb_dim": 4096, # 60% larger than above
|
||||
"n_heads": 32,
|
||||
"n_layers": 36, # 26% larger than above
|
||||
"hidden_dim": 12288,
|
||||
"head_dim": 128,
|
||||
"qk_norm": True,
|
||||
"n_kv_groups": 8,
|
||||
"rope_base": 1_000_000.0,
|
||||
"dtype": torch.bfloat16,
|
||||
}
|
||||
|
||||
# 14 billion parameters
|
||||
QWEN3_CONFIG_14B = {
|
||||
"vocab_size": 151_936,
|
||||
"context_length": 40_960,
|
||||
"emb_dim": 5120, # 25% larger than above
|
||||
"n_heads": 40, # 25% larger than above
|
||||
"n_layers": 40, # 11% larger than above
|
||||
"hidden_dim": 17408, # 42% larger than above
|
||||
"head_dim": 128,
|
||||
"qk_norm": True,
|
||||
"n_kv_groups": 8,
|
||||
"rope_base": 1_000_000.0,
|
||||
"dtype": torch.bfloat16,
|
||||
}
|
||||
|
||||
QWEN3_CONFIG_32B = {
|
||||
"vocab_size": 151_936,
|
||||
"context_length": 40_960,
|
||||
"emb_dim": 5120,
|
||||
"n_heads": 64, # 60% larger than above
|
||||
"n_layers": 64, # 60% larger than above
|
||||
"hidden_dim": 25600, # 47% larger than above
|
||||
"head_dim": 128,
|
||||
"qk_norm": True,
|
||||
"n_kv_groups": 8,
|
||||
"rope_base": 1_000_000.0,
|
||||
"dtype": torch.bfloat16,
|
||||
}
|
||||
|
||||
|
||||
class Qwen3Model(nn.Module):
|
||||
def __init__(self, cfg):
|
||||
@@ -388,6 +464,44 @@ def download_from_huggingface(repo_id, filename, local_dir, revision="main"):
|
||||
url = f"{base_url}/{repo_id}/resolve/{revision}/{filename}"
|
||||
Path(local_dir).mkdir(parents=True, exist_ok=True)
|
||||
dest_path = os.path.join(local_dir, filename)
|
||||
print(f"Downloading {url} to {dest_path}...")
|
||||
urllib.request.urlretrieve(url, dest_path)
|
||||
|
||||
if os.path.exists(dest_path):
|
||||
print(f"File already exists: {dest_path}")
|
||||
else:
|
||||
print(f"Downloading {url} to {dest_path}...")
|
||||
urllib.request.urlretrieve(url, dest_path)
|
||||
|
||||
return dest_path
|
||||
|
||||
|
||||
def download_from_huggingface_from_snapshots(repo_id, local_dir):
|
||||
from huggingface_hub import hf_hub_download, snapshot_download
|
||||
from safetensors.torch import load_file # or your preferred loader
|
||||
|
||||
repo_dir = snapshot_download(repo_id=repo_id, local_dir=local_dir)
|
||||
|
||||
index_path = os.path.join(repo_dir, "model.safetensors.index.json")
|
||||
single_file_path = os.path.join(repo_dir, "model.safetensors")
|
||||
|
||||
if os.path.exists(index_path):
|
||||
# Multi-shard model
|
||||
with open(index_path, "r") as f:
|
||||
index = json.load(f)
|
||||
|
||||
weights_dict = {}
|
||||
for filename in set(index["weight_map"].values()):
|
||||
shard_path = os.path.join(repo_dir, filename)
|
||||
shard = load_file(shard_path)
|
||||
weights_dict.update(shard)
|
||||
elif os.path.exists(single_file_path):
|
||||
# Single-shard model
|
||||
weights_file = hf_hub_download(
|
||||
repo_id=repo_id,
|
||||
filename="model.safetensors",
|
||||
local_dir=local_dir,
|
||||
)
|
||||
weights_dict = load_file(weights_file)
|
||||
else:
|
||||
raise FileNotFoundError("No model.safetensors or model.safetensors.index.json found.")
|
||||
|
||||
return weights_dict
|
||||
|
||||
Reference in New Issue
Block a user