Parametrize qkv bias

jackzhxng · jackzhxng · commit d4a3f91d7f16 · 2025-02-13T11:37:13.000-08:00
diff --git a/examples/models/llama/attention.py b/examples/models/llama/attention.py
@@ -175,10 +175,16 @@ def __init__(self, args: ModelArgs, layer_id: int, rope: Rope):
         self.max_batch_size = args.max_batch_size
         self.max_context_len = args.max_context_len
         self.dim = args.dim
-        # TODO: parametrize bias for attention and feedforward.
-        self.wq = nn.Linear(self.dim, self.n_heads * self.head_dim, bias=True)
-        self.wk = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=True)
-        self.wv = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=True)
+        self.attention_qkv_bias = args.attention_qkv_bias
+        self.wq = nn.Linear(
+            self.dim, self.n_heads * self.head_dim, bias=self.attention_qkv_bias
+        )
+        self.wk = nn.Linear(
+            self.dim, self.n_kv_heads * self.head_dim, bias=self.attention_qkv_bias
+        )
+        self.wv = nn.Linear(
+            self.dim, self.n_kv_heads * self.head_dim, bias=self.attention_qkv_bias
+        )
         self.wo = nn.Linear(self.n_heads * self.head_dim, self.dim, bias=False)
 
         self.layer_id = layer_id
diff --git a/examples/models/llama/model_args.py b/examples/models/llama/model_args.py
@@ -21,6 +21,7 @@ class ModelArgs:
     num_experts: int = 8  # Number of experts
     num_activated_experts: int = 2  # Number of experts to activate
     attention_type: str = "mha"  # Attention type, registered in attention.py
+    attention_qkv_bias: bool = False
     use_kv_cache: bool = False  # Use key/value cache
     use_sdpa_with_kv_cache_op: bool = (
         False  # Use custom sdpa op that updates kv cache in-place
diff --git a/examples/models/qwen2_5/1_5b_config.json b/examples/models/qwen2_5/1_5b_config.json
@@ -8,5 +8,6 @@
   "norm_eps": 1e-06,
   "rope_theta": 1000000.0,
   "use_scaled_rope": false,
-  "vocab_size": 151936
+  "vocab_size": 151936,
+  "attention_qkv_bias": true
 }

Original file line number	Diff line number	Diff line change
`@@ -8,5 +8,6 @@`
`8`	`8`	`"norm_eps": 1e-06,`
`9`	`9`	`"rope_theta": 1000000.0,`
`10`	`10`	`"use_scaled_rope": false,`
`11`		`- "vocab_size": 151936`
	`11`	`+ "vocab_size": 151936,`
	`12`	`+ "attention_qkv_bias": true`
`12`	`13`	`}`