@@ -142,14 +142,27 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
142
142
raise ValueError (f"Mismatch between weight map and model parts for tensor names: { sym_diff } " )
143
143
144
144
def format_tensor_name (self , key : gguf .MODEL_TENSOR , bid : int | None = None , suffix : str = ".weight" ) -> str :
145
- name : str = gguf .TENSOR_NAMES [key ]
146
145
if key not in gguf .MODEL_TENSORS [self .model_arch ]:
147
146
raise ValueError (f"Missing { key !r} for MODEL_TENSORS of { self .model_arch !r} " )
147
+ name : str = gguf .TENSOR_NAMES [key ]
148
148
if "{bid}" in name :
149
149
assert bid is not None
150
150
name = name .format (bid = bid )
151
151
return name + suffix
152
152
153
+ def match_model_tensor_name (self , name : str , key : gguf .MODEL_TENSOR , bid : int | None , suffix : str = ".weight" ) -> bool :
154
+ if key not in gguf .MODEL_TENSORS [self .model_arch ]:
155
+ return False
156
+ key_name : str = gguf .TENSOR_NAMES [key ]
157
+ if "{bid}" in key_name :
158
+ if bid is None :
159
+ return False
160
+ key_name = key_name .format (bid = bid )
161
+ else :
162
+ if bid is not None :
163
+ return False
164
+ return name == (key_name + suffix )
165
+
153
166
def map_tensor_name (self , name : str , try_suffixes : Sequence [str ] = (".weight" , ".bias" )) -> str :
154
167
new_name = self .tensor_map .get_name (key = name , try_suffixes = try_suffixes )
155
168
if new_name is None :
@@ -218,12 +231,12 @@ def write_tensors(self):
218
231
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
219
232
def np_fp32_to_bf16 (n : np .ndarray ):
220
233
# force nan to quiet
221
- n = np .where ((n & 0x7fffffff ) > 0x7f800000 , n | (64 << 16 ), n )
234
+ n = np .where ((n & 0x7fffffff ) > 0x7f800000 , ( n & 0xffff0000 ) | (64 << 16 ), n )
222
235
# flush subnormals to zero
223
236
n = np .where ((n & 0x7f800000 ) == 0 , n & 0x80000000 , n )
224
237
# round to nearest even
225
238
n = (n + (0x7fff + ((n >> 16 ) & 1 ))) >> 16
226
- return n
239
+ return n . astype ( np . int16 )
227
240
228
241
# Doing this row-wise is much, much faster than element-wise, hence the signature
229
242
v_fp32_to_bf16 = np .vectorize (np_fp32_to_bf16 , otypes = [np .int16 ], signature = "(n)->(n)" )
@@ -263,10 +276,25 @@ def np_fp32_to_bf16(n: np.ndarray):
263
276
extra_f16 = self .extra_f16_tensors (name , new_name , bid , n_dims )
264
277
265
278
# Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
266
- extra_f32 = extra_f32 or n_dims == 1 or new_name .endswith ("_norm.weight" )
279
+ # Conditions should closely match those in llama_model_quantize_internal in llama.cpp
280
+ extra_f32 = any (cond for cond in (
281
+ extra_f32 ,
282
+ n_dims == 1 ,
283
+ new_name .endswith ("_norm.weight" ),
284
+ ))
285
+
286
+ # Some tensor types are always in float32
287
+ extra_f32 = extra_f32 or any (self .match_model_tensor_name (new_name , key , bid ) for key in (
288
+ gguf .MODEL_TENSOR .FFN_GATE_INP ,
289
+ gguf .MODEL_TENSOR .POS_EMBD ,
290
+ gguf .MODEL_TENSOR .TOKEN_TYPES ,
291
+ ))
267
292
268
293
# if f16 desired, convert any float32 2-dim weight tensors to float16
269
- extra_f16 = extra_f16 or (name .endswith (".weight" ) and n_dims >= 2 )
294
+ extra_f16 = any (cond for cond in (
295
+ extra_f16 ,
296
+ (name .endswith (".weight" ) and n_dims >= 2 ),
297
+ ))
270
298
271
299
if self .ftype != gguf .LlamaFileType .ALL_F32 and extra_f16 and not extra_f32 :
272
300
if self .ftype == gguf .LlamaFileType .MOSTLY_F16 :
@@ -2050,12 +2078,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
2050
2078
2051
2079
return [(self .map_tensor_name (name ), data_torch )]
2052
2080
2053
- def extra_f32_tensors (self , name : str , new_name : str , bid : int | None , n_dims : int ) -> bool :
2054
- del new_name , bid , n_dims # unused
2055
-
2056
- # not used with get_rows, must be F32
2057
- return name == "embeddings.token_type_embeddings.weight"
2058
-
2059
2081
2060
2082
@Model .register ("NomicBertModel" )
2061
2083
class NomicBertModel (BertModel ):
@@ -2453,6 +2475,8 @@ def main() -> None:
2453
2475
logger .info ("Set model tokenizer" )
2454
2476
model_instance .set_vocab ()
2455
2477
2478
+ model_instance .gguf_writer .add_quantization_version (gguf .GGML_QUANT_VERSION );
2479
+
2456
2480
if args .vocab_only :
2457
2481
logger .info (f"Exporting model vocab to '{ fname_out } '" )
2458
2482
model_instance .write_vocab ()
0 commit comments