Skip to content

Commit 95930da

Browse files
committed
convert-hf : get bit-exact same output as ./quantize
The quantization version was missing. * convert-hf : don't round bf16 NANs * convert-hf : save some memory with np.int16 intermediate bf16 weights * convert-hf : more closely match llama.cpp with which weights to keep in f32
1 parent 3801db1 commit 95930da

File tree

3 files changed

+37
-12
lines changed

3 files changed

+37
-12
lines changed

convert-hf-to-gguf.py

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -142,14 +142,27 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
142142
raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}")
143143

144144
def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
145-
name: str = gguf.TENSOR_NAMES[key]
146145
if key not in gguf.MODEL_TENSORS[self.model_arch]:
147146
raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}")
147+
name: str = gguf.TENSOR_NAMES[key]
148148
if "{bid}" in name:
149149
assert bid is not None
150150
name = name.format(bid=bid)
151151
return name + suffix
152152

153+
def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | None, suffix: str = ".weight") -> bool:
154+
if key not in gguf.MODEL_TENSORS[self.model_arch]:
155+
return False
156+
key_name: str = gguf.TENSOR_NAMES[key]
157+
if "{bid}" in key_name:
158+
if bid is None:
159+
return False
160+
key_name = key_name.format(bid=bid)
161+
else:
162+
if bid is not None:
163+
return False
164+
return name == (key_name + suffix)
165+
153166
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
154167
new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes)
155168
if new_name is None:
@@ -218,12 +231,12 @@ def write_tensors(self):
218231
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
219232
def np_fp32_to_bf16(n: np.ndarray):
220233
# force nan to quiet
221-
n = np.where((n & 0x7fffffff) > 0x7f800000, n | (64 << 16), n)
234+
n = np.where((n & 0x7fffffff) > 0x7f800000, (n & 0xffff0000) | (64 << 16), n)
222235
# flush subnormals to zero
223236
n = np.where((n & 0x7f800000) == 0, n & 0x80000000, n)
224237
# round to nearest even
225238
n = (n + (0x7fff + ((n >> 16) & 1))) >> 16
226-
return n
239+
return n.astype(np.int16)
227240

228241
# Doing this row-wise is much, much faster than element-wise, hence the signature
229242
v_fp32_to_bf16 = np.vectorize(np_fp32_to_bf16, otypes=[np.int16], signature="(n)->(n)")
@@ -263,10 +276,25 @@ def np_fp32_to_bf16(n: np.ndarray):
263276
extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims)
264277

265278
# Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
266-
extra_f32 = extra_f32 or n_dims == 1 or new_name.endswith("_norm.weight")
279+
# Conditions should closely match those in llama_model_quantize_internal in llama.cpp
280+
extra_f32 = any(cond for cond in (
281+
extra_f32,
282+
n_dims == 1,
283+
new_name.endswith("_norm.weight"),
284+
))
285+
286+
# Some tensor types are always in float32
287+
extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in (
288+
gguf.MODEL_TENSOR.FFN_GATE_INP,
289+
gguf.MODEL_TENSOR.POS_EMBD,
290+
gguf.MODEL_TENSOR.TOKEN_TYPES,
291+
))
267292

268293
# if f16 desired, convert any float32 2-dim weight tensors to float16
269-
extra_f16 = extra_f16 or (name.endswith(".weight") and n_dims >= 2)
294+
extra_f16 = any(cond for cond in (
295+
extra_f16,
296+
(name.endswith(".weight") and n_dims >= 2),
297+
))
270298

271299
if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
272300
if self.ftype == gguf.LlamaFileType.MOSTLY_F16:
@@ -2050,12 +2078,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
20502078

20512079
return [(self.map_tensor_name(name), data_torch)]
20522080

2053-
def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
2054-
del new_name, bid, n_dims # unused
2055-
2056-
# not used with get_rows, must be F32
2057-
return name == "embeddings.token_type_embeddings.weight"
2058-
20592081

20602082
@Model.register("NomicBertModel")
20612083
class NomicBertModel(BertModel):
@@ -2453,6 +2475,8 @@ def main() -> None:
24532475
logger.info("Set model tokenizer")
24542476
model_instance.set_vocab()
24552477

2478+
model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION);
2479+
24562480
if args.vocab_only:
24572481
logger.info(f"Exporting model vocab to '{fname_out}'")
24582482
model_instance.write_vocab()

gguf-py/gguf/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
GGUF_MAGIC = 0x46554747 # "GGUF"
1111
GGUF_VERSION = 3
1212
GGUF_DEFAULT_ALIGNMENT = 32
13+
GGML_QUANT_VERSION = 2 # GGML_QNT_VERSION from ggml.h
1314

1415
#
1516
# metadata keys

gguf-py/gguf/gguf_writer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,7 @@ def add_file_type(self, ftype: int) -> None:
350350
def add_name(self, name: str) -> None:
351351
self.add_string(Keys.General.NAME, name)
352352

353-
def add_quantization_version(self, quantization_version: GGMLQuantizationType) -> None:
353+
def add_quantization_version(self, quantization_version: int) -> None:
354354
self.add_uint32(
355355
Keys.General.QUANTIZATION_VERSION, quantization_version)
356356

0 commit comments

Comments
 (0)