Skip to content

Commit 3fec68b

Browse files
convert : add support of codeqwen due to tokenizer (#6707)
* add support of codeqwen due to tokenizer * override load_hparams * fix typo * fix load_params * convert : fix whitespace --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent c8297c6 commit 3fec68b

File tree

1 file changed

+16
-0
lines changed

1 file changed

+16
-0
lines changed

convert-hf-to-gguf.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,16 @@ def _set_vocab_sentencepiece(self):
363363
scores.append(-1000.0)
364364
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
365365

366+
if vocab_size > len(tokens):
367+
pad_count = vocab_size - len(tokens)
368+
print(
369+
f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]"
370+
)
371+
for i in range(1, pad_count + 1):
372+
tokens.append(f"[PAD{i}]")
373+
scores.append(-1000.0)
374+
toktypes.append(SentencePieceTokenTypes.UNUSED)
375+
366376
assert len(tokens) == vocab_size
367377

368378
self.gguf_writer.add_tokenizer_model("llama")
@@ -1789,6 +1799,12 @@ def write_tensors(self):
17891799
class Qwen2Model(Model):
17901800
model_arch = gguf.MODEL_ARCH.QWEN2
17911801

1802+
def set_vocab(self):
1803+
try:
1804+
self._set_vocab_sentencepiece()
1805+
except FileNotFoundError:
1806+
self._set_vocab_gpt2()
1807+
17921808

17931809
@Model.register("Qwen2MoeForCausalLM")
17941810
class Qwen2MoeModel(Model):

0 commit comments

Comments
 (0)