Skip to content

Commit 69ff613

Browse files
authored
llama : support models without vocabulary (#5798)
* additional methods to read model and ctx parameters * vocab size as a part of a model metadata * models without vocabulary, convert.py part * models without vocabulary, llama.cpp part * PR clean up * converter scrypt fixes * llama_vocab_type update (renamed the new key) * pr review fixes * revert function renaming * one more NoVocab assert
1 parent 044ec4b commit 69ff613

File tree

5 files changed

+142
-88
lines changed

5 files changed

+142
-88
lines changed

convert.py

Lines changed: 73 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,9 @@ def load(model_plus: ModelPlus) -> Params:
332332
#
333333

334334
class BpeVocab:
335+
tokenizer_model = "gpt2"
336+
name = "bpe"
337+
335338
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
336339
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
337340
if isinstance(self.bpe_tokenizer.get('model'), dict):
@@ -390,6 +393,9 @@ def __repr__(self) -> str:
390393

391394

392395
class SentencePieceVocab:
396+
tokenizer_model = "llama"
397+
name = "spm"
398+
393399
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
394400
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
395401
added_tokens: dict[str, int]
@@ -453,6 +459,9 @@ def __repr__(self) -> str:
453459

454460

455461
class HfVocab:
462+
tokenizer_model = "llama"
463+
name = "hfft"
464+
456465
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
457466
try:
458467
from transformers import AutoTokenizer
@@ -553,7 +562,15 @@ def __repr__(self) -> str:
553562
return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
554563

555564

556-
Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab"
565+
class NoVocab:
566+
tokenizer_model = "no_vocab"
567+
name = "no_vocab"
568+
569+
def __repr__(self) -> str:
570+
return "<NoVocab for a model without integrated vocabulary>"
571+
572+
573+
Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab | NoVocab"
557574

558575

559576
#
@@ -935,8 +952,10 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
935952
# Handle special case where the model's vocab size is not set
936953
if params.n_vocab == -1:
937954
raise ValueError(
938-
f"The model's vocab size is set to -1 in params.json. Please update it manually. Maybe {vocab.vocab_size}?"
955+
f"The model's vocab size is set to -1 in params.json. Please update it manually.{f' Maybe {vocab.vocab_size}?' if hasattr(vocab, 'vocab_size') else ''}"
939956
)
957+
if isinstance(vocab, NoVocab):
958+
return # model has no vocab
940959

941960
# Check for a vocab size mismatch
942961
if params.n_vocab == vocab.vocab_size:
@@ -977,6 +996,7 @@ def add_meta_arch(self, params: Params) -> None:
977996
name = str(params.path_model.parent).split('/')[-1]
978997

979998
self.gguf.add_name (name)
999+
self.gguf.add_vocab_size (params.n_vocab)
9801000
self.gguf.add_context_length (params.n_ctx)
9811001
self.gguf.add_embedding_length (params.n_embd)
9821002
self.gguf.add_block_count (params.n_layer)
@@ -1013,21 +1033,9 @@ def add_meta_arch(self, params: Params) -> None:
10131033
if params.ftype is not None:
10141034
self.gguf.add_file_type(params.ftype)
10151035

1016-
def handle_tokenizer_model(self, vocab: Vocab) -> str:
1017-
# Map the vocab types to the supported tokenizer models
1018-
tokenizer_model = {
1019-
SentencePieceVocab: "llama",
1020-
HfVocab: "llama",
1021-
BpeVocab: "gpt2",
1022-
}.get(type(vocab))
1023-
1024-
# Block if vocab type is not predefined
1025-
if tokenizer_model is None:
1026-
raise ValueError("Unknown vocab type: Not supported")
1027-
1028-
return tokenizer_model
1029-
10301036
def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
1037+
assert not isinstance(vocab, NoVocab)
1038+
10311039
tokens = []
10321040
scores = []
10331041
toktypes = []
@@ -1043,11 +1051,8 @@ def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list
10431051
return tokens, scores, toktypes
10441052

10451053
def add_meta_vocab(self, vocab: Vocab) -> None:
1046-
# Handle the tokenizer model
1047-
tokenizer_model = self.handle_tokenizer_model(vocab)
1048-
10491054
# Ensure that tokenizer_model is added to the GGUF model
1050-
self.gguf.add_tokenizer_model(tokenizer_model)
1055+
self.gguf.add_tokenizer_model(vocab.tokenizer_model)
10511056

10521057
# Extract model vocabulary for model conversion
10531058
tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab)
@@ -1074,6 +1079,26 @@ def write_meta(self) -> None:
10741079
def write_tensor_info(self) -> None:
10751080
self.gguf.write_ti_data_to_file()
10761081

1082+
def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: int) -> None:
1083+
ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency=concurrency)
1084+
if ftype == GGMLFileType.MostlyQ8_0:
1085+
ndarrays = bounded_parallel_map(
1086+
OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
1087+
use_processpool_executor=True,
1088+
)
1089+
else:
1090+
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
1091+
1092+
start = time.time()
1093+
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
1094+
elapsed = time.time() - start
1095+
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
1096+
padi = len(str(len(model)))
1097+
print(
1098+
f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
1099+
)
1100+
self.gguf.write_tensor_data(ndarray)
1101+
10771102
def close(self) -> None:
10781103
self.gguf.close()
10791104

@@ -1082,7 +1107,7 @@ def write_vocab_only(
10821107
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
10831108
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
10841109
) -> None:
1085-
check_vocab_size(params, vocab, pad_vocab = pad_vocab)
1110+
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
10861111

10871112
of = OutputFile(fname_out, endianess=endianess)
10881113

@@ -1120,8 +1145,11 @@ def write_all(
11201145

11211146
# meta data
11221147
of.add_meta_arch(params)
1123-
of.add_meta_vocab(vocab)
1124-
of.add_meta_special_vocab(svocab)
1148+
if isinstance(vocab, NoVocab):
1149+
of.gguf.add_tokenizer_model(vocab.tokenizer_model)
1150+
else:
1151+
of.add_meta_vocab(vocab)
1152+
of.add_meta_special_vocab(svocab)
11251153

11261154
# tensor info
11271155
for name, lazy_tensor in model.items():
@@ -1131,24 +1159,7 @@ def write_all(
11311159
of.write_tensor_info()
11321160

11331161
# tensor data
1134-
ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
1135-
if ftype == GGMLFileType.MostlyQ8_0:
1136-
ndarrays = bounded_parallel_map(
1137-
OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
1138-
use_processpool_executor=True,
1139-
)
1140-
else:
1141-
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
1142-
1143-
start = time.time()
1144-
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
1145-
elapsed = time.time() - start
1146-
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
1147-
padi = len(str(len(model)))
1148-
print(
1149-
f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
1150-
)
1151-
of.gguf.write_tensor_data(ndarray)
1162+
of.write_tensor_data(ftype, model, concurrency)
11521163

11531164
of.close()
11541165

@@ -1309,8 +1320,8 @@ def _select_file(self, vocab_types: list[str]) -> tuple[str, Path]:
13091320
return vtype, path
13101321
raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
13111322

1312-
def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
1313-
load_merges = vocabtype == "bpe"
1323+
def _create_special_vocab(self, vocab: Vocab, model_parent_path: Path) -> gguf.SpecialVocab:
1324+
load_merges = vocab.name == "bpe"
13141325
n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
13151326
return gguf.SpecialVocab(
13161327
model_parent_path,
@@ -1319,30 +1330,34 @@ def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path:
13191330
n_vocab=n_vocab,
13201331
)
13211332

1322-
def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
1333+
def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
13231334
vocab_type, path = self._select_file(vocab_types)
13241335
print(f"Loading vocab file {path!r}, type {vocab_type!r}")
13251336

13261337
added_tokens_path = path.parent / "added_tokens.json"
1327-
vocab: Vocab
13281338
if vocab_type == "bpe":
1329-
vocab = BpeVocab(
1339+
return BpeVocab(
13301340
path, added_tokens_path if added_tokens_path.exists() else None
13311341
)
1332-
elif vocab_type == "spm":
1333-
vocab = SentencePieceVocab(
1342+
if vocab_type == "spm":
1343+
return SentencePieceVocab(
13341344
path, added_tokens_path if added_tokens_path.exists() else None
13351345
)
1336-
elif vocab_type == "hfft":
1337-
vocab = HfVocab(
1346+
if vocab_type == "hfft":
1347+
return HfVocab(
13381348
path.parent, added_tokens_path if added_tokens_path.exists() else None
13391349
)
1350+
raise ValueError(vocab_type)
1351+
1352+
def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
1353+
vocab: Vocab
1354+
if len(vocab_types) == 1 and "no_vocab" in vocab_types:
1355+
vocab = NoVocab()
13401356
else:
1341-
raise ValueError(vocab_type)
1357+
vocab = self._create_vocab_by_path(vocab_types)
13421358
# FIXME: Respect --vocab-dir?
13431359
special_vocab = self._create_special_vocab(
13441360
vocab,
1345-
vocab_type,
13461361
model_parent_path,
13471362
)
13481363
return vocab, special_vocab
@@ -1380,6 +1395,7 @@ def main(args_in: list[str] | None = None) -> None:
13801395
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
13811396
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
13821397
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
1398+
parser.add_argument("--no-vocab", action="store_true", help="store model without the vocab")
13831399
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
13841400
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
13851401
parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft")
@@ -1392,6 +1408,10 @@ def main(args_in: list[str] | None = None) -> None:
13921408
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
13931409

13941410
args = parser.parse_args(args_in)
1411+
if args.no_vocab:
1412+
if args.vocab_only:
1413+
raise ValueError("no need to specify --vocab-only if using --no-vocab")
1414+
args.vocab_type = "no_vocab"
13951415

13961416
if args.dump_single:
13971417
model_plus = lazy_load_file(args.model)
@@ -1442,7 +1462,7 @@ def main(args_in: list[str] | None = None) -> None:
14421462
print(f"Wrote {outfile}")
14431463
return
14441464

1445-
if model_plus.vocab is not None and args.vocab_dir is None:
1465+
if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
14461466
vocab = model_plus.vocab
14471467

14481468
print(f"Vocab info: {vocab}")

gguf-py/gguf/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ class General:
3232
FILE_TYPE = "general.file_type"
3333

3434
class LLM:
35+
VOCAB_SIZE = "{arch}.vocab_size"
3536
CONTEXT_LENGTH = "{arch}.context_length"
3637
EMBEDDING_LENGTH = "{arch}.embedding_length"
3738
BLOCK_COUNT = "{arch}.block_count"
@@ -752,6 +753,7 @@ def get_type(val: Any) -> GGUFValueType:
752753
KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE
753754

754755
# LLM
756+
KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE
755757
KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH
756758
KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH
757759
KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT

gguf-py/gguf/gguf_writer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,9 @@ def add_custom_alignment(self, alignment: int) -> None:
321321
self.data_alignment = alignment
322322
self.add_uint32(Keys.General.ALIGNMENT, alignment)
323323

324+
def add_vocab_size(self, size: int) -> None:
325+
self.add_uint32(Keys.LLM.VOCAB_SIZE.format(arch=self.arch), size)
326+
324327
def add_context_length(self, length: int) -> None:
325328
self.add_uint32(Keys.LLM.CONTEXT_LENGTH.format(arch=self.arch), length)
326329

0 commit comments

Comments
 (0)