@@ -332,6 +332,9 @@ def load(model_plus: ModelPlus) -> Params:
332
332
#
333
333
334
334
class BpeVocab :
335
+ tokenizer_model = "gpt2"
336
+ name = "bpe"
337
+
335
338
def __init__ (self , fname_tokenizer : Path , fname_added_tokens : Path | None ) -> None :
336
339
self .bpe_tokenizer = json .loads (open (str (fname_tokenizer ), encoding = "utf-8" ).read ())
337
340
if isinstance (self .bpe_tokenizer .get ('model' ), dict ):
@@ -390,6 +393,9 @@ def __repr__(self) -> str:
390
393
391
394
392
395
class SentencePieceVocab :
396
+ tokenizer_model = "llama"
397
+ name = "spm"
398
+
393
399
def __init__ (self , fname_tokenizer : Path , fname_added_tokens : Path | None ) -> None :
394
400
self .sentencepiece_tokenizer = SentencePieceProcessor (str (fname_tokenizer ))
395
401
added_tokens : dict [str , int ]
@@ -453,6 +459,9 @@ def __repr__(self) -> str:
453
459
454
460
455
461
class HfVocab :
462
+ tokenizer_model = "llama"
463
+ name = "hfft"
464
+
456
465
def __init__ (self , fname_tokenizer : Path , fname_added_tokens : Path | None = None ) -> None :
457
466
try :
458
467
from transformers import AutoTokenizer
@@ -553,7 +562,15 @@ def __repr__(self) -> str:
553
562
return f"<HfVocab with { self .vocab_size_base } base tokens and { len (self .added_tokens_list )} added tokens>"
554
563
555
564
556
- Vocab : TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab"
565
+ class NoVocab :
566
+ tokenizer_model = "no_vocab"
567
+ name = "no_vocab"
568
+
569
+ def __repr__ (self ) -> str :
570
+ return "<NoVocab for a model without integrated vocabulary>"
571
+
572
+
573
+ Vocab : TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab | NoVocab"
557
574
558
575
559
576
#
@@ -935,8 +952,10 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
935
952
# Handle special case where the model's vocab size is not set
936
953
if params .n_vocab == - 1 :
937
954
raise ValueError (
938
- f"The model's vocab size is set to -1 in params.json. Please update it manually. Maybe { vocab .vocab_size } ?"
955
+ f"The model's vocab size is set to -1 in params.json. Please update it manually.{ f' Maybe { vocab .vocab_size } ?' if hasattr ( vocab , 'vocab_size' ) else '' } "
939
956
)
957
+ if isinstance (vocab , NoVocab ):
958
+ return # model has no vocab
940
959
941
960
# Check for a vocab size mismatch
942
961
if params .n_vocab == vocab .vocab_size :
@@ -977,6 +996,7 @@ def add_meta_arch(self, params: Params) -> None:
977
996
name = str (params .path_model .parent ).split ('/' )[- 1 ]
978
997
979
998
self .gguf .add_name (name )
999
+ self .gguf .add_vocab_size (params .n_vocab )
980
1000
self .gguf .add_context_length (params .n_ctx )
981
1001
self .gguf .add_embedding_length (params .n_embd )
982
1002
self .gguf .add_block_count (params .n_layer )
@@ -1013,21 +1033,9 @@ def add_meta_arch(self, params: Params) -> None:
1013
1033
if params .ftype is not None :
1014
1034
self .gguf .add_file_type (params .ftype )
1015
1035
1016
- def handle_tokenizer_model (self , vocab : Vocab ) -> str :
1017
- # Map the vocab types to the supported tokenizer models
1018
- tokenizer_model = {
1019
- SentencePieceVocab : "llama" ,
1020
- HfVocab : "llama" ,
1021
- BpeVocab : "gpt2" ,
1022
- }.get (type (vocab ))
1023
-
1024
- # Block if vocab type is not predefined
1025
- if tokenizer_model is None :
1026
- raise ValueError ("Unknown vocab type: Not supported" )
1027
-
1028
- return tokenizer_model
1029
-
1030
1036
def extract_vocabulary_from_model (self , vocab : Vocab ) -> tuple [list [bytes ], list [float ], list [gguf .TokenType ]]:
1037
+ assert not isinstance (vocab , NoVocab )
1038
+
1031
1039
tokens = []
1032
1040
scores = []
1033
1041
toktypes = []
@@ -1043,11 +1051,8 @@ def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list
1043
1051
return tokens , scores , toktypes
1044
1052
1045
1053
def add_meta_vocab (self , vocab : Vocab ) -> None :
1046
- # Handle the tokenizer model
1047
- tokenizer_model = self .handle_tokenizer_model (vocab )
1048
-
1049
1054
# Ensure that tokenizer_model is added to the GGUF model
1050
- self .gguf .add_tokenizer_model (tokenizer_model )
1055
+ self .gguf .add_tokenizer_model (vocab . tokenizer_model )
1051
1056
1052
1057
# Extract model vocabulary for model conversion
1053
1058
tokens , scores , toktypes = self .extract_vocabulary_from_model (vocab )
@@ -1074,6 +1079,26 @@ def write_meta(self) -> None:
1074
1079
def write_tensor_info (self ) -> None :
1075
1080
self .gguf .write_ti_data_to_file ()
1076
1081
1082
+ def write_tensor_data (self , ftype : GGMLFileType , model : LazyModel , concurrency : int ) -> None :
1083
+ ndarrays_inner = bounded_parallel_map (OutputFile .do_item , model .items (), concurrency = concurrency )
1084
+ if ftype == GGMLFileType .MostlyQ8_0 :
1085
+ ndarrays = bounded_parallel_map (
1086
+ OutputFile .maybe_do_quantize , ndarrays_inner , concurrency = concurrency , max_workers = concurrency ,
1087
+ use_processpool_executor = True ,
1088
+ )
1089
+ else :
1090
+ ndarrays = map (OutputFile .maybe_do_quantize , ndarrays_inner )
1091
+
1092
+ start = time .time ()
1093
+ for i , ((name , lazy_tensor ), ndarray ) in enumerate (zip (model .items (), ndarrays )):
1094
+ elapsed = time .time () - start
1095
+ size = ' x ' .join (f"{ dim :6d} " for dim in lazy_tensor .shape )
1096
+ padi = len (str (len (model )))
1097
+ print (
1098
+ f"[{ i + 1 :{padi }d} /{ len (model )} ] Writing tensor { name :38s} | size { size :16} | type { lazy_tensor .data_type .name :4} | T+{ int (elapsed ):4} "
1099
+ )
1100
+ self .gguf .write_tensor_data (ndarray )
1101
+
1077
1102
def close (self ) -> None :
1078
1103
self .gguf .close ()
1079
1104
@@ -1082,7 +1107,7 @@ def write_vocab_only(
1082
1107
fname_out : Path , params : Params , vocab : Vocab , svocab : gguf .SpecialVocab ,
1083
1108
endianess : gguf .GGUFEndian = gguf .GGUFEndian .LITTLE , pad_vocab : bool = False ,
1084
1109
) -> None :
1085
- check_vocab_size (params , vocab , pad_vocab = pad_vocab )
1110
+ check_vocab_size (params , vocab , pad_vocab = pad_vocab )
1086
1111
1087
1112
of = OutputFile (fname_out , endianess = endianess )
1088
1113
@@ -1120,8 +1145,11 @@ def write_all(
1120
1145
1121
1146
# meta data
1122
1147
of .add_meta_arch (params )
1123
- of .add_meta_vocab (vocab )
1124
- of .add_meta_special_vocab (svocab )
1148
+ if isinstance (vocab , NoVocab ):
1149
+ of .gguf .add_tokenizer_model (vocab .tokenizer_model )
1150
+ else :
1151
+ of .add_meta_vocab (vocab )
1152
+ of .add_meta_special_vocab (svocab )
1125
1153
1126
1154
# tensor info
1127
1155
for name , lazy_tensor in model .items ():
@@ -1131,24 +1159,7 @@ def write_all(
1131
1159
of .write_tensor_info ()
1132
1160
1133
1161
# tensor data
1134
- ndarrays_inner = bounded_parallel_map (OutputFile .do_item , model .items (), concurrency = concurrency )
1135
- if ftype == GGMLFileType .MostlyQ8_0 :
1136
- ndarrays = bounded_parallel_map (
1137
- OutputFile .maybe_do_quantize , ndarrays_inner , concurrency = concurrency , max_workers = concurrency ,
1138
- use_processpool_executor = True ,
1139
- )
1140
- else :
1141
- ndarrays = map (OutputFile .maybe_do_quantize , ndarrays_inner )
1142
-
1143
- start = time .time ()
1144
- for i , ((name , lazy_tensor ), ndarray ) in enumerate (zip (model .items (), ndarrays )):
1145
- elapsed = time .time () - start
1146
- size = ' x ' .join (f"{ dim :6d} " for dim in lazy_tensor .shape )
1147
- padi = len (str (len (model )))
1148
- print (
1149
- f"[{ i + 1 :{padi }d} /{ len (model )} ] Writing tensor { name :38s} | size { size :16} | type { lazy_tensor .data_type .name :4} | T+{ int (elapsed ):4} "
1150
- )
1151
- of .gguf .write_tensor_data (ndarray )
1162
+ of .write_tensor_data (ftype , model , concurrency )
1152
1163
1153
1164
of .close ()
1154
1165
@@ -1309,8 +1320,8 @@ def _select_file(self, vocab_types: list[str]) -> tuple[str, Path]:
1309
1320
return vtype , path
1310
1321
raise FileNotFoundError (f"Could not find any of { [self ._FILES [vt ] for vt in vocab_types ]} " )
1311
1322
1312
- def _create_special_vocab (self , vocab : Vocab , vocabtype : str , model_parent_path : Path ) -> gguf .SpecialVocab :
1313
- load_merges = vocabtype == "bpe"
1323
+ def _create_special_vocab (self , vocab : Vocab , model_parent_path : Path ) -> gguf .SpecialVocab :
1324
+ load_merges = vocab . name == "bpe"
1314
1325
n_vocab = vocab .vocab_size if hasattr (vocab , "vocab_size" ) else None
1315
1326
return gguf .SpecialVocab (
1316
1327
model_parent_path ,
@@ -1319,30 +1330,34 @@ def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path:
1319
1330
n_vocab = n_vocab ,
1320
1331
)
1321
1332
1322
- def load_vocab (self , vocab_types : list [str ], model_parent_path : Path ) -> tuple [ Vocab , gguf . SpecialVocab ] :
1333
+ def _create_vocab_by_path (self , vocab_types : list [str ]) -> Vocab :
1323
1334
vocab_type , path = self ._select_file (vocab_types )
1324
1335
print (f"Loading vocab file { path !r} , type { vocab_type !r} " )
1325
1336
1326
1337
added_tokens_path = path .parent / "added_tokens.json"
1327
- vocab : Vocab
1328
1338
if vocab_type == "bpe" :
1329
- vocab = BpeVocab (
1339
+ return BpeVocab (
1330
1340
path , added_tokens_path if added_tokens_path .exists () else None
1331
1341
)
1332
- elif vocab_type == "spm" :
1333
- vocab = SentencePieceVocab (
1342
+ if vocab_type == "spm" :
1343
+ return SentencePieceVocab (
1334
1344
path , added_tokens_path if added_tokens_path .exists () else None
1335
1345
)
1336
- elif vocab_type == "hfft" :
1337
- vocab = HfVocab (
1346
+ if vocab_type == "hfft" :
1347
+ return HfVocab (
1338
1348
path .parent , added_tokens_path if added_tokens_path .exists () else None
1339
1349
)
1350
+ raise ValueError (vocab_type )
1351
+
1352
+ def load_vocab (self , vocab_types : list [str ], model_parent_path : Path ) -> tuple [Vocab , gguf .SpecialVocab ]:
1353
+ vocab : Vocab
1354
+ if len (vocab_types ) == 1 and "no_vocab" in vocab_types :
1355
+ vocab = NoVocab ()
1340
1356
else :
1341
- raise ValueError ( vocab_type )
1357
+ vocab = self . _create_vocab_by_path ( vocab_types )
1342
1358
# FIXME: Respect --vocab-dir?
1343
1359
special_vocab = self ._create_special_vocab (
1344
1360
vocab ,
1345
- vocab_type ,
1346
1361
model_parent_path ,
1347
1362
)
1348
1363
return vocab , special_vocab
@@ -1380,6 +1395,7 @@ def main(args_in: list[str] | None = None) -> None:
1380
1395
parser .add_argument ("--dump" , action = "store_true" , help = "don't convert, just show what's in the model" )
1381
1396
parser .add_argument ("--dump-single" , action = "store_true" , help = "don't convert, just show what's in a single model file" )
1382
1397
parser .add_argument ("--vocab-only" , action = "store_true" , help = "extract only the vocab" )
1398
+ parser .add_argument ("--no-vocab" , action = "store_true" , help = "store model without the vocab" )
1383
1399
parser .add_argument ("--outtype" , choices = output_choices , help = "output format - note: q8_0 may be very slow (default: f16 or f32 based on input)" )
1384
1400
parser .add_argument ("--vocab-dir" , type = Path , help = "directory containing tokenizer.model, if separate from model file" )
1385
1401
parser .add_argument ("--vocab-type" , help = "vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)" , default = "spm,hfft" )
@@ -1392,6 +1408,10 @@ def main(args_in: list[str] | None = None) -> None:
1392
1408
parser .add_argument ("--skip-unknown" , action = "store_true" , help = "skip unknown tensor names instead of failing" )
1393
1409
1394
1410
args = parser .parse_args (args_in )
1411
+ if args .no_vocab :
1412
+ if args .vocab_only :
1413
+ raise ValueError ("no need to specify --vocab-only if using --no-vocab" )
1414
+ args .vocab_type = "no_vocab"
1395
1415
1396
1416
if args .dump_single :
1397
1417
model_plus = lazy_load_file (args .model )
@@ -1442,7 +1462,7 @@ def main(args_in: list[str] | None = None) -> None:
1442
1462
print (f"Wrote { outfile } " )
1443
1463
return
1444
1464
1445
- if model_plus .vocab is not None and args .vocab_dir is None :
1465
+ if model_plus .vocab is not None and args .vocab_dir is None and not args . no_vocab :
1446
1466
vocab = model_plus .vocab
1447
1467
1448
1468
print (f"Vocab info: { vocab } " )
0 commit comments