1
1
#!/usr/bin/env python3
2
- # HF falcon --> gguf conversion
2
+ # HF refact --> gguf conversion
3
3
4
4
from __future__ import annotations
5
5
15
15
import torch
16
16
from transformers import AutoTokenizer # type: ignore[import]
17
17
18
- if ' NO_LOCAL_GGUF' not in os .environ :
19
- sys .path .insert (1 , str (Path (__file__ ).parent / ' gguf-py' / ' gguf' ))
18
+ if " NO_LOCAL_GGUF" not in os .environ :
19
+ sys .path .insert (1 , str (Path (__file__ ).parent / " gguf-py" / " gguf" ))
20
20
import gguf
21
21
22
22
@@ -31,13 +31,17 @@ def bytes_to_unicode():
31
31
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
32
32
And avoids mapping to whitespace/control characters the bpe code barfs on.
33
33
"""
34
- bs = list (range (ord ("!" ), ord ("~" )+ 1 ))+ list (range (ord ("¡" ), ord ("¬" )+ 1 ))+ list (range (ord ("®" ), ord ("ÿ" )+ 1 ))
34
+ bs = (
35
+ list (range (ord ("!" ), ord ("~" ) + 1 ))
36
+ + list (range (ord ("¡" ), ord ("¬" ) + 1 ))
37
+ + list (range (ord ("®" ), ord ("ÿ" ) + 1 ))
38
+ )
35
39
cs = bs [:]
36
40
n = 0
37
41
for b in range (2 ** 8 ):
38
42
if b not in bs :
39
43
bs .append (b )
40
- cs .append (2 ** 8 + n )
44
+ cs .append (2 ** 8 + n )
41
45
n += 1
42
46
return dict (zip (bs , (chr (n ) for n in cs )))
43
47
@@ -54,32 +58,41 @@ def count_model_parts(dir_model: Path) -> int:
54
58
55
59
56
60
def parse_args () -> argparse .Namespace :
57
- parser = argparse .ArgumentParser (description = "Convert a Refact model to a GGML compatible file" )
61
+ parser = argparse .ArgumentParser (
62
+ description = "Convert a Refact model to a GGML compatible file"
63
+ )
58
64
parser .add_argument (
59
- "--vocab-only" , action = "store_true" ,
65
+ "--vocab-only" ,
66
+ action = "store_true" ,
60
67
help = "extract only the vocab" ,
61
68
)
62
69
parser .add_argument (
63
- "--outfile" , type = Path ,
70
+ "--outfile" ,
71
+ type = Path ,
64
72
help = "path to write to; default: based on input" ,
65
73
)
66
74
parser .add_argument (
67
- "model" , type = Path ,
75
+ "model" ,
76
+ type = Path ,
68
77
help = "directory containing model file, or model file itself (*.bin)" ,
69
78
)
70
79
parser .add_argument (
71
- "ftype" , type = int , choices = [0 , 1 ], default = 1 , nargs = '?' ,
80
+ "ftype" ,
81
+ type = int ,
82
+ choices = [0 , 1 ],
83
+ default = 1 ,
84
+ nargs = "?" ,
72
85
help = "output format - use 0 for float32, 1 for float16" ,
73
86
)
74
87
return parser .parse_args ()
75
88
89
+
76
90
args = parse_args ()
77
91
78
92
dir_model = args .model
79
93
ftype = args .ftype
80
94
if not dir_model .is_dir ():
81
-
82
- print (f'Error: { args .model } is not a directory' , file = sys .stderr )
95
+ print (f"Error: { args .model } is not a directory" , file = sys .stderr )
83
96
sys .exit (1 )
84
97
85
98
# possible tensor data types
@@ -93,9 +106,9 @@ def parse_args() -> argparse.Namespace:
93
106
fname_out = args .outfile
94
107
else :
95
108
# output in the same directory as the model by default
96
- fname_out = dir_model / f' ggml-model-{ ftype_str [ftype ]} .gguf'
109
+ fname_out = dir_model / f" ggml-model-{ ftype_str [ftype ]} .gguf"
97
110
98
- print ("gguf: loading model " + dir_model .name )
111
+ print ("gguf: loading model " + dir_model .name )
99
112
100
113
with open (dir_model / "config.json" , "r" , encoding = "utf-8" ) as f :
101
114
hparams = json .load (f )
@@ -108,7 +121,7 @@ def parse_args() -> argparse.Namespace:
108
121
# get number of model parts
109
122
num_parts = count_model_parts (dir_model )
110
123
111
- ARCH = gguf .MODEL_ARCH .REFACT
124
+ ARCH = gguf .MODEL_ARCH .REFACT
112
125
gguf_writer = gguf .GGUFWriter (fname_out , gguf .MODEL_ARCH_NAMES [ARCH ])
113
126
114
127
print ("gguf: get model metadata" )
@@ -142,9 +155,9 @@ def parse_args() -> argparse.Namespace:
142
155
scores : list [float ] = []
143
156
toktypes : list [int ] = []
144
157
145
- tokenizer_json_file = dir_model / ' tokenizer.json'
158
+ tokenizer_json_file = dir_model / " tokenizer.json"
146
159
if not tokenizer_json_file .is_file ():
147
- print (f' Error: Missing { tokenizer_json_file } ' , file = sys .stderr )
160
+ print (f" Error: Missing { tokenizer_json_file } " , file = sys .stderr )
148
161
sys .exit (1 )
149
162
150
163
# gpt2 tokenizer
@@ -157,7 +170,11 @@ def parse_args() -> argparse.Namespace:
157
170
158
171
# The number of tokens in tokenizer.json can differ from the expected vocab size.
159
172
# This causes downstream issues with mismatched tensor sizes when running the inference
160
- vocab_size = hparams ["vocab_size" ] if "vocab_size" in hparams else len (tokenizer_json ["model" ]["vocab" ])
173
+ vocab_size = (
174
+ hparams ["vocab_size" ]
175
+ if "vocab_size" in hparams
176
+ else len (tokenizer_json ["model" ]["vocab" ])
177
+ )
161
178
162
179
tokenizer = AutoTokenizer .from_pretrained (dir_model , trust_remote_code = True )
163
180
@@ -176,29 +193,29 @@ def parse_args() -> argparse.Namespace:
176
193
if ord (c ) < 256 : # single byte character
177
194
text .append (byte_decoder [ord (c )])
178
195
else : # multibyte special token character
179
- text .extend (c .encode (' utf-8' ))
196
+ text .extend (c .encode (" utf-8" ))
180
197
else :
181
198
print (f"Key { i } not in tokenizer vocabulary. Padding with an arbitrary token." )
182
199
pad_token = f"[PAD{ i } ]" .encode ("utf8" )
183
200
text = bytearray (pad_token )
184
201
185
202
tokens .append (text )
186
- scores .append (0.0 ) # dymmy
203
+ scores .append (0.0 ) # dymmy
187
204
toktypes .append (gguf .TokenType .NORMAL ) # dummy
188
205
189
206
gguf_writer .add_token_list (tokens )
190
207
gguf_writer .add_token_scores (scores )
191
208
gguf_writer .add_token_types (toktypes )
192
209
193
- special_vocab = gguf .SpecialVocab (dir_model , load_merges = True )
210
+ special_vocab = gguf .SpecialVocab (dir_model , load_merges = True )
194
211
special_vocab .add_to_gguf (gguf_writer )
195
212
196
213
# TENSORS
197
214
198
- tensor_map = gguf .get_tensor_name_map (ARCH ,block_count )
215
+ tensor_map = gguf .get_tensor_name_map (ARCH , block_count )
199
216
200
217
# params for qkv transform
201
- n_head = hparams ["n_head" ]
218
+ n_head = hparams ["n_head" ]
202
219
n_head_kv = 1
203
220
204
221
head_dim = hparams ["n_embd" ] // n_head
@@ -230,7 +247,7 @@ def parse_args() -> argparse.Namespace:
230
247
data = data .squeeze ().numpy ()
231
248
232
249
# map tensor names
233
- new_name = tensor_map .get_name (name , try_suffixes = (".weight" , ))
250
+ new_name = tensor_map .get_name (name , try_suffixes = (".weight" ,))
234
251
if new_name is None :
235
252
print ("Can not map tensor '" + name + "'" )
236
253
sys .exit ()
@@ -247,10 +264,23 @@ def parse_args() -> argparse.Namespace:
247
264
data = data .astype (np .float32 )
248
265
249
266
# if f16 desired, convert any float32 2-dim weight tensors to float16
250
- if ftype == 1 and data_dtype == np .float32 and name .endswith (".weight" ) and n_dims == 2 :
267
+ if (
268
+ ftype == 1
269
+ and data_dtype == np .float32
270
+ and name .endswith (".weight" )
271
+ and n_dims == 2
272
+ ):
251
273
data = data .astype (np .float16 )
252
274
253
- print (new_name + ", n_dims = " + str (n_dims ) + ", " + str (old_dtype ) + " --> " + str (data .dtype ))
275
+ print (
276
+ new_name
277
+ + ", n_dims = "
278
+ + str (n_dims )
279
+ + ", "
280
+ + str (old_dtype )
281
+ + " --> "
282
+ + str (data .dtype )
283
+ )
254
284
255
285
gguf_writer .add_tensor (new_name , data )
256
286
0 commit comments