Skip to content

Commit 92139b9

Browse files
authored
tests : add test-tokenizer-0.sh + fix some tokenizers (#7036)
* tests : add test-tokenizer-0.sh * unicode : add all unicode number ranges * starcoder : fix pre-tokenizer * tests : add test that fails with DeepSeek tokenizers * falcon : fix regex * unicode : regenerate unicode tables * refact : add tokenizer model * lint : fix * tests : disable failing tests ggml-ci * refact : add tests files ggml-ci * convert : print -> logging ggml-ci * lint : fix * unicode : digit -> number * phi-3 : update
1 parent a2ac89d commit 92139b9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+903
-719
lines changed

.flake8

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
[flake8]
22
max-line-length = 125
33
ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
4-
exclude = examples/*,examples/*/**,*/**/__init__.py
4+
exclude = examples/*,examples/*/**,*/**/__init__.py,scripts/gen-unicode-data.py,tests/test-tokenizer-0.py

Makefile

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,11 +77,10 @@ test: $(TEST_TARGETS)
7777
./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
7878
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
7979
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
80-
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
81-
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
8280
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
8381
./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
8482
./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
83+
./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
8584
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
8685
continue; \
8786
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \

convert-hf-to-gguf-update.py

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from enum import IntEnum, auto
3232
from transformers import AutoTokenizer
3333

34+
logging.basicConfig(level=logging.DEBUG)
3435
logger = logging.getLogger("convert-hf-to-gguf-update")
3536

3637

@@ -62,6 +63,7 @@ class TOKENIZER_TYPE(IntEnum):
6263
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
6364
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
6465
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
66+
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
6567
]
6668

6769
# make directory "models/tokenizers" if it doesn't exist
@@ -158,8 +160,8 @@ def get_vocab_base_pre(self, tokenizer) -> str:
158160
chktok = tokenizer.encode(chktxt)
159161
chkhsh = sha256(str(chktok).encode()).hexdigest()
160162
161-
print(f"chktok: {{chktok}}")
162-
print(f"chkhsh: {{chkhsh}}")
163+
logger.debug(f"chktok: {{chktok}}")
164+
logger.debug(f"chkhsh: {{chkhsh}}")
163165
164166
res = None
165167
@@ -168,22 +170,22 @@ def get_vocab_base_pre(self, tokenizer) -> str:
168170
# don't edit the hashes manually!
169171
{src_ifs}
170172
if res is None:
171-
print("\\n")
172-
print("**************************************************************************************")
173-
print("** WARNING: The BPE pre-tokenizer was not recognized!")
174-
print("** There are 2 possible reasons for this:")
175-
print("** - the model has not been added to convert-hf-to-gguf-update.py yet")
176-
print("** - the pre-tokenization config has changed upstream")
177-
print("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
178-
print("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
179-
print("**")
180-
print(f"** chkhsh: {{chkhsh}}")
181-
print("**************************************************************************************")
182-
print("\\n")
173+
logger.warning("\\n")
174+
logger.warning("**************************************************************************************")
175+
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
176+
logger.warning("** There are 2 possible reasons for this:")
177+
logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
178+
logger.warning("** - the pre-tokenization config has changed upstream")
179+
logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
180+
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
181+
logger.warning("**")
182+
logger.warning(f"** chkhsh: {{chkhsh}}")
183+
logger.warning("**************************************************************************************")
184+
logger.warning("\\n")
183185
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
184186
185-
print(f"tokenizer.ggml.pre: {{repr(res)}}")
186-
print(f"chkhsh: {{chkhsh}}")
187+
logger.debug(f"tokenizer.ggml.pre: {{repr(res)}}")
188+
logger.debug(f"chkhsh: {{chkhsh}}")
187189
188190
return res
189191
"""
@@ -197,6 +199,8 @@ def get_vocab_base_pre(self, tokenizer) -> str:
197199
# generate tests for each tokenizer model
198200

199201
tests = [
202+
"ied 4 ½ months",
203+
"Führer",
200204
"",
201205
" ",
202206
" ",
@@ -281,6 +285,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
281285
for model in models:
282286
name = model["name"]
283287

284-
logger.info(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only")
288+
print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
285289

286290
logger.info("\n")

convert-hf-to-gguf.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
308308
if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
309309
# ref: https://huggingface.co/openai-community/gpt2
310310
res = "gpt-2"
311+
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
312+
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base
313+
res = "refact"
311314

312315
if res is None:
313316
logger.warning("\n")
@@ -324,7 +327,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
324327
logger.warning("\n")
325328
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
326329

327-
logger.debug(f"tokenizer.ggml.pre: {res}")
330+
logger.debug(f"tokenizer.ggml.pre: {repr(res)}")
328331
logger.debug(f"chkhsh: {chkhsh}")
329332

330333
return res

llama.cpp

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4383,6 +4383,9 @@ static void llm_load_vocab(
43834383
} else if (
43844384
tokenizer_pre == "gpt-2") {
43854385
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4386+
} else if (
4387+
tokenizer_pre == "refact") {
4388+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
43864389
} else {
43874390
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
43884391
}
@@ -11952,7 +11955,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id
1195211955
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
1195311956
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
1195411957
GGML_ASSERT(llama_is_byte_token(vocab, id));
11955-
const auto& token_data = vocab.id_to_token.at(id);
11958+
const auto & token_data = vocab.id_to_token.at(id);
1195611959
switch (llama_vocab_get_type(vocab)) {
1195711960
case LLAMA_VOCAB_TYPE_SPM: {
1195811961
auto buf = token_data.text.substr(3, 2);
@@ -12212,14 +12215,13 @@ struct llm_tokenizer_bpe {
1221212215
"\\s?\\p{L}+",
1221312216
"\\s?\\p{P}+",
1221412217
"[一-龥ࠀ-一가-퟿]+",
12215-
"\\p{N}+",
12218+
"\\p{N}",
1221612219
});
1221712220
break;
1221812221
case LLAMA_VOCAB_PRE_TYPE_FALCON:
1221912222
word_collection = unicode_regex_split(text, {
1222012223
"[\\p{P}\\$\\+<=>\\^~\\|]+",
1222112224
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12222-
"\\p{N}+",
1222312225
"[0-9][0-9][0-9]",
1222412226
});
1222512227
break;
@@ -12235,6 +12237,12 @@ struct llm_tokenizer_bpe {
1223512237
});
1223612238
break;
1223712239
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
12240+
case LLAMA_VOCAB_PRE_TYPE_REFACT:
12241+
word_collection = unicode_regex_split(text, {
12242+
"\\p{N}",
12243+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12244+
});
12245+
break;
1223812246
case LLAMA_VOCAB_PRE_TYPE_GPT2:
1223912247
word_collection = unicode_regex_split(text, {
1224012248
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
@@ -17466,9 +17474,10 @@ int32_t llama_tokenize(
1746617474

1746717475
static std::string llama_decode_text(const std::string & text) {
1746817476
std::string decoded_text;
17469-
auto unicode_sequences = unicode_cpts_from_utf8(text);
17470-
for (auto & unicode_sequence : unicode_sequences) {
17471-
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence));
17477+
17478+
const auto cpts = unicode_cpts_from_utf8(text);
17479+
for (const auto cpt : cpts) {
17480+
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
1747217481
}
1747317482

1747417483
return decoded_text;

llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ extern "C" {
7979
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
8080
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
8181
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
82+
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
8283
};
8384

8485
// note: these values should be synchronized with ggml_rope

models/ggml-vocab-bert-bge.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
15

26
__ggml_vocab_test__
37

models/ggml-vocab-bert-bge.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
29464 2094 1018 1092 2706
2+
11865 17875
13

24

35

models/ggml-vocab-deepseek-coder.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
15

26
__ggml_vocab_test__
37

models/ggml-vocab-deepseek-coder.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
1050 207 19 207 19192 4217
2+
37 32009 71 6247
13

24
207
35
243

models/ggml-vocab-deepseek-llm.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
15

26
__ggml_vocab_test__
37

models/ggml-vocab-deepseek-llm.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
1052 207 19 207 19109 4223
2+
37 100014 71 6245
13

24
207
35
243

models/ggml-vocab-falcon.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
15

26
__ggml_vocab_test__
37

models/ggml-vocab-falcon.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
878 204 31 3068 133 2137
2+
28611 132 30042
13

24
204
35
258

models/ggml-vocab-gpt-2.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
15

26
__ggml_vocab_test__
37

models/ggml-vocab-gpt-2.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
798 604 25208 1933
2+
37 9116 71 11751
13

24
220
35
220 220

models/ggml-vocab-llama-bpe.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
15

26
__ggml_vocab_test__
37

models/ggml-vocab-llama-bpe.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
1142 220 19 220 27154 4038
2+
37 51853 261
13

24
220
35
256

models/ggml-vocab-llama-spm.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
15

26
__ggml_vocab_test__
37

models/ggml-vocab-llama-spm.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
474 287 29871 29946 29871 30226 7378
2+
383 4000 261
13

24
259
35
1678

models/ggml-vocab-mpt.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
15

26
__ggml_vocab_test__
37

models/ggml-vocab-mpt.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
728 577 24142 2607
2+
39 26288 6554
13

24
209
35
50276

models/ggml-vocab-phi-3.gguf

-99 Bytes
Binary file not shown.

models/ggml-vocab-phi-3.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
15

26
__ggml_vocab_test__
37

models/ggml-vocab-phi-3.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
474 287 29871 29946 29871 30226 7378
2+
383 4000 261
13

24
259
35
1678

models/ggml-vocab-refact.gguf

44 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)