Skip to content

Commit affe946

Browse files
authored
Merge pull request #104 from yirongjie/main
perf: add AArch64 GEMM/GEMV for q4_0.
2 parents 56a0603 + 6e941b5 commit affe946

File tree

81 files changed

+4913
-1232
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

81 files changed

+4913
-1232
lines changed

CMakeLists.txt

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@ endif ()
2121

2222
if (ARM)
2323
set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/../bin-arm)
24+
add_compile_definitions(__ARM_FEATURE_DOTPROD)
25+
# 检查是否使用的是 GCC 或 Clang 编译器
26+
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
27+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8.2-a+dotprod")
28+
endif()
2429
else ()
2530
set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/../bin)
2631
endif ()
@@ -96,7 +101,8 @@ endif ()
96101
if (QUANT)
97102
include_directories(${PROJECT_SOURCE_DIR}/src/quantizer)
98103
file(GLOB_RECURSE MLLM_QUANT
99-
104+
${PROJECT_SOURCE_DIR}/src/backends/cpu/compute/GEMM_AArch64.hpp
105+
${PROJECT_SOURCE_DIR}/src/backends/cpu/compute/GEMM_AArch64.cpp
100106
${PROJECT_SOURCE_DIR}/src/backends/cpu/quantize/*.hpp
101107
${PROJECT_SOURCE_DIR}/src/backends/cpu/quantize/*.cpp
102108
)

examples/demo_imagebind_1mod.cpp

Lines changed: 35 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -13,53 +13,57 @@ int main(int argc, char **argv) {
1313
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/imagebind_huge-q4_k.mllm");
1414
cmdParser.add<string>("merges", 'f', "specify mllm tokenizer merges.txt path", false, "../vocab/clip_merges.txt");
1515
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
16+
cmdParser.add<int>("loop_times", 'l', "number of inference loops", false, 10);
17+
cmdParser.add<string>("modality", 'o', "inference modality (text/vision/audio/all)", false, "all");
1618
cmdParser.parse_check(argc, argv);
1719

1820
string vocab_path = cmdParser.get<string>("vocab");
1921
string model_path = cmdParser.get<string>("model");
2022
string merges_path = cmdParser.get<string>("merges");
23+
int loop_times = cmdParser.get<int>("loop_times");
24+
string modality = cmdParser.get<string>("modality");
2125
CPUBackend::cpu_threads = cmdParser.get<int>("thread");
2226

2327
auto processor = ImagebindProcessor(vocab_path, merges_path);
24-
2528
ImagebindConfig config("huge");
2629

27-
int loop_times = 10;
28-
29-
// auto input_tensors = processor.process(
30-
// {"a dog.", "A car", "A bird"},config.max_position_embeddings,
31-
// {"../assets/dog_image.jpg", "../assets/car_image.jpg", "../assets/bird_image.jpg"}, config.img_hw,
32-
// {"../assets/dog_audio.wav", "../assets/car_audio.wav", "../assets/bird_audio.wav"});
33-
3430
auto input_tensors = processor.process(
35-
{"a dog."},config.max_position_embeddings,
31+
{"a dog."}, config.max_position_embeddings,
3632
{"../assets/dog_image.jpg"}, config.img_hw,
3733
{"../assets/dog_audio.wav"});
38-
39-
std::cout<<"Text| input_shape:["<<input_tensors.text_tensors.batch()<<", "<<input_tensors.text_tensors.sequence()<<", "<<input_tensors.text_tensors.head()<<", "<<input_tensors.text_tensors.dimension()<<"]"<<std::endl;
40-
auto text_model = ImagebindTextModel(config);
41-
text_model.load(model_path);
42-
for (int step = 0; step < loop_times; step++) {
43-
auto result = text_model({input_tensors.text_tensors}, input_tensors.in_len);
34+
35+
if (modality == "text" || modality == "all") {
36+
std::cout << "Text| input_shape:[" << input_tensors.text_tensors.batch() << ", " << input_tensors.text_tensors.sequence() << ", " << input_tensors.text_tensors.head() << ", " << input_tensors.text_tensors.dimension() << "]" << std::endl;
37+
auto text_model = ImagebindTextModel(config);
38+
text_model.load(model_path);
39+
for (int step = 0; step < loop_times; step++) {
40+
auto result = text_model({input_tensors.text_tensors}, input_tensors.in_len);
41+
}
42+
text_model.profiling();
43+
text_model.free();
4444
}
45-
text_model.profiling();
46-
text_model.free();
4745

48-
std::cout<<"Vision| input_shape:["<<input_tensors.img_tensors.batch()<<", "<<input_tensors.img_tensors.channel()<<", "<<input_tensors.img_tensors.time()<<", "<<input_tensors.img_tensors.height()<<", "<<input_tensors.img_tensors.width()<<"]"<<std::endl;
49-
auto vision_model = ImagebindVisionModel(config);
50-
vision_model.load(model_path);
51-
for (int step = 0; step < loop_times; step++) {
52-
auto result = vision_model({input_tensors.img_tensors});
46+
if (modality == "vision" || modality == "all") {
47+
std::cout << "Vision| input_shape:[" << input_tensors.img_tensors.batch() << ", " << input_tensors.img_tensors.channel() << ", " << input_tensors.img_tensors.time() << ", " << input_tensors.img_tensors.height() << ", " << input_tensors.img_tensors.width() << "]" << std::endl;
48+
auto vision_model = ImagebindVisionModel(config);
49+
vision_model.load(model_path);
50+
for (int step = 0; step < loop_times; step++) {
51+
auto result = vision_model({input_tensors.img_tensors});
52+
}
53+
vision_model.profiling();
54+
vision_model.free();
5355
}
54-
vision_model.profiling();
55-
vision_model.free();
5656

57-
std::cout<<"Audio| input_shape:["<<input_tensors.audio_tensors.batch()<<", "<<input_tensors.audio_tensors.sequence()<<", "<<input_tensors.audio_tensors.head()<<", "<<input_tensors.audio_tensors.dimension()<<"]"<<std::endl;
58-
auto audio_model = ImagebindAudioModel(config);
59-
audio_model.load(model_path);
60-
for (int step = 0; step < loop_times; step++) {
61-
auto result = audio_model({input_tensors.audio_tensors});
57+
if (modality == "audio" || modality == "all") {
58+
std::cout << "Audio| input_shape:[" << input_tensors.audio_tensors.batch() << ", " << input_tensors.audio_tensors.sequence() << ", " << input_tensors.audio_tensors.head() << ", " << input_tensors.audio_tensors.dimension() << "]" << std::endl;
59+
auto audio_model = ImagebindAudioModel(config);
60+
audio_model.load(model_path);
61+
for (int step = 0; step < loop_times; step++) {
62+
auto result = audio_model({input_tensors.audio_tensors});
63+
}
64+
audio_model.profiling();
65+
audio_model.free();
6266
}
63-
audio_model.profiling();
64-
audio_model.free();
67+
68+
return 0;
6569
}

examples/demo_llama.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ int main(int argc, char **argv) {
5151
chatPostProcessing(out_token, input_tensor, {});
5252
}
5353
printf("\n");
54+
model.profiling();
5455
}
5556

5657
return 0;

examples/demo_qwen.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ int main(int argc, char **argv) {
1919
cmdline::parser cmdParser;
2020
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen_vocab.mllm");
2121
cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen_merges.txt");
22-
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/qwen-1.5-0.5b-q4_k.mllm");
22+
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/qwen-1.5-1.8b-q8_0.mllm");
2323
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
2424
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
2525
cmdParser.parse_check(argc, argv);
@@ -31,7 +31,7 @@ int main(int argc, char **argv) {
3131
CPUBackend::cpu_threads = cmdParser.get<int>("thread");
3232

3333
auto tokenizer = QWenTokenizer(vocab_path, merge_path);
34-
QWenConfig config(tokens_limit, "0.5B", RoPEType::HFHUBROPE);
34+
QWenConfig config(tokens_limit, "1.8B", RoPEType::HFHUBROPE);
3535
auto model = QWenForCausalLM(config);
3636
model.load(model_path);
3737

examples/demo_yi.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@
99
*
1010
*/
1111
#include "cmdline.h"
12-
#include "models/yi/configuration_yi.hpp"
13-
#include "models/yi/modeling_yi.hpp"
14-
#include "models/yi/tokenization_yi.hpp"
12+
#include "models/llama/configuration_llama.hpp"
13+
#include "models/llama/modeling_llama.hpp"
14+
#include "models/llama/tokenization_llama.hpp"
1515
#include "processor/PostProcess.hpp"
1616

1717
using namespace mllm;
@@ -29,9 +29,9 @@ int main(int argc, char **argv) {
2929
int tokens_limit = cmdParser.get<int>("limits");
3030
CPUBackend::cpu_threads = cmdParser.get<int>("thread");
3131

32-
auto tokenizer = YiTokenizer(vocab_path);
33-
YiConfig config(tokens_limit, "6B", RoPEType::HFHUBROPE);
34-
auto model = YiForCausalLM(config);
32+
auto tokenizer = LLaMATokenizer(vocab_path, false);
33+
LLaMAConfig config(tokens_limit, "6B", RoPEType::HFHUBROPE, 64000);
34+
auto model = LLaMAModel(config);
3535
model.load(model_path);
3636

3737
vector<string> in_strs = {

examples/main_alpaca.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@ NetTensor *Attention( NetTensor * x, int embedding_size, int hidden_size, int he
5151
v = _KVCache( {v}, cache_max, name + ".v_cache");
5252
auto *qk = _Matmul( {q, k}, false, true, name + ".qk");
5353
qk = *qk/std::sqrt(hidden_size);
54-
qk = _Causalmask( {qk}, name + ".mask");
55-
qk = _Softmax( {qk}, DIMENSION, name + ".softmax");
54+
// qk = _Causalmask( {qk}, name + ".mask");
55+
qk = _Softmax( {qk}, DIMENSION, true, name + ".softmax");
5656
auto *o = _Matmul( {qk, v}, false, false, name + ".qkv");
5757
o = o->view(-1, 1, -1, hidden_size * head_size);
5858
o = _Linear( {o}, hidden_size * head_size, embedding_size, false, name + ".o_proj");

examples/main_clip.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,11 @@ NetTensor *Attention(NetTensor *x, int embedding_size, int hidden_size, int head
4545
auto *qk = _Matmul( {q, k}, false, true, name + ".qk");
4646
qk = _Scale( {qk}, 1.0F / std::sqrt(hidden_size), 0.0F, false, name + ".scale");
4747
if(name.find("text_model") != std::string::npos){
48-
qk = _Causalmask( {qk}, name + ".mask");
48+
// qk = _Causalmask( {qk}, name + ".mask");
49+
qk = _Softmax( {qk}, DIMENSION, true, name + ".softmax");
50+
} else{
51+
qk = _Softmax( {qk}, DIMENSION, false, name + ".softmax");
4952
}
50-
qk = _Softmax( {qk}, DIMENSION, name + ".softmax");
5153
auto *o = _Matmul( {qk, v}, false, false, name + ".qkv");
5254
o = o->view(-1, 1, -1, hidden_size * head_size);
5355
o = _Linear( {o}, hidden_size * head_size, embedding_size, true, name + ".out_proj");

examples/main_fuyu.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,8 @@ NetTensor *Attention(NetTensor *x, int embedding_size, int hidden_size, int head
102102
v = _KVCache({v}, cache_max, name + ".v_cache");
103103
auto *qk = _Matmul({q, k}, false, true, name + ".qk");
104104
qk = _Scale({qk}, 1.0F / std::sqrt(head_size), 0.0F, false, name + ".scale");
105-
qk = _Causalmask({qk}, name + ".mask");
106-
qk = _Softmax({qk}, DIMENSION, name + ".softmax");
105+
// qk = _Causalmask({qk}, name + ".mask");
106+
qk = _Softmax({qk}, DIMENSION, true, name + ".softmax");
107107
auto *o = _Matmul({qk, v}, false, false, name + ".qkv");
108108
o = o->view(-1, 1, -1, hidden_size * head_size);
109109
o = _Linear({o}, hidden_size * head_size, embedding_size, true, name + ".dense");

examples/main_imagebind.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,10 @@ NetTensor *Attention(Context *c,NetTensor *x, int embedding_size, int hidden_siz
118118
auto *qk = _Matmul( {q, k}, false, true, name + ".qk");
119119
qk = *qk/std::sqrt(hidden_size);
120120
if(name.find("text") != std::string::npos){
121-
qk = _Causalmask( {qk}, name + ".mask");
121+
qk = _Softmax( {qk}, DIMENSION, true, name + ".softmax");
122+
} else{
123+
qk = _Softmax( {qk}, DIMENSION, false, name + ".softmax");
122124
}
123-
qk = _Softmax( {qk}, DIMENSION, name + ".softmax");
124125
auto *o = _Matmul( {qk, v}, false, false, name + ".qkv");
125126
o = o->view(-1, 1, -1, hidden_size * head_size);
126127
o = _Linear( {o}, hidden_size * head_size, embedding_size, true, name + ".out_proj");
@@ -227,10 +228,10 @@ void ImageBind(Context* c) {
227228
a = a->transpose(BATCH, SEQUENCE);
228229

229230
auto *j1 = _Matmul( {p, i}, false, true, "final.vision@text");
230-
j1 = _Softmax( {j1}, DIMENSION, "[email protected]");
231+
j1 = _Softmax( {j1}, DIMENSION, false, "[email protected]");
231232

232233
auto *j2 = _Matmul( {p, a}, false, true, "final.vision@audio");
233-
j2 = _Softmax( {j2}, DIMENSION, "[email protected]");
234+
j2 = _Softmax( {j2}, DIMENSION, false, "[email protected]");
234235

235236
i = _Cat( {j1, j2}, BATCH, "final.cat");
236237
}

examples/main_llama.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ NetTensor *Attention(NetTensor *x, int embedding_size, int hidden_size, int head
5050
v = _KVCache({v}, cache_max, name + ".v_cache");
5151
auto *qk = _Matmul({q, k}, false, true, name + ".qk");
5252
qk = *qk / std::sqrt(hidden_size);
53-
qk = _Causalmask({qk}, name + ".mask");
54-
qk = _Softmax({qk}, DIMENSION, name + ".softmax");
53+
// qk = _Causalmask({qk}, name + ".mask");
54+
qk = _Softmax({qk}, DIMENSION, true, name + ".softmax");
5555
auto *o = _Matmul({qk, v}, false, false, name + ".qkv");
5656
o = o->view(-1, 1, -1, hidden_size * head_size);
5757
o = _Linear({o}, hidden_size * head_size, embedding_size, false, name + ".wo");

examples/main_llava.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,8 @@ NetTensor *Attention(NetTensor *x, int embedding_size, int hidden_size, int head
7272
v = _KVCache({v}, cache_max, name + ".v_cache");
7373
auto *qk = _Matmul({q, k}, false, true, name + ".qk");
7474
qk = *qk / std::sqrt(hidden_size);
75-
qk = _Causalmask({qk}, name + ".mask");
76-
qk = _Softmax({qk}, DIMENSION, name + ".softmax");
75+
// qk = _Causalmask({qk}, name + ".mask");
76+
qk = _Softmax({qk}, DIMENSION, true, name + ".softmax");
7777
auto *o = _Matmul({qk, v}, false, false, name + ".qkv");
7878
o = o->view(-1, 1, -1, hidden_size * head_size);
7979
o = _Linear({o}, hidden_size * head_size, embedding_size, false, name + ".o_proj");
@@ -117,9 +117,10 @@ NetTensor *VisionAttention(NetTensor *x, int embedding_size, int hidden_size, in
117117
auto *qk = _Matmul({q, k}, false, true, name + ".qk");
118118
qk = _Scale({qk}, 1.0F / std::sqrt(hidden_size), 0.0F, false, name + ".scale");
119119
if (name.find("text_model") != std::string::npos) {
120-
qk = _Causalmask({qk}, name + ".mask");
120+
qk = _Softmax( {qk}, DIMENSION, true, name + ".softmax");
121+
} else{
122+
qk = _Softmax( {qk}, DIMENSION, false, name + ".softmax");
121123
}
122-
qk = _Softmax({qk}, DIMENSION, name + ".softmax");
123124
auto *o = _Matmul({qk, v}, false, false, name + ".qkv");
124125
o = o->view(-1, 1, -1, hidden_size * head_size);
125126
o = _Linear({o}, hidden_size * head_size, embedding_size, true, name + ".out_proj");

examples/main_tinyllama.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@ NetTensor *Attention( NetTensor * x, int embedding_size, int hidden_size, int he
5151
v = _KVCache( {v},head_size/mutil_key_value_head, cache_max, name + ".v_cache");
5252
auto *qk = _Matmul( {q, k}, false, true, name + ".qk");
5353
qk = *qk/std::sqrt(hidden_size);
54-
qk = _Causalmask( {qk}, name + ".mask");
55-
qk = _Softmax( {qk}, DIMENSION, name + ".softmax");
54+
// qk = _Causalmask( {qk}, name + ".mask");
55+
qk = _Softmax( {qk}, DIMENSION, true, name + ".softmax");
5656
auto *o = _Matmul( {qk, v}, false, false, name + ".qkv");
5757
o = o->view(-1, 1, -1, hidden_size * head_size);
5858
o = _Linear( {o}, hidden_size * head_size, embedding_size, false, name + ".o_proj");

examples/main_vit.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1089,7 +1089,7 @@ NetTensor *Attention(NetTensor * x, int embedded_size, int hidden_size, int head
10891089
qk = *qk/std::sqrt(hidden_size);
10901090
// qk = _Scale( {qk}, 1.0F / std::sqrt(hidden_size), 0.0F, false, name + ".scale");
10911091
// qk = _Causalmask( {qk}, name + ".mask");
1092-
qk = _Softmax( {qk}, DIMENSION, name + ".softmax");
1092+
qk = _Softmax( {qk}, DIMENSION, false, name + ".softmax");
10931093
auto *o = _Matmul( {qk, v}, false, false, name + ".qkv");
10941094
o = o->view(-1, 1, -1, hidden_size * head_size);
10951095
o = _Linear( {o}, hidden_size * head_size, embedded_size, true, name + ".output.dense");

include/Types.hpp

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@ enum DataType {
5656
MLLM_TYPE_I8,
5757
MLLM_TYPE_I16,
5858
MLLM_TYPE_I32,
59+
MLLM_TYPE_Q4_0_4_4=19,
60+
MLLM_TYPE_Q4_0_4_8=20,
61+
MLLM_TYPE_Q4_0_8_8=21,
62+
MLLM_TYPE_Q8_0_4_4,
5963
MLLM_TYPE_COUNT,
6064
};
6165
enum ChlType {
@@ -147,6 +151,8 @@ enum RoPEType {
147151
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
148152
* SOFTWARE.
149153
*/
154+
// #define LLAMAFILE_SGEMM
155+
150156
#if defined(__ARM_NEON) && !defined(_MSC_VER)
151157
typedef __fp16 mllm_fp16_t;
152158
#else
@@ -223,6 +229,39 @@ typedef struct {
223229
#pragma pack()
224230
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K / 16 * sizeof(int16_t), "wrong q8_K block size/padding");
225231

232+
233+
#pragma pack(1)
234+
typedef struct {
235+
mllm_fp16_t d[4]; // deltas for 4 q4_0 blocks
236+
uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks
237+
} block_q4_0x4;
238+
#pragma pack()
239+
static_assert(sizeof(block_q4_0x4) == 4 * sizeof(mllm_fp16_t) + QK4_0 * 2, "wrong q4_0x4 block size/padding");
240+
241+
#pragma pack(1)
242+
typedef struct {
243+
mllm_fp16_t d[8]; // deltas for 8 q4_0 blocks
244+
uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks
245+
} block_q4_0x8;
246+
#pragma pack()
247+
static_assert(sizeof(block_q4_0x8) == 8 * sizeof(mllm_fp16_t) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
248+
249+
#pragma pack(1)
250+
typedef struct {
251+
mllm_fp16_t d[4]; // deltas for 4 q8_0 blocks
252+
int8_t qs[QK8_0 * 4]; // quants for 4 q8_0 blocks
253+
} block_q8_0x4;
254+
#pragma pack()
255+
static_assert(sizeof(block_q8_0x4) == 4 * sizeof(mllm_fp16_t) + QK8_0 * 4, "wrong q8_0x4 block size/padding");
256+
257+
#pragma pack(1)
258+
typedef struct {
259+
mllm_fp16_t d[8]; // deltas for 8 q8_0 blocks
260+
int8_t qs[QK8_0 * 8]; // quants for 8 q8_0 blocks
261+
} block_q8_0x8;
262+
#pragma pack()
263+
static_assert(sizeof(block_q8_0x8) == 8 * sizeof(mllm_fp16_t) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
264+
226265
//
227266

228267
static string DataTypeName(DataType dataType) {
@@ -251,6 +290,14 @@ static string DataTypeName(DataType dataType) {
251290
return "Q4_1";
252291
case MLLM_TYPE_Q8_1:
253292
return "Q8_1";
293+
case MLLM_TYPE_Q4_0_4_4:
294+
return "Q4_0_4_4";
295+
case MLLM_TYPE_Q4_0_4_8:
296+
return "Q4_0_4_8";
297+
case MLLM_TYPE_Q4_0_8_8:
298+
return "Q4_0_8_8";
299+
case MLLM_TYPE_Q8_0_4_4:
300+
return "Q8_0_4_4";
254301
case MLLM_TYPE_COUNT:
255302
return "COUNT";
256303
default:
@@ -281,6 +328,15 @@ static size_t DataTypeSize(DataType dtype, int count = 1) {
281328
return (sizeof(block_q8_K)) * count / (QK_K);
282329
case MLLM_TYPE_Q4_1:
283330
case MLLM_TYPE_Q8_1:
331+
return -1;
332+
case MLLM_TYPE_Q4_0_4_4:
333+
return (sizeof(block_q4_0x4)) * count / (QK4_0 * 4);
334+
case MLLM_TYPE_Q4_0_4_8:
335+
return (sizeof(block_q4_0x8)) * count / (QK4_0 * 8);
336+
case MLLM_TYPE_Q4_0_8_8:
337+
return (sizeof(block_q4_0x8)) * count / (QK4_0 * 8);
338+
case MLLM_TYPE_Q8_0_4_4:
339+
return (sizeof(block_q8_0x4)) * count / (QK8_0 * 4);
284340
case MLLM_TYPE_COUNT:
285341
return 0;
286342
default:

0 commit comments

Comments
 (0)