Skip to content

Commit eb6499c

Browse files
committed
Revert tools/mtmd/ to match master
1 parent 2f3854c commit eb6499c

File tree

7 files changed

+382
-92
lines changed

7 files changed

+382
-92
lines changed

tools/mtmd/clip-impl.h

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
#include <climits>
66
#include <cstdarg>
7+
#include <cinttypes>
78
#include <string>
89
#include <map>
910
#include <sstream>
@@ -44,7 +45,7 @@
4445
// tensor name constants
4546
//
4647

47-
#define TN_POS_EMBD "%s.position_embd.weight"
48+
#define TN_POS_EMBD "v.position_embd.weight"
4849
#define TN_CLASS_EMBD "v.class_embd"
4950
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
5051
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
@@ -110,6 +111,7 @@ enum projector_type {
110111
PROJECTOR_TYPE_PIXTRAL,
111112
PROJECTOR_TYPE_QWEN25VL,
112113
PROJECTOR_TYPE_INTERNVL,
114+
PROJECTOR_TYPE_LLAMA4,
113115
PROJECTOR_TYPE_UNKNOWN,
114116
};
115117

@@ -125,6 +127,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
125127
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
126128
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
127129
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
130+
{ PROJECTOR_TYPE_LLAMA4, "llama4"},
128131
};
129132

130133
static projector_type clip_projector_type_from_string(const std::string & str) {
@@ -240,6 +243,11 @@ struct clip_image_u8_batch {
240243
struct clip_image_f32_batch {
241244
std::vector<clip_image_f32_ptr> entries;
242245

246+
// for llava-uhd style models, we need to know the grid size
247+
// note: entries.size() == grid_x * grid_y + 1 (one overview image)
248+
int grid_x = 0;
249+
int grid_y = 0;
250+
243251
clip_image_f32_batch clone() const {
244252
clip_image_f32_batch new_batch;
245253
new_batch.entries.reserve(entries.size());
@@ -358,6 +366,70 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
358366
}
359367
}
360368

369+
//
370+
// debugging
371+
//
372+
373+
static void print_tensor_shape(ggml_tensor * t) {
374+
printf("%s.shape = [", t->name);
375+
for (int i = 0; i < ggml_n_dims(t); ++i) {
376+
printf("%" PRId64, t->ne[i]);
377+
if (i < ggml_n_dims(t) - 1) {
378+
printf(", ");
379+
}
380+
}
381+
printf("]\n");
382+
}
383+
384+
static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
385+
ggml_type type = t->type;
386+
int64_t * ne = t->ne;
387+
size_t * nb = t->nb;
388+
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
389+
printf("%s.data: [\n", t->name);
390+
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
391+
if (i2 == n && ne[2] > 2*n) {
392+
printf(" ..., \n");
393+
i2 = ne[2] - n;
394+
}
395+
printf(" [\n");
396+
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
397+
if (i1 == n && ne[1] > 2*n) {
398+
printf(" ..., \n");
399+
i1 = ne[1] - n;
400+
}
401+
printf(" [");
402+
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
403+
if (i0 == n && ne[0] > 2*n) {
404+
printf("..., ");
405+
i0 = ne[0] - n;
406+
}
407+
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
408+
float v;
409+
if (type == GGML_TYPE_F16) {
410+
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
411+
} else if (type == GGML_TYPE_F32) {
412+
v = *(float *) &data[i];
413+
} else if (type == GGML_TYPE_I32) {
414+
v = (float) *(int32_t *) &data[i];
415+
} else if (type == GGML_TYPE_I16) {
416+
v = (float) *(int16_t *) &data[i];
417+
} else if (type == GGML_TYPE_I8) {
418+
v = (float) *(int8_t *) &data[i];
419+
} else {
420+
GGML_ABORT("fatal error");
421+
}
422+
printf("%8.4f", v);
423+
if (i0 < ne[0] - 1) printf(", ");
424+
}
425+
printf("],\n");
426+
}
427+
printf(" ],\n");
428+
}
429+
printf(" ]\n");
430+
}
431+
}
432+
361433
//
362434
// API used internally with mtmd
363435
//

0 commit comments

Comments
 (0)