Skip to content

Commit 366624b

Browse files
phymbertslarenggerganovngxson
authored andcommitted
llama_model_loader: support multiple split/shard GGUFs (ggml-org#6187)
* split: support in llama_model_loader * avoid copying the entire vector Co-authored-by: slaren <[email protected]> * split: move llama_tensor_offset to llama_model_loader * llama_model_loader: PR feedbacks: - use only one gguf_context for metadata only - store all ggml_context in a vector as the files and mappings - store all weights in a vector along with the source tensor - rename ctx_gguf to meta - rename ctx_meta to contexts * avoid copying the entire vector * Simplify this by making these optional, switch some layer creation tensor optional Co-authored-by: Georgi Gerganov <[email protected]> * Handle optional tensors Co-authored-by: Georgi Gerganov <[email protected]> * llama_model_loader: fail if backend cannot allocate buffer * fix mmap buffer management * llama_model_loader: map file to backend buffer if the allocation succeeds only * llama_model_loader: only map tensors included in the context * llama_model_loader: minor, use same variable name for consistency, fix spacing in types cast * llama_model_loader: fail if any of backend buffer cannot be allocated * spacing Co-authored-by: slaren <[email protected]> * fix loop over pointer Co-authored-by: slaren <[email protected]> * llama_model_loader: if n_tensors declared not equals to loaded tensors in split, throw an exception instead of asserting * llama_model_loader: ensure mappings vector has the expected size * llama_model_loader: use at instead of operator[] if this should never add to the map. * llama_model_loader: immediately add the backend buffer to the model buffers in order to free them if an error occurs in the next allocation. Reserve the expected size. * llama_model_loader: be sure the model mappings has enough capacity before allocating backend buffer * llama_model_loader: fix map -> unordered map * llama_split_prefix: use a clearer version, not pass split path len but dest max len. Co-authored-by: Xuan Son Nguyen <[email protected]> * llama : minor ggml-ci * llama : introduce some typedef helpers * docs: add model shard in hot topic * llama_model_loader: put mapping in a unique_ptr from the moment it is allocated Co-authored-by: slaren <[email protected]> * fix llama_split_prefix --------- Co-authored-by: slaren <[email protected]> Co-authored-by: Georgi Gerganov <[email protected]> Co-authored-by: Xuan Son Nguyen <[email protected]>
1 parent d174faf commit 366624b

File tree

4 files changed

+411
-223
lines changed

4 files changed

+411
-223
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
2222
- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
2323
- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
2424
- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
25+
- Support loading sharded model, using `gguf-split` CLI https://github.com/ggerganov/llama.cpp/pull/6187
2526

2627
----
2728

examples/gguf-split/gguf-split.cpp

Lines changed: 66 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,34 @@
11
#include "llama.h"
2-
#include "ggml.h"
32
#include "common.h"
43

54
#include <algorithm>
65
#include <cmath>
7-
#include <cstdint>
86
#include <cstdlib>
97
#include <fstream>
10-
#include <ios>
118
#include <string>
129
#include <vector>
1310

1411
#include <stdio.h>
15-
#include <fcntl.h>
1612
#include <string.h>
13+
#include <climits>
14+
#include <stdexcept>
15+
16+
#if defined(_WIN32)
17+
#include <windows.h>
18+
#ifndef PATH_MAX
19+
#define PATH_MAX MAX_PATH
20+
#endif
21+
#include <io.h>
22+
#endif
1723

1824
enum split_operation : uint8_t {
1925
SPLIT_OP_SPLIT,
2026
SPLIT_OP_MERGE,
2127
};
2228

23-
static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT = "general.split";
24-
static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT = "general.split_count";
25-
26-
static const int SPLIT_FILENAME_MAX = 256;
27-
28-
static const char * const SPLIT_FILENAME_FORMAT = "%s-%05d-of-%05d.gguf";
29+
static const char * const LLM_KV_SPLIT_NO = "split.no";
30+
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
31+
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
2932

3033
struct split_params {
3134
split_operation operation = SPLIT_OP_SPLIT;
@@ -116,13 +119,13 @@ static bool split_params_parse(int argc, const char ** argv, split_params & para
116119
try {
117120
if (!split_params_parse_ex(argc, argv, params)) {
118121
split_print_usage(argv[0]);
119-
exit(1);
122+
exit(EXIT_FAILURE);
120123
}
121124
}
122125
catch (const std::invalid_argument & ex) {
123126
fprintf(stderr, "%s\n", ex.what());
124127
split_print_usage(argv[0]);
125-
exit(1);
128+
exit(EXIT_FAILURE);
126129
}
127130
return result;
128131
}
@@ -134,12 +137,6 @@ static void zeros(std::ofstream & file, size_t n) {
134137
}
135138
}
136139

137-
static std::string split_file_name(const std::string & path, int i_split, int n_split) {
138-
char f_split[SPLIT_FILENAME_MAX] = {0};
139-
snprintf(f_split, sizeof(f_split), SPLIT_FILENAME_FORMAT, path.c_str(), i_split + 1, n_split);
140-
return std::string(f_split);
141-
}
142-
143140
struct split_strategy {
144141
const split_params params;
145142
std::ifstream & f_input;
@@ -180,19 +177,21 @@ struct split_strategy {
180177
if (i_split == 0) {
181178
gguf_set_kv(ctx_out, ctx_gguf);
182179
}
183-
gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT, i_split);
184-
gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, n_split);
180+
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split);
181+
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, n_split);
182+
gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors);
185183

186184
// populate the original tensors, so we get an initial metadata
187185
for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) {
188186
struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
189187
gguf_add_tensor(ctx_out, meta);
190188
}
191189

192-
auto split_name = split_file_name(params.output, i_split, n_split);
190+
char split_path[PATH_MAX] = {0};
191+
llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split);
193192

194-
fprintf(stderr, "%s: %s ...", __func__, split_name.c_str());
195-
fout = std::ofstream(split_name, std::ios::binary);
193+
fprintf(stderr, "%s: %s ...", __func__, split_path);
194+
fout = std::ofstream(split_path, std::ios::binary);
196195
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
197196

198197
auto meta_size = gguf_get_meta_size(ctx_out);
@@ -250,19 +249,23 @@ static void gguf_split(const split_params & split_params) {
250249
std::ifstream f_input(split_params.input.c_str(), std::ios::binary);
251250
if (!f_input.is_open()) {
252251
fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_params.input.c_str());
253-
exit(1);
252+
exit(EXIT_FAILURE);
254253
}
255254

256255
auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params);
257256
if (!ctx_gguf) {
258257
fprintf(stderr, "%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
259-
exit(1);
258+
exit(EXIT_FAILURE);
260259
}
261260

262261
split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta);
262+
263+
char first_split_path[PATH_MAX] = {0};
264+
llama_split_path(first_split_path, sizeof(first_split_path),
265+
split_params.output.c_str(), strategy.i_split, strategy.n_split);
263266
fprintf(stderr, "%s: %s -> %s (%d tensors per file)\n",
264267
__func__, split_params.input.c_str(),
265-
split_file_name(split_params.output, strategy.i_split, strategy.n_split).c_str(),
268+
first_split_path,
266269
split_params.n_split_tensors);
267270

268271
strategy.split_start();
@@ -298,7 +301,9 @@ static void gguf_merge(const split_params & split_params) {
298301
std::vector<ggml_context *> ctx_metas;
299302
std::vector<gguf_context *> ctx_ggufs;
300303

301-
std::string split_prefix;
304+
char split_path[PATH_MAX] = {0};
305+
strncpy(split_path, split_params.input.c_str(), sizeof(split_path) - 1);
306+
char split_prefix[PATH_MAX] = {0};
302307

303308
// First pass to find KV and tensors metadata
304309
for (int i_split = 0; i_split < n_split; i_split++) {
@@ -309,87 +314,64 @@ static void gguf_merge(const split_params & split_params) {
309314
/*.ctx = */ &ctx_meta,
310315
};
311316

312-
auto split_name = split_params.input;
313317
if (i_split > 0) {
314-
split_name = split_file_name(split_prefix, i_split, n_split);
318+
llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
315319
}
316-
fprintf(stderr, "%s: reading metadata %s ...", __func__, split_name.c_str());
320+
fprintf(stderr, "%s: reading metadata %s ...", __func__, split_path);
317321

318-
auto * ctx_gguf = gguf_init_from_file(split_name.c_str(), params);
322+
auto * ctx_gguf = gguf_init_from_file(split_path, params);
319323
if (!ctx_gguf) {
320324
fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
321-
exit(1);
325+
exit(EXIT_FAILURE);
322326
}
323327
ctx_ggufs.push_back(ctx_gguf);
324328
ctx_metas.push_back(ctx_meta);
325329

326330
if (i_split == 0) {
327-
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT);
331+
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
328332
if (key_n_split < 0) {
329333
fprintf(stderr,
330334
"\n%s: input file does not contain %s metadata\n",
331335
__func__,
332-
LLM_KV_GENERAL_SPLIT_N_SPLIT);
336+
LLM_KV_SPLIT_COUNT);
333337
gguf_free(ctx_gguf);
338+
ggml_free(ctx_meta);
334339
gguf_free(ctx_out);
335340
fout.close();
336-
exit(1);
341+
exit(EXIT_FAILURE);
337342
}
338343

339-
n_split = gguf_get_val_u8(ctx_gguf, key_n_split);
344+
n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
340345
if (n_split < 1) {
341346
fprintf(stderr,
342347
"\n%s: input file does not contain a valid split count %d\n",
343348
__func__,
344349
n_split);
345350
gguf_free(ctx_gguf);
351+
ggml_free(ctx_meta);
346352
gguf_free(ctx_out);
347353
fout.close();
348-
exit(1);
354+
exit(EXIT_FAILURE);
349355
}
350356

351-
// Do not trigger merge if we try to merge again the output
352-
gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0);
353-
354-
// Set metadata from the first split
355-
gguf_set_kv(ctx_out, ctx_gguf);
356-
}
357-
358-
// Verify the file naming
359-
{
360-
int i_split_file = 0;
361-
int n_split_file = 0;
362-
const char * i_split_format = "-00000-of-00000.gguf";
363-
364-
if (split_name.size() < strlen(i_split_format)) {
365-
fprintf(stderr, "\n%s: unexpected input file name: %s\n", __func__, split_params.input.c_str());
366-
for (auto * _ctx_gguf : ctx_ggufs) {
367-
gguf_free(_ctx_gguf);
368-
}
357+
// Verify the file naming and extract split_prefix
358+
if (!llama_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) {
359+
fprintf(stderr, "\n%s: unexpected input file name: %s"
360+
" i_split=%d"
361+
" n_split=%d\n", __func__,
362+
split_path, i_split, n_split);
363+
gguf_free(ctx_gguf);
364+
ggml_free(ctx_meta);
369365
gguf_free(ctx_out);
370366
fout.close();
371-
exit(1);
367+
exit(EXIT_FAILURE);
372368
}
373369

374-
split_prefix = split_name.substr(0, split_name.size() - strlen(i_split_format));
375-
376-
const char * split_name_c_str = split_name.c_str();
377-
int n_part = sscanf(&split_name_c_str[0] + split_prefix.size(), "-%d-of-%d", &i_split_file, &n_split_file);
370+
// Do not trigger merge if we try to merge again the output
371+
gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0);
378372

379-
if (n_part != 2 || i_split_file - 1 != i_split || n_split_file != n_split) {
380-
fprintf(stderr, "\n%s: unexpected input file name: %s"
381-
" i_split=%d i_split_file=%d"
382-
" n_split=%d n_split_file=%d\n", __func__,
383-
split_params.input.c_str(),
384-
i_split, i_split_file,
385-
n_split, n_split_file);
386-
for (auto * _ctx_gguf : ctx_ggufs) {
387-
gguf_free(_ctx_gguf);
388-
}
389-
gguf_free(ctx_out);
390-
fout.close();
391-
exit(1);
392-
}
373+
// Set metadata from the first split
374+
gguf_set_kv(ctx_out, ctx_gguf);
393375
}
394376

395377
auto n_tensors = gguf_get_n_tensors(ctx_gguf);
@@ -411,18 +393,19 @@ static void gguf_merge(const split_params & split_params) {
411393

412394
// Write tensors data
413395
for (int i_split = 0; i_split < n_split; i_split++) {
414-
auto split_name = split_file_name(split_prefix, i_split, n_split);
415-
std::ifstream f_input(split_name.c_str(), std::ios::binary);
396+
llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
397+
std::ifstream f_input(split_path, std::ios::binary);
416398
if (!f_input.is_open()) {
417-
fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_name.c_str());
418-
for (auto * _ctx_gguf : ctx_ggufs) {
419-
gguf_free(_ctx_gguf);
399+
fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_path);
400+
for (uint32_t i = 0; i < ctx_ggufs.size(); i++) {
401+
gguf_free(ctx_ggufs[i]);
402+
ggml_free(ctx_metas[i]);
420403
}
421404
gguf_free(ctx_out);
422405
fout.close();
423-
exit(1);
406+
exit(EXIT_FAILURE);
424407
}
425-
fprintf(stderr, "%s: writing tensors %s ...", __func__, split_name.c_str());
408+
fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path);
426409

427410
auto * ctx_gguf = ctx_ggufs[i_split];
428411
auto * ctx_meta = ctx_metas[i_split];
@@ -481,8 +464,8 @@ int main(int argc, const char ** argv) {
481464
break;
482465
case SPLIT_OP_MERGE: gguf_merge(params);
483466
break;
484-
default:split_print_usage(argv[0]);
485-
exit(1);
467+
default: split_print_usage(argv[0]);
468+
exit(EXIT_FAILURE);
486469
}
487470

488471
return 0;

0 commit comments

Comments
 (0)