Skip to content

Commit a836529

Browse files
committed
Merge 'origin/master' into hipblas
2 parents 85f902d + 254a7a7 commit a836529

32 files changed

+7749
-736
lines changed

.devops/full.Dockerfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,6 @@ COPY . .
1616

1717
RUN make
1818

19+
ENV LC_ALL=C.utf8
20+
1921
ENTRYPOINT ["/app/.devops/tools.sh"]

.devops/main.Dockerfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,6 @@ FROM ubuntu:$UBUNTU_VERSION as runtime
1515

1616
COPY --from=build /app/main /main
1717

18+
ENV LC_ALL=C.utf8
19+
1820
ENTRYPOINT [ "/main" ]

.github/workflows/build.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@ on:
1010
push:
1111
branches:
1212
- master
13-
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
13+
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
1414
pull_request:
1515
types: [opened, synchronize, reopened]
16-
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
16+
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
1717

1818
env:
1919
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}

CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,9 @@ target_link_libraries(llama PRIVATE
464464
if (BUILD_SHARED_LIBS)
465465
set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
466466
target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
467+
if (LLAMA_METAL)
468+
set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
469+
endif()
467470
endif()
468471

469472
if (GGML_SOURCES_CUDA)

Makefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
107107
# Usage AVX-only
108108
#CFLAGS += -mfma -mf16c -mavx
109109
#CXXFLAGS += -mfma -mf16c -mavx
110+
111+
# Usage SSSE3-only (Not is SSE3!)
112+
#CFLAGS += -mssse3
113+
#CXXFLAGS += -mssse3
110114
endif
111115

112116
ifneq ($(filter ppc64%,$(UNAME_M)),)
@@ -123,6 +127,7 @@ endif
123127

124128
ifndef LLAMA_NO_K_QUANTS
125129
CFLAGS += -DGGML_USE_K_QUANTS
130+
CXXFLAGS += -DGGML_USE_K_QUANTS
126131
OBJS += k_quants.o
127132
endif
128133

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -308,7 +308,7 @@ Building the program with BLAS support may lead to some performance improvements
308308

309309
- #### BLIS
310310

311-
Check [BLIS.md](BLIS.md) for more information.
311+
Check [BLIS.md](docs/BLIS.md) for more information.
312312

313313
- #### Intel MKL
314314

SHA256SUMS

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth
22
666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847 models/7B/ggml-model-f16.bin
3-
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q4_0.bin
3+
ec2f2d1f0dfb73b72a4cbac7fa121abbe04c37ab327125a38248f930c0f09ddf models/7B/ggml-model-q4_0.bin
44
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q4_1.bin
55
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_0.bin
66
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_1.bin
77
7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json
88
745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth
99
d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth
1010
2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808 models/13B/ggml-model-f16.bin
11-
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q4_0.bin
11+
fad169e6f0f575402cf75945961cb4a8ecd824ba4da6be2af831f320c4348fa5 models/13B/ggml-model-q4_0.bin
1212
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q4_1.bin
1313
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_0.bin
1414
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_1.bin
@@ -18,7 +18,7 @@ e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/con
1818
24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378 models/30B/consolidated.02.pth
1919
1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b models/30B/consolidated.03.pth
2020
7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37 models/30B/ggml-model-f16.bin
21-
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q4_0.bin
21+
d2a441403944819492ec8c2002cc36fa38468149bfb4b7b4c52afc7bd9a7166d models/30B/ggml-model-q4_0.bin
2222
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q4_1.bin
2323
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_0.bin
2424
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_1.bin
@@ -32,7 +32,7 @@ a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78 models/65B/con
3232
72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b models/65B/consolidated.06.pth
3333
d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638 models/65B/consolidated.07.pth
3434
60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0 models/65B/ggml-model-f16.bin
35-
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q4_0.bin
35+
cde053439fa4910ae454407e2717cc46cc2c2b4995c00c93297a2b52e790fa92 models/65B/ggml-model-q4_0.bin
3636
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q4_1.bin
3737
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_0.bin
3838
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_1.bin

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ else()
3737
add_subdirectory(save-load-state)
3838
add_subdirectory(benchmark)
3939
add_subdirectory(baby-llama)
40+
add_subdirectory(train-text-from-scratch)
4041
if (LLAMA_METAL)
4142
add_subdirectory(metal)
4243
endif()

examples/baby-llama/baby-llama.cpp

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -79,34 +79,39 @@ struct ggml_tensor * randomize_tensor_normal(
7979
int ndims,
8080
const int64_t ne[],
8181
struct random_normal_distribution * rnd) {
82+
float scale = 1.0; // xavier
8283
switch (ndims) {
8384
case 1:
85+
scale /= sqrtf(ne[0]);
8486
for (int i0 = 0; i0 < ne[0]; i0++) {
85-
((float *)tensor->data)[i0] = frand_normal(rnd);
87+
((float *)tensor->data)[i0] = scale * frand_normal(rnd);
8688
}
8789
break;
8890
case 2:
91+
scale /= sqrtf(ne[0]+ne[1]);
8992
for (int i1 = 0; i1 < ne[1]; i1++) {
9093
for (int i0 = 0; i0 < ne[0]; i0++) {
91-
((float *)tensor->data)[i1*ne[0] + i0] = frand_normal(rnd);
94+
((float *)tensor->data)[i1*ne[0] + i0] = scale * frand_normal(rnd);
9295
}
9396
}
9497
break;
9598
case 3:
99+
scale /= sqrtf(ne[0]+ne[1]);
96100
for (int i2 = 0; i2 < ne[2]; i2++) {
97101
for (int i1 = 0; i1 < ne[1]; i1++) {
98102
for (int i0 = 0; i0 < ne[0]; i0++) {
99-
((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand_normal(rnd);
103+
((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
100104
}
101105
}
102106
}
103107
break;
104108
case 4:
109+
scale /= sqrtf(ne[0]+ne[1]);
105110
for (int i3 = 0; i3 < ne[3]; i3++) {
106111
for (int i2 = 0; i2 < ne[2]; i2++) {
107112
for (int i1 = 0; i1 < ne[1]; i1++) {
108113
for (int i0 = 0; i0 < ne[0]; i0++) {
109-
((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand_normal(rnd);
114+
((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
110115
}
111116
}
112117
}
@@ -148,8 +153,8 @@ struct llama_hparams_lora {
148153
uint32_t n_rot = 64;
149154
uint32_t n_lora = 64;
150155

151-
bool operator!=(const llama_hparams & other) const {
152-
return memcmp(this, &other, sizeof(llama_hparams));
156+
bool operator!=(const llama_hparams_lora & other) const {
157+
return memcmp(this, &other, sizeof(llama_hparams_lora)) != 0;
153158
}
154159
};
155160

examples/common.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
331331
}
332332
#else
333333
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
334+
#endif // GGML_USE_CUBLAS
335+
} else if (arg == "--low-vram" || arg == "-lv") {
336+
#ifdef GGML_USE_CUBLAS
337+
params.low_vram = true;
338+
#else
339+
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
334340
#endif // GGML_USE_CUBLAS
335341
} else if (arg == "--no-mmap") {
336342
params.use_mmap = false;
@@ -479,6 +485,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
479485
fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
480486
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
481487
fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
488+
fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
482489
#endif
483490
fprintf(stderr, " --mtest compute maximum memory usage\n");
484491
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
@@ -528,6 +535,7 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
528535
lparams.n_gpu_layers = params.n_gpu_layers;
529536
lparams.main_gpu = params.main_gpu;
530537
memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
538+
lparams.low_vram = params.low_vram;
531539
lparams.seed = params.seed;
532540
lparams.f16_kv = params.memory_f16;
533541
lparams.use_mmap = params.use_mmap;
@@ -632,6 +640,9 @@ void console_set_color(console_state & con_st, console_color_t color) {
632640
case CONSOLE_COLOR_USER_INPUT:
633641
fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_GREEN);
634642
break;
643+
case CONSOLE_COLOR_ERROR:
644+
fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_RED);
645+
break;
635646
}
636647
con_st.color = color;
637648
fflush(con_st.out);

examples/common.h

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,16 @@
2121
int32_t get_num_physical_cores();
2222

2323
struct gpt_params {
24-
int32_t seed = -1; // RNG seed
25-
int32_t n_threads = get_num_physical_cores();
26-
int32_t n_predict = -1; // new tokens to predict
27-
int32_t n_ctx = 512; // context size
28-
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
29-
int32_t n_keep = 0; // number of tokens to keep from initial prompt
30-
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
31-
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
24+
int32_t seed = -1; // RNG seed
25+
int32_t n_threads = get_num_physical_cores();
26+
int32_t n_predict = -1; // new tokens to predict
27+
int32_t n_ctx = 512; // context size
28+
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
29+
int32_t n_keep = 0; // number of tokens to keep from initial prompt
30+
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
31+
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
3232
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
33+
bool low_vram = 0; // if true, reduce VRAM usage at the cost of performance
3334

3435
// sampling parameters
3536
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
@@ -112,7 +113,8 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params);
112113
enum console_color_t {
113114
CONSOLE_COLOR_DEFAULT=0,
114115
CONSOLE_COLOR_PROMPT,
115-
CONSOLE_COLOR_USER_INPUT
116+
CONSOLE_COLOR_USER_INPUT,
117+
CONSOLE_COLOR_ERROR
116118
};
117119

118120
struct console_state {

examples/main/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,5 +288,6 @@ These options provide extra functionality and customization when running the LLa
288288
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
289289
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
290290
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
291+
- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
291292
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
292293
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.

examples/main/main.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,9 @@ int main(int argc, char ** argv) {
8181
if (params.n_ctx > 2048) {
8282
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
8383
"expect poor results\n", __func__, params.n_ctx);
84+
} else if (params.n_ctx < 8) {
85+
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
86+
params.n_ctx = 8;
8487
}
8588

8689
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
@@ -328,9 +331,29 @@ int main(int argc, char ** argv) {
328331

329332
std::vector<llama_token> embd;
330333

334+
// do one empty run to warm up the model
335+
{
336+
const std::vector<llama_token> tmp = { llama_token_bos(), };
337+
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
338+
llama_reset_timings(ctx);
339+
}
340+
331341
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
332342
// predict
333343
if (embd.size() > 0) {
344+
// Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
345+
// --prompt or --file which uses the same value.
346+
auto max_embd_size = n_ctx - 4;
347+
// Ensure the input doesn't exceed the context size by truncating embd if necessary.
348+
if ((int)embd.size() > max_embd_size) {
349+
auto skipped_tokens = embd.size() - max_embd_size;
350+
console_set_color(con_st, CONSOLE_COLOR_ERROR);
351+
printf("<<input too long: skipped %ld token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
352+
console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
353+
fflush(stdout);
354+
embd.resize(max_embd_size);
355+
}
356+
334357
// infinite text generation via context swapping
335358
// if we run out of context:
336359
// - take the n_keep first tokens from the original prompt (via n_past)

0 commit comments

Comments
 (0)