@@ -190,8 +190,10 @@ struct llama_hparams {
190
190
uint32_t n_head = 32 ;
191
191
uint32_t n_layer = 32 ;
192
192
uint32_t n_rot = 64 ;
193
+
193
194
float rope_freq_base = 10000 .0f ;
194
195
float rope_freq_scale = 1 .0f ;
196
+
195
197
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
196
198
197
199
bool operator !=(const llama_hparams & other) const {
@@ -843,12 +845,12 @@ struct llama_context_params llama_context_default_params() {
843
845
struct llama_context_params result = {
844
846
/* .seed =*/ LLAMA_DEFAULT_SEED,
845
847
/* .n_ctx =*/ 512 ,
846
- /* .rope_freq_base =*/ 10000 .0f ,
847
- /* .rope_freq_scale =*/ 1 .0f ,
848
848
/* .n_batch =*/ 512 ,
849
849
/* .gpu_layers =*/ 0 ,
850
850
/* .main_gpu =*/ 0 ,
851
851
/* .tensor_split =*/ {0 },
852
+ /* .rope_freq_base =*/ 10000 .0f ,
853
+ /* .rope_freq_scale =*/ 1 .0f ,
852
854
/* .progress_callback =*/ nullptr ,
853
855
/* .progress_callback_user_data =*/ nullptr ,
854
856
/* .low_vram =*/ false ,
@@ -968,12 +970,12 @@ static void llama_model_load_internal(
968
970
llama_model & model,
969
971
llama_vocab & vocab,
970
972
int n_ctx,
971
- float rope_freq_base,
972
- float rope_freq_scale,
973
973
int n_batch,
974
974
int n_gpu_layers,
975
975
int main_gpu,
976
976
const float * tensor_split,
977
+ float rope_freq_base,
978
+ float rope_freq_scale,
977
979
bool low_vram,
978
980
ggml_type memory_type,
979
981
bool use_mmap,
@@ -1008,26 +1010,27 @@ static void llama_model_load_internal(
1008
1010
}
1009
1011
1010
1012
hparams.n_ctx = n_ctx;
1011
- hparams.rope_freq_base = rope_freq_base;
1013
+
1014
+ hparams.rope_freq_base = rope_freq_base;
1012
1015
hparams.rope_freq_scale = rope_freq_scale;
1013
1016
}
1014
1017
1015
1018
const uint32_t n_ff = ((2 *(4 *hparams.n_embd )/3 + hparams.n_mult - 1 )/hparams.n_mult )*hparams.n_mult ;
1016
1019
1017
1020
{
1018
- fprintf (stderr, " %s: format = %s\n " , __func__, llama_file_version_name (file_version));
1019
- fprintf (stderr, " %s: n_vocab = %u\n " , __func__, hparams.n_vocab );
1020
- fprintf (stderr, " %s: n_ctx = %u\n " , __func__, hparams.n_ctx );
1021
- fprintf (stderr, " %s: n_embd = %u\n " , __func__, hparams.n_embd );
1022
- fprintf (stderr, " %s: n_mult = %u\n " , __func__, hparams.n_mult );
1023
- fprintf (stderr, " %s: n_head = %u\n " , __func__, hparams.n_head );
1024
- fprintf (stderr, " %s: n_layer = %u\n " , __func__, hparams.n_layer );
1025
- fprintf (stderr, " %s: n_rot = %u\n " , __func__, hparams.n_rot );
1026
- fprintf (stderr, " %s: freq_base = %.1f\n " , __func__, hparams.rope_freq_base );
1027
- fprintf (stderr, " %s: freq_scale = %g\n " , __func__, hparams.rope_freq_scale );
1021
+ fprintf (stderr, " %s: format = %s\n " , __func__, llama_file_version_name (file_version));
1022
+ fprintf (stderr, " %s: n_vocab = %u\n " , __func__, hparams.n_vocab );
1023
+ fprintf (stderr, " %s: n_ctx = %u\n " , __func__, hparams.n_ctx );
1024
+ fprintf (stderr, " %s: n_embd = %u\n " , __func__, hparams.n_embd );
1025
+ fprintf (stderr, " %s: n_mult = %u\n " , __func__, hparams.n_mult );
1026
+ fprintf (stderr, " %s: n_head = %u\n " , __func__, hparams.n_head );
1027
+ fprintf (stderr, " %s: n_layer = %u\n " , __func__, hparams.n_layer );
1028
+ fprintf (stderr, " %s: n_rot = %u\n " , __func__, hparams.n_rot );
1029
+ fprintf (stderr, " %s: freq_base = %.1f\n " , __func__, hparams.rope_freq_base );
1030
+ fprintf (stderr, " %s: freq_scale = %g\n " , __func__, hparams.rope_freq_scale );
1028
1031
fprintf (stderr, " %s: ftype = %u (%s)\n " , __func__, hparams.ftype , llama_ftype_name (hparams.ftype ));
1029
- fprintf (stderr, " %s: n_ff = %u\n " , __func__, n_ff);
1030
- fprintf (stderr, " %s: model size = %s\n " , __func__, llama_model_type_name (model.type ));
1032
+ fprintf (stderr, " %s: n_ff = %u\n " , __func__, n_ff);
1033
+ fprintf (stderr, " %s: model size = %s\n " , __func__, llama_model_type_name (model.type ));
1031
1034
}
1032
1035
1033
1036
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
@@ -1278,12 +1281,12 @@ static bool llama_model_load(
1278
1281
llama_model & model,
1279
1282
llama_vocab & vocab,
1280
1283
int n_ctx,
1281
- float rope_freq_base,
1282
- float rope_freq_scale,
1283
1284
int n_batch,
1284
1285
int n_gpu_layers,
1285
1286
int main_gpu,
1286
1287
float * tensor_split,
1288
+ float rope_freq_base,
1289
+ float rope_freq_scale,
1287
1290
bool low_vram,
1288
1291
ggml_type memory_type,
1289
1292
bool use_mmap,
@@ -1292,7 +1295,7 @@ static bool llama_model_load(
1292
1295
llama_progress_callback progress_callback,
1293
1296
void *progress_callback_user_data) {
1294
1297
try {
1295
- llama_model_load_internal (fname, model, vocab, n_ctx, rope_freq_base, rope_freq_scale, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1298
+ llama_model_load_internal (fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale , low_vram, memory_type,
1296
1299
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1297
1300
return true ;
1298
1301
} catch (const std::exception & err) {
@@ -1342,9 +1345,10 @@ static bool llama_eval_internal(
1342
1345
const int n_head = hparams.n_head ;
1343
1346
const int n_vocab = hparams.n_vocab ;
1344
1347
const int n_rot = hparams.n_embd /hparams.n_head ;
1348
+ const int n_gpu_layers = model.n_gpu_layers ;
1349
+
1345
1350
const float freq_base = hparams.rope_freq_base ;
1346
1351
const float freq_scale = hparams.rope_freq_scale ;
1347
- const int n_gpu_layers = model.n_gpu_layers ;
1348
1352
1349
1353
auto & mem_per_token = lctx.mem_per_token ;
1350
1354
auto & buf_compute = lctx.buf_compute ;
@@ -2689,9 +2693,9 @@ struct llama_model * llama_load_model_from_file(
2689
2693
2690
2694
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2691
2695
2692
- if (!llama_model_load (path_model, *model, model->vocab , params.n_ctx , params.rope_freq_base , params.rope_freq_scale ,
2693
- params.n_batch , params.n_gpu_layers , params.main_gpu , params.tensor_split , params.low_vram , memory_type ,
2694
- params.use_mmap , params.use_mlock , params.vocab_only , params.progress_callback ,
2696
+ if (!llama_model_load (path_model, *model, model->vocab , params.n_ctx , params.n_batch , params.n_gpu_layers ,
2697
+ params.main_gpu , params.tensor_split , params.rope_freq_base , params.rope_freq_scale , params.low_vram ,
2698
+ memory_type, params.use_mmap , params.use_mlock , params.vocab_only , params.progress_callback ,
2695
2699
params.progress_callback_user_data )) {
2696
2700
delete model;
2697
2701
fprintf (stderr, " %s: failed to load model\n " , __func__);
0 commit comments