@@ -853,43 +853,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
853
853
default: type = LLM_TYPE_UNKNOWN;
854
854
}
855
855
856
- // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
857
- if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
858
- // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
859
- LLAMA_LOG_WARN("%s: assuming n_swa = 2047 for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct\n", __func__);
856
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
860
857
861
- hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
862
-
863
- hparams.n_swa = 2047;
864
- } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
865
- // default value for Phi-3-mini-128k-instruct
866
- LLAMA_LOG_WARN("%s: assuming no SWA for Phi-3-mini-128k-instruct\n", __func__);
867
-
868
- hparams.swa_type = LLAMA_SWA_TYPE_NONE;
869
-
870
- hparams.n_swa = hparams.n_ctx_train;
871
- hparams.n_swa_pattern = 1;
872
- } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
873
- // default value for Phi-3-medium-128k-instruct
874
- LLAMA_LOG_WARN("%s: assuming no SWA for Phi-3-medium-128k-instruct\n", __func__);
858
+ if (found_swa && hparams.n_swa > 0) {
859
+ LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
860
+ __func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
875
861
862
+ // TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
876
863
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
877
864
878
- hparams.n_swa = hparams.n_ctx_train;
879
- hparams.n_swa_pattern = 1;
880
- }
881
-
882
- bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
883
- if (!found_swa && hparams.n_swa == 0) {
884
- throw std::runtime_error("invalid value for sliding_window");
885
- }
886
-
887
- if (hparams.n_swa > hparams.n_ctx_train) {
888
- LLAMA_LOG_WARN("%s: unexpected n_swa: %d >= %d, disabling SWA\n", __func__, hparams.n_swa, hparams.n_ctx_train);
889
-
890
- hparams.swa_type = LLAMA_SWA_TYPE_NONE;
891
-
892
- hparams.n_swa = hparams.n_ctx_train;
865
+ hparams.n_swa = 0;
893
866
hparams.n_swa_pattern = 1;
894
867
}
895
868
} break;
@@ -7368,8 +7341,9 @@ struct llm_build_phi2 : public llm_graph_context {
7368
7341
}
7369
7342
};
7370
7343
7371
- struct llm_build_phi3_iswa : public llm_graph_context {
7372
- llm_build_phi3_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
7344
+ template<bool iswa>
7345
+ struct llm_build_phi3 : public llm_graph_context {
7346
+ llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
7373
7347
const int64_t n_embd_head = hparams.n_embd_head_v;
7374
7348
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
7375
7349
@@ -7383,7 +7357,14 @@ struct llm_build_phi3_iswa : public llm_graph_context {
7383
7357
// inp_pos - contains the positions
7384
7358
ggml_tensor * inp_pos = build_inp_pos();
7385
7359
7386
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
7360
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
7361
+ inp_attn_type * inp_attn = nullptr;
7362
+
7363
+ if constexpr (iswa) {
7364
+ inp_attn = build_attn_inp_kv_unified_iswa();
7365
+ } else {
7366
+ inp_attn = build_attn_inp_kv_unified();
7367
+ }
7387
7368
7388
7369
for (int il = 0; il < n_layer; ++il) {
7389
7370
auto * residual = inpL;
@@ -13232,7 +13213,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
13232
13213
13233
13214
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
13234
13215
13235
- if (hparams.n_swa > 0) {
13216
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
13217
+ GGML_ASSERT(hparams.n_swa_pattern != 1);
13218
+
13236
13219
res = new llama_kv_cache_unified_iswa(
13237
13220
*this,
13238
13221
params.type_k,
@@ -13245,6 +13228,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
13245
13228
cparams.n_batch,
13246
13229
padding);
13247
13230
} else {
13231
+ GGML_ASSERT(hparams.n_swa_pattern == 1);
13232
+
13248
13233
res = new llama_kv_cache_unified(
13249
13234
*this,
13250
13235
nullptr,
@@ -13353,7 +13338,11 @@ llm_graph_result_ptr llama_model::build_graph(
13353
13338
case LLM_ARCH_PHI3:
13354
13339
case LLM_ARCH_PHIMOE:
13355
13340
{
13356
- llm = std::make_unique<llm_build_phi3_iswa>(*this, params, gf);
13341
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
13342
+ llm = std::make_unique<llm_build_phi3<true>> (*this, params, gf);
13343
+ } else {
13344
+ llm = std::make_unique<llm_build_phi3<false>>(*this, params, gf);
13345
+ }
13357
13346
} break;
13358
13347
case LLM_ARCH_PLAMO:
13359
13348
{
0 commit comments