@@ -196,6 +196,7 @@ struct vk_device_struct {
196
196
vk_pipeline pipeline_pad_f32;
197
197
vk_pipeline pipeline_repeat_f32;
198
198
vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
199
+ vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16;
199
200
vk_pipeline pipeline_norm_f32;
200
201
vk_pipeline pipeline_group_norm_f32;
201
202
vk_pipeline pipeline_rms_norm_f32;
@@ -722,6 +723,12 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
722
723
std::lock_guard<std::mutex> guard(compile_count_mutex);
723
724
assert(compile_count > 0);
724
725
compile_count--;
726
+
727
+ // "Progress bar" for shader compiles
728
+ static uint32_t total_compile_count = 0;
729
+ if ((total_compile_count++ % 10) == 0) {
730
+ std::cerr << ".";
731
+ }
725
732
}
726
733
compile_count_cond.notify_all();
727
734
}
@@ -1200,6 +1207,8 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events
1200
1207
static void ggml_vk_load_shaders(vk_device& device) {
1201
1208
VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
1202
1209
1210
+ std::cerr << "ggml_vulkan: Compiling shaders";
1211
+
1203
1212
// mulmat
1204
1213
std::initializer_list<uint32_t> warptile_l = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
1205
1214
std::initializer_list<uint32_t> warptile_m = { 128, 64, 64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
@@ -1759,6 +1768,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
1759
1768
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
1760
1769
ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
1761
1770
1771
+ ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f32, "contig_cpy_f32_f32", contig_cpy_f32_f32_len, contig_cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
1772
+ ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f16, "contig_cpy_f32_f16", contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
1773
+ ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
1774
+
1762
1775
ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
1763
1776
ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
1764
1777
@@ -1817,6 +1830,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
1817
1830
for (auto &c : compiles) {
1818
1831
c.wait();
1819
1832
}
1833
+ std::cerr << "Done!" << std::endl;
1820
1834
}
1821
1835
1822
1836
static vk_device ggml_vk_get_device(size_t idx) {
@@ -3061,18 +3075,34 @@ static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
3061
3075
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
3062
3076
}
3063
3077
3064
- static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_type from, ggml_type to) {
3065
- if (from == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
3066
- return ctx->device->pipeline_cpy_f32_f32;
3078
+ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src, const ggml_tensor * dst, ggml_type to) {
3079
+
3080
+ // Choose "contiguous copy" shader if src/dst are contiguous
3081
+ bool contig = ggml_is_contiguous(src) && (!dst || ggml_is_contiguous(dst));
3082
+
3083
+ if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
3084
+ if (contig) {
3085
+ return ctx->device->pipeline_contig_cpy_f32_f32;
3086
+ } else {
3087
+ return ctx->device->pipeline_cpy_f32_f32;
3088
+ }
3067
3089
}
3068
- if (from == GGML_TYPE_F32 && to == GGML_TYPE_F16) {
3069
- return ctx->device->pipeline_cpy_f32_f16;
3090
+ if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F16) {
3091
+ if (contig) {
3092
+ return ctx->device->pipeline_contig_cpy_f32_f16;
3093
+ } else {
3094
+ return ctx->device->pipeline_cpy_f32_f16;
3095
+ }
3070
3096
}
3071
- if (from == GGML_TYPE_F16 && to == GGML_TYPE_F16) {
3072
- return ctx->device->pipeline_cpy_f16_f16;
3097
+ if (src->type == GGML_TYPE_F16 && to == GGML_TYPE_F16) {
3098
+ if (contig) {
3099
+ return ctx->device->pipeline_contig_cpy_f16_f16;
3100
+ } else {
3101
+ return ctx->device->pipeline_cpy_f16_f16;
3102
+ }
3073
3103
}
3074
3104
3075
- std::cerr << "Missing CPY op for types: " << ggml_type_name(from ) << " " << ggml_type_name(to) << std::endl;
3105
+ std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type ) << " " << ggml_type_name(to) << std::endl;
3076
3106
GGML_ABORT("fatal error");
3077
3107
}
3078
3108
@@ -3082,6 +3112,15 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
3082
3112
const int tensor_type_size = ggml_type_size(tensor->type);
3083
3113
3084
3114
const uint32_t ne = ggml_nelements(tensor);
3115
+ std::array<uint32_t, 3> elements;
3116
+
3117
+ if (ne > 262144) {
3118
+ elements = { 512, 512, CEIL_DIV(ne, 262144) };
3119
+ } else if (ne > 512) {
3120
+ elements = { 512, CEIL_DIV(ne, 512), 1 };
3121
+ } else {
3122
+ elements = { ne, 1, 1 };
3123
+ }
3085
3124
3086
3125
const vk_op_unary_push_constants pc = {
3087
3126
(uint32_t)ne,
@@ -3091,7 +3130,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
3091
3130
0.0f, 0.0f,
3092
3131
};
3093
3132
ggml_vk_sync_buffers(subctx);
3094
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, { ne, 1, 1 } );
3133
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, elements );
3095
3134
}
3096
3135
3097
3136
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -3176,12 +3215,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
3176
3215
vk_pipeline to_fp16_vk_1 = nullptr;
3177
3216
3178
3217
if (x_non_contig) {
3179
- to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type , GGML_TYPE_F16);
3218
+ to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr , GGML_TYPE_F16);
3180
3219
} else {
3181
3220
to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
3182
3221
}
3183
3222
if (y_non_contig) {
3184
- to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type , GGML_TYPE_F16);
3223
+ to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr , GGML_TYPE_F16);
3185
3224
} else {
3186
3225
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
3187
3226
}
@@ -3361,10 +3400,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
3361
3400
vk_pipeline to_fp16_vk_0 = nullptr;
3362
3401
vk_pipeline to_fp16_vk_1 = nullptr;
3363
3402
if (x_non_contig) {
3364
- to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type , src0->type);
3403
+ to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr , src0->type);
3365
3404
}
3366
3405
if (y_non_contig) {
3367
- to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type , src1->type);
3406
+ to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr , src1->type);
3368
3407
} else {
3369
3408
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
3370
3409
}
@@ -3745,12 +3784,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
3745
3784
vk_pipeline to_fp16_vk_1 = nullptr;
3746
3785
3747
3786
if (x_non_contig) {
3748
- to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type , GGML_TYPE_F16);
3787
+ to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr , GGML_TYPE_F16);
3749
3788
} else {
3750
3789
to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
3751
3790
}
3752
3791
if (y_non_contig) {
3753
- to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type , GGML_TYPE_F16);
3792
+ to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr , GGML_TYPE_F16);
3754
3793
} else {
3755
3794
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
3756
3795
}
@@ -3938,10 +3977,10 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
3938
3977
vk_pipeline to_fp16_vk_0 = nullptr;
3939
3978
vk_pipeline to_fp16_vk_1 = nullptr;
3940
3979
if (x_non_contig) {
3941
- to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type , src0->type);
3980
+ to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr , src0->type);
3942
3981
}
3943
3982
if (y_non_contig) {
3944
- to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type , src1->type);
3983
+ to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr , src1->type);
3945
3984
} else {
3946
3985
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
3947
3986
}
@@ -4148,7 +4187,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
4148
4187
case GGML_OP_CPY:
4149
4188
case GGML_OP_CONT:
4150
4189
case GGML_OP_DUP:
4151
- return ggml_vk_get_cpy_pipeline(ctx, src0->type , dst->type);
4190
+ return ggml_vk_get_cpy_pipeline(ctx, src0, dst , dst->type);
4152
4191
case GGML_OP_NORM:
4153
4192
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
4154
4193
return ctx->device->pipeline_norm_f32;
@@ -4281,7 +4320,6 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
4281
4320
case GGML_OP_DIV:
4282
4321
case GGML_OP_CONCAT:
4283
4322
case GGML_OP_UPSCALE:
4284
- case GGML_OP_SCALE:
4285
4323
case GGML_OP_SQR:
4286
4324
case GGML_OP_SIN:
4287
4325
case GGML_OP_COS:
0 commit comments