@@ -6261,7 +6261,8 @@ static void norm_f32_sycl(const float *x, float *dst, const int ncols,
6261
6261
});
6262
6262
});
6263
6263
} else {
6264
- const int work_group_size = g_work_group_size;
6264
+ // FIXME: 1024 from cuda
6265
+ const int work_group_size = 1024;
6265
6266
const sycl::range<3> block_dims(1, 1, work_group_size);
6266
6267
/*
6267
6268
DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
@@ -6307,7 +6308,7 @@ static void group_norm_f32_sycl(const float *x, float *dst,
6307
6308
});
6308
6309
});
6309
6310
} else {
6310
- const int work_group_size = g_work_group_size ;
6311
+ const int work_group_size = 1024 ;
6311
6312
const sycl::range<3> block_dims(1, 1, work_group_size);
6312
6313
/*
6313
6314
DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
@@ -6396,7 +6397,7 @@ static void rms_norm_f32_sycl(const float *x, float *dst, const int ncols,
6396
6397
});
6397
6398
});
6398
6399
} else {
6399
- const int work_group_size = g_work_group_size ;
6400
+ const int work_group_size = 1024 ;
6400
6401
const sycl::range<3> block_dims(1, 1, work_group_size);
6401
6402
/*
6402
6403
DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
@@ -9246,7 +9247,7 @@ static void soft_max_f32_sycl(const float * x, const float * mask,
9246
9247
const int nrows_y, const float scale, const float max_bias,
9247
9248
queue_ptr stream) {
9248
9249
int nth = WARP_SIZE;
9249
- int max_block_size = g_work_group_size ;
9250
+ int max_block_size = 1024 ;
9250
9251
while (nth < ncols_x && nth < max_block_size) nth *= 2;
9251
9252
if (nth>max_block_size) nth = max_block_size;
9252
9253
@@ -11452,14 +11453,9 @@ static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const gg
11452
11453
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
11453
11454
queue_ptr main_stream = ctx.stream();
11454
11455
11455
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
11456
- void * src0_ddq = src0_extra->data_device[ctx.device];
11457
-
11458
- ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
11459
- float * src1_ddf = (float *) src1_extra->data_device[ctx.device];
11460
-
11461
- ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
11462
- float * dst_ddf = (float *) dst_extra->data_device[ctx.device];
11456
+ void * src0_ddq = src0->data;
11457
+ float * src1_ddf = (float *) src1->data;
11458
+ float * dst_ddf = (float *) dst->data;
11463
11459
11464
11460
ggml_mul_mat_p021_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
11465
11461
}
@@ -11490,15 +11486,10 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml
11490
11486
11491
11487
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
11492
11488
queue_ptr main_stream = ctx.stream();
11493
-
11494
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
11495
- void * src0_ddq = src0_extra->data_device[ctx.device];
11496
-
11497
- ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
11498
- float * src1_ddf = (float *) src1_extra->data_device[ctx.device];
11499
-
11500
- ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
11501
- float * dst_ddf = (float *) dst_extra->data_device[ctx.device];
11489
+
11490
+ void * src0_ddq = src0->data;
11491
+ float * src1_ddf = (float *) src1->data;
11492
+ float * dst_ddf = (float *) dst->data;
11502
11493
11503
11494
const int64_t row_stride_x = nb01 / sizeof(sycl::half);
11504
11495
const int64_t channel_stride_x = nb02 / sizeof(sycl::half);
@@ -12042,9 +12033,6 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
12042
12033
const int64_t ne = ggml_nelements(src0);
12043
12034
GGML_ASSERT(ne == ggml_nelements(src1));
12044
12035
12045
- GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
12046
- GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
12047
-
12048
12036
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
12049
12037
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
12050
12038
@@ -12053,11 +12041,8 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
12053
12041
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
12054
12042
queue_ptr main_stream = ctx.stream();
12055
12043
12056
- const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
12057
- const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
12058
-
12059
- char * src0_ddc = (char *) src0_extra->data_device[ctx.device];
12060
- char * src1_ddc = (char *) src1_extra->data_device[ctx.device];
12044
+ char * src0_ddc = (char *) src0->data;
12045
+ char * src1_ddc = (char *) src1->data;
12061
12046
12062
12047
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
12063
12048
ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
0 commit comments