@@ -78,6 +78,9 @@ static int g_work_group_size = 0;
78
78
#define GGML_SYCL_MMV_Y 1
79
79
#endif
80
80
81
+ typedef sycl::queue *queue_ptr;
82
+ typedef sycl::handler *handle_ptr;
83
+
81
84
enum ggml_sycl_backend_gpu_mode {
82
85
SYCL_UNSET_GPU_MODE = -1 ,
83
86
SYCL_SINGLE_GPU_MODE = 0 ,
@@ -182,17 +185,6 @@ static_assert(
182
185
#endif // GGML_SYCL_PEER_MAX_BATCH_SIZE
183
186
184
187
#define MUL_MAT_SRC1_COL_STRIDE 128
185
- #define MAX_STREAMS 8
186
- #define SYCL_MAX_DEVICES 48
187
-
188
- static dpct::queue_ptr g_syclStreams[SYCL_MAX_DEVICES][MAX_STREAMS] = {{0 }};
189
-
190
- struct ggml_tensor_extra_gpu {
191
- void * data_device[SYCL_MAX_DEVICES]; // 1 pointer for each device for split
192
- // tensors
193
- dpct::event_ptr events[SYCL_MAX_DEVICES]
194
- [MAX_STREAMS]; // events for synchronizing multiple GPUs
195
- };
196
188
197
189
class sycl_gpu_mgr {
198
190
public:
@@ -320,7 +312,7 @@ class sycl_gpu_mgr {
320
312
}
321
313
};
322
314
323
- static sycl_gpu_mgr* g_sycl_gpu_mgr = NULL ;
315
+ static sycl_gpu_mgr* g_sycl_gpu_mgr = new sycl_gpu_mgr( 0 ) ;
324
316
static int g_device_count = -1 ;
325
317
static int g_all_sycl_device_count = -1 ;
326
318
static int g_main_device = -1 ;
@@ -329,31 +321,15 @@ static bool g_ggml_backend_sycl_buffer_type_initialized = false;
329
321
330
322
static std::array<float , SYCL_MAX_DEVICES> g_default_tensor_split = {};
331
323
332
- static float g_tensor_split[SYCL_MAX_DEVICES ] = {0 };
324
+ static float g_tensor_split[GGML_SYCL_MAX_DEVICES ] = {0 };
333
325
334
326
static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode =
335
327
SYCL_UNSET_GPU_MODE;
336
328
337
- struct sycl_device_capabilities {
338
- int cc; // compute capability
339
- bool vmm; // virtual memory support
340
- size_t vmm_granularity; // granularity of virtual memory
341
- int device_id;
342
- };
343
-
344
- static sycl_device_capabilities g_device_caps[SYCL_MAX_DEVICES] = {
345
- {0 , false , 0 , -1 }};
346
-
347
- struct sycl_device_id2index {
348
- int index;
349
- };
350
-
351
329
static void * g_scratch_buffer = nullptr ;
352
330
static size_t g_scratch_size = 0 ; // disabled by default
353
331
static size_t g_scratch_offset = 0 ;
354
332
355
- static dpct::queue_ptr g_sycl_handles[SYCL_MAX_DEVICES] = {nullptr };
356
-
357
333
int get_main_device ();
358
334
359
335
[[noreturn]] static inline void bad_arch (const sycl::stream& stream_ct1) {
@@ -427,25 +403,151 @@ inline dpct::err0 ggml_sycl_set_device(const int device) try {
427
403
std::exit (1 );
428
404
}
429
405
430
- void log_ggml_var_device (
431
- const char * name,
432
- float * src,
433
- size_t total_elements,
434
- bool src_on_device);
435
-
436
- void log_ggml_var_device_fp16 (
437
- const char * name,
438
- sycl::half* src,
439
- size_t total_elements,
440
- bool src_on_device);
441
-
442
- // todo: debug for crash in some case
443
- void print_ggml_tensor (const char * name, struct ggml_tensor * src);
444
-
445
- static int log_file_name_idx = 0 ;
446
- void log_tensor_with_cnt (
447
- const char * name,
448
- struct ggml_tensor * src,
449
- int stop_cnt);
406
+ // ////////////////////
407
+
408
+ struct ggml_sycl_device_info {
409
+ int device_count;
410
+
411
+ struct sycl_device_info {
412
+ int cc; // compute capability
413
+ // int nsm; // number of streaming multiprocessors
414
+ // size_t smpb; // max. shared memory per block
415
+ bool vmm; // virtual memory support
416
+ size_t total_vram;
417
+ };
418
+
419
+ sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
420
+
421
+ std::array<float , GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
422
+ };
423
+
424
+ const ggml_sycl_device_info & ggml_sycl_info ();
425
+
426
+ struct ggml_sycl_pool {
427
+ virtual ~ggml_sycl_pool () = default ;
428
+
429
+ virtual void * alloc (size_t size, size_t * actual_size) = 0;
430
+ virtual void free (void * ptr, size_t size) = 0;
431
+ };
432
+
433
+ template <typename T>
434
+ struct ggml_sycl_pool_alloc {
435
+ ggml_sycl_pool * pool = nullptr ;
436
+ T * ptr = nullptr ;
437
+ size_t actual_size = 0 ;
438
+
439
+ explicit ggml_sycl_pool_alloc (ggml_sycl_pool & pool) : pool(&pool) {
440
+ }
441
+
442
+ ggml_sycl_pool_alloc (ggml_sycl_pool & pool, size_t size) : pool(&pool) {
443
+ alloc (size);
444
+ }
445
+
446
+ ~ggml_sycl_pool_alloc () {
447
+ if (ptr != nullptr ) {
448
+ pool->free (ptr, actual_size);
449
+ }
450
+ }
451
+
452
+ // size is in number of elements
453
+ T * alloc (size_t size) {
454
+ GGML_ASSERT (pool != nullptr );
455
+ GGML_ASSERT (ptr == nullptr );
456
+ ptr = (T *) pool->alloc (size * sizeof (T), &this ->actual_size );
457
+ return ptr;
458
+ }
459
+
460
+ T * alloc (ggml_sycl_pool & pool, size_t size) {
461
+ this ->pool = &pool;
462
+ return alloc (size);
463
+ }
464
+
465
+ T * get () {
466
+ return ptr;
467
+ }
468
+
469
+ ggml_sycl_pool_alloc () = default ;
470
+ ggml_sycl_pool_alloc (const ggml_sycl_pool_alloc &) = delete ;
471
+ ggml_sycl_pool_alloc (ggml_sycl_pool_alloc &&) = delete ;
472
+ ggml_sycl_pool_alloc& operator =(const ggml_sycl_pool_alloc &) = delete ;
473
+ ggml_sycl_pool_alloc& operator =(ggml_sycl_pool_alloc &&) = delete ;
474
+ };
475
+
476
+ // backend interface
477
+
478
+ struct ggml_tensor_extra_gpu {
479
+ void * data_device[GGML_SYCL_MAX_DEVICES]; // 1 pointer for each device for split
480
+ // tensors
481
+ dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
482
+ [GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
483
+ };
484
+
485
+ struct ggml_backend_sycl_context {
486
+ int device;
487
+ std::string name;
488
+
489
+ queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
490
+ static sycl::handler * sycl_handles[GGML_SYCL_MAX_DEVICES] = {nullptr };
491
+
492
+ explicit ggml_backend_sycl_context (int device) :
493
+ device(device),
494
+ name(GGML_SYCL_NAME + std::to_string(device)) {
495
+ }
496
+
497
+ ~ggml_backend_sycl_context () {
498
+ for (int i = 0 ; i < GGML_SYCL_MAX_DEVICES; ++i) {
499
+ for (int j = 0 ; j < GGML_SYCL_MAX_STREAMS; ++j) {
500
+ if (qptrs[i][j] != nullptr ) {
501
+ SYCL_CHECK (free (qptrs[i][j]));
502
+ }
503
+ }
504
+ if (cublas_handles[i] != nullptr ) {
505
+ SYCL_CHECK (free (sycl_handles[i]));
506
+ }
507
+ }
508
+ }
509
+
510
+ queue_ptr stream (int device, int stream) {
511
+ if (qptrs[device][stream] == nullptr ) {
512
+ SYCL_CHECK (dpct::get_current_device ().create_queue (
513
+ g_sycl_gpu_mgr->get_co_ctx (), dpct::get_current_device ())));
514
+ }
515
+ return qptrs[device][stream];
516
+ }
517
+
518
+ queue_ptr stream () {
519
+ return stream (device, 0 );
520
+ }
521
+
522
+ handle_ptr sycl_handle (int device) {
523
+ if (sycl_handles[device] == nullptr ) {
524
+ const dpct::queue_ptr stream = qptrs[device][0 ];
525
+ // create sycl handle
526
+ SYCL_CHECK (CHECK_TRY_ERROR (sycl_handles[device] = stream));
527
+ }
528
+ return sycl_handles[device];
529
+ }
530
+
531
+ handle_ptr sycl_handle () {
532
+ return sycl_handle (device);
533
+ }
534
+
535
+ // pool
536
+ std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
537
+
538
+ static std::unique_ptr<ggml_sycl_pool> new_pool_for_device (queue_ptr qptr, int device);
539
+
540
+ ggml_sycl_pool & pool (int device) {
541
+ if (pools[device] == nullptr ) {
542
+ pools[device] = new_pool_for_device (qptrs[device][0 ], device);
543
+ }
544
+ return *pools[device];
545
+ }
546
+
547
+ ggml_sycl_pool & pool () {
548
+ return pool (device);
549
+ }
550
+ };
551
+
450
552
451
553
#endif // GGML_SYCL_COMMON_HPP
0 commit comments