Skip to content

Commit e1b7913

Browse files
committed
backup
1 parent 38f09be commit e1b7913

11 files changed

+555
-1104
lines changed

ggml-sycl.cpp

Lines changed: 338 additions & 677 deletions
Large diffs are not rendered by default.

ggml-sycl.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,6 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
3636
// TODO: these are temporary
3737
// ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670
3838
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index);
39-
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
40-
GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
4139

4240
// SYCL doesn't support registering host memory, keep here for reference
4341
// GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);

ggml-sycl/backend.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,5 @@
1919
#include "dmmv.hpp"
2020
#include "mmq.hpp"
2121
#include "mmvq.hpp"
22-
#include "pool.hpp"
2322

2423
#endif // GGML_SYCL_BACKEND_HPP

ggml-sycl/common.cpp

Lines changed: 0 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -20,122 +20,6 @@ int get_current_device_id() {
2020
return dpct::dev_mgr::instance().current_device_id();
2121
}
2222

23-
void log_ggml_var_device(
24-
const char* name,
25-
float* src,
26-
size_t total_elements,
27-
bool src_on_device) {
28-
if (!g_ggml_sycl_debug)
29-
return;
30-
if (!src) {
31-
printf("GGML Tensor:%s skip to save for NULL pointer\n", name);
32-
return;
33-
}
34-
char filename[1024];
35-
sprintf(filename, "%s.txt", name);
36-
printf("GGML Tensor:%s save to %s\n", name, filename);
37-
38-
size_t total_size = total_elements * sizeof(float);
39-
float* local_buf = NULL;
40-
if (src_on_device) {
41-
local_buf = (float*)ggml_sycl_host_malloc(total_size);
42-
ggml_sycl_set_device(g_main_device);
43-
dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
44-
main_stream->memcpy(local_buf, src, total_size).wait();
45-
} else {
46-
local_buf = (float*)src;
47-
}
48-
49-
std::ofstream logfile;
50-
logfile.open(filename);
51-
for (size_t i = 0; i < total_elements; i++) {
52-
logfile << local_buf[i] << " ";
53-
if ((i + 1) % 20 == 0)
54-
logfile << std::endl;
55-
}
56-
logfile << std::endl;
57-
logfile.close();
58-
59-
if (src_on_device)
60-
ggml_sycl_host_free(local_buf);
61-
}
62-
63-
void log_ggml_var_device_fp16(
64-
const char* name,
65-
sycl::half* src,
66-
size_t total_elements,
67-
bool src_on_device) {
68-
if (!g_ggml_sycl_debug)
69-
return;
70-
if (!src) {
71-
printf("GGML Tensor:%s skip to save for NULL pointer\n", name);
72-
return;
73-
}
74-
char filename[1024];
75-
sprintf(filename, "%s.txt", name);
76-
printf("GGML Tensor:%s save to %s\n", name, filename);
77-
78-
size_t total_size = total_elements * sizeof(sycl::half);
79-
sycl::half* local_buf = NULL;
80-
if (src_on_device) {
81-
local_buf = (sycl::half*)ggml_sycl_host_malloc(total_size);
82-
ggml_sycl_set_device(g_main_device);
83-
dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
84-
main_stream->memcpy(local_buf, src, total_size).wait();
85-
} else {
86-
local_buf = (sycl::half*)src;
87-
}
88-
89-
std::ofstream logfile;
90-
logfile.open(filename);
91-
for (size_t i = 0; i < total_elements; i++) {
92-
logfile << local_buf[i] << " ";
93-
if ((i + 1) % 20 == 0)
94-
logfile << std::endl;
95-
}
96-
logfile << std::endl;
97-
logfile.close();
98-
99-
if (src_on_device)
100-
ggml_sycl_host_free(local_buf);
101-
}
102-
103-
void print_ggml_tensor(const char* name, struct ggml_tensor* src) {
104-
if (!g_ggml_sycl_debug)
105-
return;
106-
if (!src) {
107-
printf("GGML Tensor:%s skip to save for NULL pointer\n", name);
108-
return;
109-
}
110-
111-
size_t total_elements = ggml_nelements(src);
112-
113-
const bool src_on_device = src->backend == GGML_BACKEND_TYPE_GPU ||
114-
src->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
115-
float* src_data = NULL;
116-
if (src_on_device) {
117-
ggml_tensor_extra_gpu* src_extra = (ggml_tensor_extra_gpu*)src->extra;
118-
src_data = (float*)src_extra->data_device[g_main_device];
119-
} else {
120-
src_data = (float*)src->data;
121-
}
122-
123-
log_ggml_var_device(name, src_data, total_elements, src_on_device);
124-
}
125-
126-
void log_tensor_with_cnt(
127-
const char* name,
128-
struct ggml_tensor* src,
129-
int stop_cnt) {
130-
stop_cnt = 4;
131-
if (log_file_name_idx >= stop_cnt)
132-
return;
133-
char filename[1280];
134-
sprintf(filename, "%s_%07d", name, log_file_name_idx);
135-
log_file_name_idx++;
136-
print_ggml_tensor(filename, src);
137-
}
138-
13923
void* ggml_sycl_host_malloc(size_t size) try {
14024
if (getenv("GGML_SYCL_NO_PINNED") != nullptr) {
14125
return nullptr;

ggml-sycl/common.hpp

Lines changed: 151 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ static int g_work_group_size = 0;
7878
#define GGML_SYCL_MMV_Y 1
7979
#endif
8080

81+
typedef sycl::queue *queue_ptr;
82+
typedef sycl::handler *handle_ptr;
83+
8184
enum ggml_sycl_backend_gpu_mode {
8285
SYCL_UNSET_GPU_MODE = -1,
8386
SYCL_SINGLE_GPU_MODE = 0,
@@ -182,17 +185,6 @@ static_assert(
182185
#endif // GGML_SYCL_PEER_MAX_BATCH_SIZE
183186

184187
#define MUL_MAT_SRC1_COL_STRIDE 128
185-
#define MAX_STREAMS 8
186-
#define SYCL_MAX_DEVICES 48
187-
188-
static dpct::queue_ptr g_syclStreams[SYCL_MAX_DEVICES][MAX_STREAMS] = {{0}};
189-
190-
struct ggml_tensor_extra_gpu {
191-
void* data_device[SYCL_MAX_DEVICES]; // 1 pointer for each device for split
192-
// tensors
193-
dpct::event_ptr events[SYCL_MAX_DEVICES]
194-
[MAX_STREAMS]; // events for synchronizing multiple GPUs
195-
};
196188

197189
class sycl_gpu_mgr {
198190
public:
@@ -320,7 +312,7 @@ class sycl_gpu_mgr {
320312
}
321313
};
322314

323-
static sycl_gpu_mgr* g_sycl_gpu_mgr = NULL;
315+
static sycl_gpu_mgr* g_sycl_gpu_mgr = new sycl_gpu_mgr(0);
324316
static int g_device_count = -1;
325317
static int g_all_sycl_device_count = -1;
326318
static int g_main_device = -1;
@@ -329,31 +321,15 @@ static bool g_ggml_backend_sycl_buffer_type_initialized = false;
329321

330322
static std::array<float, SYCL_MAX_DEVICES> g_default_tensor_split = {};
331323

332-
static float g_tensor_split[SYCL_MAX_DEVICES] = {0};
324+
static float g_tensor_split[GGML_SYCL_MAX_DEVICES] = {0};
333325

334326
static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode =
335327
SYCL_UNSET_GPU_MODE;
336328

337-
struct sycl_device_capabilities {
338-
int cc; // compute capability
339-
bool vmm; // virtual memory support
340-
size_t vmm_granularity; // granularity of virtual memory
341-
int device_id;
342-
};
343-
344-
static sycl_device_capabilities g_device_caps[SYCL_MAX_DEVICES] = {
345-
{0, false, 0, -1}};
346-
347-
struct sycl_device_id2index {
348-
int index;
349-
};
350-
351329
static void* g_scratch_buffer = nullptr;
352330
static size_t g_scratch_size = 0; // disabled by default
353331
static size_t g_scratch_offset = 0;
354332

355-
static dpct::queue_ptr g_sycl_handles[SYCL_MAX_DEVICES] = {nullptr};
356-
357333
int get_main_device();
358334

359335
[[noreturn]] static inline void bad_arch(const sycl::stream& stream_ct1) {
@@ -427,25 +403,151 @@ inline dpct::err0 ggml_sycl_set_device(const int device) try {
427403
std::exit(1);
428404
}
429405

430-
void log_ggml_var_device(
431-
const char* name,
432-
float* src,
433-
size_t total_elements,
434-
bool src_on_device);
435-
436-
void log_ggml_var_device_fp16(
437-
const char* name,
438-
sycl::half* src,
439-
size_t total_elements,
440-
bool src_on_device);
441-
442-
// todo: debug for crash in some case
443-
void print_ggml_tensor(const char* name, struct ggml_tensor* src);
444-
445-
static int log_file_name_idx = 0;
446-
void log_tensor_with_cnt(
447-
const char* name,
448-
struct ggml_tensor* src,
449-
int stop_cnt);
406+
//////////////////////
407+
408+
struct ggml_sycl_device_info {
409+
int device_count;
410+
411+
struct sycl_device_info {
412+
int cc; // compute capability
413+
// int nsm; // number of streaming multiprocessors
414+
// size_t smpb; // max. shared memory per block
415+
bool vmm; // virtual memory support
416+
size_t total_vram;
417+
};
418+
419+
sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
420+
421+
std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
422+
};
423+
424+
const ggml_sycl_device_info & ggml_sycl_info();
425+
426+
struct ggml_sycl_pool {
427+
virtual ~ggml_sycl_pool() = default;
428+
429+
virtual void * alloc(size_t size, size_t * actual_size) = 0;
430+
virtual void free(void * ptr, size_t size) = 0;
431+
};
432+
433+
template<typename T>
434+
struct ggml_sycl_pool_alloc {
435+
ggml_sycl_pool * pool = nullptr;
436+
T * ptr = nullptr;
437+
size_t actual_size = 0;
438+
439+
explicit ggml_sycl_pool_alloc(ggml_sycl_pool & pool) : pool(&pool) {
440+
}
441+
442+
ggml_sycl_pool_alloc(ggml_sycl_pool & pool, size_t size) : pool(&pool) {
443+
alloc(size);
444+
}
445+
446+
~ggml_sycl_pool_alloc() {
447+
if (ptr != nullptr) {
448+
pool->free(ptr, actual_size);
449+
}
450+
}
451+
452+
// size is in number of elements
453+
T * alloc(size_t size) {
454+
GGML_ASSERT(pool != nullptr);
455+
GGML_ASSERT(ptr == nullptr);
456+
ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
457+
return ptr;
458+
}
459+
460+
T * alloc(ggml_sycl_pool & pool, size_t size) {
461+
this->pool = &pool;
462+
return alloc(size);
463+
}
464+
465+
T * get() {
466+
return ptr;
467+
}
468+
469+
ggml_sycl_pool_alloc() = default;
470+
ggml_sycl_pool_alloc(const ggml_sycl_pool_alloc &) = delete;
471+
ggml_sycl_pool_alloc(ggml_sycl_pool_alloc &&) = delete;
472+
ggml_sycl_pool_alloc& operator=(const ggml_sycl_pool_alloc &) = delete;
473+
ggml_sycl_pool_alloc& operator=(ggml_sycl_pool_alloc &&) = delete;
474+
};
475+
476+
// backend interface
477+
478+
struct ggml_tensor_extra_gpu {
479+
void* data_device[GGML_SYCL_MAX_DEVICES]; // 1 pointer for each device for split
480+
// tensors
481+
dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
482+
[GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
483+
};
484+
485+
struct ggml_backend_sycl_context {
486+
int device;
487+
std::string name;
488+
489+
queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
490+
static sycl::handler * sycl_handles[GGML_SYCL_MAX_DEVICES] = {nullptr};
491+
492+
explicit ggml_backend_sycl_context(int device) :
493+
device(device),
494+
name(GGML_SYCL_NAME + std::to_string(device)) {
495+
}
496+
497+
~ggml_backend_sycl_context() {
498+
for (int i = 0; i < GGML_SYCL_MAX_DEVICES; ++i) {
499+
for (int j = 0; j < GGML_SYCL_MAX_STREAMS; ++j) {
500+
if (qptrs[i][j] != nullptr) {
501+
SYCL_CHECK(free(qptrs[i][j]));
502+
}
503+
}
504+
if (cublas_handles[i] != nullptr) {
505+
SYCL_CHECK(free(sycl_handles[i]));
506+
}
507+
}
508+
}
509+
510+
queue_ptr stream(int device, int stream) {
511+
if (qptrs[device][stream] == nullptr) {
512+
SYCL_CHECK(dpct::get_current_device().create_queue(
513+
g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
514+
}
515+
return qptrs[device][stream];
516+
}
517+
518+
queue_ptr stream() {
519+
return stream(device, 0);
520+
}
521+
522+
handle_ptr sycl_handle(int device) {
523+
if (sycl_handles[device] == nullptr) {
524+
const dpct::queue_ptr stream = qptrs[device][0];
525+
// create sycl handle
526+
SYCL_CHECK(CHECK_TRY_ERROR(sycl_handles[device] = stream));
527+
}
528+
return sycl_handles[device];
529+
}
530+
531+
handle_ptr sycl_handle() {
532+
return sycl_handle(device);
533+
}
534+
535+
// pool
536+
std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
537+
538+
static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device);
539+
540+
ggml_sycl_pool & pool(int device) {
541+
if (pools[device] == nullptr) {
542+
pools[device] = new_pool_for_device(qptrs[device][0], device);
543+
}
544+
return *pools[device];
545+
}
546+
547+
ggml_sycl_pool & pool() {
548+
return pool(device);
549+
}
550+
};
551+
450552

451553
#endif // GGML_SYCL_COMMON_HPP

ggml-sycl/convert.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#include "convert.hpp"
22
#include "dequantize.hpp"
3-
3+
#include "presets.hpp"
44

55
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
66
static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k,

0 commit comments

Comments
 (0)