Skip to content

Commit 2dd5d1f

Browse files
committed
llamafile : improve moe prompt eval speed on cpu
This change introduces a llamafile_mixmul() API, that allows tinyBLAS to speed up "Mixture of Expert" models. On my Threadripper the Mixtral 8x7b F16 weights now process prompts 2x faster. I am also seeing a 60 percent improvement with Mixtral 8x22b Q4_0. Support is provided for Q8_0; it is also supported by tinyBLAS. MoE models spend the most time in MUL_MAT_ID rather than MUL_MAT, which is why llamafile_sgemm() was not able to help them before. The new code works by decomposing the mixmul operation into fast 2d llamafile_sgemm() calls. This also adds BF16 support to tinyBLAS
1 parent 8748d8a commit 2dd5d1f

File tree

5 files changed

+714
-118
lines changed

5 files changed

+714
-118
lines changed

common/common.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ using json = nlohmann::ordered_json;
7878
//
7979

8080
int32_t cpu_get_num_physical_cores() {
81-
#ifdef __linux__
81+
#if defined(__linux__) || defined(__COSMOPOLITAN__)
8282
// enumerate the set of thread siblings, num entries is num cores
8383
std::unordered_set<std::string> siblings;
8484
for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
@@ -113,7 +113,7 @@ int32_t cpu_get_num_physical_cores() {
113113
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
114114
}
115115

116-
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
116+
#if defined(__x86_64__) && (defined(__linux__) || defined(__COSMOPOLITAN__)) && !defined(__ANDROID__)
117117
#include <pthread.h>
118118

119119
static void cpuid(unsigned leaf, unsigned subleaf,
@@ -167,7 +167,7 @@ static int cpu_count_math_cpus(int n_cpu) {
167167
* Returns number of CPUs on system that are useful for math.
168168
*/
169169
int32_t cpu_get_num_math() {
170-
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
170+
#if defined(__x86_64__) && (defined(__linux__) || defined(__COSMOPOLITAN__)) && !defined(__ANDROID__)
171171
int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
172172
if (n_cpu < 1) {
173173
return cpu_get_num_physical_cores();

ggml/include/ggml.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -650,6 +650,21 @@ extern "C" {
650650
enum ggml_cgraph_eval_order order;
651651
};
652652

653+
struct ggml_compute_state_shared;
654+
655+
struct ggml_compute_params {
656+
// ith = thread index, nth = number of threads
657+
int ith, nth;
658+
659+
// work buffer for all threads
660+
size_t wsize;
661+
void * wdata;
662+
663+
struct ggml_compute_state_shared * shared;
664+
};
665+
666+
void ggml_barrier(struct ggml_compute_state_shared * shared);
667+
653668
// scratch buffer
654669
struct ggml_scratch {
655670
size_t offs;

ggml/src/ggml.c

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1754,17 +1754,6 @@ struct ggml_compute_state {
17541754
struct ggml_compute_state_shared * shared;
17551755
};
17561756

1757-
struct ggml_compute_params {
1758-
// ith = thread index, nth = number of threads
1759-
int ith, nth;
1760-
1761-
// work buffer for all threads
1762-
size_t wsize;
1763-
void * wdata;
1764-
1765-
struct ggml_compute_state_shared * shared;
1766-
};
1767-
17681757
//
17691758
// fundamental operations
17701759
//
@@ -2857,15 +2846,15 @@ inline static void ggml_critical_section_start(void) {
28572846
}
28582847

28592848
#ifdef GGML_USE_OPENMP
2860-
static void ggml_barrier(struct ggml_compute_state_shared * shared) {
2849+
void ggml_barrier(struct ggml_compute_state_shared * shared) {
28612850
if (shared->n_threads == 1) {
28622851
return;
28632852
}
28642853

28652854
#pragma omp barrier
28662855
}
28672856
#else
2868-
static void ggml_barrier(struct ggml_compute_state_shared * shared) {
2857+
void ggml_barrier(struct ggml_compute_state_shared * shared) {
28692858
if (shared->n_threads == 1) {
28702859
return;
28712860
}
@@ -12306,11 +12295,16 @@ static void ggml_compute_forward_mul_mat_id(
1230612295
const struct ggml_tensor * src1 = dst->src[1];
1230712296
const struct ggml_tensor * ids = dst->src[2];
1230812297

12309-
GGML_TENSOR_BINARY_OP_LOCALS
12298+
#if GGML_USE_LLAMAFILE
12299+
if (llamafile_mixmul(params, src0, src1, ids, dst))
12300+
return;
12301+
#endif
1231012302

1231112303
const int ith = params->ith;
1231212304
const int nth = params->nth;
1231312305

12306+
GGML_TENSOR_BINARY_OP_LOCALS
12307+
1231412308
const enum ggml_type type = src0->type;
1231512309

1231612310
const bool src1_cont = ggml_is_contiguous(src1);
@@ -18536,6 +18530,9 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
1853618530
cur = 0;
1853718531
const struct ggml_tensor * src0 = node->src[0];
1853818532
const struct ggml_tensor * src1 = node->src[1];
18533+
#if GGML_USE_LLAMAFILE
18534+
const struct ggml_tensor * src2 = node->src[2];
18535+
#endif
1853918536
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
1854018537
if (src1->type != vec_dot_type) {
1854118538
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
@@ -18544,6 +18541,10 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
1854418541
cur += GGML_PAD(cur, sizeof(int64_t)); // align
1854518542
cur += n_as * sizeof(int64_t); // matrix_row_counts
1854618543
cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
18544+
#if GGML_USE_LLAMAFILE
18545+
size_t cur2 = llamafile_mixmul_needs(src0, src1, src2);
18546+
cur = cur > cur2 ? cur : cur2;
18547+
#endif
1854718548
} break;
1854818549
case GGML_OP_OUT_PROD:
1854918550
{

0 commit comments

Comments
 (0)