Skip to content

Commit 07cebab

Browse files
committed
Introduce bfloat16 support
Many models on Hugging Face (e.g. Mistral, TinyLLaMA) use bfloat16 as their canonical floating point format. ┌sign │ │ ┌exponent │ │ │ │ ┌mantissa │ │ │ │┌──┴───┐┌─┴───┐ 0b0000000000000000 brain16 This encoding has the same number of exponent bits as float32. That makes conversion relatively straightforward, even in the absence of hardware support. For example, converting brain16 to binary32 means simply shifting 16 bits to the left. ┌sign │ │ ┌exponent │ │ │ │ ┌mantissa │ │ │ │┌──┴───┐┌─┴───────────────────┐ 0b00000000000000000000000000000000 IEEE binary32 The issue is that converting bf16 to fp16 can result in information loss. Only 13% of bf16 numbers can be precisely represented in fp16 which in practice ends up being 99.71% of Mistral 7b v0.2's weights however there is currently no way other than fp32 to get the others ┌sign │ │ ┌exponent │ │ │ │ ┌mantissa │ │ │ │┌─┴─┐┌─┴──────┐ 0b0000000000000000 IEEE binary16 This change fixes that, by adding a bf16 data type to GGML. Support for CPU inference has been implemented along with optimizations for the AVX2, AVX512, and AVX512BF16 ISAs. Perplexity on Mistral 7b 0.2 improves somewhere around -0.0024 to -0.0046 compared to using fp16
1 parent 37e7854 commit 07cebab

File tree

8 files changed

+1734
-173
lines changed

8 files changed

+1734
-173
lines changed

examples/finetune/finetune.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -575,7 +575,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
575575
GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
576576

577577
auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
578-
if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
578+
if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16) {
579579
return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
580580
} else if (a->type == GGML_TYPE_F32) {
581581
return ggml_add(ctx, a, b);

examples/quantize/quantize.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
4747
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
4848
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
4949
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
50-
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
50+
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, -0.0020 ppl @ Mistral-7B", },
51+
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
5152
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
5253
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
5354
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },

ggml-impl.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,9 @@ size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml
260260
// return index, asserts if table is full
261261
size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key);
262262

263+
#define GGML_FP32_TO_BF16(x) ggml_fp32_to_bf16(x)
264+
#define GGML_BF16_TO_FP32(x) ggml_bf16_to_fp32(x)
265+
263266
#ifdef __cplusplus
264267
}
265268
#endif

ggml.c

Lines changed: 1621 additions & 169 deletions
Large diffs are not rendered by default.

ggml.h

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,7 @@ extern "C" {
370370
GGML_TYPE_I64 = 27,
371371
GGML_TYPE_F64 = 28,
372372
GGML_TYPE_IQ1_M = 29,
373+
GGML_TYPE_BF16 = 30,
373374
GGML_TYPE_COUNT,
374375
};
375376

@@ -410,6 +411,7 @@ extern "C" {
410411
GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
411412
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
412413
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
414+
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
413415
};
414416

415417
// available tensor operations:
@@ -2368,6 +2370,90 @@ extern "C" {
23682370
GGML_API int ggml_cpu_has_vsx (void);
23692371
GGML_API int ggml_cpu_has_matmul_int8(void);
23702372

2373+
/**
2374+
* Google Brain 16-bit floating point number.
2375+
*
2376+
* ┌sign
2377+
* │
2378+
* │ ┌exponent
2379+
* │ │
2380+
* │ │ ┌mantissa
2381+
* │ │ │
2382+
* │┌──┴───┐┌─┴───┐
2383+
* 0b0000000000000000 brain16
2384+
*
2385+
* Since bf16 has the same number of exponent bits as a 32bit float,
2386+
* encoding and decoding numbers becomes relatively straightforward.
2387+
*
2388+
* ┌sign
2389+
* │
2390+
* │ ┌exponent
2391+
* │ │
2392+
* │ │ ┌mantissa
2393+
* │ │ │
2394+
* │┌──┴───┐┌─┴───────────────────┐
2395+
* 0b00000000000000000000000000000000 IEEE binary32
2396+
*
2397+
* For comparison, the standard fp16 format has fewer exponent bits.
2398+
*
2399+
* ┌sign
2400+
* │
2401+
* │ ┌exponent
2402+
* │ │
2403+
* │ │ ┌mantissa
2404+
* │ │ │
2405+
* │┌─┴─┐┌─┴──────┐
2406+
* 0b0000000000000000 IEEE binary16
2407+
*
2408+
* So be warned that converting between them, destroys several bits.
2409+
*
2410+
* @see IEEE 754-2008
2411+
*/
2412+
typedef struct {
2413+
uint16_t x;
2414+
} ggml_bf16_t;
2415+
2416+
/**
2417+
* Converts brain16 to float32.
2418+
*/
2419+
static inline float ggml_bf16_to_fp32(ggml_bf16_t h) {
2420+
union {
2421+
float f;
2422+
uint32_t i;
2423+
} u;
2424+
u.i = (uint32_t)h.x << 16;
2425+
return u.f;
2426+
}
2427+
2428+
/**
2429+
* Converts float32 to brain16.
2430+
*
2431+
* This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
2432+
* Subnormals shall be flushed to zero, and NANs will be quiet.
2433+
* This code should vectorize nicely if using modern compilers.
2434+
*/
2435+
static inline ggml_bf16_t ggml_fp32_to_bf16(float s) {
2436+
ggml_bf16_t h;
2437+
union {
2438+
float f;
2439+
uint32_t i;
2440+
} u;
2441+
u.f = s;
2442+
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
2443+
h.x = (u.i >> 16) | 64; /* force to quiet */
2444+
return h;
2445+
}
2446+
if (!(u.i & 0x7f800000)) { /* subnormal */
2447+
h.x = (u.i & 0x80000000) >> 16; /* flush to zero */
2448+
return h;
2449+
}
2450+
h.x = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
2451+
return h;
2452+
}
2453+
2454+
GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int n);
2455+
GGML_API void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int n);
2456+
23712457
//
23722458
// Internal types and functions exposed for tests and benchmarks
23732459
//

gguf-py/gguf/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -729,6 +729,7 @@ class GGMLQuantizationType(IntEnum):
729729
I64 = 27
730730
F64 = 28
731731
IQ1_M = 29
732+
BF16 = 30
732733

733734

734735
class GGUFEndian(IntEnum):
@@ -775,6 +776,7 @@ def get_type(val: Any) -> GGUFValueType:
775776
GGML_QUANT_SIZES = {
776777
GGMLQuantizationType.F32: (1, 4),
777778
GGMLQuantizationType.F16: (1, 2),
779+
GGMLQuantizationType.BF16: (1, 2),
778780
GGMLQuantizationType.Q4_0: (32, 2 + 16),
779781
GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16),
780782
GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16),

llama.cpp

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3022,6 +3022,7 @@ struct llama_model_loader {
30223022
switch (type_max) {
30233023
case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
30243024
case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
3025+
case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break;
30253026
case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
30263027
case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
30273028
case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
@@ -3407,6 +3408,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
34073408
switch (ftype) {
34083409
case LLAMA_FTYPE_ALL_F32: return "all F32";
34093410
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
3411+
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
34103412
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
34113413
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
34123414
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
@@ -5466,6 +5468,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
54665468
|| !(
54675469
model.ftype == LLAMA_FTYPE_ALL_F32 ||
54685470
model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
5471+
model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
54695472
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
54705473
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
54715474
)
@@ -12839,13 +12842,16 @@ static void llama_tensor_dequantize_internal(
1283912842
if (qtype.to_float == NULL) {
1284012843
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
1284112844
}
12842-
} else if (tensor->type != GGML_TYPE_F16) {
12845+
} else if (tensor->type != GGML_TYPE_F16 &&
12846+
tensor->type != GGML_TYPE_BF16) {
1284312847
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
1284412848
}
1284512849

1284612850
if (nthread < 2) {
1284712851
if (tensor->type == GGML_TYPE_F16) {
1284812852
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
12853+
} else if (tensor->type == GGML_TYPE_BF16) {
12854+
ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
1284912855
} else if (ggml_is_quantized(tensor->type)) {
1285012856
qtype.to_float(tensor->data, f32_output, nelements);
1285112857
} else {
@@ -12854,7 +12860,14 @@ static void llama_tensor_dequantize_internal(
1285412860
return;
1285512861
}
1285612862

12857-
size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
12863+
size_t block_size;
12864+
if (tensor->type == GGML_TYPE_F16 ||
12865+
tensor->type == GGML_TYPE_BF16) {
12866+
block_size = 1;
12867+
} else {
12868+
block_size = (size_t)ggml_blck_size(tensor->type);
12869+
}
12870+
1285812871
size_t block_size_bytes = ggml_type_size(tensor->type);
1285912872

1286012873
GGML_ASSERT(nelements % block_size == 0);
@@ -12873,6 +12886,8 @@ static void llama_tensor_dequantize_internal(
1287312886
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
1287412887
if (typ == GGML_TYPE_F16) {
1287512888
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
12889+
} else if (typ == GGML_TYPE_BF16) {
12890+
ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
1287612891
} else {
1287712892
qtype.to_float(inbuf, outbuf, nels);
1287812893
}
@@ -13215,6 +13230,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1321513230
case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
1321613231
case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
1321713232
case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
13233+
case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
1321813234
case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
1321913235

1322013236
// K-quants

llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ extern "C" {
118118
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
119119
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
120120
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
121+
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
121122

122123
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
123124
};

0 commit comments

Comments
 (0)