@@ -1734,7 +1734,7 @@ struct llama_kv_cell {
1734
1734
// ring-buffer of cached KV data
1735
1735
struct llama_kv_cache {
1736
1736
bool has_shift = false;
1737
- // with Mamba, a slot can hold the state for more than one past token
1737
+ // with Mamba, a cell can hold the state for more than one past token
1738
1738
bool unlimited = false;
1739
1739
1740
1740
// Note: The value of head isn't only used to optimize searching
@@ -1993,7 +1993,7 @@ static bool llama_kv_cache_init(
1993
1993
1994
1994
cache.has_shift = false;
1995
1995
1996
- // for now, only Mamba can hold state for more than one past token per slot
1996
+ // for now, only Mamba can hold state for more than one past token per cell
1997
1997
cache.unlimited = model.arch == LLM_ARCH_MAMBA;
1998
1998
1999
1999
cache.head = 0;
@@ -2249,7 +2249,7 @@ static void llama_kv_cache_seq_cp(
2249
2249
cache.cells[seq_id_dst].delta = seq_id_src;
2250
2250
// NOTE: a sequence can't have multiple sources, but can have multiple destinations.
2251
2251
// For compatibility with the other KV cache API functions,
2252
- // the seq_id(s) of a slot suggests an intent to "copy to" those id(s),
2252
+ // the seq_id(s) of a cell suggests an intent to "copy to" those id(s),
2253
2253
// so that when a sequence is copied, it can initially be found from the source cell.
2254
2254
cache.cells[seq_id_src].seq_id.insert(seq_id_dst);
2255
2255
// prevent the destination from getting cleared
@@ -11726,10 +11726,10 @@ struct llama_context * llama_new_context_with_model(
11726
11726
ggml_type type_k = params.type_k;
11727
11727
ggml_type type_v = params.type_v;
11728
11728
11729
- // Mamba only needs a constant number of KV cache slots per sequence
11729
+ // Mamba only needs a constant number of KV cache cells per sequence
11730
11730
if (model->arch == LLM_ARCH_MAMBA) {
11731
- // Mamba needs as many slots as there are distinct sequences processed at the same time
11732
- // The extra slot allows dedicating a sequence id to the system prompt
11731
+ // Mamba needs as many KV cells as there are sequences kept at any time
11732
+ // The extra cell allows dedicating a sequence id to the system prompt
11733
11733
// TODO: find a better way to get the max number of parallel sequences
11734
11734
kv_size = params.n_parallel + 1;
11735
11735
// it's probably best to keep as much precision as possible for the states
0 commit comments