Skip to content

Commit 051c70d

Browse files
authored
llama: Don't double count the sampling time (#2107)
1 parent 9e4475f commit 051c70d

File tree

1 file changed

+9
-11
lines changed

1 file changed

+9
-11
lines changed

llama.cpp

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1905,10 +1905,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
19051905
return;
19061906
}
19071907

1908-
const int64_t t_start_sample_us = ggml_time_us();
1909-
19101908
llama_sample_softmax(ctx, candidates);
19111909

1910+
const int64_t t_start_sample_us = ggml_time_us();
1911+
19121912
// Compute the cumulative probabilities
19131913
float cum_sum = 0.0f;
19141914
size_t last_idx = candidates->size;
@@ -1937,9 +1937,8 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
19371937
return;
19381938
}
19391939

1940-
const int64_t t_start_sample_us = ggml_time_us();
1941-
19421940
llama_sample_softmax(nullptr, candidates);
1941+
const int64_t t_start_sample_us = ggml_time_us();
19431942

19441943
// Compute the first and second derivatives
19451944
std::vector<float> first_derivatives(candidates->size - 1);
@@ -1991,11 +1990,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
19911990
return;
19921991
}
19931992

1994-
const int64_t t_start_sample_us = ggml_time_us();
1995-
19961993
// Compute the softmax of logits and calculate entropy
19971994
llama_sample_softmax(nullptr, candidates);
19981995

1996+
const int64_t t_start_sample_us = ggml_time_us();
1997+
19991998
float entropy = 0.0f;
20001999
for (size_t i = 0; i < candidates->size; ++i) {
20012000
entropy += -candidates->data[i].p * logf(candidates->data[i].p);
@@ -2164,13 +2163,11 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
21642163

21652164
if (ctx) {
21662165
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2167-
ctx->n_sample++;
21682166
}
21692167
return X;
21702168
}
21712169

21722170
llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
2173-
assert(ctx);
21742171
int64_t t_start_sample_us;
21752172
t_start_sample_us = ggml_time_us();
21762173

@@ -2185,13 +2182,14 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
21852182
candidates->size = 1;
21862183
}
21872184

2185+
if (ctx) {
2186+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2187+
}
2188+
21882189
// Normalize the probabilities of the remaining words
21892190
llama_sample_softmax(ctx, candidates);
21902191

21912192
// Sample the next word X from the remaining words
2192-
if (ctx) {
2193-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2194-
}
21952193
llama_token X = llama_sample_token(ctx, candidates);
21962194
t_start_sample_us = ggml_time_us();
21972195

0 commit comments

Comments
 (0)