Skip to content

Commit dc47ba3

Browse files
authored
[Bugfix] Fixed prompt length for random dataset (#17408)
Signed-off-by: Mikhail Podvitskii <[email protected]>
1 parent edbf2d6 commit dc47ba3

File tree

1 file changed

+15
-2
lines changed

1 file changed

+15
-2
lines changed

benchmarks/benchmark_dataset.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -315,13 +315,15 @@ def sample(
315315
)
316316

317317
vocab_size = tokenizer.vocab_size
318+
num_special_tokens = tokenizer.num_special_tokens_to_add()
319+
real_input_len = input_len - num_special_tokens
318320

319321
prefix_token_ids = (np.random.randint(
320322
0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
321323

322324
# New sampling logic: [X * (1 - b), X * (1 + b)]
323-
input_low = int(input_len * (1 - range_ratio))
324-
input_high = int(input_len * (1 + range_ratio))
325+
input_low = int(real_input_len * (1 - range_ratio))
326+
input_high = int(real_input_len * (1 + range_ratio))
325327
output_low = int(output_len * (1 - range_ratio))
326328
output_high = int(output_len * (1 + range_ratio))
327329

@@ -344,6 +346,17 @@ def sample(
344346
vocab_size).tolist()
345347
token_sequence = prefix_token_ids + inner_seq
346348
prompt = tokenizer.decode(token_sequence)
349+
# After decoding the prompt we have to encode and decode it again.
350+
# This is done because in some cases N consecutive tokens
351+
# give a string tokenized into != N number of tokens.
352+
# For example for GPT2Tokenizer:
353+
# [6880, 6881] -> ['Ġcalls', 'here'] ->
354+
# [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
355+
# To avoid uncontrolled change of the prompt length,
356+
# the encoded sequence is truncated before being decode again.
357+
re_encoded_sequence = tokenizer.encode(
358+
prompt, add_special_tokens=False)[:input_lens[i]]
359+
prompt = tokenizer.decode(re_encoded_sequence)
347360
total_input_len = prefix_len + int(input_lens[i])
348361
requests.append(
349362
SampleRequest(

0 commit comments

Comments
 (0)