|
| 1 | +import argparse |
| 2 | +import base64 |
| 3 | +import json |
| 4 | +import os |
| 5 | +import re |
| 6 | +import signal |
| 7 | +import socket |
| 8 | +import subprocess |
| 9 | +import sys |
| 10 | +import threading |
| 11 | +import time |
| 12 | +import traceback |
| 13 | +from contextlib import closing |
| 14 | +from datetime import datetime |
| 15 | + |
| 16 | +import matplotlib.pyplot as plt |
| 17 | +import requests |
| 18 | + |
| 19 | + |
| 20 | +def main(args_in: list[str] | None = None) -> None: |
| 21 | + parser = argparse.ArgumentParser(description="Start server benchmark scenario") |
| 22 | + parser.add_argument("--name", type=str, help="Bench name", required=True) |
| 23 | + parser.add_argument("--runner-label", type=str, help="Runner label", required=True) |
| 24 | + parser.add_argument("--branch", type=str, help="Branch name", default="detached") |
| 25 | + parser.add_argument("--commit", type=str, help="Commit name", default="dirty") |
| 26 | + parser.add_argument("--host", type=str, help="Server listen host", default="0.0.0.0") |
| 27 | + parser.add_argument("--port", type=int, help="Server listen host", default="8080") |
| 28 | + parser.add_argument("--model-path-prefix", type=str, help="Prefix where to store the model files", default="models") |
| 29 | + parser.add_argument("--n-prompts", type=int, |
| 30 | + help="SERVER_BENCH_N_PROMPTS: total prompts to randomly select in the benchmark", required=True) |
| 31 | + parser.add_argument("--max-prompt-tokens", type=int, |
| 32 | + help="SERVER_BENCH_MAX_PROMPT_TOKENS: maximum prompt tokens to filter out in the dataset", |
| 33 | + required=True) |
| 34 | + parser.add_argument("--max-tokens", type=int, |
| 35 | + help="SERVER_BENCH_MAX_CONTEXT: maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens", |
| 36 | + required=True) |
| 37 | + parser.add_argument("--hf-repo", type=str, help="Hugging Face model repository", required=True) |
| 38 | + parser.add_argument("--hf-file", type=str, help="Hugging Face model file", required=True) |
| 39 | + parser.add_argument("-ngl", "--n-gpu-layers", type=int, help="layers to the GPU for computation", required=True) |
| 40 | + parser.add_argument("--ctx-size", type=int, help="Set the size of the prompt context", required=True) |
| 41 | + parser.add_argument("--parallel", type=int, help="Set the number of slots for process requests", required=True) |
| 42 | + parser.add_argument("--batch-size", type=int, help="Set the batch size for prompt processing", required=True) |
| 43 | + parser.add_argument("--ubatch-size", type=int, help="physical maximum batch size", required=True) |
| 44 | + parser.add_argument("--scenario", type=str, help="Scenario to run", required=True) |
| 45 | + parser.add_argument("--duration", type=str, help="Bench scenario", required=True) |
| 46 | + |
| 47 | + args = parser.parse_args(args_in) |
| 48 | + |
| 49 | + start_time = time.time() |
| 50 | + |
| 51 | + # Start the server and performance scenario |
| 52 | + try: |
| 53 | + server_process = start_server(args) |
| 54 | + except Exception: |
| 55 | + print("bench: server start error :") |
| 56 | + traceback.print_exc(file=sys.stdout) |
| 57 | + sys.exit(1) |
| 58 | + |
| 59 | + # start the benchmark |
| 60 | + bench_results_str = "" |
| 61 | + try: |
| 62 | + start_benchmark(args) |
| 63 | + |
| 64 | + with open("results.github.env", 'w') as github_env: |
| 65 | + # parse output |
| 66 | + with open('k6-results.json', 'r') as bench_results: |
| 67 | + # Load JSON data from file |
| 68 | + data = json.load(bench_results) |
| 69 | + for metric_name in data['metrics']: |
| 70 | + for metric_metric in data['metrics'][metric_name]: |
| 71 | + value = data['metrics'][metric_name][metric_metric] |
| 72 | + if isinstance(value, float): |
| 73 | + github_env.write( |
| 74 | + f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={round(value, 2)}\n") |
| 75 | + token_seconds = data['metrics']['llamacpp_tokens_second']['avg'] |
| 76 | + bench_results_str = json.dumps(data) |
| 77 | + |
| 78 | + except Exception: |
| 79 | + print("bench: error :") |
| 80 | + traceback.print_exc(file=sys.stdout) |
| 81 | + |
| 82 | + # Stop the server |
| 83 | + if server_process: |
| 84 | + try: |
| 85 | + print(f"bench: shutting down server pid={server_process.pid} ...") |
| 86 | + if os.name == 'nt': |
| 87 | + interrupt = signal.CTRL_C_EVENT |
| 88 | + else: |
| 89 | + interrupt = signal.SIGINT |
| 90 | + server_process.send_signal(interrupt) |
| 91 | + server_process.wait(0.5) |
| 92 | + |
| 93 | + except subprocess.TimeoutExpired: |
| 94 | + print(f"server still alive after 500ms, force-killing pid={server_process.pid} ...") |
| 95 | + server_process.kill() # SIGKILL |
| 96 | + server_process.wait() |
| 97 | + |
| 98 | + while is_server_listening(args.host, args.port): |
| 99 | + time.sleep(0.1) |
| 100 | + |
| 101 | + # Prometheus |
| 102 | + end_time = time.time() |
| 103 | + image_data = [] |
| 104 | + pr_comment = f"tk/s={round(token_seconds, 2)}" |
| 105 | + if is_server_listening("0.0.0.0", 9090): |
| 106 | + metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds', |
| 107 | + 'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred'] |
| 108 | + |
| 109 | + for metric in metrics: |
| 110 | + resp = requests.get(f"http://localhost:9090/api/v1/query_range", |
| 111 | + params={'query': 'llamacpp:' + metric, 'start': start_time, 'end': end_time, 'step': 2}) |
| 112 | + if resp.status_code != 200: |
| 113 | + print(f"bench: unable to extract prometheus metric {metric}: {resp.text}") |
| 114 | + else: |
| 115 | + metric_data = resp.json() |
| 116 | + values = metric_data['data']['result'][0]['values'] |
| 117 | + timestamps, metric_values = zip(*values) |
| 118 | + metric_values = [float(value) for value in metric_values] |
| 119 | + timestamps = [datetime.fromtimestamp(int(ts)) for ts in timestamps] |
| 120 | + plt.figure(figsize=(16, 10), dpi=80) |
| 121 | + plt.plot(timestamps, metric_values, label=metric) |
| 122 | + plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7) |
| 123 | + plt.yticks(fontsize=12, alpha=.7) |
| 124 | + |
| 125 | + plt.title(f"{args.name} on {args.runner_label}\n" |
| 126 | + f"duration={args.duration} {round(token_seconds, 2)}tk/s\n" |
| 127 | + f"branch={args.branch} commit={args.commit}", |
| 128 | + fontsize=14, wrap=True) |
| 129 | + plt.grid(axis='both', alpha=.3) |
| 130 | + plt.ylabel(f"llamacpp:{metric}", fontsize=14) |
| 131 | + plt.xlabel(f"hf-repo={args.hf_repo} hf-file={args.hf_file}\n" |
| 132 | + f"parallel={args.parallel} ctx-size={args.ctx_size} ngl={args.n_gpu_layers} batch-size={args.batch_size} ubatch-size={args.ubatch_size}\n" |
| 133 | + f" pp={args.max_prompt_tokens} pp+tg={args.max_tokens}", fontsize=14, wrap=True) |
| 134 | + plt.gcf().autofmt_xdate() |
| 135 | + |
| 136 | + # Remove borders |
| 137 | + plt.gca().spines["top"].set_alpha(0.0) |
| 138 | + plt.gca().spines["bottom"].set_alpha(0.3) |
| 139 | + plt.gca().spines["right"].set_alpha(0.0) |
| 140 | + plt.gca().spines["left"].set_alpha(0.3) |
| 141 | + |
| 142 | + # Save the plot as a PNG image |
| 143 | + plt.savefig(f'{metric}.png') |
| 144 | + plt.close() |
| 145 | + with open(f'{metric}.png', "rb") as image_file: |
| 146 | + encoded_string = base64.b64encode(image_file.read()).decode() |
| 147 | + image_data.append(f"data:image/png;base64,{encoded_string}") |
| 148 | + # pr_comment = f""" |
| 149 | + # llama.cpp server benchmark results for {args.name} on {args.runner_label}: {round(token_seconds, 2)}tk/s |
| 150 | + # <p align="center"> |
| 151 | + # <img src="{image_data[0]}" alt="prompt_tokens_seconds" /> |
| 152 | + # <img src="{image_data[1]}" alt="predicted_tokens_seconds"/> |
| 153 | + # </p> |
| 154 | + # <details> |
| 155 | + # <summary>Details</summary> |
| 156 | + # <p align="center"> |
| 157 | + # <img src="{image_data[2]}" alt="kv_cache_usage_ratio" /> |
| 158 | + # <img src="{image_data[3]}" alt="requests_processing"/> |
| 159 | + # <img src="{image_data[4]}" alt="requests_deferred"/> |
| 160 | + # </p> |
| 161 | + # </detail> |
| 162 | + # """ |
| 163 | + |
| 164 | + with open("results.github.env", 'a') as github_env: |
| 165 | + github_env.write(f"BENCH_RESULTS='{bench_results_str}'") |
| 166 | + |
| 167 | + |
| 168 | +def start_benchmark(args): |
| 169 | + k6_path = 'k6' |
| 170 | + if 'BENCH_K6_BIN_PATH' in os.environ: |
| 171 | + k6_path = os.environ['BENCH_K6_BIN_PATH'] |
| 172 | + k6_args = [ |
| 173 | + 'run', args.scenario, |
| 174 | + '--no-color', |
| 175 | + ] |
| 176 | + k6_args.extend(['--duration', args.duration]) |
| 177 | + k6_args.extend(['--iterations', args.n_prompts]) |
| 178 | + k6_args.extend(['--vus', args.parallel]) |
| 179 | + k6_args.extend(['--summary-export', 'k6-results.json']) |
| 180 | + args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} " |
| 181 | + args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]]) |
| 182 | + print(f"bench: starting k6 with: {args}") |
| 183 | + k6_completed = subprocess.run(args, shell=True, stdout=sys.stdout, stderr=sys.stderr) |
| 184 | + if k6_completed.returncode != 0: |
| 185 | + raise Exception("bench: unable to run k6") |
| 186 | + |
| 187 | + |
| 188 | +def start_server(args): |
| 189 | + server_process = start_server_background(args) |
| 190 | + |
| 191 | + attempts = 0 |
| 192 | + max_attempts = 20 |
| 193 | + if 'GITHUB_ACTIONS' in os.environ: |
| 194 | + max_attempts *= 2 |
| 195 | + |
| 196 | + while not is_server_listening(args.host, args.port): |
| 197 | + attempts += 1 |
| 198 | + if attempts > max_attempts: |
| 199 | + assert False, "server not started" |
| 200 | + print(f"bench: waiting for server to start ...") |
| 201 | + time.sleep(0.5) |
| 202 | + |
| 203 | + print("bench: server started.") |
| 204 | + return server_process |
| 205 | + |
| 206 | + |
| 207 | +def start_server_background(args): |
| 208 | + # Start the server |
| 209 | + server_path = '../../../build/bin/server' |
| 210 | + if 'LLAMA_SERVER_BIN_PATH' in os.environ: |
| 211 | + server_path = os.environ['LLAMA_SERVER_BIN_PATH'] |
| 212 | + server_args = [ |
| 213 | + '--host', args.host, |
| 214 | + '--port', args.port, |
| 215 | + ] |
| 216 | + model_file = args.model_path_prefix + os.path.sep + args.hf_file |
| 217 | + model_dir = os.path.dirname(model_file) |
| 218 | + if not os.path.exists(model_dir): |
| 219 | + os.makedirs(model_dir) |
| 220 | + server_args.extend(['--model', model_file]) |
| 221 | + server_args.extend(['--hf-repo', args.hf_repo]) |
| 222 | + server_args.extend(['--hf-file', args.hf_file]) |
| 223 | + server_args.extend(['--n-gpu-layers', args.n_gpu_layers]) |
| 224 | + server_args.extend(['--ctx-size', args.ctx_size]) |
| 225 | + server_args.extend(['--parallel', args.parallel]) |
| 226 | + server_args.extend(['--batch-size', args.batch_size]) |
| 227 | + server_args.extend(['--ubatch-size', args.ubatch_size]) |
| 228 | + server_args.extend(['--n-predict', args.max_tokens * 2]) |
| 229 | + server_args.extend(['--defrag-thold', "0.1"]) |
| 230 | + server_args.append('--cont-batching') |
| 231 | + server_args.append('--metrics') |
| 232 | + server_args.extend(['--log-format', "text"]) |
| 233 | + args = [str(arg) for arg in [server_path, *server_args]] |
| 234 | + print(f"bench: starting server with: {' '.join(args)}") |
| 235 | + pkwargs = { |
| 236 | + 'stdout': subprocess.PIPE, |
| 237 | + 'stderr': subprocess.PIPE |
| 238 | + } |
| 239 | + server_process = subprocess.Popen( |
| 240 | + args, |
| 241 | + **pkwargs) |
| 242 | + |
| 243 | + def server_log(in_stream, out_stream): |
| 244 | + for line in iter(in_stream.readline, b''): |
| 245 | + print(line.decode('utf-8'), end='', file=out_stream) |
| 246 | + |
| 247 | + thread_stdout = threading.Thread(target=server_log, args=(server_process.stdout, sys.stdout)) |
| 248 | + thread_stdout.start() |
| 249 | + thread_stderr = threading.Thread(target=server_log, args=(server_process.stderr, sys.stderr)) |
| 250 | + thread_stderr.start() |
| 251 | + |
| 252 | + return server_process |
| 253 | + |
| 254 | + |
| 255 | +def is_server_listening(server_fqdn, server_port): |
| 256 | + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: |
| 257 | + result = sock.connect_ex((server_fqdn, server_port)) |
| 258 | + _is_server_listening = result == 0 |
| 259 | + if _is_server_listening: |
| 260 | + print(f"server is listening on {server_fqdn}:{server_port}...") |
| 261 | + return _is_server_listening |
| 262 | + |
| 263 | + |
| 264 | +def escape_metric_name(metric_name): |
| 265 | + return re.sub('[^A-Z0-9]', '_', metric_name.upper()) |
| 266 | + |
| 267 | + |
| 268 | +if __name__ == '__main__': |
| 269 | + main() |
0 commit comments