Skip to content

Commit 087f5ad

Browse files
committed
server: bench: init
1 parent 8abf995 commit 087f5ad

File tree

4 files changed

+286
-6
lines changed

4 files changed

+286
-6
lines changed

.github/workflows/bench.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,10 @@ on:
1919
push:
2020
branches:
2121
- master
22-
- hp/server/bench/workflow # FIXME remove
23-
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
22+
paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
2423
pull_request:
2524
types: [opened, synchronize, reopened]
26-
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
25+
paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
2726
schedule:
2827
- cron: '04 2 * * *'
2928

@@ -36,7 +35,7 @@ jobs:
3635
runs-on: Standard_NC4as_T4_v3
3736
env:
3837
RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
39-
if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request != '' || github.event.push.ref == 'refs/heads/master' }}
38+
#if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.event.push.ref == 'refs/heads/master' }}
4039
steps:
4140
- name: Clone
4241
id: checkout
@@ -119,7 +118,7 @@ jobs:
119118
--n-prompts 1000 \
120119
--max-prompt-tokens 1024 \
121120
--max-tokens 2048
122-
121+
123122
cat results.github.env >> $GITHUB_ENV
124123
125124
# - name: Comment PR
@@ -134,9 +133,10 @@ jobs:
134133
- name: Commit status
135134
uses: Sibz/github-status-action@v1
136135
with:
136+
authToken: ${{secrets.GITHUB_TOKEN}}
137137
context: ${{ github.job }}
138138
description: |
139-
$BENCH_RESULTS
139+
${{ env.BENCH_RESULTS }}
140140
state: 'success'
141141

142142
- name: Upload results

examples/server/bench/bench.py

Lines changed: 269 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,269 @@
1+
import argparse
2+
import base64
3+
import json
4+
import os
5+
import re
6+
import signal
7+
import socket
8+
import subprocess
9+
import sys
10+
import threading
11+
import time
12+
import traceback
13+
from contextlib import closing
14+
from datetime import datetime
15+
16+
import matplotlib.pyplot as plt
17+
import requests
18+
19+
20+
def main(args_in: list[str] | None = None) -> None:
21+
parser = argparse.ArgumentParser(description="Start server benchmark scenario")
22+
parser.add_argument("--name", type=str, help="Bench name", required=True)
23+
parser.add_argument("--runner-label", type=str, help="Runner label", required=True)
24+
parser.add_argument("--branch", type=str, help="Branch name", default="detached")
25+
parser.add_argument("--commit", type=str, help="Commit name", default="dirty")
26+
parser.add_argument("--host", type=str, help="Server listen host", default="0.0.0.0")
27+
parser.add_argument("--port", type=int, help="Server listen host", default="8080")
28+
parser.add_argument("--model-path-prefix", type=str, help="Prefix where to store the model files", default="models")
29+
parser.add_argument("--n-prompts", type=int,
30+
help="SERVER_BENCH_N_PROMPTS: total prompts to randomly select in the benchmark", required=True)
31+
parser.add_argument("--max-prompt-tokens", type=int,
32+
help="SERVER_BENCH_MAX_PROMPT_TOKENS: maximum prompt tokens to filter out in the dataset",
33+
required=True)
34+
parser.add_argument("--max-tokens", type=int,
35+
help="SERVER_BENCH_MAX_CONTEXT: maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens",
36+
required=True)
37+
parser.add_argument("--hf-repo", type=str, help="Hugging Face model repository", required=True)
38+
parser.add_argument("--hf-file", type=str, help="Hugging Face model file", required=True)
39+
parser.add_argument("-ngl", "--n-gpu-layers", type=int, help="layers to the GPU for computation", required=True)
40+
parser.add_argument("--ctx-size", type=int, help="Set the size of the prompt context", required=True)
41+
parser.add_argument("--parallel", type=int, help="Set the number of slots for process requests", required=True)
42+
parser.add_argument("--batch-size", type=int, help="Set the batch size for prompt processing", required=True)
43+
parser.add_argument("--ubatch-size", type=int, help="physical maximum batch size", required=True)
44+
parser.add_argument("--scenario", type=str, help="Scenario to run", required=True)
45+
parser.add_argument("--duration", type=str, help="Bench scenario", required=True)
46+
47+
args = parser.parse_args(args_in)
48+
49+
start_time = time.time()
50+
51+
# Start the server and performance scenario
52+
try:
53+
server_process = start_server(args)
54+
except Exception:
55+
print("bench: server start error :")
56+
traceback.print_exc(file=sys.stdout)
57+
sys.exit(1)
58+
59+
# start the benchmark
60+
bench_results_str = ""
61+
try:
62+
start_benchmark(args)
63+
64+
with open("results.github.env", 'w') as github_env:
65+
# parse output
66+
with open('k6-results.json', 'r') as bench_results:
67+
# Load JSON data from file
68+
data = json.load(bench_results)
69+
for metric_name in data['metrics']:
70+
for metric_metric in data['metrics'][metric_name]:
71+
value = data['metrics'][metric_name][metric_metric]
72+
if isinstance(value, float):
73+
github_env.write(
74+
f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={round(value, 2)}\n")
75+
token_seconds = data['metrics']['llamacpp_tokens_second']['avg']
76+
bench_results_str = json.dumps(data)
77+
78+
except Exception:
79+
print("bench: error :")
80+
traceback.print_exc(file=sys.stdout)
81+
82+
# Stop the server
83+
if server_process:
84+
try:
85+
print(f"bench: shutting down server pid={server_process.pid} ...")
86+
if os.name == 'nt':
87+
interrupt = signal.CTRL_C_EVENT
88+
else:
89+
interrupt = signal.SIGINT
90+
server_process.send_signal(interrupt)
91+
server_process.wait(0.5)
92+
93+
except subprocess.TimeoutExpired:
94+
print(f"server still alive after 500ms, force-killing pid={server_process.pid} ...")
95+
server_process.kill() # SIGKILL
96+
server_process.wait()
97+
98+
while is_server_listening(args.host, args.port):
99+
time.sleep(0.1)
100+
101+
# Prometheus
102+
end_time = time.time()
103+
image_data = []
104+
pr_comment = f"tk/s={round(token_seconds, 2)}"
105+
if is_server_listening("0.0.0.0", 9090):
106+
metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds',
107+
'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred']
108+
109+
for metric in metrics:
110+
resp = requests.get(f"http://localhost:9090/api/v1/query_range",
111+
params={'query': 'llamacpp:' + metric, 'start': start_time, 'end': end_time, 'step': 2})
112+
if resp.status_code != 200:
113+
print(f"bench: unable to extract prometheus metric {metric}: {resp.text}")
114+
else:
115+
metric_data = resp.json()
116+
values = metric_data['data']['result'][0]['values']
117+
timestamps, metric_values = zip(*values)
118+
metric_values = [float(value) for value in metric_values]
119+
timestamps = [datetime.fromtimestamp(int(ts)) for ts in timestamps]
120+
plt.figure(figsize=(16, 10), dpi=80)
121+
plt.plot(timestamps, metric_values, label=metric)
122+
plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7)
123+
plt.yticks(fontsize=12, alpha=.7)
124+
125+
plt.title(f"{args.name} on {args.runner_label}\n"
126+
f"duration={args.duration} {round(token_seconds, 2)}tk/s\n"
127+
f"branch={args.branch} commit={args.commit}",
128+
fontsize=14, wrap=True)
129+
plt.grid(axis='both', alpha=.3)
130+
plt.ylabel(f"llamacpp:{metric}", fontsize=14)
131+
plt.xlabel(f"hf-repo={args.hf_repo} hf-file={args.hf_file}\n"
132+
f"parallel={args.parallel} ctx-size={args.ctx_size} ngl={args.n_gpu_layers} batch-size={args.batch_size} ubatch-size={args.ubatch_size}\n"
133+
f" pp={args.max_prompt_tokens} pp+tg={args.max_tokens}", fontsize=14, wrap=True)
134+
plt.gcf().autofmt_xdate()
135+
136+
# Remove borders
137+
plt.gca().spines["top"].set_alpha(0.0)
138+
plt.gca().spines["bottom"].set_alpha(0.3)
139+
plt.gca().spines["right"].set_alpha(0.0)
140+
plt.gca().spines["left"].set_alpha(0.3)
141+
142+
# Save the plot as a PNG image
143+
plt.savefig(f'{metric}.png')
144+
plt.close()
145+
with open(f'{metric}.png', "rb") as image_file:
146+
encoded_string = base64.b64encode(image_file.read()).decode()
147+
image_data.append(f"data:image/png;base64,{encoded_string}")
148+
# pr_comment = f"""
149+
# llama.cpp server benchmark results for {args.name} on {args.runner_label}: {round(token_seconds, 2)}tk/s
150+
# <p align="center">
151+
# <img src="{image_data[0]}" alt="prompt_tokens_seconds" />
152+
# <img src="{image_data[1]}" alt="predicted_tokens_seconds"/>
153+
# </p>
154+
# <details>
155+
# <summary>Details</summary>
156+
# <p align="center">
157+
# <img src="{image_data[2]}" alt="kv_cache_usage_ratio" />
158+
# <img src="{image_data[3]}" alt="requests_processing"/>
159+
# <img src="{image_data[4]}" alt="requests_deferred"/>
160+
# </p>
161+
# </detail>
162+
# """
163+
164+
with open("results.github.env", 'a') as github_env:
165+
github_env.write(f"BENCH_RESULTS='{bench_results_str}'")
166+
167+
168+
def start_benchmark(args):
169+
k6_path = 'k6'
170+
if 'BENCH_K6_BIN_PATH' in os.environ:
171+
k6_path = os.environ['BENCH_K6_BIN_PATH']
172+
k6_args = [
173+
'run', args.scenario,
174+
'--no-color',
175+
]
176+
k6_args.extend(['--duration', args.duration])
177+
k6_args.extend(['--iterations', args.n_prompts])
178+
k6_args.extend(['--vus', args.parallel])
179+
k6_args.extend(['--summary-export', 'k6-results.json'])
180+
args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
181+
args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
182+
print(f"bench: starting k6 with: {args}")
183+
k6_completed = subprocess.run(args, shell=True, stdout=sys.stdout, stderr=sys.stderr)
184+
if k6_completed.returncode != 0:
185+
raise Exception("bench: unable to run k6")
186+
187+
188+
def start_server(args):
189+
server_process = start_server_background(args)
190+
191+
attempts = 0
192+
max_attempts = 20
193+
if 'GITHUB_ACTIONS' in os.environ:
194+
max_attempts *= 2
195+
196+
while not is_server_listening(args.host, args.port):
197+
attempts += 1
198+
if attempts > max_attempts:
199+
assert False, "server not started"
200+
print(f"bench: waiting for server to start ...")
201+
time.sleep(0.5)
202+
203+
print("bench: server started.")
204+
return server_process
205+
206+
207+
def start_server_background(args):
208+
# Start the server
209+
server_path = '../../../build/bin/server'
210+
if 'LLAMA_SERVER_BIN_PATH' in os.environ:
211+
server_path = os.environ['LLAMA_SERVER_BIN_PATH']
212+
server_args = [
213+
'--host', args.host,
214+
'--port', args.port,
215+
]
216+
model_file = args.model_path_prefix + os.path.sep + args.hf_file
217+
model_dir = os.path.dirname(model_file)
218+
if not os.path.exists(model_dir):
219+
os.makedirs(model_dir)
220+
server_args.extend(['--model', model_file])
221+
server_args.extend(['--hf-repo', args.hf_repo])
222+
server_args.extend(['--hf-file', args.hf_file])
223+
server_args.extend(['--n-gpu-layers', args.n_gpu_layers])
224+
server_args.extend(['--ctx-size', args.ctx_size])
225+
server_args.extend(['--parallel', args.parallel])
226+
server_args.extend(['--batch-size', args.batch_size])
227+
server_args.extend(['--ubatch-size', args.ubatch_size])
228+
server_args.extend(['--n-predict', args.max_tokens * 2])
229+
server_args.extend(['--defrag-thold', "0.1"])
230+
server_args.append('--cont-batching')
231+
server_args.append('--metrics')
232+
server_args.extend(['--log-format', "text"])
233+
args = [str(arg) for arg in [server_path, *server_args]]
234+
print(f"bench: starting server with: {' '.join(args)}")
235+
pkwargs = {
236+
'stdout': subprocess.PIPE,
237+
'stderr': subprocess.PIPE
238+
}
239+
server_process = subprocess.Popen(
240+
args,
241+
**pkwargs)
242+
243+
def server_log(in_stream, out_stream):
244+
for line in iter(in_stream.readline, b''):
245+
print(line.decode('utf-8'), end='', file=out_stream)
246+
247+
thread_stdout = threading.Thread(target=server_log, args=(server_process.stdout, sys.stdout))
248+
thread_stdout.start()
249+
thread_stderr = threading.Thread(target=server_log, args=(server_process.stderr, sys.stderr))
250+
thread_stderr.start()
251+
252+
return server_process
253+
254+
255+
def is_server_listening(server_fqdn, server_port):
256+
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
257+
result = sock.connect_ex((server_fqdn, server_port))
258+
_is_server_listening = result == 0
259+
if _is_server_listening:
260+
print(f"server is listening on {server_fqdn}:{server_port}...")
261+
return _is_server_listening
262+
263+
264+
def escape_metric_name(metric_name):
265+
return re.sub('[^A-Z0-9]', '_', metric_name.upper())
266+
267+
268+
if __name__ == '__main__':
269+
main()

examples/server/bench/prometheus.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
global:
2+
scrape_interval: 10s
3+
external_labels:
4+
llamacpp: 'server'
5+
6+
scrape_configs:
7+
- job_name: 'llama.cpp server'
8+
static_configs:
9+
- targets: ['localhost:8080']
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
matplotlib
2+
requests

0 commit comments

Comments
 (0)