Skip to content

Commit ccad425

Browse files
committed
server: allow to override threads server pool with --threads-server
1 parent d5ab297 commit ccad425

File tree

2 files changed

+17
-0
lines changed

2 files changed

+17
-0
lines changed

examples/server/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ The project is under active development, and we are [looking for feedback and co
1818

1919
- `--threads N`, `-t N`: Set the number of threads to use during generation.
2020
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
21+
- `--threads-server N`: number of threads in the http server pool to process requests (default: `std::thread::hardware_concurrency()`)
2122
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
2223
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
2324
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.

examples/server/server.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ struct server_params
4444
int32_t write_timeout = 600;
4545
bool slots_endpoint = true;
4646
bool metrics_endpoint = false;
47+
int n_threads_http = -1;
4748
};
4849

4950
bool server_verbose = false;
@@ -2065,6 +2066,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
20652066
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
20662067
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
20672068
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
2069+
printf(" --threads-http N number of threads in the http server pool to process requests (default: hardware concurrency)\n");
20682070
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
20692071
printf(" --rope-scaling {none,linear,yarn}\n");
20702072
printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
@@ -2351,6 +2353,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
23512353
}
23522354
params.n_threads_batch = std::stoi(argv[i]);
23532355
}
2356+
else if (arg == "--threads-http")
2357+
{
2358+
if (++i >= argc)
2359+
{
2360+
invalid_param = true;
2361+
break;
2362+
}
2363+
sparams.n_threads_http = std::stoi(argv[i]);
2364+
}
23542365
else if (arg == "-b" || arg == "--batch-size")
23552366
{
23562367
if (++i >= argc)
@@ -3509,6 +3520,11 @@ int main(int argc, char **argv)
35093520
}*/
35103521
//);
35113522

3523+
if (sparams.n_threads_http > 0) {
3524+
log_data["n_threads_http"] = std::to_string(sparams.n_threads_http);
3525+
svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
3526+
}
3527+
35123528
LOG_INFO("HTTP server listening", log_data);
35133529
// run the HTTP server in a thread - see comment below
35143530
std::thread t([&]()

0 commit comments

Comments
 (0)