From ff0f3e8427f44cbef6b7b60b02721f1b9e3c781b Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 21 Aug 2024 12:12:20 +0200 Subject: [PATCH] also add LLAMA_ARG_CONT_BATCHING --- common/common.cpp | 1 + examples/server/README.md | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index 960d4fac09348..98d5108797cc1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -345,6 +345,7 @@ void gpt_params_parse_from_env(gpt_params & params) { get_env("LLAMA_ARG_EMBEDDINGS", params.embedding); get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn); get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold); + get_env("LLAMA_ARG_CONT_BATCHING", params.cont_batching); get_env("LLAMA_ARG_HOST", params.hostname); get_env("LLAMA_ARG_PORT", params.port); } diff --git a/examples/server/README.md b/examples/server/README.md index 26811721fb5cc..805e05b4a5114 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -266,9 +266,10 @@ Available environment variables (if specified, these variables will override par - `LLAMA_ARG_CHAT_TEMPLATE`: equivalent to `--chat-template` - `LLAMA_ARG_N_PREDICT`: equivalent to `-n` - `LLAMA_ARG_ENDPOINT_METRICS`: if set to `1`, it will enable metrics endpoint (equivalent to `--metrics`) -- `LLAMA_ARG_ENDPOINT_SLOTS`: if set to `0`, it will **disable** slots endpoint (equivalent to `--no-slots`) +- `LLAMA_ARG_ENDPOINT_SLOTS`: if set to `0`, it will **disable** slots endpoint (equivalent to `--no-slots`). This feature is enabled by default. - `LLAMA_ARG_EMBEDDINGS`: if set to `1`, it will enable embeddings endpoint (equivalent to `--embeddings`) - `LLAMA_ARG_FLASH_ATTN`: if set to `1`, it will enable flash attention (equivalent to `-fa`) +- `LLAMA_ARG_CONT_BATCHING`: if set to `0`, it will **disable** continuous batching (equivalent to `--no-cont-batching`). This feature is enabled by default. - `LLAMA_ARG_DEFRAG_THOLD`: equivalent to `-dt` - `LLAMA_ARG_HOST`: equivalent to `--host` - `LLAMA_ARG_PORT`: equivalent to `--port`