From 3748c734bfaa128318129dea6553a49f1972f55a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 21 Aug 2024 11:45:16 +0200 Subject: [PATCH 1/3] server : add some missing env variables --- common/common.cpp | 4 +++ examples/server/README.md | 57 +++++++++++++++++++++++++++------------ 2 files changed, 44 insertions(+), 17 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 59e8296604c9c..918b4e1f8e825 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -327,6 +327,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { void gpt_params_parse_from_env(gpt_params & params) { // we only care about server-related params for now get_env("LLAMA_ARG_MODEL", params.model); + get_env("LLAMA_ARG_HF_REPO", params.hf_repo); + get_env("LLAMA_ARG_HF_FILE", params.hf_file); get_env("LLAMA_ARG_THREADS", params.n_threads); get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx); get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel); @@ -341,6 +343,8 @@ void gpt_params_parse_from_env(gpt_params & params) { get_env("LLAMA_ARG_EMBEDDINGS", params.embedding); get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn); get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold); + get_env("LLAMA_ARG_HOST", params.hostname); + get_env("LLAMA_ARG_PORT", params.port); } bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { diff --git a/examples/server/README.md b/examples/server/README.md index abe245271195b..839e9e5d6398e 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -249,23 +249,46 @@ logging: Available environment variables (if specified, these variables will override parameters specified in arguments): -- `LLAMA_CACHE` (cache directory, used by `--hf-repo`) -- `HF_TOKEN` (Hugging Face access token, used when accessing a gated model with `--hf-repo`) -- `LLAMA_ARG_MODEL` -- `LLAMA_ARG_THREADS` -- `LLAMA_ARG_CTX_SIZE` -- `LLAMA_ARG_N_PARALLEL` -- `LLAMA_ARG_BATCH` -- `LLAMA_ARG_UBATCH` -- `LLAMA_ARG_N_GPU_LAYERS` -- `LLAMA_ARG_THREADS_HTTP` -- `LLAMA_ARG_CHAT_TEMPLATE` -- `LLAMA_ARG_N_PREDICT` -- `LLAMA_ARG_ENDPOINT_METRICS` -- `LLAMA_ARG_ENDPOINT_SLOTS` -- `LLAMA_ARG_EMBEDDINGS` -- `LLAMA_ARG_FLASH_ATTN` -- `LLAMA_ARG_DEFRAG_THOLD` +- `LLAMA_CACHE`: cache directory, used by `--hf-repo` +- `HF_TOKEN`: Hugging Face access token, used when accessing a gated model with `--hf-repo` +- `LLAMA_ARG_MODEL`: equivalent to `-m` +- `LLAMA_ARG_HF_REPO`: equivalent to `--hf-repo` +- `LLAMA_ARG_HF_FILE`: equivalent to `--hf-file` +- `LLAMA_ARG_THREADS`: equivalent to `-t` +- `LLAMA_ARG_CTX_SIZE`: equivalent to `-c` +- `LLAMA_ARG_N_PARALLEL`: equivalent to `-np` +- `LLAMA_ARG_BATCH`: equivalent to `-b` +- `LLAMA_ARG_UBATCH`: equivalent to `-ub` +- `LLAMA_ARG_N_GPU_LAYERS`: equivalent to `-ngl` +- `LLAMA_ARG_THREADS_HTTP`: equivalent to `--threads-http` +- `LLAMA_ARG_CHAT_TEMPLATE`: equivalent to `--chat-template` +- `LLAMA_ARG_N_PREDICT`: equivalent to `-n` +- `LLAMA_ARG_ENDPOINT_METRICS`: if set to `1`, it will enable metrics endpoint (equivalent to `--metrics`) +- `LLAMA_ARG_ENDPOINT_SLOTS`: if set to `0`, it will **disable** slots endpoint (equivalent to `--no-slots`) +- `LLAMA_ARG_EMBEDDINGS`: if set to `1`, it will enable embeddings endpoint (equivalent to `--embeddings`) +- `LLAMA_ARG_FLASH_ATTN`: if set to `1`, it will enable flash attention (equivalent to `-fa`) +- `LLAMA_ARG_DEFRAG_THOLD`: equivalent to `-dt` +- `LLAMA_ARG_HOST`: equivalent to `--host` +- `LLAMA_ARG_PORT`: equivalent to `--port` + +Example usage of docker compose with environment variables: + +```yml +services: + llamacpp-server: + image: ghcr.io/ggerganov/llama.cpp:server + ports: + - 8080:8080 + volumes: + - ./models:/models + environment: + LLAMA_ARG_MODEL: /models/my_model.gguf + LLAMA_ARG_CTX_SIZE: 4096 + LLAMA_ARG_N_PARALLEL: 2 + LLAMA_ARG_ENDPOINT_METRICS: 1 # to disable, either remove or set to 0 + LLAMA_ARG_HOST: 0.0.0.0 + LLAMA_ARG_PORT: 8080 +``` ## Build From b3eed8947803ccbd13b5bf22d062e73f0228f334 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 21 Aug 2024 11:55:35 +0200 Subject: [PATCH 2/3] add LLAMA_ARG_HOST to server dockerfile --- .devops/llama-server-cuda.Dockerfile | 2 ++ .devops/llama-server-intel.Dockerfile | 2 ++ .devops/llama-server-rocm.Dockerfile | 2 ++ .devops/llama-server-vulkan.Dockerfile | 2 ++ .devops/llama-server.Dockerfile | 2 ++ common/common.cpp | 2 ++ examples/server/README.md | 4 +++- 7 files changed, 15 insertions(+), 1 deletion(-) diff --git a/.devops/llama-server-cuda.Dockerfile b/.devops/llama-server-cuda.Dockerfile index 67328cf1c1788..1842489841f8c 100644 --- a/.devops/llama-server-cuda.Dockerfile +++ b/.devops/llama-server-cuda.Dockerfile @@ -24,6 +24,8 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} ENV GGML_CUDA=1 # Enable cURL ENV LLAMA_CURL=1 +# Must be set to 0.0.0.0 so it can listen to requests from host machine +ENV LLAMA_ARG_HOST=0.0.0.0 RUN make -j$(nproc) llama-server diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile index f525658dddfe5..9c355b664f15e 100644 --- a/.devops/llama-server-intel.Dockerfile +++ b/.devops/llama-server-intel.Dockerfile @@ -26,6 +26,8 @@ RUN apt-get update && \ COPY --from=build /app/build/bin/llama-server /llama-server ENV LC_ALL=C.utf8 +# Must be set to 0.0.0.0 so it can listen to requests from host machine +ENV LLAMA_ARG_HOST=0.0.0.0 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] diff --git a/.devops/llama-server-rocm.Dockerfile b/.devops/llama-server-rocm.Dockerfile index 763b4cd3f1c2e..fd0e19ad6e49c 100644 --- a/.devops/llama-server-rocm.Dockerfile +++ b/.devops/llama-server-rocm.Dockerfile @@ -39,6 +39,8 @@ ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} ENV GGML_HIPBLAS=1 ENV CC=/opt/rocm/llvm/bin/clang ENV CXX=/opt/rocm/llvm/bin/clang++ +# Must be set to 0.0.0.0 so it can listen to requests from host machine +ENV LLAMA_ARG_HOST=0.0.0.0 # Enable cURL ENV LLAMA_CURL=1 diff --git a/.devops/llama-server-vulkan.Dockerfile b/.devops/llama-server-vulkan.Dockerfile index 13a61ffd8454b..93c5e0c26e691 100644 --- a/.devops/llama-server-vulkan.Dockerfile +++ b/.devops/llama-server-vulkan.Dockerfile @@ -23,6 +23,8 @@ RUN cp /app/build/bin/llama-server /llama-server && \ rm -rf /app ENV LC_ALL=C.utf8 +# Must be set to 0.0.0.0 so it can listen to requests from host machine +ENV LLAMA_ARG_HOST=0.0.0.0 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] diff --git a/.devops/llama-server.Dockerfile b/.devops/llama-server.Dockerfile index ff558604ebde2..02accc85e1368 100644 --- a/.devops/llama-server.Dockerfile +++ b/.devops/llama-server.Dockerfile @@ -21,6 +21,8 @@ RUN apt-get update && \ COPY --from=build /app/llama-server /llama-server ENV LC_ALL=C.utf8 +# Must be set to 0.0.0.0 so it can listen to requests from host machine +ENV LLAMA_ARG_HOST=0.0.0.0 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] diff --git a/common/common.cpp b/common/common.cpp index 918b4e1f8e825..960d4fac09348 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -327,6 +327,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { void gpt_params_parse_from_env(gpt_params & params) { // we only care about server-related params for now get_env("LLAMA_ARG_MODEL", params.model); + get_env("LLAMA_ARG_MODEL_URL", params.model_url); + get_env("LLAMA_ARG_MODEL_ALIAS", params.model_alias); get_env("LLAMA_ARG_HF_REPO", params.hf_repo); get_env("LLAMA_ARG_HF_FILE", params.hf_file); get_env("LLAMA_ARG_THREADS", params.n_threads); diff --git a/examples/server/README.md b/examples/server/README.md index 839e9e5d6398e..26811721fb5cc 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -252,6 +252,8 @@ Available environment variables (if specified, these variables will override par - `LLAMA_CACHE`: cache directory, used by `--hf-repo` - `HF_TOKEN`: Hugging Face access token, used when accessing a gated model with `--hf-repo` - `LLAMA_ARG_MODEL`: equivalent to `-m` +- `LLAMA_ARG_MODEL_URL`: equivalent to `-mu` +- `LLAMA_ARG_MODEL_ALIAS`: equivalent to `-a` - `LLAMA_ARG_HF_REPO`: equivalent to `--hf-repo` - `LLAMA_ARG_HF_FILE`: equivalent to `--hf-file` - `LLAMA_ARG_THREADS`: equivalent to `-t` @@ -282,11 +284,11 @@ services: volumes: - ./models:/models environment: + # alternatively, you can use "LLAMA_ARG_MODEL_URL" to download the model LLAMA_ARG_MODEL: /models/my_model.gguf LLAMA_ARG_CTX_SIZE: 4096 LLAMA_ARG_N_PARALLEL: 2 LLAMA_ARG_ENDPOINT_METRICS: 1 # to disable, either remove or set to 0 - LLAMA_ARG_HOST: 0.0.0.0 LLAMA_ARG_PORT: 8080 ``` From ff0f3e8427f44cbef6b7b60b02721f1b9e3c781b Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 21 Aug 2024 12:12:20 +0200 Subject: [PATCH 3/3] also add LLAMA_ARG_CONT_BATCHING --- common/common.cpp | 1 + examples/server/README.md | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index 960d4fac09348..98d5108797cc1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -345,6 +345,7 @@ void gpt_params_parse_from_env(gpt_params & params) { get_env("LLAMA_ARG_EMBEDDINGS", params.embedding); get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn); get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold); + get_env("LLAMA_ARG_CONT_BATCHING", params.cont_batching); get_env("LLAMA_ARG_HOST", params.hostname); get_env("LLAMA_ARG_PORT", params.port); } diff --git a/examples/server/README.md b/examples/server/README.md index 26811721fb5cc..805e05b4a5114 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -266,9 +266,10 @@ Available environment variables (if specified, these variables will override par - `LLAMA_ARG_CHAT_TEMPLATE`: equivalent to `--chat-template` - `LLAMA_ARG_N_PREDICT`: equivalent to `-n` - `LLAMA_ARG_ENDPOINT_METRICS`: if set to `1`, it will enable metrics endpoint (equivalent to `--metrics`) -- `LLAMA_ARG_ENDPOINT_SLOTS`: if set to `0`, it will **disable** slots endpoint (equivalent to `--no-slots`) +- `LLAMA_ARG_ENDPOINT_SLOTS`: if set to `0`, it will **disable** slots endpoint (equivalent to `--no-slots`). This feature is enabled by default. - `LLAMA_ARG_EMBEDDINGS`: if set to `1`, it will enable embeddings endpoint (equivalent to `--embeddings`) - `LLAMA_ARG_FLASH_ATTN`: if set to `1`, it will enable flash attention (equivalent to `-fa`) +- `LLAMA_ARG_CONT_BATCHING`: if set to `0`, it will **disable** continuous batching (equivalent to `--no-cont-batching`). This feature is enabled by default. - `LLAMA_ARG_DEFRAG_THOLD`: equivalent to `-dt` - `LLAMA_ARG_HOST`: equivalent to `--host` - `LLAMA_ARG_PORT`: equivalent to `--port`