diff --git a/common/arg.cpp b/common/arg.cpp index 60e37a89a68e8..922391069d32a 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1312,7 +1312,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, [](gpt_params & params, int value) { params.n_parallel = value; } - )); + ).set_env("LLAMA_ARG_N_PARALLEL")); add_opt(llama_arg( {"-ns", "--sequences"}, "N", format("number of sequences to decode (default: %d)", params.n_sequences), diff --git a/examples/server/README.md b/examples/server/README.md index 168e14a9908d8..7a5d26ca08ba3 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -87,7 +87,7 @@ The project is under active development, and we are [looking for feedback and co | `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16) | | `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)
(env: LLAMA_ARG_DEFRAG_THOLD) | -| `-np, --parallel N` | number of parallel sequences to decode (default: 1) | +| `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) | | `-nocb, --no-cont-batching` | disable continuous batching
(env: LLAMA_ARG_NO_CONT_BATCHING) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing |