diff --git a/docker/llm/serving/xpu/docker/start-vllm-service.sh b/docker/llm/serving/xpu/docker/start-vllm-service.sh index 749dbcd7bff..15a252f7f6d 100644 --- a/docker/llm/serving/xpu/docker/start-vllm-service.sh +++ b/docker/llm/serving/xpu/docker/start-vllm-service.sh @@ -19,6 +19,7 @@ python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \ --port 8000 \ --model $model \ --trust-remote-code \ + --block-size 8 \ --gpu-memory-utilization 0.9 \ --device xpu \ --dtype float16 \ diff --git a/docs/mddocs/DockerGuides/vllm_docker_quickstart.md b/docs/mddocs/DockerGuides/vllm_docker_quickstart.md index 9cf6c72ae20..b841fffc5c8 100644 --- a/docs/mddocs/DockerGuides/vllm_docker_quickstart.md +++ b/docs/mddocs/DockerGuides/vllm_docker_quickstart.md @@ -103,6 +103,7 @@ Before performing benchmark or starting the service, you can refer to this [sect |`--max-model-len`| Model context length. If unspecified, will be automatically derived from the model config.| |`--max-num-batched-token`| Maximum number of batched tokens per iteration.| |`--max-num-seq`| Maximum number of sequences per iteration. Default: 256| +|`--block-size`| vLLM block size. Set to 8 to achieve a performance boost.| #### Single card serving