From 51ff9ebd8a07c5f94e1b78928e5710666ea690e7 Mon Sep 17 00:00:00 2001 From: Shaojun Liu <61072813+liu-shaojun@users.noreply.github.com> Date: Fri, 20 Dec 2024 09:29:16 +0800 Subject: [PATCH] Upgrade oneccl version to 0.0.6.3 (#12560) * Update Dockerfile * Update Dockerfile * Update start-vllm-service.sh --- docker/llm/serving/xpu/docker/Dockerfile | 4 ++-- docker/llm/serving/xpu/docker/start-vllm-service.sh | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/docker/llm/serving/xpu/docker/Dockerfile b/docker/llm/serving/xpu/docker/Dockerfile index 0b1e7267047..9e64990a90f 100644 --- a/docker/llm/serving/xpu/docker/Dockerfile +++ b/docker/llm/serving/xpu/docker/Dockerfile @@ -64,8 +64,8 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO cd /tmp/ && \ pip install torch==2.1.0.post2 torchvision==0.16.0.post2 torchaudio==2.1.0.post2 intel-extension-for-pytorch==2.1.30.post0 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ && \ # Internal oneccl - wget https://sourceforge.net/projects/oneccl-wks/files/2024.0.0.6.2-release/oneccl_wks_installer_2024.0.0.6.2.sh && \ - bash oneccl_wks_installer_2024.0.0.6.2.sh && \ + wget https://sourceforge.net/projects/oneccl-wks/files/2024.0.0.6.3-release/oneccl_wks_installer_2024.0.0.6.3.sh && \ + bash oneccl_wks_installer_2024.0.0.6.3.sh && \ git clone https://github.com/intel/torch-ccl -b v2.1.300+xpu && \ cd torch-ccl && \ patch -p1 < /tmp/oneccl-binding.patch && \ diff --git a/docker/llm/serving/xpu/docker/start-vllm-service.sh b/docker/llm/serving/xpu/docker/start-vllm-service.sh index 30b8e53102e..576c3b4a42a 100644 --- a/docker/llm/serving/xpu/docker/start-vllm-service.sh +++ b/docker/llm/serving/xpu/docker/start-vllm-service.sh @@ -3,6 +3,7 @@ model="YOUR_MODEL_PATH" served_model_name="YOUR_MODEL_NAME" export CCL_WORKER_COUNT=2 +export SYCL_CACHE_PERSISTENT=1 export FI_PROVIDER=shm export CCL_ATL_TRANSPORT=ofi export CCL_ZE_IPC_EXCHANGE=sockets @@ -11,6 +12,9 @@ export CCL_ATL_SHM=1 export USE_XETLA=OFF export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 export TORCH_LLM_ALLREDUCE=0 + +export CCL_SAME_STREAM=1 +export CCL_BLOCKING_WAIT=0 source /opt/intel/1ccl-wks/setvars.sh @@ -20,14 +24,14 @@ python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \ --model $model \ --trust-remote-code \ --block-size 8 \ - --gpu-memory-utilization 0.9 \ + --gpu-memory-utilization 0.95 \ --device xpu \ --dtype float16 \ --enforce-eager \ --load-in-low-bit fp8 \ --max-model-len 2048 \ --max-num-batched-tokens 4000 \ - --max-num-seqs 12 \ + --max-num-seqs 256 \ --tensor-parallel-size 1 \ --disable-async-output-proc \ --distributed-executor-backend ray