diff --git a/.github/workflows/config/gpt2-ci.yaml b/.github/workflows/config/gpt2-ci.yaml index b3927953b..b8f9fcba5 100644 --- a/.github/workflows/config/gpt2-ci.yaml +++ b/.github/workflows/config/gpt2-ci.yaml @@ -10,7 +10,7 @@ device: cpu num_replicas: 1 ipex: enabled: true - precision: bf16 + precision: fp32 model_description: model_id_or_path: gpt2 tokenizer_name_or_path: gpt2 diff --git a/.github/workflows/config/opt-125m-ci.yaml b/.github/workflows/config/opt-125m-ci.yaml index f13ec7e54..6d175be7a 100644 --- a/.github/workflows/config/opt-125m-ci.yaml +++ b/.github/workflows/config/opt-125m-ci.yaml @@ -9,7 +9,7 @@ workers_per_group: 2 device: CPU ipex: enabled: false - precision: bf16 + precision: fp32 model_description: model_id_or_path: facebook/opt-125m tokenizer_name_or_path: facebook/opt-125m diff --git a/.github/workflows/config/update_inference_config.py b/.github/workflows/config/update_inference_config.py index 502bcf591..94c5b22b5 100644 --- a/.github/workflows/config/update_inference_config.py +++ b/.github/workflows/config/update_inference_config.py @@ -18,11 +18,14 @@ import argparse -def update_inference_config(config_file: str, output_file: str, deepspeed: bool, ipex: bool): +def update_inference_config( + config_file: str, output_file: str, deepspeed: bool, ipex: bool, vllm: bool +): with open(config_file) as f: config = yaml.load(f, Loader=yaml.FullLoader) config["deepspeed"] = deepspeed config["ipex"]["enabled"] = ipex + config["vllm"]["enabled"] = vllm with open(output_file, "w") as f: yaml.dump(config, f, sort_keys=False) @@ -34,10 +37,13 @@ def get_parser(): parser.add_argument("--output_file", type=str, required=True) parser.add_argument("--deepspeed", action="store_true") parser.add_argument("--ipex", action="store_true") + parser.add_argument("--vllm", action="store_true") return parser if __name__ == "__main__": parser = get_parser() args = parser.parse_args() - update_inference_config(args.config_file, args.output_file, args.deepspeed, args.ipex) + update_inference_config( + args.config_file, args.output_file, args.deepspeed, args.ipex, args.vllm + ) diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml index 91950e553..cec4af99e 100644 --- a/.github/workflows/workflow_finetune.yml +++ b/.github/workflows/workflow_finetune.yml @@ -37,7 +37,7 @@ jobs: name: finetune strategy: matrix: - model: [ EleutherAI/gpt-j-6b, NousResearch/Llama-2-7b-chat-hf, gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b, NousResearch/Llama-2-7b-hf, mistralai/Mistral-7B-v0.1, google/gemma-2b] + model: [ EleutherAI/gpt-j-6b, NousResearch/Llama-2-7b-chat-hf, gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b, NousResearch/Llama-2-7b-hf, mistralai/Mistral-7B-v0.1] isPR: - ${{inputs.ci_type == 'pr'}} @@ -47,7 +47,6 @@ jobs: - { model: "EleutherAI/gpt-j-6b"} - { model: "NousResearch/Llama-2-7b-chat-hf"} - { model: "mistralai/Mistral-7B-v0.1"} - - { model: "google/gemma-2b"} runs-on: self-hosted diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml index e01ef598e..a0cee8827 100644 --- a/.github/workflows/workflow_inference.yml +++ b/.github/workflows/workflow_inference.yml @@ -37,7 +37,7 @@ jobs: name: inference strategy: matrix: - model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-ipex-llm, neural-chat-7b-v3-1, CodeLlama-7b-hf, falcon-7b, starcoder, llama-2-7b-chat-hf, llama-2-7b-chat-hf-vllm, gemma-2b, deepseek-coder-33b-instruct] + model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-ipex-llm, neural-chat-7b-v3-1, CodeLlama-7b-hf, falcon-7b, starcoder, llama-2-7b-chat-hf, llama-2-7b-chat-hf-no-vllm, deepseek-coder-33b-instruct] isPR: - ${{inputs.ci_type == 'pr'}} @@ -48,8 +48,7 @@ jobs: - { model: "gpt-j-6b"} - { model: "mistral-7b-v0.1"} - { model: "mpt-7b-ipex-llm"} - - { model: "llama-2-7b-chat-hf-vllm"} - - { model: "gemma-2b"} + - { model: "llama-2-7b-chat-hf-no-vllm"} runs-on: self-hosted @@ -100,7 +99,7 @@ jobs: run: | TARGET=${{steps.target.outputs.target}} source dev/scripts/ci-functions.sh - strat_ray ${TARGET} + start_ray ${TARGET} - name: Run Inference Test run: | diff --git a/.github/workflows/workflow_tests.yml b/.github/workflows/workflow_tests.yml index c74536866..912d59b42 100644 --- a/.github/workflows/workflow_tests.yml +++ b/.github/workflows/workflow_tests.yml @@ -117,6 +117,7 @@ jobs: source $(python -c "import oneccl_bindings_for_pytorch as torch_ccl; print(torch_ccl.cwd)")/env/setvars.sh # Additional libraries required for pytest pip install -r ./tests/requirements.txt + bash ./dev/scripts/install-vllm-cpu.sh - name: Start Ray Cluster run: | @@ -179,7 +180,7 @@ jobs: run: | TARGET=${{steps.target.outputs.target}} source dev/scripts/ci-functions.sh - strat_ray ${TARGET} + start_ray ${TARGET} - name: Run Tests run: | diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 2a2a8570a..b78e6c021 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -424,9 +424,8 @@ def main(args: argparse.Namespace): random.seed(args.seed) np.random.seed(args.seed) - - route_prefix = all_models[args.model_name].route_prefix if args.simple: + route_prefix = all_models[args.model_name].route_prefix api_url = args.model_endpoint_base + route_prefix else: api_url = args.model_endpoint_base + "/v1/chat/completions" @@ -720,17 +719,20 @@ def main(args: argparse.Namespace): ) parser.add_argument( "--temperature", + type=float, default=None, help="The value used to modulate the next token probabilities.", ) parser.add_argument( "--top_p", + type=float, default=None, help="If set to float < 1, only the smallest set of most probable tokens \ with probabilities that add up to `Top p` or higher are kept for generation.", ) parser.add_argument( "--top_k", + type=float, default=None, help="The number of highest probability vocabulary tokens to keep \ for top-k-filtering.", diff --git a/dev/docker/Dockerfile.habana b/dev/docker/Dockerfile.habana index efdddf6c2..1972f60a6 100644 --- a/dev/docker/Dockerfile.habana +++ b/dev/docker/Dockerfile.habana @@ -7,13 +7,18 @@ WORKDIR /root/llm-on-ray COPY ./pyproject.toml . COPY ./MANIFEST.in . -# create llm_on_ray package directory to bypass the following 'pip install -e' command +# Create llm_on_ray package directory to bypass the following 'pip install -e' command RUN mkdir ./llm_on_ray RUN pip install -e . && \ pip install --upgrade-strategy eager optimum[habana] && \ pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.1 +# Install vllm habana env +RUN pip install -v git+https://github.com/HabanaAI/vllm-fork.git@cf6952d +# Reinstall ray because vllm downgrades the ray version +RUN pip install "ray>=2.10" "ray[serve,tune]>=2.10" + # Optinal. Comment out if you are not using UI COPY ./dev/scripts/install-ui.sh /tmp @@ -30,3 +35,4 @@ ENV RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES=1 ENV PT_HPU_LAZY_ACC_PAR_MODE=0 ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true + diff --git a/dev/docker/ci/Dockerfile.cpu_vllm_and_deepspeed b/dev/docker/ci/Dockerfile.cpu_vllm_and_deepspeed new file mode 100644 index 000000000..b0d42c2a5 --- /dev/null +++ b/dev/docker/ci/Dockerfile.cpu_vllm_and_deepspeed @@ -0,0 +1,47 @@ +# syntax=docker/dockerfile:1 +FROM ubuntu:22.04 + +ENV LANG C.UTF-8 + +WORKDIR /root/llm-on-ray + +RUN --mount=type=cache,target=/var/cache/apt apt-get update -y \ + && apt-get install -y build-essential cmake wget curl git vim htop ssh net-tools \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +ENV CONDA_DIR /opt/conda +RUN wget --quiet https://github.com/conda-forge/miniforge/releases/download/23.3.1-1/Miniforge3-Linux-x86_64.sh -O ~/miniforge.sh && \ + /bin/bash ~/miniforge.sh -b -p /opt/conda +ENV PATH $CONDA_DIR/bin:$PATH + +# setup env +SHELL ["/bin/bash", "--login", "-c"] + +RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \ + unset -f conda && \ + export PATH=$CONDA_DIR/bin/:${PATH} && \ + mamba config --add channels intel && \ + mamba install -y -c conda-forge python==3.9 gxx=12.3 gxx_linux-64=12.3 libxcrypt + +COPY ./pyproject.toml . +COPY ./MANIFEST.in . +COPY ./dev/scripts/install-vllm-cpu.sh . + +# create llm_on_ray package directory to bypass the following 'pip install -e' command +RUN mkdir ./llm_on_ray + +RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[cpu,deepspeed] --extra-index-url https://download.pytorch.org/whl/cpu \ + --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ + +RUN ds_report + +# Used to invalidate docker build cache with --build-arg CACHEBUST=$(date +%s) +ARG CACHEBUST=1 +COPY ./dev/scripts/install-oneapi.sh /tmp +RUN /tmp/install-oneapi.sh + +# Install vllm-cpu +# Activate base first for loading g++ envs ($CONDA_PREFIX/etc/conda/activate.d/*) +RUN --mount=type=cache,target=/root/.cache/pip \ + source /opt/conda/bin/activate base && ./install-vllm-cpu.sh diff --git a/dev/docker/ci/Dockerfile.cpu_vllm_and_deepspeed.pip_non_editable b/dev/docker/ci/Dockerfile.cpu_vllm_and_deepspeed.pip_non_editable new file mode 100644 index 000000000..7500c301a --- /dev/null +++ b/dev/docker/ci/Dockerfile.cpu_vllm_and_deepspeed.pip_non_editable @@ -0,0 +1,43 @@ +# syntax=docker/dockerfile:1 +FROM ubuntu:22.04 + +ENV LANG C.UTF-8 + +WORKDIR /root/llm-on-ray + +RUN --mount=type=cache,target=/var/cache/apt apt-get update -y \ + && apt-get install -y build-essential cmake wget curl git vim htop ssh net-tools \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +ENV CONDA_DIR /opt/conda +RUN wget --quiet https://github.com/conda-forge/miniforge/releases/download/23.3.1-1/Miniforge3-Linux-x86_64.sh -O ~/miniforge.sh && \ + /bin/bash ~/miniforge.sh -b -p /opt/conda +ENV PATH $CONDA_DIR/bin:$PATH + +# setup env +SHELL ["/bin/bash", "--login", "-c"] + +RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \ + unset -f conda && \ + export PATH=$CONDA_DIR/bin/:${PATH} && \ + mamba config --add channels intel && \ + mamba install -y -c conda-forge python==3.9 gxx=12.3 gxx_linux-64=12.3 libxcrypt + +# copy all checkedout file for later non-editable pip +COPY . . + +RUN --mount=type=cache,target=/root/.cache/pip pip install .[cpu,deepspeed] --extra-index-url https://download.pytorch.org/whl/cpu \ + --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ + +RUN ds_report + +# Used to invalidate docker build cache with --build-arg CACHEBUST=$(date +%s) +ARG CACHEBUST=1 +COPY ./dev/scripts/install-oneapi.sh /tmp +RUN /tmp/install-oneapi.sh + +# Install vllm-cpu +# Activate base first for loading g++ envs ($CONDA_PREFIX/etc/conda/activate.d/*) +RUN --mount=type=cache,target=/root/.cache/pip \ + source /opt/conda/bin/activate base && ./install-vllm-cpu.sh diff --git a/dev/docker/ci/Dockerfile.habana_vllm b/dev/docker/ci/Dockerfile.habana_vllm index 0d68b7756..1972f60a6 100644 --- a/dev/docker/ci/Dockerfile.habana_vllm +++ b/dev/docker/ci/Dockerfile.habana_vllm @@ -15,7 +15,7 @@ RUN pip install -e . && \ pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.1 # Install vllm habana env -RUN pip install -v git+https://github.com/HabanaAI/vllm-fork.git@ae3d6121 +RUN pip install -v git+https://github.com/HabanaAI/vllm-fork.git@cf6952d # Reinstall ray because vllm downgrades the ray version RUN pip install "ray>=2.10" "ray[serve,tune]>=2.10" diff --git a/dev/docker/ci/Dockerfile.tests_cpu b/dev/docker/ci/Dockerfile.tests_cpu index 1438f5059..3720e9d4e 100644 --- a/dev/docker/ci/Dockerfile.tests_cpu +++ b/dev/docker/ci/Dockerfile.tests_cpu @@ -24,10 +24,11 @@ RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \ unset -f conda && \ export PATH=$CONDA_DIR/bin/:${PATH} && \ mamba config --add channels intel && \ - mamba install python==${python_v} + mamba install -y -c conda-forge python==${python_v} gxx=12.3 gxx_linux-64=12.3 libxcrypt COPY ./pyproject.toml . COPY ./MANIFEST.in . +COPY ./dev/scripts/install-vllm-cpu.sh . # create llm_on_ray package directory to bypass the following 'pip install -e' command RUN mkdir ./llm_on_ray @@ -39,3 +40,8 @@ RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[cpu] --extra-ind ARG CACHEBUST=1 COPY ./dev/scripts/install-oneapi.sh /tmp RUN /tmp/install-oneapi.sh + +# Install vllm-cpu +# Activate base first for loading g++ envs ($CONDA_PREFIX/etc/conda/activate.d/*) +RUN --mount=type=cache,target=/root/.cache/pip \ + source /opt/conda/bin/activate base && ./install-vllm-cpu.sh diff --git a/dev/scripts/ci-functions.sh b/dev/scripts/ci-functions.sh index fde5f0717..3d3a62381 100644 --- a/dev/scripts/ci-functions.sh +++ b/dev/scripts/ci-functions.sh @@ -75,7 +75,7 @@ install_dependencies(){ docker exec "${TARGET}" bash -c "pip install -r ./tests/requirements.txt" } -strat_ray(){ +start_ray(){ local TARGET=$1 # Start Ray Cluster @@ -110,8 +110,8 @@ stop_container(){ declare -A DF_SUFFIX_MAPPER DF_SUFFIX_MAPPER=( ["mpt-7b-ipex-llm"]=".ipex-llm" - ["llama-2-7b-chat-hf-vllm"]=".vllm" - ["gpt-j-6b"]=".cpu_and_deepspeed.pip_non_editable" + ["llama-2-7b-chat-hf-no-vllm"]=".cpu_and_deepspeed" + ["gpt-j-6b"]=".cpu_vllm_and_deepspeed.pip_non_editable" ) @@ -120,14 +120,14 @@ get_DF_SUFFIX() { if [[ ${DF_SUFFIX_MAPPER[$key]+_} ]]; then echo "${DF_SUFFIX_MAPPER[$key]}" else - echo ".cpu_and_deepspeed" + echo ".cpu_vllm_and_deepspeed" fi } declare -A TARGET_SUFFIX_MAPPER TARGET_SUFFIX_MAPPER=( ["mpt-7b-ipex-llm"]="_ipex-llm" - ["llama-2-7b-chat-hf-vllm"]="_vllm" + ["llama-2-7b-chat-hf-no-vllm"]="_wo_vllm" ) get_TARGET_SUFFIX() { @@ -169,7 +169,7 @@ inference_deepspeed_test(){ local model=$2 if [[ ${model} =~ ^(gemma-2b|gpt2|falcon-7b|starcoder|mpt-7b.*)$ ]]; then echo ${model} is not supported! - elif [[ ! ${model} == "llama-2-7b-chat-hf-vllm" ]]; then + elif [[ ! ${model} == "llama-2-7b-chat-hf-no-vllm" ]]; then echo update_inference_config with deepspeed: docker exec "${TARGET}" bash -c "python .github/workflows/config/update_inference_config.py --config_file llm_on_ray/inference/models/\"${model}\".yaml --output_file \"${model}\".yaml.deepspeed --deepspeed" echo Start deepspeed simple serve : @@ -187,7 +187,7 @@ inference_restapi_test(){ if [[ ${model} == "mpt-7b-ipex-llm" ]]; then echo Start mpt-7b-ipex-llm simple serve : docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml" - elif [[ ! ${model} == "llama-2-7b-chat-hf-vllm" ]]; then + else echo Start "${TARGET}" serve : docker exec "${TARGET}" bash -c "llm_on_ray-serve --models ${model}" echo Http query: diff --git a/dev/scripts/install-vllm-cpu.sh b/dev/scripts/install-vllm-cpu.sh index 3e7481538..48822cae0 100755 --- a/dev/scripts/install-vllm-cpu.sh +++ b/dev/scripts/install-vllm-cpu.sh @@ -4,7 +4,6 @@ [[ -n $(which g++) ]] || { echo "GNU C++ Compiler (g++) is not found!"; exit 1; } [[ -n $(which pip) ]] || { echo "pip command is not found!"; exit 1; } -# g++ version should be >=12.3. You can run the following to install GCC 12.3 and dependencies on conda: # conda install -y -c conda-forge gxx=12.3 gxx_linux-64=12.3 libxcrypt version_greater_equal() { @@ -14,13 +13,20 @@ gcc_version=$(g++ --version | grep -o -E '[0-9]+\.[0-9]+\.[0-9]+' | head -n1) echo echo Current GNU C++ Compiler version: $gcc_version echo -version_greater_equal "${gcc_version}" 12.3.0 || { echo "GNU C++ Compiler 12.3.0 or above is required!"; exit 1; } - -VLLM_VERSION=0.4.1 +VLLM_VERSION=0.5.2 echo Installing vLLM v$VLLM_VERSION ... # Install VLLM from source, refer to https://docs.vllm.ai/en/latest/getting_started/cpu-installation.html for details -# We use this one-liner to install latest vllm-cpu -MAX_JOBS=8 VLLM_TARGET_DEVICE=cpu pip install -v git+https://github.com/vllm-project/vllm.git@v$VLLM_VERSION \ +is_avx512_available=$(cat /proc/cpuinfo | grep avx512) +if [ -z "$is_avx512_available" ]; then + echo "AVX512 is not available, vLLM CPU backend using other ISA types." + MAX_JOBS=8 VLLM_TARGET_DEVICE=cpu VLLM_CPU_DISABLE_AVX512="true" pip install -v git+https://github.com/vllm-project/vllm.git@v$VLLM_VERSION \ --extra-index-url https://download.pytorch.org/whl/cpu +else + # g++ version should be >=12.3. You can run the following to install GCC 12.3 and dependencies on conda: + version_greater_equal "${gcc_version}" 12.3.0 || { echo "GNU C++ Compiler 12.3.0 or above is required!"; exit 1; } + echo "Install vllm-cpu with AVX512 ISA support" + MAX_JOBS=8 VLLM_TARGET_DEVICE=cpu pip install -v git+https://github.com/vllm-project/vllm.git@v$VLLM_VERSION \ + --extra-index-url https://download.pytorch.org/whl/cpu +fi echo Done! \ No newline at end of file diff --git a/llm_on_ray/inference/api_openai_backend/router_app.py b/llm_on_ray/inference/api_openai_backend/router_app.py index a9f63da7b..35868bb24 100644 --- a/llm_on_ray/inference/api_openai_backend/router_app.py +++ b/llm_on_ray/inference/api_openai_backend/router_app.py @@ -34,13 +34,14 @@ # import os -from typing import AsyncGenerator, List +from typing import AsyncGenerator, List, Dict, Union import uuid import async_timeout from fastapi import FastAPI, status from fastapi import Response as FastAPIResponse from fastapi.middleware.cors import CORSMiddleware -from starlette.responses import Response, StreamingResponse +from starlette.responses import Response, StreamingResponse, JSONResponse +from starlette.requests import Request from llm_on_ray.inference.logger import get_logger from llm_on_ray.inference.api_openai_backend.request_handler import ( OpenAIHTTPException, @@ -68,6 +69,19 @@ logger = get_logger(__name__) +try: + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest as vllm_ChatCompletionRequest, + ChatCompletionResponse as vllm_ChatCompletionResponse, + ) + from llm_on_ray.inference.inference_config import ( + DEVICE_HPU, + DEVICE_CUDA, + ) +except Exception: + logger.warning("VLLM package is not installed") + # timeout in 10 minutes. Streaming can take longer than 3 min TIMEOUT = float(os.environ.get("ROUTER_HTTP_TIMEOUT", 1800)) @@ -243,8 +257,25 @@ class Router: def __init__( self, query_client: RouterQueryClient, + model_configs: Dict, + max_num_seqs: int, ) -> None: self.query_client = query_client + self.vllm_openai_serving_chat = {} + for infer_name, infer_conf in model_configs.items(): + if infer_conf.vllm.enabled and infer_conf.device in [DEVICE_HPU, DEVICE_CUDA]: + from llm_on_ray.inference.predictors.vllm_predictor import VllmPredictor + + predictor = VllmPredictor(infer_conf, max_num_seqs) + serving_chat = OpenAIServingChat( + predictor.engine, + infer_conf.name, + infer_conf.vllm.response_role, + infer_conf.model_description.chat_template, + ) + else: + serving_chat = None + self.vllm_openai_serving_chat[infer_name] = serving_chat @router_app.get("/v1/models", response_model=ModelList) async def models(self) -> ModelList: @@ -332,7 +363,8 @@ async def completions( @router_app.post("/v1/chat/completions") async def chat( self, - body: ChatCompletionRequest, + body: Union[ChatCompletionRequest, vllm_ChatCompletionRequest], + raw_request: Request, response: FastAPIResponse, ): """Given a prompt, the model will return one or more predicted completions, @@ -341,57 +373,68 @@ async def chat( Returns: A response object with completions. """ - prompt = Prompt( - prompt=body.messages, - parameters=dict(body), - tools=body.tools, - tool_choice=body.tool_choice, - ) - request_id = f"chatcmpl-{str(uuid.uuid4().hex)}" - if body.stream: - return StreamingResponse( - _chat_completions_wrapper( - request_id, - body, - response, - self.query_client.query(body.model, prompt, request_id, body.stream), - ), - media_type="text/event-stream", - ) + serving_chat = self.vllm_openai_serving_chat[body.model] + if serving_chat: + generator = await serving_chat.create_chat_completion(body, raw_request=raw_request) + if body.stream: + return StreamingResponse(content=generator, media_type="text/event-stream") + else: + assert isinstance(generator, vllm_ChatCompletionResponse) + return JSONResponse(content=generator.model_dump()) else: - async with async_timeout.timeout(TIMEOUT): - results_reponse = self.query_client.query( - body.model, prompt, request_id, body.stream + prompt = Prompt( + prompt=body.messages, + parameters=dict(body), + tools=body.tools, + tool_choice=body.tool_choice, + ) + request_id = f"chatcmpl-{str(uuid.uuid4().hex)}" + if body.stream: + return StreamingResponse( + _chat_completions_wrapper( + request_id, + body, + response, + self.query_client.query(body.model, prompt, request_id, body.stream), + ), + media_type="text/event-stream", ) - async for results in results_reponse: - if results.error: - raise OpenAIHTTPException( - message=results.error.message, - status_code=results.error.code, - type=results.error.type, - ) - - if results.tool_calls is not None: - msg = ChatMessage(role="assistant", tool_calls=results.tool_calls) - # deleting this fields so that they don't appear in the response - del msg.tool_call_id - else: - msg = ChatMessage(role="assistant", content=results.generated_text or "") + else: + async with async_timeout.timeout(TIMEOUT): + results_reponse = self.query_client.query( + body.model, prompt, request_id, body.stream + ) + async for results in results_reponse: + if results.error: + raise OpenAIHTTPException( + message=results.error.message, + status_code=results.error.code, + type=results.error.type, + ) - usage = UsageInfo.from_response(results.dict()) - return ChatCompletionResponse( - id=request_id, - object="chat.completion", - model=body.model, - choices=[ - ChatCompletionResponseChoice( - index=0, - message=msg, - finish_reason=results.finish_reason, + if results.tool_calls is not None: + msg = ChatMessage(role="assistant", tool_calls=results.tool_calls) + # deleting this fields so that they don't appear in the response + del msg.tool_call_id + else: + msg = ChatMessage( + role="assistant", content=results.generated_text or "" ) - ], - usage=usage, - ) + + usage = UsageInfo.from_response(results.dict()) + return ChatCompletionResponse( + id=request_id, + object="chat.completion", + model=body.model, + choices=[ + ChatCompletionResponseChoice( + index=0, + message=msg, + finish_reason=results.finish_reason, + ) + ], + usage=usage, + ) @router_app.get("/v1/health_check") async def health_check(self) -> bool: diff --git a/llm_on_ray/inference/api_server_openai.py b/llm_on_ray/inference/api_server_openai.py index 6b5a0e2db..dcc1ee85f 100644 --- a/llm_on_ray/inference/api_server_openai.py +++ b/llm_on_ray/inference/api_server_openai.py @@ -38,7 +38,7 @@ from llm_on_ray.inference.api_openai_backend.router_app import Router, router_app -def router_application(deployments, model_list, max_ongoing_requests): +def router_application(deployments, model_list, max_ongoing_requests, max_num_seqs): """Create a Router Deployment. Router Deployment will point to a Serve Deployment for each specified base model, @@ -68,11 +68,13 @@ def router_application(deployments, model_list, max_ongoing_requests): ), # Maximum backlog for a single replica )(serve.ingress(router_app)(Router)) - return RouterDeployment.bind(merged_client) + return RouterDeployment.bind(merged_client, model_list, max_num_seqs) -def openai_serve_run(deployments, model_list, host, route_prefix, port, max_ongoing_requests): - router_app = router_application(deployments, model_list, max_ongoing_requests) +def openai_serve_run( + deployments, model_list, host, route_prefix, port, max_ongoing_requests, max_num_seqs +): + router_app = router_application(deployments, model_list, max_ongoing_requests, max_num_seqs) serve.start(http_options={"host": host, "port": port}) serve.run( diff --git a/llm_on_ray/inference/inference_config.py b/llm_on_ray/inference/inference_config.py index 7d405c7c7..1d1759685 100644 --- a/llm_on_ray/inference/inference_config.py +++ b/llm_on_ray/inference/inference_config.py @@ -44,7 +44,7 @@ class ModelConfig(BaseModel): class Ipex(BaseModel): - enabled: bool = True + enabled: bool = False precision: str = "bf16" @validator("precision") @@ -59,6 +59,12 @@ class Vllm(BaseModel): max_num_seqs: int = 256 precision: str = "bf16" enforce_eager: bool = False + tensor_parallel_size: int = 1 + gpu_memory_utilization: float = 0.90 + block_size: int = 16 + max_seq_len_to_capture: int = 8192 + response_role: str = "assistant" + lora_modules: Union[str, None] = None @validator("precision") def _check_precision(cls, v: str): diff --git a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml index 37e18acf4..55eb6b35b 100644 --- a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml +++ b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml @@ -2,13 +2,10 @@ port: 8000 name: CodeLlama-7b-hf route_prefix: /CodeLlama-7b-hf num_replicas: 1 -cpus_per_worker: 24 -gpus_per_worker: 0 -deepspeed: false -workers_per_group: 2 device: cpu -ipex: - enabled: false +cpus_per_worker: 24 +vllm: + enabled: true precision: bf16 model_description: model_id_or_path: codellama/CodeLlama-7b-hf diff --git a/llm_on_ray/inference/models/bloom-560m.yaml b/llm_on_ray/inference/models/bloom-560m.yaml index 12d2b0372..d630ffaaf 100644 --- a/llm_on_ray/inference/models/bloom-560m.yaml +++ b/llm_on_ray/inference/models/bloom-560m.yaml @@ -2,12 +2,9 @@ port: 8000 name: bloom-560m route_prefix: /bloom-560m num_replicas: 1 -cpus_per_worker: 24 -gpus_per_worker: 0 -deepspeed: false -workers_per_group: 2 device: cpu -ipex: +cpus_per_worker: 24 +vllm: enabled: true precision: bf16 model_description: diff --git a/llm_on_ray/inference/models/deepseek-coder-33b-instruct.yaml b/llm_on_ray/inference/models/deepseek-coder-33b-instruct.yaml index adc1d158c..310134474 100644 --- a/llm_on_ray/inference/models/deepseek-coder-33b-instruct.yaml +++ b/llm_on_ray/inference/models/deepseek-coder-33b-instruct.yaml @@ -2,13 +2,10 @@ port: 8000 name: deepseek-coder-33b-instruct route_prefix: /deepseek-coder-33b-instruct num_replicas: 1 -cpus_per_worker: 24 -gpus_per_worker: 0 -deepspeed: false -workers_per_group: 2 device: cpu -ipex: - enabled: false +cpus_per_worker: 24 +vllm: + enabled: true precision: bf16 model_description: model_id_or_path: deepseek-ai/deepseek-coder-33b-instruct diff --git a/llm_on_ray/inference/models/deplot.yaml b/llm_on_ray/inference/models/deplot.yaml index 6e5bde761..dfdb1798c 100644 --- a/llm_on_ray/inference/models/deplot.yaml +++ b/llm_on_ray/inference/models/deplot.yaml @@ -2,13 +2,10 @@ port: 8000 name: deplot route_prefix: /deplot num_replicas: 1 -cpus_per_worker: 24 -gpus_per_worker: 0 -deepspeed: false -workers_per_group: 2 device: cpu -ipex: - enabled: false +cpus_per_worker: 24 +vllm: + enabled: true precision: bf16 model_description: model_id_or_path: google/deplot diff --git a/llm_on_ray/inference/models/falcon-7b.yaml b/llm_on_ray/inference/models/falcon-7b.yaml index 119337d70..5c02cf70f 100644 --- a/llm_on_ray/inference/models/falcon-7b.yaml +++ b/llm_on_ray/inference/models/falcon-7b.yaml @@ -2,13 +2,10 @@ port: 8000 name: falcon-7b route_prefix: /falcon-7b num_replicas: 1 -cpus_per_worker: 24 -gpus_per_worker: 0 -deepspeed: false -workers_per_group: 2 device: cpu -ipex: - enabled: false +cpus_per_worker: 24 +vllm: + enabled: true precision: bf16 model_description: model_id_or_path: tiiuae/falcon-7b diff --git a/llm_on_ray/inference/models/fuyu8b.yaml b/llm_on_ray/inference/models/fuyu8b.yaml index 77d33ff9b..e065b8ade 100644 --- a/llm_on_ray/inference/models/fuyu8b.yaml +++ b/llm_on_ray/inference/models/fuyu8b.yaml @@ -2,13 +2,10 @@ port: 8000 name: fuyu-8b route_prefix: /fuyu-8b num_replicas: 1 -cpus_per_worker: 24 -gpus_per_worker: 0 -deepspeed: false -workers_per_group: 2 device: cpu -ipex: - enabled: false +cpus_per_worker: 24 +vllm: + enabled: true precision: bf16 model_description: model_id_or_path: adept/fuyu-8b diff --git a/llm_on_ray/inference/models/gemma-2b.yaml b/llm_on_ray/inference/models/gemma-2b.yaml index 5b013b371..7f743a028 100644 --- a/llm_on_ray/inference/models/gemma-2b.yaml +++ b/llm_on_ray/inference/models/gemma-2b.yaml @@ -2,12 +2,9 @@ port: 8000 name: gemma-2b route_prefix: /gemma-2b num_replicas: 1 -cpus_per_worker: 2 -gpus_per_worker: 0 -deepspeed: false -workers_per_group: 2 device: cpu -ipex: +cpus_per_worker: 24 +vllm: enabled: true precision: bf16 model_description: diff --git a/llm_on_ray/inference/models/gpt-j-6b.yaml b/llm_on_ray/inference/models/gpt-j-6b.yaml index 9719b2f7e..2dffc062e 100644 --- a/llm_on_ray/inference/models/gpt-j-6b.yaml +++ b/llm_on_ray/inference/models/gpt-j-6b.yaml @@ -2,14 +2,10 @@ port: 8000 name: gpt-j-6b route_prefix: /gpt-j-6b num_replicas: 1 -cpus_per_worker: 24 -gpus_per_worker: 0 -deepspeed: false -workers_per_group: 2 device: cpu -ipex: - # false here for ci coverage - enabled: false +cpus_per_worker: 24 +vllm: + enabled: true precision: bf16 model_description: model_id_or_path: EleutherAI/gpt-j-6b diff --git a/llm_on_ray/inference/models/gpt2.yaml b/llm_on_ray/inference/models/gpt2.yaml index 06a4b1b8b..81021a85d 100644 --- a/llm_on_ray/inference/models/gpt2.yaml +++ b/llm_on_ray/inference/models/gpt2.yaml @@ -2,12 +2,9 @@ port: 8000 name: gpt2 route_prefix: /gpt2 num_replicas: 1 -cpus_per_worker: 24 -gpus_per_worker: 0 -deepspeed: false -workers_per_group: 2 device: cpu -ipex: +cpus_per_worker: 24 +vllm: enabled: true precision: bf16 model_description: diff --git a/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm-autoscaling.yaml b/llm_on_ray/inference/models/llama-2-7b-chat-hf-autoscaling.yaml similarity index 83% rename from llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm-autoscaling.yaml rename to llm_on_ray/inference/models/llama-2-7b-chat-hf-autoscaling.yaml index ba32990a6..b8c50951a 100644 --- a/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm-autoscaling.yaml +++ b/llm_on_ray/inference/models/llama-2-7b-chat-hf-autoscaling.yaml @@ -9,18 +9,12 @@ autoscaling_config: target_ongoing_requests: 24 downscale_delay_s: 30 upscale_delay_s: 10 +device: cpu cpus_per_worker: 24 -gpus_per_worker: 0 -deepspeed: false vllm: enabled: true max_num_seqs: 64 precision: bf16 -workers_per_group: 2 -device: cpu -ipex: - enabled: false - precision: bf16 model_description: model_id_or_path: NousResearch/Llama-2-7b-chat-hf tokenizer_name_or_path: NousResearch/Llama-2-7b-chat-hf diff --git a/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml b/llm_on_ray/inference/models/llama-2-7b-chat-hf-no-vllm.yaml similarity index 59% rename from llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml rename to llm_on_ray/inference/models/llama-2-7b-chat-hf-no-vllm.yaml index 29d562aa9..83acddba2 100644 --- a/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml +++ b/llm_on_ray/inference/models/llama-2-7b-chat-hf-no-vllm.yaml @@ -1,17 +1,10 @@ port: 8000 -name: llama-2-7b-chat-hf -route_prefix: /llama-2-7b-chat-hf +name: llama-2-7b-chat-hf-no-vllm +route_prefix: /llama-2-7b-chat-hf-no-vllm num_replicas: 1 +device: cpu cpus_per_worker: 24 -gpus_per_worker: 0 -deepspeed: false vllm: - enabled: true - max_num_seqs: 256 - precision: bf16 -workers_per_group: 2 -device: cpu -ipex: enabled: false precision: bf16 model_description: diff --git a/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml b/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml index 81cb74d98..d4fe78093 100644 --- a/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml +++ b/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml @@ -2,14 +2,10 @@ port: 8000 name: llama-2-7b-chat-hf route_prefix: /llama-2-7b-chat-hf num_replicas: 1 -dynamic_max_batch_size: 8 -cpus_per_worker: 24 -gpus_per_worker: 0 -deepspeed: false -workers_per_group: 2 device: cpu -ipex: - enabled: false +cpus_per_worker: 24 +vllm: + enabled: true precision: bf16 model_description: model_id_or_path: NousResearch/Llama-2-7b-chat-hf diff --git a/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml b/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml index ea50f6af7..c10f2e2cd 100644 --- a/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml +++ b/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml @@ -1,13 +1,10 @@ port: 8000 name: mistral-7b-instruct-v0.2 route_prefix: /mistral-7b-instruct-v0.2 -cpus_per_worker: 48 -gpus_per_worker: 0 -deepspeed: false -workers_per_group: 2 device: cpu -ipex: - enabled: false +cpus_per_worker: 24 +vllm: + enabled: true precision: bf16 model_description: model_id_or_path: mistralai/Mistral-7B-Instruct-v0.2 diff --git a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml index 3654f18f0..5767e8955 100644 --- a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml +++ b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml @@ -2,13 +2,10 @@ port: 8000 name: mistral-7b-v0.1 route_prefix: /mistral-7b-v0.1 num_replicas: 1 -cpus_per_worker: 24 -gpus_per_worker: 0 -deepspeed: false -workers_per_group: 2 device: cpu -ipex: - enabled: false +cpus_per_worker: 24 +vllm: + enabled: true precision: bf16 model_description: model_id_or_path: mistralai/Mistral-7B-v0.1 diff --git a/llm_on_ray/inference/models/mpt-7b.yaml b/llm_on_ray/inference/models/mpt-7b.yaml index 89ce086ed..42b6eefad 100644 --- a/llm_on_ray/inference/models/mpt-7b.yaml +++ b/llm_on_ray/inference/models/mpt-7b.yaml @@ -2,13 +2,10 @@ port: 8000 name: mpt-7b route_prefix: /mpt-7b num_replicas: 1 -cpus_per_worker: 24 -gpus_per_worker: 0 -deepspeed: false -workers_per_group: 2 device: cpu -ipex: - enabled: false +cpus_per_worker: 24 +vllm: + enabled: true precision: bf16 model_description: model_id_or_path: mosaicml/mpt-7b diff --git a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml index 8f32c28b7..fca4487e1 100644 --- a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml +++ b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml @@ -2,13 +2,10 @@ port: 8000 name: neural-chat-7b-v3-1 route_prefix: /neural-chat-7b-v3-1 num_replicas: 1 -cpus_per_worker: 24 -gpus_per_worker: 0 -deepspeed: false -workers_per_group: 2 device: cpu -ipex: - enabled: false +cpus_per_worker: 24 +vllm: + enabled: true precision: bf16 model_description: model_id_or_path: Intel/neural-chat-7b-v3-1 diff --git a/llm_on_ray/inference/models/opt-125m.yaml b/llm_on_ray/inference/models/opt-125m.yaml index 81e05fc19..6bf1c728b 100644 --- a/llm_on_ray/inference/models/opt-125m.yaml +++ b/llm_on_ray/inference/models/opt-125m.yaml @@ -2,13 +2,10 @@ port: 8000 name: opt-125m route_prefix: /opt-125m num_replicas: 1 -cpus_per_worker: 24 -gpus_per_worker: 0 -deepspeed: false -workers_per_group: 2 device: cpu -ipex: - enabled: false +cpus_per_worker: 24 +vllm: + enabled: true precision: bf16 model_description: model_id_or_path: facebook/opt-125m diff --git a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml index daa5256c5..f9eabb8ae 100644 --- a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml +++ b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml @@ -1,13 +1,10 @@ port: 8000 name: sqlcoder-7b-2 route_prefix: /sqlcoder-7b-2 -cpus_per_worker: 22 -gpus_per_worker: 0 -deepspeed: false -workers_per_group: 2 device: cpu -ipex: - enabled: false +cpus_per_worker: 24 +vllm: + enabled: true precision: bf16 model_description: model_id_or_path: defog/sqlcoder-7b-2 diff --git a/llm_on_ray/inference/models/starcoder.yaml b/llm_on_ray/inference/models/starcoder.yaml index 199926353..1ff137a42 100644 --- a/llm_on_ray/inference/models/starcoder.yaml +++ b/llm_on_ray/inference/models/starcoder.yaml @@ -2,14 +2,11 @@ port: 8000 name: starcoder route_prefix: /starcoder num_replicas: 1 +device: cpu cpus_per_worker: 24 -gpus_per_worker: 0 -deepspeed: false -workers_per_group: 2 -ipex: - enabled: false +vllm: + enabled: true precision: bf16 -device: cpu model_description: model_id_or_path: bigcode/starcoder tokenizer_name_or_path: bigcode/starcoder diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index ed67f5119..73d1e4702 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -28,7 +28,7 @@ from fastapi import HTTPException from llm_on_ray.inference.chat_template_process import ChatTemplatePreprocess -from llm_on_ray.inference.inference_config import InferenceConfig +from llm_on_ray.inference.inference_config import InferenceConfig, DEVICE_HPU, DEVICE_CUDA from llm_on_ray.inference.api_openai_backend.openai_protocol import ( ChatMessage, ErrorResponse, @@ -66,6 +66,7 @@ def __init__( # Used to determine if openai backend is used self.use_openai = False + self.vllm_openai_serving_chat = None if infer_conf.device == "hpu" and not self.use_vllm: from llm_on_ray.inference.predictors.hpu_predictor import HPUPredictor @@ -76,9 +77,12 @@ def __init__( self.predictor = DeepSpeedPredictor(infer_conf) elif self.use_vllm: - from llm_on_ray.inference.predictors.vllm_predictor import VllmPredictor + if infer_conf.device not in [DEVICE_HPU, DEVICE_CUDA]: + from llm_on_ray.inference.predictors.vllm_predictor import VllmPredictor - self.predictor = VllmPredictor(infer_conf, max_num_seqs) + self.predictor = VllmPredictor(infer_conf, max_num_seqs) + else: + self.predictor = None elif self.is_mllm: from llm_on_ray.inference.predictors.mllm_predictor import MllmPredictor diff --git a/llm_on_ray/inference/predictors/vllm_predictor.py b/llm_on_ray/inference/predictors/vllm_predictor.py index d3d09414a..1a163580a 100644 --- a/llm_on_ray/inference/predictors/vllm_predictor.py +++ b/llm_on_ray/inference/predictors/vllm_predictor.py @@ -26,6 +26,8 @@ InferenceConfig, ModelGenerateResult, PRECISION_BF16, + DEVICE_HPU, + DEVICE_CUDA, ) @@ -43,17 +45,26 @@ def __init__(self, infer_conf: InferenceConfig, max_num_seqs): # The default value is 40GB. os.environ["VLLM_CPU_KVCACHE_SPACE"] = str(self.VLLM_CPU_KVCACHE_SPACE_DEFAULT) - args = AsyncEngineArgs( + engine_args = AsyncEngineArgs( model=model_desc.model_id_or_path, + tokenizer=model_desc.tokenizer_name_or_path, trust_remote_code=model_config.trust_remote_code, device=infer_conf.device, dtype=dtype, disable_log_requests=True, max_num_seqs=max_num_seqs, + gpu_memory_utilization=infer_conf.vllm.gpu_memory_utilization, + tensor_parallel_size=infer_conf.vllm.tensor_parallel_size, + block_size=infer_conf.vllm.block_size, + max_seq_len_to_capture=infer_conf.vllm.max_seq_len_to_capture, enforce_eager=infer_conf.vllm.enforce_eager, ) - - self.engine = AsyncLLMEngine.from_engine_args(args) + if ( + infer_conf.device in [DEVICE_HPU, DEVICE_CUDA] + and infer_conf.vllm.tensor_parallel_size > 1 + ): + engine_args.worker_use_ray = True + self.engine = AsyncLLMEngine.from_engine_args(engine_args) def update_vllm_config(self, **config): # need to update the keys of config if vllm engine is used diff --git a/llm_on_ray/inference/serve.py b/llm_on_ray/inference/serve.py index ecd3bdee8..60022690f 100644 --- a/llm_on_ray/inference/serve.py +++ b/llm_on_ray/inference/serve.py @@ -24,6 +24,8 @@ ModelDescription, InferenceConfig, all_models, + DEVICE_HPU, + DEVICE_CUDA, ) @@ -55,21 +57,40 @@ def get_deployed_models(args): deployments = {} for model_id, infer_conf in model_list.items(): ray_actor_options = get_deployment_actor_options(infer_conf) - depolyment_config = { + deployment_config = { "ray_actor_options": ray_actor_options, "max_ongoing_requests": infer_conf.max_ongoing_requests if not args.max_ongoing_requests else args.max_ongoing_requests, } if infer_conf.autoscaling_config: - depolyment_config["autoscaling_config"] = infer_conf.autoscaling_config.dict() + deployment_config["autoscaling_config"] = infer_conf.autoscaling_config.dict() elif infer_conf.num_replicas: - depolyment_config["num_replicas"] = infer_conf.num_replicas + deployment_config["num_replicas"] = infer_conf.num_replicas max_num_seqs = infer_conf.vllm.max_num_seqs if not args.max_num_seqs else args.max_num_seqs dynamic_max_batch_size = ( infer_conf.dynamic_max_batch_size if not args.max_batch_size else args.max_batch_size ) - deployments[model_id] = PredictorDeployment.options(**depolyment_config).bind( + device = infer_conf.device + if infer_conf.vllm.enabled and (not args.simple) and device in [DEVICE_HPU, DEVICE_CUDA]: + tp = infer_conf.vllm.tensor_parallel_size + if tp > 1: + deployment_config["ray_actor_options"].pop("resources", None) + pg_resources = [] + pg_resources.append( + {"CPU": 2} + ) # One is for PredictorDeployment replica, and the other is for Router replica + # When device is HPU, the resources of workers will be allocated in vllm engine. + if device == DEVICE_CUDA: + for i in range(tp): + # for the vLLM actors on GPU + pg_resources.append( + {"CPU": infer_conf.cpus_per_worker, "GPU": infer_conf.gpus_per_worker} + ) + deployment_config["placement_group_bundles"] = pg_resources + deployment_config["placement_group_strategy"] = "STRICT_PACK" + + deployments[model_id] = PredictorDeployment.options(**deployment_config).bind( infer_conf, max_num_seqs, dynamic_max_batch_size ) @@ -123,7 +144,7 @@ def main(argv=None): parser.add_argument("--port", default=8000, type=int, help="The port of deployment address.") parser.add_argument( "--max_num_seqs", - default=None, + default=256, type=int, help="The batch size for vLLM. Used when vLLM is enabled.", ) @@ -158,7 +179,15 @@ def main(argv=None): host = "127.0.0.1" if args.serve_local_only else "0.0.0.0" print("Service is running with deployments:" + str(deployments)) print("Service is running models:" + str(model_list)) - openai_serve_run(deployments, model_list, host, "/", args.port, args.max_ongoing_requests) + openai_serve_run( + deployments, + model_list, + host, + "/", + args.port, + args.max_ongoing_requests, + args.max_num_seqs, + ) msg = "Service is deployed successfully." if args.keep_serve_terminal: diff --git a/pyproject.toml b/pyproject.toml index 5a8e89306..f17820b12 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,7 @@ dependencies = [ [project.optional-dependencies] cpu = [ "transformers>=4.38.0, <=4.38.1", - "intel_extension_for_pytorch==2.2.0", + "intel_extension_for_pytorch==2.3.100", "torch==2.2.0", "oneccl_bind_pt==2.2.0" ] diff --git a/tests/inference/test_chat_template.py b/tests/inference/test_chat_template.py index 4a987a841..421b6cdb5 100644 --- a/tests/inference/test_chat_template.py +++ b/tests/inference/test_chat_template.py @@ -57,38 +57,6 @@ False, "Hello\nHi there!\nWhat is the capital of\n", ), - ( - "google/gemma-2b", - base_path / "template_gemma.jinja", - True, - "<|endoftext|>\n" - "user\n" - "Hello\n" - "\n" - "model\n" - "Hi there!\n" - "\n" - "user\n" - "What is the capital of\n" - "\n" - "model\n" - "\n", - ), - ( - "google/gemma-2b", - base_path / "template_gemma.jinja", - False, - "<|endoftext|>\n" - "user\n" - "Hello\n" - "\n" - "model\n" - "Hi there!\n" - "\n" - "user\n" - "What is the capital of\n" - "\n", - ), ( "mistralai/Mistral-7B-v0.1", base_path / "template_mistral.jinja", diff --git a/tests/test_getting_started.sh b/tests/test_getting_started.sh index a84bfe334..6288b0543 100755 --- a/tests/test_getting_started.sh +++ b/tests/test_getting_started.sh @@ -15,6 +15,9 @@ pip install .[cpu] --extra-index-url https://download.pytorch.org/whl/cpu --extr # Dynamic link oneCCL and Intel MPI libraries source $(python -c "import oneccl_bindings_for_pytorch as torch_ccl; print(torch_ccl.cwd)")/env/setvars.sh +# Install vllm from source +bash ./dev/scripts/install-vllm-cpu.sh + echo "Step 2: Start ray cluster ..." ray start --head diff --git a/tests/test_setup.sh b/tests/test_setup.sh index 589b5a320..9f37119de 100755 --- a/tests/test_setup.sh +++ b/tests/test_setup.sh @@ -18,6 +18,7 @@ case $(echo $1 | tr 'a-z' 'A-Z') in "CPU") hardware=1 pip install .[cpu] --extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ + bash ./dev/scripts/install-vllm-cpu.sh ;; "GPU") pip install .[gpu] --extra-index-url https://developer.intel.com/ipex-whl-stable-xpu @@ -31,7 +32,7 @@ case $(echo $1 | tr 'a-z' 'A-Z') in ;; esac -# Check if it neesd deepspeed +# Check if it needs deepspeed if [ $(echo $2 | tr 'A-Z' 'a-z') == "true" ] then deepspeed=true