Skip to content

Commit

Permalink
043 release fixes (#40)
Browse files Browse the repository at this point in the history
This includes some fixes for supporting vllm 0.4.3+.

Mostly the `generate` api changed, so we have to update our grpc server
accordingly

---------

Signed-off-by: Joe Runde <[email protected]>
  • Loading branch information
joerunde authored Jun 4, 2024
1 parent a17c8fb commit ef3e030
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 11 deletions.
10 changes: 5 additions & 5 deletions Dockerfile.ubi
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ RUN microdnf install -y \

ARG PYTHON_VERSION
# 0.4.2 is built for CUDA 12.1 and PyTorch 2.3.0
ARG VLLM_WHEEL_VERSION=0.4.2
ARG VLLM_WHEEL_VERSION=0.4.3

RUN curl -Lo vllm.whl https://github.com/vllm-project/vllm/releases/download/v${VLLM_WHEEL_VERSION}/vllm-${VLLM_WHEEL_VERSION}-cp${PYTHON_VERSION//.}-cp${PYTHON_VERSION//.}-manylinux1_x86_64.whl \
&& unzip vllm.whl \
Expand Down Expand Up @@ -208,12 +208,12 @@ COPY --link vllm vllm
# Comment if building *.so files from scratch
##################################################
# Copy the prebuilt *.so files
# COPY --from=prebuilt-wheel --link /workspace/vllm/*.so /workspace/vllm/
# ENV VLLM_USE_PRECOMPILED=1
COPY --from=prebuilt-wheel --link /workspace/vllm/*.so /workspace/vllm/
ENV VLLM_USE_PRECOMPILED=1
##################################################
# Comment if not building .so files from scratch
RUN microdnf install -y git \
&& microdnf clean all
#RUN microdnf install -y git \
# && microdnf clean all
##################################################

# Copy over the generated *.pb2 files
Expand Down
20 changes: 14 additions & 6 deletions vllm/entrypoints/grpc/grpc_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
TokenizeResponse)
from vllm.entrypoints.grpc.validation import validate_input, validate_params
from vllm.entrypoints.openai.serving_completion import merge_async_iterators
from vllm.inputs import TextTokensPrompt
from vllm.logger import init_logger
from vllm.sequence import Logprob
from vllm.tgis_utils import logs
Expand Down Expand Up @@ -151,13 +152,16 @@ async def Generate(self, request: BatchedGenerationRequest,
input_ids, max_is_token_limit[i]\
= await self._validate_prompt_and_tokenize(
sampling_params, truncate_input_tokens, req.text, context)
inputs = TextTokensPrompt(
prompt=req.text,
prompt_token_ids=input_ids
)
generators.append(
# prompt is supplied for observability, the text is not
# re-tokenized when `prompt_token_ids` is supplied
self.engine.generate(prompt=req.text,
self.engine.generate(inputs=inputs,
sampling_params=sampling_params,
request_id=f"{request_id}-{i}",
prompt_token_ids=input_ids),
request_id=f"{request_id}-{i}"),
)

# TODO handle cancellation
Expand Down Expand Up @@ -213,13 +217,17 @@ async def GenerateStream(
sampling_params, truncate_input_tokens, request.request.text,
context)

inputs = TextTokensPrompt(
prompt=request.request.text,
prompt_token_ids=input_ids
)

result_generator = self.engine.generate(
# prompt is supplied for observability, the text is not
# re-tokenized when `prompt_token_ids` is supplied
prompt=request.request.text,
inputs=inputs,
sampling_params=sampling_params,
request_id=request_id,
prompt_token_ids=input_ids,
request_id=request_id
)

resp_options = request.params.response
Expand Down

0 comments on commit ef3e030

Please sign in to comment.