Skip to content

Commit

Permalink
[Cherry-Pick] vLLM multi-node template (#292)
Browse files Browse the repository at this point in the history
* [RHOAIENG-15386] add a new servingruntime vllm-multinode-template (#284)

* add a new servingruntime vllm-multinode-template

Signed-off-by: jooho lee <[email protected]>

* follow up comments

Signed-off-by: jooho lee <[email protected]>

* follow up comments

Signed-off-by: jooho lee <[email protected]>

* follow up comments

Signed-off-by: jooho lee <[email protected]>

* follow up comments

Signed-off-by: jooho lee <[email protected]>

---------

Signed-off-by: jooho lee <[email protected]>

* Fix ServingRuntime args (#291)

* fix runtime args

Signed-off-by: jooho lee <[email protected]>

* add disable_custom_all_reduce solving vllm-8735 issue

Signed-off-by: jooho lee <[email protected]>

---------

Signed-off-by: jooho lee <[email protected]>

---------

Signed-off-by: jooho lee <[email protected]>
  • Loading branch information
Jooho authored Nov 12, 2024
1 parent 479aece commit 36ca97c
Show file tree
Hide file tree
Showing 3 changed files with 251 additions and 1 deletion.
6 changes: 6 additions & 0 deletions config/base/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,12 @@ replacements:
name: vllm-runtime-template
fieldPaths:
- objects.0.spec.containers.0.image
- select:
kind: Template
name: vllm-multinode-runtime-template
fieldPaths:
- objects.0.spec.containers.0.image
- objects.0.spec.workerSpec.containers.0.image
- source:
kind: ConfigMap
version: v1
Expand Down
3 changes: 2 additions & 1 deletion config/runtimes/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ resources:
- tgis-template.yaml
- ovms-kserve-template.yaml
- vllm-template.yaml
- vllm-multinode-template.yaml
- vllm-rocm-template.yaml
- vllm-gaudi-template.yaml
- caikit-standalone-template.yaml
- caikit-standalone-template.yaml
243 changes: 243 additions & 0 deletions config/runtimes/vllm-multinode-template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
apiVersion: template.openshift.io/v1
kind: Template
metadata:
labels:
opendatahub.io/dashboard: "true"
opendatahub.io/ootb: "true"
annotations:
description: vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs
openshift.io/display-name: vLLM ServingRuntime Multi Node for KServe
openshift.io/provider-display-name: Red Hat, Inc.
tags: rhods,rhoai,kserve,servingruntime,multi-node
template.openshift.io/documentation-url: https://github.com/opendatahub-io/vllm
template.openshift.io/long-description: This template defines resources needed to deploy vLLM ServingRuntime Multi-Node with KServe in Red Hat OpenShift AI
opendatahub.io/modelServingSupport: '["single"]'
opendatahub.io/apiProtocol: "REST"
name: vllm-multinode-runtime-template
objects:
- apiVersion: serving.kserve.io/v1alpha1
kind: ServingRuntime
metadata:
name: vllm-multinode-runtime
annotations:
openshift.io/display-name: vLLM ServingRuntime for KServe
opendatahub.io/recommended-accelerators: '["nvidia.com/gpu"]'
labels:
opendatahub.io/dashboard: "true"
spec:
annotations:
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
multiModel: false
supportedModelFormats:
- autoSelect: true
name: vLLM
priority: 2
containers:
- name: kserve-container
image: $(vllm-image)
command: ["bash", "-c"]
args:
- |
ray start --head --disable-usage-stats --include-dashboard false
# wait for other node to join
until [[ $(ray status --address $RAY_ADDRESS | grep -c node_) -eq ${PIPELINE_PARALLEL_SIZE} ]]; do
echo "Waiting..."
sleep 1
done
ray status
export SERVED_MODEL_NAME=${MODEL_NAME}
export MODEL_NAME=${MODEL_DIR}
exec python3 -m vllm.entrypoints.openai.api_server --port=8080 --distributed-executor-backend ray --model=${MODEL_NAME} --served-model-name=${SERVED_MODEL_NAME} --tensor-parallel-size=${TENSOR_PARALLEL_SIZE} --pipeline-parallel-size=${PIPELINE_PARALLEL_SIZE} --disable_custom_all_reduce
env:
- name: RAY_PORT
value: "6379"
- name: RAY_ADDRESS
value: 127.0.0.1:6379
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: VLLM_NO_USAGE_STATS
value: "1"
- name: HOME
value: /tmp
- name: HF_HOME
value: /tmp/hf_home
resources:
limits:
cpu: "16"
memory: 48Gi
requests:
cpu: "8"
volumeMounts:
- name: shm
mountPath: /dev/shm
livenessProbe:
failureThreshold: 2
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 15
exec:
command:
- bash
- -c
- |
# Check if the registered ray nodes count is the same as PIPELINE_PARALLEL_SIZE
gpu_status=$(ray status --address $RAY_ADDRESS | grep GPU)
if [[ -z $gpu_status ]]; then
echo "$1: GPU does not exist"
exit 1
fi
used_gpu=$(echo "$gpu_status" | awk '{print $1}' | cut -d'/' -f1)
reserved_gpu=$(echo "$gpu_status" | awk '{print $1}' | cut -d'/' -f2)
# Determine health status based on GPU usage
if [[ "$used_gpu" != "$reserved_gpu" ]]; then
echo "$1: Unhealthy - Used: $used_gpu, Reserved: $reserved_gpu"
exit 1
fi
readinessProbe:
failureThreshold: 3
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 15
exec:
command:
- bash
- -c
- |
# Check if the registered nodes count matches PIPELINE_PARALLEL_SIZE
registered_node_count=$(ray status --address $RAY_ADDRESS | grep -c node_)
if [[ $registered_node_count -ne "$PIPELINE_PARALLEL_SIZE" ]]; then
echo "Readiness Probe: Unhealthy - Registered nodes count ($registered_node_count) does not match PIPELINE_PARALLEL_SIZE ($PIPELINE_PARALLEL_SIZE)."
exit 1
fi
# Check if the registered ray nodes count is the same as PIPELINE_PARALLEL_SIZE
gpu_status=$(ray status --address $RAY_ADDRESS | grep GPU)
if [[ -z $gpu_status ]]; then
echo "$1: GPU does not exist"
exit 1
fi
used_gpu=$(echo "$gpu_status" | awk '{print $1}' | cut -d'/' -f1)
reserved_gpu=$(echo "$gpu_status" | awk '{print $1}' | cut -d'/' -f2)
# Determine health status based on GPU usage
if [[ "$used_gpu" != "$reserved_gpu" ]]; then
echo "$1: Unhealthy - Used: $used_gpu, Reserved: $reserved_gpu"
exit 1
fi
# Check model health
curl --silent --max-time 5 --fail-with-body http://localhost:8080/health
startupProbe:
failureThreshold: 30
periodSeconds: 30
successThreshold: 1
timeoutSeconds: 30
exec:
command:
- bash
- -c
- |
ray status --address $RAY_ADDRESS > /dev/null 2>&1
# Check model health
curl --silent --max-time 5 --fail-with-body http://localhost:8080/health
ports:
- containerPort: 8080
name: http
protocol: TCP
volumes:
- name: shm
emptyDir:
medium: Memory
sizeLimit: 12Gi
workerSpec:
pipelineParallelSize: 2
tensorParallelSize: 1
containers:
- name: worker-container
image: $(vllm-image)
command: ["bash", "-c"]
args:
- |
SECONDS=0
while true; do
if (( SECONDS <= 120 )); then
if ray health-check --address "${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379" > /dev/null 2>&1; then
echo "Global Control Service(GCS) is ready."
break
fi
echo "$SECONDS seconds elapsed: Waiting for Global Control Service(GCS) to be ready."
else
if ray health-check --address "${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379"; then
echo "Global Control Service(GCS) is ready. Any error messages above can be safely ignored."
break
fi
echo "$SECONDS seconds elapsed: Still waiting for Global Control Service(GCS) to be ready."
echo "For troubleshooting, refer to the FAQ at https://docs.ray.io/en/master/cluster/kubernetes/troubleshooting/troubleshooting.html#kuberay-troubleshootin-guides"
fi
sleep 5
done
export RAY_HEAD_ADDRESS="${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379"
echo "Attempting to connect to Ray cluster at $RAY_HEAD_ADDRESS ..."
ray start --address="$RAY_HEAD_ADDRESS" --block
env:
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
resources:
limits:
cpu: "16"
memory: 48Gi
requests:
cpu: "8"
volumeMounts:
- name: shm
mountPath: /dev/shm
livenessProbe:
failureThreshold: 2
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 15
exec:
command:
- bash
- -c
- |
# Check if the registered nodes count matches PIPELINE_PARALLEL_SIZE
registered_node_count=$(ray status --address ${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379 | grep -c node_)
if [[ $registered_node_count -ne "$PIPELINE_PARALLEL_SIZE" ]]; then
echo "Readiness Probe: Unhealthy - Registered nodes count ($registered_node_count) does not match PIPELINE_PARALLEL_SIZE ($PIPELINE_PARALLEL_SIZE)."
exit 1
fi
startupProbe:
failureThreshold: 12
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 30
exec:
command:
- /bin/sh
- -c
- |
ray status --address ${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379 > /dev/null 2>&1
volumes:
- name: shm
emptyDir:
medium: Memory
sizeLimit: 12Gi

0 comments on commit 36ca97c

Please sign in to comment.