-
Notifications
You must be signed in to change notification settings - Fork 56
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Cherry-Pick] vLLM multi-node template (#292)
* [RHOAIENG-15386] add a new servingruntime vllm-multinode-template (#284) * add a new servingruntime vllm-multinode-template Signed-off-by: jooho lee <[email protected]> * follow up comments Signed-off-by: jooho lee <[email protected]> * follow up comments Signed-off-by: jooho lee <[email protected]> * follow up comments Signed-off-by: jooho lee <[email protected]> * follow up comments Signed-off-by: jooho lee <[email protected]> --------- Signed-off-by: jooho lee <[email protected]> * Fix ServingRuntime args (#291) * fix runtime args Signed-off-by: jooho lee <[email protected]> * add disable_custom_all_reduce solving vllm-8735 issue Signed-off-by: jooho lee <[email protected]> --------- Signed-off-by: jooho lee <[email protected]> --------- Signed-off-by: jooho lee <[email protected]>
- Loading branch information
Showing
3 changed files
with
251 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,243 @@ | ||
apiVersion: template.openshift.io/v1 | ||
kind: Template | ||
metadata: | ||
labels: | ||
opendatahub.io/dashboard: "true" | ||
opendatahub.io/ootb: "true" | ||
annotations: | ||
description: vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs | ||
openshift.io/display-name: vLLM ServingRuntime Multi Node for KServe | ||
openshift.io/provider-display-name: Red Hat, Inc. | ||
tags: rhods,rhoai,kserve,servingruntime,multi-node | ||
template.openshift.io/documentation-url: https://github.com/opendatahub-io/vllm | ||
template.openshift.io/long-description: This template defines resources needed to deploy vLLM ServingRuntime Multi-Node with KServe in Red Hat OpenShift AI | ||
opendatahub.io/modelServingSupport: '["single"]' | ||
opendatahub.io/apiProtocol: "REST" | ||
name: vllm-multinode-runtime-template | ||
objects: | ||
- apiVersion: serving.kserve.io/v1alpha1 | ||
kind: ServingRuntime | ||
metadata: | ||
name: vllm-multinode-runtime | ||
annotations: | ||
openshift.io/display-name: vLLM ServingRuntime for KServe | ||
opendatahub.io/recommended-accelerators: '["nvidia.com/gpu"]' | ||
labels: | ||
opendatahub.io/dashboard: "true" | ||
spec: | ||
annotations: | ||
prometheus.io/port: "8080" | ||
prometheus.io/path: "/metrics" | ||
multiModel: false | ||
supportedModelFormats: | ||
- autoSelect: true | ||
name: vLLM | ||
priority: 2 | ||
containers: | ||
- name: kserve-container | ||
image: $(vllm-image) | ||
command: ["bash", "-c"] | ||
args: | ||
- | | ||
ray start --head --disable-usage-stats --include-dashboard false | ||
# wait for other node to join | ||
until [[ $(ray status --address $RAY_ADDRESS | grep -c node_) -eq ${PIPELINE_PARALLEL_SIZE} ]]; do | ||
echo "Waiting..." | ||
sleep 1 | ||
done | ||
ray status | ||
export SERVED_MODEL_NAME=${MODEL_NAME} | ||
export MODEL_NAME=${MODEL_DIR} | ||
exec python3 -m vllm.entrypoints.openai.api_server --port=8080 --distributed-executor-backend ray --model=${MODEL_NAME} --served-model-name=${SERVED_MODEL_NAME} --tensor-parallel-size=${TENSOR_PARALLEL_SIZE} --pipeline-parallel-size=${PIPELINE_PARALLEL_SIZE} --disable_custom_all_reduce | ||
env: | ||
- name: RAY_PORT | ||
value: "6379" | ||
- name: RAY_ADDRESS | ||
value: 127.0.0.1:6379 | ||
- name: POD_NAMESPACE | ||
valueFrom: | ||
fieldRef: | ||
fieldPath: metadata.namespace | ||
- name: VLLM_NO_USAGE_STATS | ||
value: "1" | ||
- name: HOME | ||
value: /tmp | ||
- name: HF_HOME | ||
value: /tmp/hf_home | ||
resources: | ||
limits: | ||
cpu: "16" | ||
memory: 48Gi | ||
requests: | ||
cpu: "8" | ||
volumeMounts: | ||
- name: shm | ||
mountPath: /dev/shm | ||
livenessProbe: | ||
failureThreshold: 2 | ||
periodSeconds: 5 | ||
successThreshold: 1 | ||
timeoutSeconds: 15 | ||
exec: | ||
command: | ||
- bash | ||
- -c | ||
- | | ||
# Check if the registered ray nodes count is the same as PIPELINE_PARALLEL_SIZE | ||
gpu_status=$(ray status --address $RAY_ADDRESS | grep GPU) | ||
if [[ -z $gpu_status ]]; then | ||
echo "$1: GPU does not exist" | ||
exit 1 | ||
fi | ||
used_gpu=$(echo "$gpu_status" | awk '{print $1}' | cut -d'/' -f1) | ||
reserved_gpu=$(echo "$gpu_status" | awk '{print $1}' | cut -d'/' -f2) | ||
# Determine health status based on GPU usage | ||
if [[ "$used_gpu" != "$reserved_gpu" ]]; then | ||
echo "$1: Unhealthy - Used: $used_gpu, Reserved: $reserved_gpu" | ||
exit 1 | ||
fi | ||
readinessProbe: | ||
failureThreshold: 3 | ||
periodSeconds: 5 | ||
successThreshold: 1 | ||
timeoutSeconds: 15 | ||
exec: | ||
command: | ||
- bash | ||
- -c | ||
- | | ||
# Check if the registered nodes count matches PIPELINE_PARALLEL_SIZE | ||
registered_node_count=$(ray status --address $RAY_ADDRESS | grep -c node_) | ||
if [[ $registered_node_count -ne "$PIPELINE_PARALLEL_SIZE" ]]; then | ||
echo "Readiness Probe: Unhealthy - Registered nodes count ($registered_node_count) does not match PIPELINE_PARALLEL_SIZE ($PIPELINE_PARALLEL_SIZE)." | ||
exit 1 | ||
fi | ||
# Check if the registered ray nodes count is the same as PIPELINE_PARALLEL_SIZE | ||
gpu_status=$(ray status --address $RAY_ADDRESS | grep GPU) | ||
if [[ -z $gpu_status ]]; then | ||
echo "$1: GPU does not exist" | ||
exit 1 | ||
fi | ||
used_gpu=$(echo "$gpu_status" | awk '{print $1}' | cut -d'/' -f1) | ||
reserved_gpu=$(echo "$gpu_status" | awk '{print $1}' | cut -d'/' -f2) | ||
# Determine health status based on GPU usage | ||
if [[ "$used_gpu" != "$reserved_gpu" ]]; then | ||
echo "$1: Unhealthy - Used: $used_gpu, Reserved: $reserved_gpu" | ||
exit 1 | ||
fi | ||
# Check model health | ||
curl --silent --max-time 5 --fail-with-body http://localhost:8080/health | ||
startupProbe: | ||
failureThreshold: 30 | ||
periodSeconds: 30 | ||
successThreshold: 1 | ||
timeoutSeconds: 30 | ||
exec: | ||
command: | ||
- bash | ||
- -c | ||
- | | ||
ray status --address $RAY_ADDRESS > /dev/null 2>&1 | ||
# Check model health | ||
curl --silent --max-time 5 --fail-with-body http://localhost:8080/health | ||
ports: | ||
- containerPort: 8080 | ||
name: http | ||
protocol: TCP | ||
volumes: | ||
- name: shm | ||
emptyDir: | ||
medium: Memory | ||
sizeLimit: 12Gi | ||
workerSpec: | ||
pipelineParallelSize: 2 | ||
tensorParallelSize: 1 | ||
containers: | ||
- name: worker-container | ||
image: $(vllm-image) | ||
command: ["bash", "-c"] | ||
args: | ||
- | | ||
SECONDS=0 | ||
while true; do | ||
if (( SECONDS <= 120 )); then | ||
if ray health-check --address "${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379" > /dev/null 2>&1; then | ||
echo "Global Control Service(GCS) is ready." | ||
break | ||
fi | ||
echo "$SECONDS seconds elapsed: Waiting for Global Control Service(GCS) to be ready." | ||
else | ||
if ray health-check --address "${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379"; then | ||
echo "Global Control Service(GCS) is ready. Any error messages above can be safely ignored." | ||
break | ||
fi | ||
echo "$SECONDS seconds elapsed: Still waiting for Global Control Service(GCS) to be ready." | ||
echo "For troubleshooting, refer to the FAQ at https://docs.ray.io/en/master/cluster/kubernetes/troubleshooting/troubleshooting.html#kuberay-troubleshootin-guides" | ||
fi | ||
sleep 5 | ||
done | ||
export RAY_HEAD_ADDRESS="${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379" | ||
echo "Attempting to connect to Ray cluster at $RAY_HEAD_ADDRESS ..." | ||
ray start --address="$RAY_HEAD_ADDRESS" --block | ||
env: | ||
- name: POD_NAME | ||
valueFrom: | ||
fieldRef: | ||
fieldPath: metadata.name | ||
- name: POD_NAMESPACE | ||
valueFrom: | ||
fieldRef: | ||
fieldPath: metadata.namespace | ||
resources: | ||
limits: | ||
cpu: "16" | ||
memory: 48Gi | ||
requests: | ||
cpu: "8" | ||
volumeMounts: | ||
- name: shm | ||
mountPath: /dev/shm | ||
livenessProbe: | ||
failureThreshold: 2 | ||
periodSeconds: 5 | ||
successThreshold: 1 | ||
timeoutSeconds: 15 | ||
exec: | ||
command: | ||
- bash | ||
- -c | ||
- | | ||
# Check if the registered nodes count matches PIPELINE_PARALLEL_SIZE | ||
registered_node_count=$(ray status --address ${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379 | grep -c node_) | ||
if [[ $registered_node_count -ne "$PIPELINE_PARALLEL_SIZE" ]]; then | ||
echo "Readiness Probe: Unhealthy - Registered nodes count ($registered_node_count) does not match PIPELINE_PARALLEL_SIZE ($PIPELINE_PARALLEL_SIZE)." | ||
exit 1 | ||
fi | ||
startupProbe: | ||
failureThreshold: 12 | ||
periodSeconds: 5 | ||
successThreshold: 1 | ||
timeoutSeconds: 30 | ||
exec: | ||
command: | ||
- /bin/sh | ||
- -c | ||
- | | ||
ray status --address ${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379 > /dev/null 2>&1 | ||
volumes: | ||
- name: shm | ||
emptyDir: | ||
medium: Memory | ||
sizeLimit: 12Gi |