[Cherry-Pick] vLLM multi-node template (#292)

* [RHOAIENG-15386] add a new servingruntime vllm-multinode-template (#284) * add a new servingruntime vllm-multinode-template Signed-off-by: jooho lee <[email protected]> * follow up comments Signed-off-by: jooho lee <[email protected]> * follow up comments Signed-off-by: jooho lee <[email protected]> * follow up comments Signed-off-by: jooho lee <[email protected]> * follow up comments Signed-off-by: jooho lee <[email protected]> --------- Signed-off-by: jooho lee <[email protected]> * Fix ServingRuntime args (#291) * fix runtime args Signed-off-by: jooho lee <[email protected]> * add disable_custom_all_reduce solving vllm-8735 issue Signed-off-by: jooho lee <[email protected]> --------- Signed-off-by: jooho lee <[email protected]> --------- Signed-off-by: jooho lee <[email protected]>
opendatahub-io · Nov 12, 2024 · 36ca97c · 36ca97c
1 parent 479aece
commit 36ca97c
Show file tree

Hide file tree

Showing 3 changed files with 251 additions and 1 deletion.
diff --git a/config/base/kustomization.yaml b/config/base/kustomization.yaml
@@ -86,6 +86,12 @@ replacements:
           name: vllm-runtime-template
         fieldPaths:
           - objects.0.spec.containers.0.image
+      - select:
+          kind: Template
+          name: vllm-multinode-runtime-template
+        fieldPaths:
+          - objects.0.spec.containers.0.image
+          - objects.0.spec.workerSpec.containers.0.image
   - source:
       kind: ConfigMap
       version: v1

diff --git a/config/runtimes/kustomization.yaml b/config/runtimes/kustomization.yaml
@@ -11,6 +11,7 @@ resources:
   - tgis-template.yaml
   - ovms-kserve-template.yaml
   - vllm-template.yaml
+  - vllm-multinode-template.yaml
   - vllm-rocm-template.yaml
   - vllm-gaudi-template.yaml
-  - caikit-standalone-template.yaml
+  - caikit-standalone-template.yaml
diff --git a/config/runtimes/vllm-multinode-template.yaml b/config/runtimes/vllm-multinode-template.yaml
@@ -0,0 +1,243 @@
+apiVersion: template.openshift.io/v1
+kind: Template
+metadata:
+  labels:
+    opendatahub.io/dashboard: "true"
+    opendatahub.io/ootb: "true"
+  annotations:
+    description: vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs
+    openshift.io/display-name: vLLM ServingRuntime Multi Node for KServe
+    openshift.io/provider-display-name: Red Hat, Inc.
+    tags: rhods,rhoai,kserve,servingruntime,multi-node
+    template.openshift.io/documentation-url: https://github.com/opendatahub-io/vllm
+    template.openshift.io/long-description: This template defines resources needed to deploy vLLM ServingRuntime Multi-Node with KServe in Red Hat OpenShift AI
+    opendatahub.io/modelServingSupport: '["single"]'
+    opendatahub.io/apiProtocol: "REST"
+  name: vllm-multinode-runtime-template
+objects:
+  - apiVersion: serving.kserve.io/v1alpha1
+    kind: ServingRuntime
+    metadata:
+      name: vllm-multinode-runtime
+      annotations:
+        openshift.io/display-name: vLLM ServingRuntime for KServe
+        opendatahub.io/recommended-accelerators: '["nvidia.com/gpu"]'
+      labels:
+        opendatahub.io/dashboard: "true"
+    spec:
+      annotations:
+        prometheus.io/port: "8080"
+        prometheus.io/path: "/metrics"
+      multiModel: false
+      supportedModelFormats:
+        - autoSelect: true
+          name: vLLM
+          priority: 2
+      containers:
+        - name: kserve-container
+          image: $(vllm-image)
+          command: ["bash", "-c"]
+          args:
+            - |
+              ray start --head --disable-usage-stats --include-dashboard false 
+              # wait for other node to join
+              until [[ $(ray status --address $RAY_ADDRESS | grep -c node_) -eq ${PIPELINE_PARALLEL_SIZE} ]]; do
+                echo "Waiting..."
+                sleep 1
+              done
+              ray status
+
+              export SERVED_MODEL_NAME=${MODEL_NAME}
+              export MODEL_NAME=${MODEL_DIR} 
+            
+              exec python3 -m vllm.entrypoints.openai.api_server --port=8080 --distributed-executor-backend ray --model=${MODEL_NAME} --served-model-name=${SERVED_MODEL_NAME} --tensor-parallel-size=${TENSOR_PARALLEL_SIZE} --pipeline-parallel-size=${PIPELINE_PARALLEL_SIZE} --disable_custom_all_reduce
+          env:
+            - name: RAY_PORT
+              value: "6379"
+            - name: RAY_ADDRESS
+              value: 127.0.0.1:6379
+            - name: POD_NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+            - name: VLLM_NO_USAGE_STATS
+              value: "1"
+            - name: HOME
+              value: /tmp
+            - name: HF_HOME
+              value: /tmp/hf_home
+          resources:
+            limits:
+              cpu: "16"
+              memory: 48Gi
+            requests:
+              cpu: "8"
+          volumeMounts:
+            - name: shm
+              mountPath: /dev/shm
+          livenessProbe:
+            failureThreshold: 2
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 15
+            exec:
+              command:
+                - bash
+                - -c
+                - |
+                  # Check if the registered ray nodes count is the same as PIPELINE_PARALLEL_SIZE
+                  gpu_status=$(ray status --address $RAY_ADDRESS | grep GPU)
+                  if [[ -z $gpu_status ]]; then
+                    echo "$1: GPU does not exist"
+                    exit 1
+                  fi
+
+                  used_gpu=$(echo "$gpu_status" | awk '{print $1}' | cut -d'/' -f1)
+                  reserved_gpu=$(echo "$gpu_status" | awk '{print $1}' | cut -d'/' -f2)
+
+                  # Determine health status based on GPU usage
+                  if [[ "$used_gpu" != "$reserved_gpu" ]]; then
+                    echo "$1: Unhealthy - Used: $used_gpu, Reserved: $reserved_gpu"
+                    exit 1
+                  fi
+          readinessProbe:
+            failureThreshold: 3
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 15
+            exec:
+              command:
+                - bash
+                - -c
+                - |
+                  # Check if the registered nodes count matches PIPELINE_PARALLEL_SIZE
+                  registered_node_count=$(ray status --address $RAY_ADDRESS | grep -c node_)
+                  if [[ $registered_node_count -ne "$PIPELINE_PARALLEL_SIZE" ]]; then
+                    echo "Readiness Probe: Unhealthy - Registered nodes count ($registered_node_count) does not match PIPELINE_PARALLEL_SIZE ($PIPELINE_PARALLEL_SIZE)."
+                    exit 1
+                  fi
+
+                  # Check if the registered ray nodes count is the same as PIPELINE_PARALLEL_SIZE
+                  gpu_status=$(ray status --address $RAY_ADDRESS | grep GPU)
+                  if [[ -z $gpu_status ]]; then
+                    echo "$1: GPU does not exist"
+                    exit 1
+                  fi
+
+                  used_gpu=$(echo "$gpu_status" | awk '{print $1}' | cut -d'/' -f1)
+                  reserved_gpu=$(echo "$gpu_status" | awk '{print $1}' | cut -d'/' -f2)
+
+                  # Determine health status based on GPU usage
+                  if [[ "$used_gpu" != "$reserved_gpu" ]]; then
+                    echo "$1: Unhealthy - Used: $used_gpu, Reserved: $reserved_gpu"
+                    exit 1
+                  fi
+                  
+                  # Check model health
+                  curl --silent --max-time 5 --fail-with-body http://localhost:8080/health
+          startupProbe:
+            failureThreshold: 30
+            periodSeconds: 30
+            successThreshold: 1
+            timeoutSeconds: 30
+            exec:
+              command:
+                - bash
+                - -c
+                - |
+                  ray status --address $RAY_ADDRESS > /dev/null 2>&1
+
+                  # Check model health
+                  curl --silent --max-time 5 --fail-with-body http://localhost:8080/health
+          ports:
+            - containerPort: 8080
+              name: http
+              protocol: TCP
+      volumes:
+        - name: shm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 12Gi    
+      workerSpec:
+        pipelineParallelSize: 2
+        tensorParallelSize: 1
+        containers:
+          - name: worker-container
+            image: $(vllm-image)
+            command: ["bash", "-c"]
+            args:
+              - |
+                SECONDS=0
+
+                while true; do              
+                  if (( SECONDS <= 120 )); then
+                    if ray health-check --address "${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379" > /dev/null 2>&1; then
+                      echo "Global Control Service(GCS) is ready."
+                      break
+                    fi
+                    echo "$SECONDS seconds elapsed: Waiting for Global Control Service(GCS) to be ready."
+                  else
+                    if ray health-check --address "${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379"; then
+                      echo "Global Control Service(GCS) is ready. Any error messages above can be safely ignored."
+                      break
+                    fi
+                    echo "$SECONDS seconds elapsed: Still waiting for Global Control Service(GCS) to be ready."
+                    echo "For troubleshooting, refer to the FAQ at https://docs.ray.io/en/master/cluster/kubernetes/troubleshooting/troubleshooting.html#kuberay-troubleshootin-guides"
+                  fi
+                  
+                  sleep 5
+                done
+
+                export RAY_HEAD_ADDRESS="${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379"
+                echo "Attempting to connect to Ray cluster at $RAY_HEAD_ADDRESS ..."
+                ray start --address="$RAY_HEAD_ADDRESS" --block
+            env:
+              - name: POD_NAME
+                valueFrom:
+                  fieldRef:
+                    fieldPath: metadata.name
+              - name: POD_NAMESPACE
+                valueFrom:
+                  fieldRef:
+                    fieldPath: metadata.namespace
+            resources:
+              limits:
+                cpu: "16"
+                memory: 48Gi
+              requests:
+                cpu: "8"
+            volumeMounts:
+              - name: shm
+                mountPath: /dev/shm                
+            livenessProbe:
+              failureThreshold: 2
+              periodSeconds: 5
+              successThreshold: 1
+              timeoutSeconds: 15
+              exec:
+                command:
+                  - bash
+                  - -c
+                  - |
+                    # Check if the registered nodes count matches PIPELINE_PARALLEL_SIZE
+                    registered_node_count=$(ray status --address ${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379 | grep -c node_)
+                    if [[ $registered_node_count -ne "$PIPELINE_PARALLEL_SIZE" ]]; then
+                      echo "Readiness Probe: Unhealthy - Registered nodes count ($registered_node_count) does not match PIPELINE_PARALLEL_SIZE ($PIPELINE_PARALLEL_SIZE)."
+                      exit 1
+                    fi 
+            startupProbe:
+              failureThreshold: 12
+              periodSeconds: 5
+              successThreshold: 1
+              timeoutSeconds: 30
+              exec:
+                command:
+                  - /bin/sh
+                  - -c
+                  - |
+                    ray status --address ${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379 > /dev/null 2>&1
+        volumes:
+          - name: shm
+            emptyDir:
+              medium: Memory
+              sizeLimit: 12Gi