[castai-hosted-model] Add vLLM to castai-hosted-model (#729)

* Add vllm to castai-hosted-model * Add a secret to hold HF token * Adjust vllm arguments * Fix naming
castai · Feb 21, 2025 · 521ad86 · 521ad86
1 parent 8dc24da
commit 521ad86
Show file tree

Hide file tree

Showing 12 changed files with 247 additions and 6 deletions.
diff --git a/charts/castai-hosted-model/Chart.lock b/charts/castai-hosted-model/Chart.lock
@@ -2,5 +2,8 @@ dependencies:
 - name: ollama
   repository: https://otwld.github.io/ollama-helm/
   version: 1.4.0
-digest: sha256:f43676d88383d377fea1578c25e95567ba6a28b06281c3a8342f40ab470ad839
-generated: "2025-02-17T14:03:59.094503Z"
+- name: vllm
+  repository: file://child-charts/vllm
+  version: 0.0.1
+digest: sha256:9f68482a80f8a9f68664f3cfa33b56631ae02d06108b14475fbe88c4cbe22e02
+generated: "2025-02-18T17:20:56.187998Z"
diff --git a/charts/castai-hosted-model/Chart.yaml b/charts/castai-hosted-model/Chart.yaml
@@ -1,11 +1,15 @@
 apiVersion: v2
 name: castai-hosted-model
-description: CAST AI hosted model deployment chart. Currently, it's just a pass through to ollama chart helm.
+description: CAST AI hosted model deployment chart.
 type: application
-version: 0.0.3
+version: 0.0.4
 appVersion: "v0.0.1"
 dependencies:
   - name: ollama
     version: 1.4.0
     repository: https://otwld.github.io/ollama-helm/
     condition: ollama.enabled
+  - name: vllm
+    version: 0.0.1
+    repository: file://child-charts/vllm
+    condition: vllm.enabled
diff --git a/charts/castai-hosted-model/README.md b/charts/castai-hosted-model/README.md
@@ -1,15 +1,17 @@
 # castai-hosted-model
 
-CAST AI hosted model deployment chart. Currently, it's just a pass through to ollama chart helm.
+CAST AI hosted model deployment chart.
 
 ## Requirements
 
 | Repository | Name | Version |
 |------------|------|---------|
+| file://child-charts/vllm | vllm | 0.0.1 |
 | https://otwld.github.io/ollama-helm/ | ollama | 1.4.0 |
 
 ## Values
 
 | Key | Type | Default | Description |
 |-----|------|---------|-------------|
-| ollama.enabled | bool | `true` |  |
+| ollama.enabled | bool | `true` |  |
+| vllm.enabled | bool | `false` |  |
diff --git a/charts/castai-hosted-model/child-charts/vllm/Chart.yaml b/charts/castai-hosted-model/child-charts/vllm/Chart.yaml
@@ -0,0 +1,6 @@
+apiVersion: v2
+name: vllm
+description: CAST AI hosted model deployment chart for vLLM.
+type: application
+version: 0.0.1
+appVersion: "v0.0.1"
diff --git a/charts/castai-hosted-model/child-charts/vllm/README.md b/charts/castai-hosted-model/child-charts/vllm/README.md
@@ -0,0 +1,25 @@
+# vllm
+
+CAST AI hosted model deployment chart for vLLM.
+
+## Values
+
+| Key | Type | Default | Description |
+|-----|------|---------|-------------|
+| container.port | int | `8080` |  |
+| image.repository | string | `"vllm/vllm-openai"` |  |
+| image.tag | string | `"v0.7.2"` |  |
+| livenessProbe | object | `{"failureThreshold":3,"httpGet":{"path":"/health"},"initialDelaySeconds":15,"periodSeconds":10}` | Liveness probe configuration |
+| livenessProbe.failureThreshold | int | `3` | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive |
+| livenessProbe.httpGet | object | `{"path":"/health"}` | Configuration of the Kubelet http request on the server |
+| livenessProbe.httpGet.path | string | `"/health"` | Path to access on the HTTP server |
+| livenessProbe.initialDelaySeconds | int | `15` | Number of seconds after the container has started before liveness probe is initiated |
+| livenessProbe.periodSeconds | int | `10` | How often (in seconds) to perform the liveness probe |
+| readinessProbe | object | `{"failureThreshold":3,"httpGet":{"path":"/health"},"initialDelaySeconds":5,"periodSeconds":5}` | Readiness probe configuration |
+| readinessProbe.failureThreshold | int | `3` | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready |
+| readinessProbe.httpGet | object | `{"path":"/health"}` | Configuration of the Kubelet http request on the server |
+| readinessProbe.httpGet.path | string | `"/health"` | Path to access on the HTTP server |
+| readinessProbe.initialDelaySeconds | int | `5` | Number of seconds after the container has started before readiness probe is initiated |
+| readinessProbe.periodSeconds | int | `5` | How often (in seconds) to perform the readiness probe |
+| service.port | int | `8080` |  |
+| service.type | string | `"ClusterIP"` |  |
diff --git a/charts/castai-hosted-model/child-charts/vllm/README.md.gotmpl b/charts/castai-hosted-model/child-charts/vllm/README.md.gotmpl
@@ -0,0 +1,14 @@
+{{ template "chart.header" . }}
+{{ template "chart.deprecationWarning" . }}
+
+{{ template "chart.description" . }}
+
+{{ template "chart.homepageLine" . }}
+
+{{ template "chart.maintainersSection" . }}
+
+{{ template "chart.sourcesSection" . }}
+
+{{ template "chart.requirementsSection" . }}
+
+{{ template "chart.valuesSection" . }}
diff --git a/charts/castai-hosted-model/child-charts/vllm/templates/_helpers.tpl b/charts/castai-hosted-model/child-charts/vllm/templates/_helpers.tpl
@@ -0,0 +1,33 @@
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "vllm.fullname" -}}
+{{ .Values.service.name }}
+{{- end }}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "vllm.chart" -}}
+{{- printf "%s-%s" "vllm" .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Common labels
+*/}}
+{{- define "vllm.labels" -}}
+helm.sh/chart: {{ include "vllm.chart" . }}
+{{ include "vllm.selectorLabels" . }}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+{{- end }}
+
+{{/*
+Selector labels
+*/}}
+{{- define "vllm.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "vllm.fullname" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end }}
diff --git a/charts/castai-hosted-model/child-charts/vllm/templates/deployment.yaml b/charts/castai-hosted-model/child-charts/vllm/templates/deployment.yaml
@@ -0,0 +1,91 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "vllm.fullname" . }}
+  labels:
+    {{- include "vllm.labels" . | nindent 4 }}
+spec:
+  replicas: {{ .Values.replicaCount }}
+  selector:                                                                                                                                  
+    matchLabels:
+      {{- include "vllm.selectorLabels" . | nindent 6 }}
+  template:
+    metadata:
+      labels:
+        {{- include "vllm.labels" . | nindent 8 }}
+        {{- with .Values.podLabels }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
+    spec:
+      containers:
+        - name: "vllm"
+          image: "{{ required "Required value 'image.repository' must be defined" .Values.image.repository }}:{{ required "Required value 'image.tag' must be defined" .Values.image.tag }}"
+          imagePullPolicy: IfNotPresent
+          command: ["/bin/sh", "-c", "vllm", "serve"]
+          args:
+            - {{ .Values.model | quote }}
+            - "--trust-remote-code"
+            - "--gpu-memory-utilization=1"
+            - "--task=generate"
+            - "--dtype=half"
+            - "--kv-cache-dtype=fp8_e5m2"
+            - "--enable-chunked-prefill=True"
+            - "--max-num-batched-tokens=10000"
+            {{ if .Values.maxModelLen }}
+            - "--max-model-len={{ .Values.maxModelLen }}"
+            {{ end }}
+            {{ if .Values.tensorParallelSize }}
+            - "--tensor-parallel-size={{ .Values.tensorParallelSize }}"
+            {{ end }}
+            {{ if .Values.quantization }}
+            - "--quantization={{ .Values.quantization }}"
+            {{ end }}
+          env:
+            {{- if .Values.hfToken }}
+            - name: HUGGING_FACE_HUB_TOKEN
+              value: "{{ .Values.hfToken }}"
+            {{- else }}
+            {{- if .Values.secretName }}
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: {{ required "secretName must be provided" .Values.secretName }}
+                  key: "HF_TOKEN"
+            {{- end }}
+            {{- end }}
+          ports:
+            - name: http
+              containerPort: {{ .Values.container.port }}
+              protocol: TCP
+          resources: 
+            {{ toYaml .Values.resources | nindent 12 }}
+          livenessProbe:
+            httpGet:
+              path: {{ .Values.livenessProbe.path }}
+              port: http
+            initialDelaySeconds: {{ .Values.livenessProbe.initialDelaySeconds }}
+            periodSeconds: {{ .Values.livenessProbe.periodSeconds }}
+            timeoutSeconds: {{ .Values.livenessProbe.timeoutSeconds }}
+            successThreshold: {{ .Values.livenessProbe.successThreshold }}
+            failureThreshold: {{ .Values.livenessProbe.failureThreshold }}
+          readinessProbe:
+            httpGet:
+              path: {{ .Values.readinessProbe.path }}
+              port: http
+            initialDelaySeconds: {{ .Values.readinessProbe.initialDelaySeconds }}
+            periodSeconds: {{ .Values.readinessProbe.periodSeconds }}
+            timeoutSeconds: {{ .Values.readinessProbe.timeoutSeconds }}
+            successThreshold: {{ .Values.readinessProbe.successThreshold }}
+            failureThreshold: {{ .Values.readinessProbe.failureThreshold }}
+      {{- with .Values.nodeSelector }}
+      nodeSelector:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.tolerations }}
+      tolerations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.affinity }}
+      affinity:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
diff --git a/charts/castai-hosted-model/child-charts/vllm/templates/secret.yaml b/charts/castai-hosted-model/child-charts/vllm/templates/secret.yaml
@@ -0,0 +1,10 @@
+{{- if .Values.hfToken }}
+apiVersion: v1
+kind: Secret
+metadata:
+  name: {{ include "vllm.fullname" . }}
+  labels:
+    {{- include "vllm.labels" . | nindent 4 }}
+data:
+  HF_TOKEN: {{ .Values.hfToken | b64enc | quote }}
+{{- end }}
diff --git a/charts/castai-hosted-model/child-charts/vllm/templates/service.yaml b/charts/castai-hosted-model/child-charts/vllm/templates/service.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "vllm.fullname" . }}
+  labels:
+    {{- include "vllm.labels" . | nindent 4 }}
+spec:
+  type: {{ .Values.service.type }}
+  ports:
+    - port: {{ .Values.service.port }}
+      targetPort: http
+      protocol: TCP
+      name: http
+  selector:
+    {{- include "vllm.selectorLabels" . | nindent 4 }}
diff --git a/charts/castai-hosted-model/child-charts/vllm/values.yaml b/charts/castai-hosted-model/child-charts/vllm/values.yaml
@@ -0,0 +1,36 @@
+image:
+  repository: "vllm/vllm-openai"
+  tag: "v0.7.2"
+
+container:
+  port: 8080
+
+service:
+  type: ClusterIP
+  port: 8080
+
+# -- Readiness probe configuration
+readinessProbe:
+  # -- Number of seconds after the container has started before readiness probe is initiated
+  initialDelaySeconds: 5
+  # -- How often (in seconds) to perform the readiness probe
+  periodSeconds: 5
+  # -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
+  failureThreshold: 3
+   # -- Configuration of the Kubelet http request on the server
+  httpGet:
+    # -- Path to access on the HTTP server
+    path: /health
+
+# -- Liveness probe configuration
+livenessProbe:
+ # -- Number of seconds after the container has started before liveness probe is initiated
+  initialDelaySeconds: 15
+  # -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
+  failureThreshold: 3
+  # -- How often (in seconds) to perform the liveness probe
+  periodSeconds: 10
+  # -- Configuration of the Kubelet http request on the server
+  httpGet:
+    # -- Path to access on the HTTP server
+    path: /health
diff --git a/charts/castai-hosted-model/values.yaml b/charts/castai-hosted-model/values.yaml
@@ -1,2 +1,4 @@
 ollama:
   enabled: true
+vllm:
+  enabled: false