diff --git a/kubernetes_yaml/mixtral_serve/README.md b/kubernetes_yaml/mixtral_serve/README.md new file mode 100644 index 00000000..6813db01 --- /dev/null +++ b/kubernetes_yaml/mixtral_serve/README.md @@ -0,0 +1,127 @@ +## Mixtral serving +Mixtral is required as part of the Instructlab process in various places. The following will describe how to provide Mixtral and the LORA adapters. + +### Secret +Because we neet to run oras inside of a container to download the various artifacts we must provide a .dockerconfigjson to the Kubernetes job with authentication back to registry.redhat.io. +It is suggested to use a Service account. https://access.redhat.com/terms-based-registry/accounts is the location to create a service account. + +Create a secret based off of the service account. + +secret.yaml + +``` +apiVersion: v1 +kind: Secret +metadata: + name: 7033380-ilab-pull-secret +data: + .dockerconfigjson: sadfassdfsadfasdfasdfasdfasdfasdfasdf= +type: kubernetes.io/dockerconfigjson +``` + +Create the secret + +``` +oc create -f secret.yaml +``` + +### Kubernetes Job +Depending on the name of your secret the file `../mixtral_pull/pull_kube_job.yaml` will need to be modified. + +``` +...redacted... + - name: docker-config + secret: + secretName: 7033380-ilab-pull-secret +...redacted... +``` + +With the secretName now reflecting your secret the job can be launched. + +``` +kubectl create -f ./mixtral_pull +``` + +This will create 3 different containers downloading various things using oras. + +### Mixtral serving +This will make no sense but it is the only way discovered so far to ensure that a token is generated to work with the model. Using the RHODS model serving UI define a model to be served named mixtral. Ensure external access and token are selected as the TOKEN is the piece not yet discovered when using just the CLI. + +We will now use the PVC from the previous step to serve the model and replace the runtime defined in the UI. + +``` +kubectl apply -f ./mixtral_serve/runtime.yaml +``` + +Modify the inference service and copy the entire spec field from ./mixtral_serve/inference.yaml + +``` +oc edit inferenceservice mixtral +``` + +``` +spec: + predictor: + maxReplicas: 1 + minReplicas: 1 + model: + args: + - --dtype=bfloat16 + - --tensor-parallel-size=4 + - --enable-lora + - --max-lora-rank=64 + - --lora-dtype=bfloat16 + - --fully-sharded-loras + - --lora-modules + - skill-classifier-v3-clm=/mnt/skills + - text-classifier-knowledge-v3-clm=/mnt/knowledge + modelFormat: + name: vLLM + name: "" + resources: + limits: + cpu: "4" + memory: 40Gi + nvidia.com/gpu: "4" + requests: + cpu: "4" + memory: 40Gi + nvidia.com/gpu: "4" + runtime: mixtral + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists +``` + + +Follow the log of the kserve-container and wait for the the following log entry + +``` +INFO: Waiting for application startup. +INFO: Application startup complete. +INFO: Uvicorn running on http://0.0.0.0:8080 (Press CTRL+C to quit) +``` + + +### Testing +To interact with the model grab the inference endpoint from the RHOAI UI and the token. + +``` +oc get secret -o yaml default-name-mixtral-sa | grep token: | awk -F: '{print $2}' | tr -d ' ' | base64 -d +``` + +Export that value as a variable named TOKEN + +``` +export TOKEN=BLOBOFLETTERSANDNUMBERS +``` + +Using curl you can ensure that the model is accepting connections +``` +curl -X POST "https://mixtral-labels.apps.hulk.octo-emerging.redhataicoe.com/v1/completions" -H "Authorization: Bearer $TOKEN" \ + -H "Content-Type: application/json" -d '{"model": "mixtral", "prompt": "San Francisco is a", "max_tokens": 7, "temperature": 0 }' + + +{"id":"cmpl-ecd5bd72a947438b805e25134bbdf636","object":"text_completion","created":1730231625,"model":"mixtral","choices":[{"index":0,"text":" city that is known for its steep","logprobs":null,"finish_reason":"length","stop_reason":null,"prompt_logprobs":null}],"usage":{"prompt_tokens":5,"total_tokens":12,"completion_tokens":7}}% +``` diff --git a/kubernetes_yaml/mixtral_serve/mixtral_pull/pull_kube_job.yaml b/kubernetes_yaml/mixtral_serve/mixtral_pull/pull_kube_job.yaml new file mode 100644 index 00000000..e04c6653 --- /dev/null +++ b/kubernetes_yaml/mixtral_serve/mixtral_pull/pull_kube_job.yaml @@ -0,0 +1,44 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: oras-copy-job +spec: + template: + spec: + containers: + - name: oras-copy-knowledge + image: ghcr.io/oras-project/oras:v1.2.0 + command: ["oras", "pull", "registry.redhat.io/rhelai1/knowledge-adapter-v3:1.2-1728663941", "--output", "/mnt/knowledge", "--registry-config", "/workspace/.docker"] + volumeMounts: + - name: docker-config + mountPath: /workspace/.docker + subPath: .dockerconfigjson # Mount the Docker config as config.json + - name: model-pvc + mountPath: /mnt + - name: oras-copy-skills + image: ghcr.io/oras-project/oras:v1.2.0 + command: ["oras", "pull", "registry.redhat.io/rhelai1/skills-adapter-v3:1.2-1728663941", "--output", "/mnt/skills", "--registry-config", "/workspace/.docker"] + volumeMounts: + - name: docker-config + mountPath: /workspace/.docker + subPath: .dockerconfigjson # Mount the Docker config as config.json + - name: model-pvc + mountPath: /mnt + - name: oras-copy-model + image: ghcr.io/oras-project/oras:v1.2.0 + command: ["oras", "pull", "registry.redhat.io/rhelai1/mixtral-8x7b-instruct-v0-1:1.2-1728663941", "--output", "/mnt/model", "--registry-config", "/workspace/.docker"] + volumeMounts: + - name: docker-config + mountPath: /workspace/.docker + subPath: .dockerconfigjson # Mount the Docker config as config.json + - name: model-pvc + mountPath: /mnt + restartPolicy: Never + volumes: + - name: model-pvc + persistentVolumeClaim: + claimName: mixtral-serving-ilab + - name: docker-config + secret: + secretName: 7033380-ilab-pull-secret + backoffLimit: 4 diff --git a/kubernetes_yaml/mixtral_serve/mixtral_pull/pvc.yaml b/kubernetes_yaml/mixtral_serve/mixtral_pull/pvc.yaml new file mode 100644 index 00000000..c7ad9fdd --- /dev/null +++ b/kubernetes_yaml/mixtral_serve/mixtral_pull/pvc.yaml @@ -0,0 +1,11 @@ +## PVC to be used for model storage +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: mixtral-serving-ilab +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 500Gi diff --git a/kubernetes_yaml/mixtral_serve/mixtral_serve.yaml b/kubernetes_yaml/mixtral_serve/mixtral_serve.yaml deleted file mode 100644 index 378be810..00000000 --- a/kubernetes_yaml/mixtral_serve/mixtral_serve.yaml +++ /dev/null @@ -1,91 +0,0 @@ -apiVersion: v1 -items: -- apiVersion: serving.kserve.io/v1alpha1 - kind: ServingRuntime - metadata: - labels: - opendatahub.io/dashboard: "true" - name: mixtral-8x7b-vllm-servingruntime - namespace: mixtral-serve - spec: - builtInAdapter: - modelLoadingTimeoutMillis: 90000 - containers: - - args: - - --model=/mnt/models/ - - --download-dir=/models-cache - - --port=8080 - image: quay.io/modh/vllm@sha256:2e7f97b69d6e0aa7366ee6a841a7e709829136a143608bee859b1fe700c36d31 - name: kserve-container - ports: - - containerPort: 8080 - name: http1 - protocol: TCP - volumeMounts: - - mountPath: /home/vllm - name: home - - mountPath: /.cache - name: cache - - mountPath: /.config - name: config - multiModel: false - supportedModelFormats: - - autoSelect: true - name: pytorch - volumes: - - emptyDir: {} - name: home - - emptyDir: {} - name: cache - - emptyDir: {} - name: config -- apiVersion: serving.kserve.io/v1beta1 - kind: InferenceService - metadata: - annotations: - serving.kserve.io/deploymentMode: RawDeployment - serving.kserve.io/enable-prometheus-scraping: "true" - labels: - opendatahub.io/dashboard: "true" - name: mixtral-8x7b-isvc - namespace: mixtral-serve - spec: - predictor: - minReplicas: 1 - model: - args: - - --dtype=bfloat16 - - --tensor-parallel-size=4 - - --max-model-len=4096 - modelFormat: - name: pytorch - resources: - limits: - nvidia.com/gpu: "4" - requests: - cpu: "8" - memory: 40Gi - nvidia.com/gpu: "4" - runtime: mixtral-8x7b-vllm-servingruntime - storage: - key: aws-connection-odf - path: models/mixtral-8x7b-v0.1/ - volumeMounts: - - mountPath: /dev/shm - name: shared-memory - - mountPath: /tmp - name: tmp - - mountPath: /home/vllm - name: home - serviceAccountName: default - volumes: - - emptyDir: - medium: Memory - sizeLimit: 16Gi - name: shared-memory - - emptyDir: {} - name: tmp - - emptyDir: {} - name: home -kind: List -metadata: {} diff --git a/kubernetes_yaml/mixtral_serve/mixtral_serve/inference.yaml b/kubernetes_yaml/mixtral_serve/mixtral_serve/inference.yaml new file mode 100644 index 00000000..8465a1f1 --- /dev/null +++ b/kubernetes_yaml/mixtral_serve/mixtral_serve/inference.yaml @@ -0,0 +1,51 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + annotations: + openshift.io/display-name: mixtral + security.opendatahub.io/enable-auth: "true" + serving.knative.openshift.io/enablePassthrough: "true" + sidecar.istio.io/inject: "true" + sidecar.istio.io/rewriteAppHTTPProbers: "true" + creationTimestamp: "2024-10-29T19:14:46Z" + finalizers: + - inferenceservice.finalizers + generation: 5 + labels: + opendatahub.io/dashboard: "true" + name: mixtral + namespace: labels + resourceVersion: "8869282" + uid: 433d76da-6c52-4b47-a3cd-ba3765e7b5bf +spec: + predictor: + maxReplicas: 1 + minReplicas: 1 + model: + args: + - --dtype=bfloat16 + - --tensor-parallel-size=4 + - --enable-lora + - --max-lora-rank=64 + - --lora-dtype=bfloat16 + - --fully-sharded-loras + - --lora-modules + - skill-classifier-v3-clm=/mnt/skills + - text-classifier-knowledge-v3-clm=/mnt/knowledge + modelFormat: + name: vLLM + name: "" + resources: + limits: + cpu: "4" + memory: 40Gi + nvidia.com/gpu: "4" + requests: + cpu: "4" + memory: 40Gi + nvidia.com/gpu: "4" + runtime: mixtral + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists diff --git a/kubernetes_yaml/mixtral_serve/mixtral_serve/runtime.yaml b/kubernetes_yaml/mixtral_serve/mixtral_serve/runtime.yaml new file mode 100644 index 00000000..fdb32e6b --- /dev/null +++ b/kubernetes_yaml/mixtral_serve/mixtral_serve/runtime.yaml @@ -0,0 +1,57 @@ +apiVersion: serving.kserve.io/v1alpha1 +kind: ServingRuntime +metadata: + annotations: + opendatahub.io/accelerator-name: migrated-gpu + opendatahub.io/apiProtocol: REST + opendatahub.io/recommended-accelerators: '["nvidia.com/gpu"]' + opendatahub.io/template-display-name: vLLM ServingRuntime for KServe + opendatahub.io/template-name: vllm-runtime + openshift.io/display-name: mixtral + creationTimestamp: "2024-10-25T15:59:12Z" + generation: 3 + labels: + opendatahub.io/dashboard: "true" + name: mixtral + namespace: labels +spec: + annotations: + prometheus.io/path: /metrics + prometheus.io/port: "8080" + containers: + - args: + - --port=8080 + - --model=/mnt/model + - --served-model-name={{.Name}} + - --distributed-executor-backend=mp + command: + - python + - -m + - vllm.entrypoints.openai.api_server + env: + - name: HF_HOME + value: /tmp/hf_home + image: quay.io/modh/vllm@sha256:3c56d4c2a5a9565e8b07ba17a6624290c4fb39ac9097b99b946326c09a8b40c8 + name: kserve-container + ports: + - containerPort: 8080 + protocol: TCP + volumeMounts: + - mountPath: /dev/shm + name: shm + - mountPath: /mnt + name: mixtral-serve + multiModel: false + storageHelper: + disabled: true + supportedModelFormats: + - autoSelect: true + name: vLLM + volumes: + - name: mixtral-serve + persistentVolumeClaim: + claimName: mixtral-serving-ilab + - emptyDir: + medium: Memory + sizeLimit: 2Gi + name: shm