Create Fake Node Deployments based on Configmap content

run-ai · Mar 28, 2024 · 86b2c12 · 86b2c12
1 parent 5a25960
commit 86b2c12
Show file tree

Hide file tree

Showing 14 changed files with 516 additions and 294 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM golang:1.18.2 as common-builder
+FROM golang:1.21 as common-builder
 WORKDIR $GOPATH/src/github.com/run-ai/fake-gpu-operator
 COPY go.mod .
 COPY go.sum .

diff --git a/cmd/status-updater/main.go b/cmd/status-updater/main.go
@@ -2,10 +2,14 @@ package main
 
 import (
 	"github.com/run-ai/fake-gpu-operator/internal/common/app"
+	"github.com/run-ai/fake-gpu-operator/internal/common/config"
 	status_updater "github.com/run-ai/fake-gpu-operator/internal/status-updater"
 )
 
 func main() {
+	requiredEnvVars := []string{"TOPOLOGY_CM_NAME", "TOPOLOGY_CM_NAMESPACE", "FAKE_NODE_DEPLOYMENTS_PATH"}
+	config.ValidateConfig(requiredEnvVars)
+
 	appRunner := app.NewAppRunner(&status_updater.StatusUpdaterApp{})
 	appRunner.Run()
 }
diff --git a/deploy/fake-gpu-operator/templates/device-plugin/_helpers.tpl b/deploy/fake-gpu-operator/templates/device-plugin/_helpers.tpl
@@ -0,0 +1,92 @@
+{{- define "fake-gpu-operator.device-plugin.metadata" }}
+metadata:
+  {{- if .Values.environment.openshift }}
+  annotations:
+    openshift.io/scc: hostmount-anyuid
+  {{- end }}
+  labels:
+    app: device-plugin
+  name: device-plugin
+  namespace: {{ .Release.Namespace }}
+{{- end }}
+
+{{- define "fake-gpu-operator.device-plugin.podSelector" }}
+selector:
+  matchLabels:
+    app: device-plugin
+    component: device-plugin
+{{- end }}
+
+{{- define "fake-gpu-operator.device-plugin.podTemplate.metadata" }}
+metadata:
+  annotations:
+    checksum/initialTopology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
+  labels:
+    app: device-plugin
+    component: device-plugin
+{{- end }}
+
+{{- define "fake-gpu-operator.device-plugin.podTemplate.spec.common" }}
+containers:
+  - image: "{{ .Values.devicePlugin.image.repository }}:{{ .Values.devicePlugin.image.tag }}"
+    imagePullPolicy: "{{ .Values.devicePlugin.image.pullPolicy }}"
+    resources:
+      {{- toYaml .Values.devicePlugin.resources | nindent 12 }}
+    env:
+      - name: NODE_NAME
+        valueFrom:
+          fieldRef:
+            fieldPath: spec.nodeName
+      - name: TOPOLOGY_CM_NAME
+        value: topology
+      - name: TOPOLOGY_CM_NAMESPACE
+        value: "{{ .Release.Namespace }}"
+    name: nvidia-device-plugin-ctr
+    securityContext:
+      privileged: true
+    terminationMessagePath: /dev/termination-log
+    terminationMessagePolicy: File
+    volumeMounts:
+      - mountPath: /runai/bin
+        name: runai-bin-directory
+      - mountPath: /runai/shared
+        name: runai-shared-directory              
+      - mountPath: /var/lib/kubelet/device-plugins
+        name: device-plugin
+dnsPolicy: ClusterFirst
+restartPolicy: Always
+serviceAccountName: nvidia-device-plugin
+terminationGracePeriodSeconds: 30
+tolerations:
+  - effect: NoSchedule
+    key: nvidia.com/gpu
+    operator: Exists
+imagePullSecrets:
+  - name: gcr-secret
+volumes:
+  - hostPath:
+      path: /var/lib/kubelet/device-plugins
+      type: ""
+    name: device-plugin
+  - hostPath:
+      path: /var/lib/runai/bin
+      type: DirectoryOrCreate
+    name: runai-bin-directory
+  - hostPath:
+      path: /var/lib/runai/shared
+      type: DirectoryOrCreate
+    name: runai-shared-directory
+{{- end }}
+
+{{- define "fake-gpu-operator.device-plugin.deployment" }}
+apiVersion: apps/v1
+kind: Deployment
+{{- include "fake-gpu-operator.device-plugin.metadata" .}}
+spec:
+  replicas: 1
+  {{- include "fake-gpu-operator.device-plugin.podSelector" . | nindent 2 }}
+  template:
+    {{- include "fake-gpu-operator.device-plugin.podTemplate.metadata" . | nindent 4 }}
+    spec:
+      {{- include "fake-gpu-operator.device-plugin.podTemplate.spec.common" . | nindent 6 }}
+{{- end }}
diff --git a/deploy/fake-gpu-operator/templates/device-plugin/daemonset.yml b/deploy/fake-gpu-operator/templates/device-plugin/daemonset.yml
@@ -1,75 +1,12 @@
 apiVersion: apps/v1
 kind: DaemonSet
-metadata:
-{{- if .Values.environment.openshift }}
-  annotations:
-    openshift.io/scc: hostmount-anyuid
-{{- end }}
-  labels:
-    app: device-plugin
-  name: device-plugin
+{{- include "fake-gpu-operator.device-plugin.metadata" . }}
 spec:
-  selector:
-    matchLabels:
-      app: device-plugin
-      component: device-plugin
+  {{- include "fake-gpu-operator.device-plugin.podSelector" . | nindent 2 }}
   template:
-    metadata:
-      annotations:
-        checksum/initialTopology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
-      labels:
-        app: device-plugin
-        component: device-plugin
+    {{- include "fake-gpu-operator.device-plugin.podTemplate.metadata" . | nindent 4 }}
     spec:
-      containers:
-        - image: "{{ .Values.devicePlugin.image.repository }}:{{ .Values.devicePlugin.image.tag }}"
-          imagePullPolicy: "{{ .Values.devicePlugin.image.pullPolicy }}"
-          resources:
-            {{- toYaml .Values.devicePlugin.resources | nindent 12 }}
-          env:
-            - name: NODE_NAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: spec.nodeName
-            - name: TOPOLOGY_CM_NAME
-              value: topology
-            - name: TOPOLOGY_CM_NAMESPACE
-              value: "{{ .Release.Namespace }}"
-          imagePullPolicy: Always
-          name: nvidia-device-plugin-ctr
-          securityContext:
-            privileged: true
-          terminationMessagePath: /dev/termination-log
-          terminationMessagePolicy: File
-          volumeMounts:
-            - mountPath: /runai/bin
-              name: runai-bin-directory
-            - mountPath: /runai/shared
-              name: runai-shared-directory              
-            - mountPath: /var/lib/kubelet/device-plugins
-              name: device-plugin
-      dnsPolicy: ClusterFirst
+      {{- include "fake-gpu-operator.device-plugin.podTemplate.spec.common" . | nindent 6 }}
       nodeSelector:
         nvidia.com/gpu.deploy.device-plugin: "true"
-      restartPolicy: Always
-      serviceAccountName: nvidia-device-plugin
-      terminationGracePeriodSeconds: 30
-      tolerations:
-        - effect: NoSchedule
-          key: nvidia.com/gpu
-          operator: Exists
-      imagePullSecrets:
-        - name: gcr-secret
-      volumes:
-        - hostPath:
-            path: /var/lib/kubelet/device-plugins
-            type: ""
-          name: device-plugin
-        - hostPath:
-            path: /var/lib/runai/bin
-            type: DirectoryOrCreate
-          name: runai-bin-directory
-        - hostPath:
-            path: /var/lib/runai/shared
-            type: DirectoryOrCreate
-          name: runai-shared-directory
+
diff --git a/deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl b/deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl
@@ -0,0 +1,77 @@
+{{- define "fake-gpu-operator.status-exporter.metadata" }}
+metadata:
+  labels:
+    app: nvidia-dcgm-exporter
+    component: status-exporter
+    app.kubernetes.io/name: nvidia-container-toolkit
+  name: nvidia-dcgm-exporter
+  namespace: {{ .Release.Namespace }}
+{{- end }}
+
+{{- define "fake-gpu-operator.status-exporter.podSelector" }}
+selector:
+  matchLabels:
+    app: nvidia-dcgm-exporter
+{{- end }}
+
+{{- define "fake-gpu-operator.status-exporter.podTemplate.metadata" }}
+metadata:
+  creationTimestamp: null
+  labels:
+    app: nvidia-dcgm-exporter
+    app.kubernetes.io/name: nvidia-container-toolkit
+{{- end }}
+
+{{- define "fake-gpu-operator.status-exporter.podTemplate.spec.common" }}
+containers:
+- image: "{{ .Values.statusExporter.image.repository }}:{{ .Values.statusExporter.image.tag }}"
+  imagePullPolicy: "{{ .Values.statusExporter.image.pullPolicy }}"
+  resources:
+    {{- toYaml .Values.statusExporter.resources | nindent 8 }}
+  name: nvidia-dcgm-exporter
+  env:
+    - name: NODE_NAME
+      valueFrom:
+        fieldRef:
+          fieldPath: spec.nodeName
+    - name: TOPOLOGY_CM_NAME
+      value: topology
+    - name: TOPOLOGY_CM_NAMESPACE
+      value: "{{ .Release.Namespace }}"
+    - name: TOPOLOGY_MAX_EXPORT_INTERVAL
+      value: "{{ .Values.statusExporter.topologyMaxExportInterval }}"
+  ports:
+    - containerPort: 9400
+      name: http
+  volumeMounts:
+    - mountPath: /runai/proc
+      name: runai-proc-directory
+restartPolicy: Always
+schedulerName: default-scheduler
+serviceAccount: status-exporter
+serviceAccountName: status-exporter
+tolerations:
+  - effect: NoSchedule
+    key: nvidia.com/gpu
+    operator: Exists
+imagePullSecrets:
+  - name: gcr-secret
+volumes:
+  - name: runai-proc-directory
+    hostPath:
+      path: /var/lib/runai/proc
+      type: DirectoryOrCreate
+{{- end }}
+
+{{- define "fake-gpu-operator.status-exporter.deployment" }}
+apiVersion: apps/v1
+kind: Deployment
+{{- include "fake-gpu-operator.status-exporter.metadata" .}}
+spec:
+  replicas: 1
+  {{- include "fake-gpu-operator.status-exporter.podSelector" . | nindent 2 }}
+  template:
+    {{- include "fake-gpu-operator.status-exporter.podTemplate.metadata" . | nindent 4 }}
+    spec:
+      {{- include "fake-gpu-operator.status-exporter.podTemplate.spec.common" . | nindent 6 }}
+{{- end }}
diff --git a/deploy/fake-gpu-operator/templates/status-exporter/daemonset.yaml b/deploy/fake-gpu-operator/templates/status-exporter/daemonset.yaml
@@ -1,66 +1,11 @@
 apiVersion: apps/v1
 kind: DaemonSet
-metadata:
-  labels:
-    app: nvidia-dcgm-exporter
-    component: status-exporter
-    # this label would make the deployment pod to mimic the container-toolkit, on top of mimicking the dcgm-exporter.
-    app.kubernetes.io/name: nvidia-container-toolkit
-  name: nvidia-dcgm-exporter
-
+{{- include "fake-gpu-operator.status-exporter.metadata" . }}
 spec:
-  selector:
-    matchLabels:
-      app: nvidia-dcgm-exporter
+  {{- include "fake-gpu-operator.status-exporter.podSelector" . | nindent 2 }}
   template:
-    metadata:
-      creationTimestamp: null
-      labels:
-        app: nvidia-dcgm-exporter
-        app.kubernetes.io/name: nvidia-container-toolkit
+    {{- include "fake-gpu-operator.status-exporter.podTemplate.metadata" . | nindent 4 }}
     spec:
-      containers:
-        - image: "{{ .Values.statusExporter.image.repository }}:{{ .Values.statusExporter.image.tag }}"
-          imagePullPolicy: "{{ .Values.statusExporter.image.pullPolicy }}"
-          resources:
-            {{- toYaml .Values.statusExporter.resources | nindent 12 }}
-          name: nvidia-dcgm-exporter
-          env:
-            - name: NODE_NAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: spec.nodeName
-            - name: TOPOLOGY_CM_NAME
-              value: topology
-            - name: TOPOLOGY_CM_NAMESPACE
-              value: "{{ .Release.Namespace }}"
-            - name: TOPOLOGY_MAX_EXPORT_INTERVAL
-              value: "{{ .Values.statusExporter.topologyMaxExportInterval }}"
-          ports:
-            - containerPort: 9400
-              name: http
-          volumeMounts:
-            - mountPath: /runai/proc
-              name: runai-proc-directory
+      {{- include "fake-gpu-operator.status-exporter.podTemplate.spec.common" . | nindent 6 }}
       nodeSelector:
-        nvidia.com/gpu.deploy.dcgm-exporter: "true"
-      restartPolicy: Always
-      schedulerName: default-scheduler
-      serviceAccount: status-exporter
-      serviceAccountName: status-exporter
-      tolerations:
-        - effect: NoSchedule
-          key: nvidia.com/gpu
-          operator: Exists
-      imagePullSecrets:
-        - name: gcr-secret
-      volumes:
-        - name: runai-proc-directory
-          hostPath:
-            path: /var/lib/runai/proc
-            type: DirectoryOrCreate
-  updateStrategy:
-    rollingUpdate:
-      maxSurge: 0
-      maxUnavailable: 1
-    type: RollingUpdate
+        nvidia.com/gpu.deploy.dcgm-exporter: "true"
diff --git a/deploy/fake-gpu-operator/templates/status-updater/deployment-templates-cm.yaml b/deploy/fake-gpu-operator/templates/status-updater/deployment-templates-cm.yaml
@@ -0,0 +1,9 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+    name: fake-node-deployments
+data:
+    status-exporter.yaml: |
+        {{- include "fake-gpu-operator.status-exporter.deployment" . | nindent 8 }}
+    device-plugin.yaml: |
+        {{- include "fake-gpu-operator.device-plugin.deployment" . | nindent 8 }}
diff --git a/deploy/fake-gpu-operator/templates/status-updater/deployment.yaml b/deploy/fake-gpu-operator/templates/status-updater/deployment.yaml
@@ -29,7 +29,16 @@ spec:
               value: topology
             - name: TOPOLOGY_CM_NAMESPACE
               value: "{{ .Release.Namespace }}"
+            - name: FAKE_NODE_DEPLOYMENTS_PATH
+              value: /var/lib/status-updater/fake-node-deployments
+          volumeMounts:
+            - name: fake-node-deployments
+              mountPath: /var/lib/status-updater/fake-node-deployments
       restartPolicy: Always
       serviceAccountName: status-updater
       imagePullSecrets:
         - name: gcr-secret
+      volumes:
+        - name: fake-node-deployments
+          configMap:
+            name: fake-node-deployments