Skip to content

Commit

Permalink
Create Fake Node Deployments based on Configmap content
Browse files Browse the repository at this point in the history
  • Loading branch information
gshaibi committed Mar 28, 2024
1 parent 5a25960 commit 86b2c12
Show file tree
Hide file tree
Showing 14 changed files with 516 additions and 294 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM golang:1.18.2 as common-builder
FROM golang:1.21 as common-builder
WORKDIR $GOPATH/src/github.com/run-ai/fake-gpu-operator
COPY go.mod .
COPY go.sum .
Expand Down
4 changes: 4 additions & 0 deletions cmd/status-updater/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@ package main

import (
"github.com/run-ai/fake-gpu-operator/internal/common/app"
"github.com/run-ai/fake-gpu-operator/internal/common/config"
status_updater "github.com/run-ai/fake-gpu-operator/internal/status-updater"
)

func main() {
requiredEnvVars := []string{"TOPOLOGY_CM_NAME", "TOPOLOGY_CM_NAMESPACE", "FAKE_NODE_DEPLOYMENTS_PATH"}
config.ValidateConfig(requiredEnvVars)

appRunner := app.NewAppRunner(&status_updater.StatusUpdaterApp{})
appRunner.Run()
}
92 changes: 92 additions & 0 deletions deploy/fake-gpu-operator/templates/device-plugin/_helpers.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
{{- define "fake-gpu-operator.device-plugin.metadata" }}
metadata:
{{- if .Values.environment.openshift }}
annotations:
openshift.io/scc: hostmount-anyuid
{{- end }}
labels:
app: device-plugin
name: device-plugin
namespace: {{ .Release.Namespace }}
{{- end }}

{{- define "fake-gpu-operator.device-plugin.podSelector" }}
selector:
matchLabels:
app: device-plugin
component: device-plugin
{{- end }}

{{- define "fake-gpu-operator.device-plugin.podTemplate.metadata" }}
metadata:
annotations:
checksum/initialTopology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
labels:
app: device-plugin
component: device-plugin
{{- end }}

{{- define "fake-gpu-operator.device-plugin.podTemplate.spec.common" }}
containers:
- image: "{{ .Values.devicePlugin.image.repository }}:{{ .Values.devicePlugin.image.tag }}"
imagePullPolicy: "{{ .Values.devicePlugin.image.pullPolicy }}"
resources:
{{- toYaml .Values.devicePlugin.resources | nindent 12 }}
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: TOPOLOGY_CM_NAME
value: topology
- name: TOPOLOGY_CM_NAMESPACE
value: "{{ .Release.Namespace }}"
name: nvidia-device-plugin-ctr
securityContext:
privileged: true
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /runai/bin
name: runai-bin-directory
- mountPath: /runai/shared
name: runai-shared-directory
- mountPath: /var/lib/kubelet/device-plugins
name: device-plugin
dnsPolicy: ClusterFirst
restartPolicy: Always
serviceAccountName: nvidia-device-plugin
terminationGracePeriodSeconds: 30
tolerations:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
imagePullSecrets:
- name: gcr-secret
volumes:
- hostPath:
path: /var/lib/kubelet/device-plugins
type: ""
name: device-plugin
- hostPath:
path: /var/lib/runai/bin
type: DirectoryOrCreate
name: runai-bin-directory
- hostPath:
path: /var/lib/runai/shared
type: DirectoryOrCreate
name: runai-shared-directory
{{- end }}

{{- define "fake-gpu-operator.device-plugin.deployment" }}
apiVersion: apps/v1
kind: Deployment
{{- include "fake-gpu-operator.device-plugin.metadata" .}}
spec:
replicas: 1
{{- include "fake-gpu-operator.device-plugin.podSelector" . | nindent 2 }}
template:
{{- include "fake-gpu-operator.device-plugin.podTemplate.metadata" . | nindent 4 }}
spec:
{{- include "fake-gpu-operator.device-plugin.podTemplate.spec.common" . | nindent 6 }}
{{- end }}
73 changes: 5 additions & 68 deletions deploy/fake-gpu-operator/templates/device-plugin/daemonset.yml
Original file line number Diff line number Diff line change
@@ -1,75 +1,12 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
{{- if .Values.environment.openshift }}
annotations:
openshift.io/scc: hostmount-anyuid
{{- end }}
labels:
app: device-plugin
name: device-plugin
{{- include "fake-gpu-operator.device-plugin.metadata" . }}
spec:
selector:
matchLabels:
app: device-plugin
component: device-plugin
{{- include "fake-gpu-operator.device-plugin.podSelector" . | nindent 2 }}
template:
metadata:
annotations:
checksum/initialTopology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
labels:
app: device-plugin
component: device-plugin
{{- include "fake-gpu-operator.device-plugin.podTemplate.metadata" . | nindent 4 }}
spec:
containers:
- image: "{{ .Values.devicePlugin.image.repository }}:{{ .Values.devicePlugin.image.tag }}"
imagePullPolicy: "{{ .Values.devicePlugin.image.pullPolicy }}"
resources:
{{- toYaml .Values.devicePlugin.resources | nindent 12 }}
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: TOPOLOGY_CM_NAME
value: topology
- name: TOPOLOGY_CM_NAMESPACE
value: "{{ .Release.Namespace }}"
imagePullPolicy: Always
name: nvidia-device-plugin-ctr
securityContext:
privileged: true
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /runai/bin
name: runai-bin-directory
- mountPath: /runai/shared
name: runai-shared-directory
- mountPath: /var/lib/kubelet/device-plugins
name: device-plugin
dnsPolicy: ClusterFirst
{{- include "fake-gpu-operator.device-plugin.podTemplate.spec.common" . | nindent 6 }}
nodeSelector:
nvidia.com/gpu.deploy.device-plugin: "true"
restartPolicy: Always
serviceAccountName: nvidia-device-plugin
terminationGracePeriodSeconds: 30
tolerations:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
imagePullSecrets:
- name: gcr-secret
volumes:
- hostPath:
path: /var/lib/kubelet/device-plugins
type: ""
name: device-plugin
- hostPath:
path: /var/lib/runai/bin
type: DirectoryOrCreate
name: runai-bin-directory
- hostPath:
path: /var/lib/runai/shared
type: DirectoryOrCreate
name: runai-shared-directory

77 changes: 77 additions & 0 deletions deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
{{- define "fake-gpu-operator.status-exporter.metadata" }}
metadata:
labels:
app: nvidia-dcgm-exporter
component: status-exporter
app.kubernetes.io/name: nvidia-container-toolkit
name: nvidia-dcgm-exporter
namespace: {{ .Release.Namespace }}
{{- end }}

{{- define "fake-gpu-operator.status-exporter.podSelector" }}
selector:
matchLabels:
app: nvidia-dcgm-exporter
{{- end }}

{{- define "fake-gpu-operator.status-exporter.podTemplate.metadata" }}
metadata:
creationTimestamp: null
labels:
app: nvidia-dcgm-exporter
app.kubernetes.io/name: nvidia-container-toolkit
{{- end }}

{{- define "fake-gpu-operator.status-exporter.podTemplate.spec.common" }}
containers:
- image: "{{ .Values.statusExporter.image.repository }}:{{ .Values.statusExporter.image.tag }}"
imagePullPolicy: "{{ .Values.statusExporter.image.pullPolicy }}"
resources:
{{- toYaml .Values.statusExporter.resources | nindent 8 }}
name: nvidia-dcgm-exporter
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: TOPOLOGY_CM_NAME
value: topology
- name: TOPOLOGY_CM_NAMESPACE
value: "{{ .Release.Namespace }}"
- name: TOPOLOGY_MAX_EXPORT_INTERVAL
value: "{{ .Values.statusExporter.topologyMaxExportInterval }}"
ports:
- containerPort: 9400
name: http
volumeMounts:
- mountPath: /runai/proc
name: runai-proc-directory
restartPolicy: Always
schedulerName: default-scheduler
serviceAccount: status-exporter
serviceAccountName: status-exporter
tolerations:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
imagePullSecrets:
- name: gcr-secret
volumes:
- name: runai-proc-directory
hostPath:
path: /var/lib/runai/proc
type: DirectoryOrCreate
{{- end }}

{{- define "fake-gpu-operator.status-exporter.deployment" }}
apiVersion: apps/v1
kind: Deployment
{{- include "fake-gpu-operator.status-exporter.metadata" .}}
spec:
replicas: 1
{{- include "fake-gpu-operator.status-exporter.podSelector" . | nindent 2 }}
template:
{{- include "fake-gpu-operator.status-exporter.podTemplate.metadata" . | nindent 4 }}
spec:
{{- include "fake-gpu-operator.status-exporter.podTemplate.spec.common" . | nindent 6 }}
{{- end }}
65 changes: 5 additions & 60 deletions deploy/fake-gpu-operator/templates/status-exporter/daemonset.yaml
Original file line number Diff line number Diff line change
@@ -1,66 +1,11 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
labels:
app: nvidia-dcgm-exporter
component: status-exporter
# this label would make the deployment pod to mimic the container-toolkit, on top of mimicking the dcgm-exporter.
app.kubernetes.io/name: nvidia-container-toolkit
name: nvidia-dcgm-exporter

{{- include "fake-gpu-operator.status-exporter.metadata" . }}
spec:
selector:
matchLabels:
app: nvidia-dcgm-exporter
{{- include "fake-gpu-operator.status-exporter.podSelector" . | nindent 2 }}
template:
metadata:
creationTimestamp: null
labels:
app: nvidia-dcgm-exporter
app.kubernetes.io/name: nvidia-container-toolkit
{{- include "fake-gpu-operator.status-exporter.podTemplate.metadata" . | nindent 4 }}
spec:
containers:
- image: "{{ .Values.statusExporter.image.repository }}:{{ .Values.statusExporter.image.tag }}"
imagePullPolicy: "{{ .Values.statusExporter.image.pullPolicy }}"
resources:
{{- toYaml .Values.statusExporter.resources | nindent 12 }}
name: nvidia-dcgm-exporter
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: TOPOLOGY_CM_NAME
value: topology
- name: TOPOLOGY_CM_NAMESPACE
value: "{{ .Release.Namespace }}"
- name: TOPOLOGY_MAX_EXPORT_INTERVAL
value: "{{ .Values.statusExporter.topologyMaxExportInterval }}"
ports:
- containerPort: 9400
name: http
volumeMounts:
- mountPath: /runai/proc
name: runai-proc-directory
{{- include "fake-gpu-operator.status-exporter.podTemplate.spec.common" . | nindent 6 }}
nodeSelector:
nvidia.com/gpu.deploy.dcgm-exporter: "true"
restartPolicy: Always
schedulerName: default-scheduler
serviceAccount: status-exporter
serviceAccountName: status-exporter
tolerations:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
imagePullSecrets:
- name: gcr-secret
volumes:
- name: runai-proc-directory
hostPath:
path: /var/lib/runai/proc
type: DirectoryOrCreate
updateStrategy:
rollingUpdate:
maxSurge: 0
maxUnavailable: 1
type: RollingUpdate
nvidia.com/gpu.deploy.dcgm-exporter: "true"
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: fake-node-deployments
data:
status-exporter.yaml: |
{{- include "fake-gpu-operator.status-exporter.deployment" . | nindent 8 }}
device-plugin.yaml: |
{{- include "fake-gpu-operator.device-plugin.deployment" . | nindent 8 }}
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,16 @@ spec:
value: topology
- name: TOPOLOGY_CM_NAMESPACE
value: "{{ .Release.Namespace }}"
- name: FAKE_NODE_DEPLOYMENTS_PATH
value: /var/lib/status-updater/fake-node-deployments
volumeMounts:
- name: fake-node-deployments
mountPath: /var/lib/status-updater/fake-node-deployments
restartPolicy: Always
serviceAccountName: status-updater
imagePullSecrets:
- name: gcr-secret
volumes:
- name: fake-node-deployments
configMap:
name: fake-node-deployments
Loading

0 comments on commit 86b2c12

Please sign in to comment.