Skip to content

Commit

Permalink
Refactor to use deployment templates
Browse files Browse the repository at this point in the history
  • Loading branch information
gshaibi committed Mar 28, 2024
1 parent 86b2c12 commit c26b451
Show file tree
Hide file tree
Showing 15 changed files with 172 additions and 140 deletions.
2 changes: 1 addition & 1 deletion cmd/status-updater/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import (
)

func main() {
requiredEnvVars := []string{"TOPOLOGY_CM_NAME", "TOPOLOGY_CM_NAMESPACE", "FAKE_NODE_DEPLOYMENTS_PATH"}
requiredEnvVars := []string{"TOPOLOGY_CM_NAME", "TOPOLOGY_CM_NAMESPACE", "FAKE_GPU_OPERATOR_NAMESPACE"}
config.ValidateConfig(requiredEnvVars)

appRunner := app.NewAppRunner(&status_updater.StatusUpdaterApp{})
Expand Down
59 changes: 22 additions & 37 deletions deploy/fake-gpu-operator/templates/device-plugin/_helpers.tpl
Original file line number Diff line number Diff line change
@@ -1,32 +1,30 @@
{{- define "fake-gpu-operator.device-plugin.metadata" }}
metadata:
{{- if .Values.environment.openshift }}
annotations:
openshift.io/scc: hostmount-anyuid
{{- end }}
labels:
app: device-plugin
name: device-plugin
namespace: {{ .Release.Namespace }}
{{- end }}
{{- define "fake-gpu-operator.device-plugin.common.metadata.labels" -}}
app: device-plugin
{{- end -}}

{{- define "fake-gpu-operator.device-plugin.common.metadata.annotations" -}}
openshift.io/scc: hostmount-anyuid
{{- end -}}

{{- define "fake-gpu-operator.device-plugin.podSelector" }}
selector:
matchLabels:
app: device-plugin
component: device-plugin
{{- define "fake-gpu-operator.device-plugin.common.metadata.name" -}}
device-plugin
{{- end -}}

{{- define "fake-gpu-operator.device-plugin.common.podSelector" }}
matchLabels:
app: device-plugin
component: device-plugin
{{- end }}

{{- define "fake-gpu-operator.device-plugin.podTemplate.metadata" }}
metadata:
annotations:
checksum/initialTopology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
labels:
app: device-plugin
component: device-plugin
{{- define "fake-gpu-operator.device-plugin.common.podTemplate.metadata" }}
annotations:
checksum/initialTopology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
labels:
app: device-plugin
component: device-plugin
{{- end }}

{{- define "fake-gpu-operator.device-plugin.podTemplate.spec.common" }}
{{- define "fake-gpu-operator.device-plugin.common.podTemplate.spec" }}
containers:
- image: "{{ .Values.devicePlugin.image.repository }}:{{ .Values.devicePlugin.image.tag }}"
imagePullPolicy: "{{ .Values.devicePlugin.image.pullPolicy }}"
Expand Down Expand Up @@ -77,16 +75,3 @@ volumes:
type: DirectoryOrCreate
name: runai-shared-directory
{{- end }}

{{- define "fake-gpu-operator.device-plugin.deployment" }}
apiVersion: apps/v1
kind: Deployment
{{- include "fake-gpu-operator.device-plugin.metadata" .}}
spec:
replicas: 1
{{- include "fake-gpu-operator.device-plugin.podSelector" . | nindent 2 }}
template:
{{- include "fake-gpu-operator.device-plugin.podTemplate.metadata" . | nindent 4 }}
spec:
{{- include "fake-gpu-operator.device-plugin.podTemplate.spec.common" . | nindent 6 }}
{{- end }}
14 changes: 9 additions & 5 deletions deploy/fake-gpu-operator/templates/device-plugin/daemonset.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
apiVersion: apps/v1
kind: DaemonSet
{{- include "fake-gpu-operator.device-plugin.metadata" . }}
metadata:
name: {{ include "fake-gpu-operator.device-plugin.common.metadata.name" . }}
labels:
{{- include "fake-gpu-operator.device-plugin.common.metadata.labels" . | nindent 4 }}
spec:
{{- include "fake-gpu-operator.device-plugin.podSelector" . | nindent 2 }}
selector:
{{- include "fake-gpu-operator.device-plugin.common.podSelector" . | nindent 4 }}
template:
{{- include "fake-gpu-operator.device-plugin.podTemplate.metadata" . | nindent 4 }}
metadata:
{{- include "fake-gpu-operator.device-plugin.common.podTemplate.metadata" . | nindent 6 }}
spec:
{{- include "fake-gpu-operator.device-plugin.podTemplate.spec.common" . | nindent 6 }}
{{- include "fake-gpu-operator.device-plugin.common.podTemplate.spec" . | nindent 6 }}
nodeSelector:
nvidia.com/gpu.deploy.device-plugin: "true"

Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "fake-gpu-operator.device-plugin.common.metadata.name" . }}
labels:
{{- include "fake-gpu-operator.device-plugin.common.metadata.labels" . | nindent 4 }}
run.ai/fake-node-deployment-template: "true"
spec:
replicas: 0
selector:
{{- include "fake-gpu-operator.device-plugin.common.podSelector" . | nindent 4 }}
template:
metadata:
{{- include "fake-gpu-operator.device-plugin.common.podTemplate.metadata" . | nindent 6 }}
spec:
{{- include "fake-gpu-operator.device-plugin.common.podTemplate.spec" . | nindent 6 }}
56 changes: 20 additions & 36 deletions deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl
Original file line number Diff line number Diff line change
@@ -1,28 +1,25 @@
{{- define "fake-gpu-operator.status-exporter.metadata" }}
metadata:
labels:
app: nvidia-dcgm-exporter
component: status-exporter
app.kubernetes.io/name: nvidia-container-toolkit
name: nvidia-dcgm-exporter
namespace: {{ .Release.Namespace }}
{{- end }}
{{- define "fake-gpu-operator.status-exporter.common.metadata.labels" -}}
app: nvidia-dcgm-exporter
component: status-exporter
app.kubernetes.io/name: nvidia-container-toolkit
{{- end -}}

{{- define "fake-gpu-operator.status-exporter.common.metadata.name" -}}
nvidia-dcgm-exporter
{{- end -}}

{{- define "fake-gpu-operator.status-exporter.podSelector" }}
selector:
matchLabels:
app: nvidia-dcgm-exporter
{{- end }}
{{- define "fake-gpu-operator.status-exporter.common.podSelector" -}}
matchLabels:
app: nvidia-dcgm-exporter
{{- end -}}

{{- define "fake-gpu-operator.status-exporter.podTemplate.metadata" }}
metadata:
creationTimestamp: null
labels:
app: nvidia-dcgm-exporter
app.kubernetes.io/name: nvidia-container-toolkit
{{- end }}
{{- define "fake-gpu-operator.status-exporter.common.podTemplate.metadata" -}}
labels:
app: nvidia-dcgm-exporter
app.kubernetes.io/name: nvidia-container-toolkit
{{- end -}}

{{- define "fake-gpu-operator.status-exporter.podTemplate.spec.common" }}
{{- define "fake-gpu-operator.status-exporter.common.podTemplate.spec" -}}
containers:
- image: "{{ .Values.statusExporter.image.repository }}:{{ .Values.statusExporter.image.tag }}"
imagePullPolicy: "{{ .Values.statusExporter.image.pullPolicy }}"
Expand Down Expand Up @@ -61,17 +58,4 @@ volumes:
hostPath:
path: /var/lib/runai/proc
type: DirectoryOrCreate
{{- end }}

{{- define "fake-gpu-operator.status-exporter.deployment" }}
apiVersion: apps/v1
kind: Deployment
{{- include "fake-gpu-operator.status-exporter.metadata" .}}
spec:
replicas: 1
{{- include "fake-gpu-operator.status-exporter.podSelector" . | nindent 2 }}
template:
{{- include "fake-gpu-operator.status-exporter.podTemplate.metadata" . | nindent 4 }}
spec:
{{- include "fake-gpu-operator.status-exporter.podTemplate.spec.common" . | nindent 6 }}
{{- end }}
{{- end -}}
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
apiVersion: apps/v1
kind: DaemonSet
{{- include "fake-gpu-operator.status-exporter.metadata" . }}
metadata:
name: {{ include "fake-gpu-operator.status-exporter.common.metadata.name" . }}
labels:
{{- include "fake-gpu-operator.status-exporter.common.metadata.labels" . | nindent 4 }}
spec:
{{- include "fake-gpu-operator.status-exporter.podSelector" . | nindent 2 }}
selector:
{{- include "fake-gpu-operator.status-exporter.common.podSelector" . | nindent 4 }}
template:
{{- include "fake-gpu-operator.status-exporter.podTemplate.metadata" . | nindent 4 }}
metadata:
{{- include "fake-gpu-operator.status-exporter.common.podTemplate.metadata" . | nindent 6 }}
spec:
{{- include "fake-gpu-operator.status-exporter.podTemplate.spec.common" . | nindent 6 }}
{{- include "fake-gpu-operator.status-exporter.common.podTemplate.spec" . | nindent 6 }}
nodeSelector:
nvidia.com/gpu.deploy.dcgm-exporter: "true"
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "fake-gpu-operator.status-exporter.common.metadata.name" . }}
labels:
{{- include "fake-gpu-operator.status-exporter.common.metadata.labels" . | nindent 4 }}
run.ai/fake-node-deployment-template: "true"
spec:
replicas: 0
selector:
{{- include "fake-gpu-operator.status-exporter.common.podSelector" . | nindent 4 }}
template:
metadata:
{{- include "fake-gpu-operator.status-exporter.common.podTemplate.metadata" . | nindent 6 }}
spec:
{{- include "fake-gpu-operator.status-exporter.common.podTemplate.spec" . | nindent 6 }}
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ rules:
- get
- list
- watch
- patch
- apiGroups:
- ""
resources:
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,12 @@ spec:
env:
- name: TOPOLOGY_CM_NAME
value: topology
# GuyTodo: Remove and replace with FAKE_GPU_OPERATOR_NAMESPACE
- name: TOPOLOGY_CM_NAMESPACE
value: "{{ .Release.Namespace }}"
- name: FAKE_NODE_DEPLOYMENTS_PATH
value: /var/lib/status-updater/fake-node-deployments
volumeMounts:
- name: fake-node-deployments
mountPath: /var/lib/status-updater/fake-node-deployments
- name: FAKE_GPU_OPERATOR_NAMESPACE
value: "{{ .Release.Namespace }}"
restartPolicy: Always
serviceAccountName: status-updater
imagePullSecrets:
- name: gcr-secret
volumes:
- name: fake-node-deployments
configMap:
name: fake-node-deployments
16 changes: 16 additions & 0 deletions deploy/fake-gpu-operator/templates/status-updater/role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: fake-status-updater
rules:
- apiGroups:
- apps
resources:
- deployments
verbs:
- update
- list
- get
- watch
- create
- delete
12 changes: 12 additions & 0 deletions deploy/fake-gpu-operator/templates/status-updater/rolebinding.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: fake-status-updater
roleRef:
kind: Role
apiGroup: rbac.authorization.k8s.io
name: fake-status-updater
subjects:
- kind: ServiceAccount
name: status-updater
namespace: "{{ .Release.Namespace }}"
8 changes: 5 additions & 3 deletions internal/common/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@ const (
MigMappingAnnotation = "run.ai/mig-mapping"
KwokNodeAnnotation = "kwok.x-k8s.io/node"

GpuGroupLabel = "runai-gpu-group"
GpuProductLabel = "nvidia.com/gpu.product"
MigConfigStateLabel = "nvidia.com/mig.config.state"
GpuGroupLabel = "runai-gpu-group"
GpuProductLabel = "nvidia.com/gpu.product"
MigConfigStateLabel = "nvidia.com/mig.config.state"
FakeNodeDeploymentTemplateLabel = "run.ai/fake-node-deployment-template"

ReservationNs = "runai-reservation"

Expand All @@ -21,4 +22,5 @@ const (
EnvNodeName = "NODE_NAME"
EnvTopologyCmName = "TOPOLOGY_CM_NAME"
EnvTopologyCmNamespace = "TOPOLOGY_CM_NAMESPACE"
EnvFakeGpuOperatorNs = "FAKE_GPU_OPERATOR_NAMESPACE"
)
Loading

0 comments on commit c26b451

Please sign in to comment.