diff --git a/deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl b/deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl index 076ba02..28a779d 100644 --- a/deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl +++ b/deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl @@ -17,6 +17,8 @@ matchLabels: labels: app: nvidia-dcgm-exporter app.kubernetes.io/name: nvidia-container-toolkit +annotations: + checksum/hostpath-init-configmap: {{ include (print $.Template.BasePath "/status-exporter/hostpath-init-configmap.yaml") . | sha256sum }} {{- end -}} {{- define "fake-gpu-operator.status-exporter.common.podTemplate.spec" -}} @@ -41,8 +43,8 @@ containers: - containerPort: 9400 name: http volumeMounts: - - mountPath: /runai/proc - name: runai-proc-directory + - mountPath: /runai + name: runai-data restartPolicy: Always schedulerName: default-scheduler serviceAccount: status-exporter @@ -54,8 +56,11 @@ tolerations: imagePullSecrets: - name: gcr-secret volumes: - - name: runai-proc-directory + - name: runai-data hostPath: - path: /var/lib/runai/proc + path: /var/lib/runai type: DirectoryOrCreate + - name: hostpath-init-script + configMap: + name: hostpath-init {{- end -}} diff --git a/deploy/fake-gpu-operator/templates/status-exporter/clusterrole.yaml b/deploy/fake-gpu-operator/templates/status-exporter/clusterrole.yaml index f11a061..34f913d 100644 --- a/deploy/fake-gpu-operator/templates/status-exporter/clusterrole.yaml +++ b/deploy/fake-gpu-operator/templates/status-exporter/clusterrole.yaml @@ -30,10 +30,11 @@ rules: {{- if .Values.environment.openshift }} - apiGroups: - security.openshift.io - resourceNames: - - hostaccess resources: - securitycontextconstraints + resourceNames: + # Required for chmod and chcon to runai host directory + - privileged verbs: - use -{{ end }} \ No newline at end of file +{{ end }} diff --git a/deploy/fake-gpu-operator/templates/status-exporter/daemonset.yaml b/deploy/fake-gpu-operator/templates/status-exporter/daemonset.yaml index 916f6b6..d98419e 100644 --- a/deploy/fake-gpu-operator/templates/status-exporter/daemonset.yaml +++ b/deploy/fake-gpu-operator/templates/status-exporter/daemonset.yaml @@ -12,5 +12,18 @@ spec: {{- include "fake-gpu-operator.status-exporter.common.podTemplate.metadata" . | nindent 6 }} spec: {{- include "fake-gpu-operator.status-exporter.common.podTemplate.spec" . | nindent 6 }} + initContainers: + - name: hostpath-init + image: "ubuntu:24.04" + command: ["/bin/bash", "/hostpath-init/init.sh"] + volumeMounts: + - name: runai-data + mountPath: /runai + - name: hostpath-init-script + mountPath: /hostpath-init + securityContext: + seccompProfile: + type: RuntimeDefault + privileged: true nodeSelector: - nvidia.com/gpu.deploy.dcgm-exporter: "true" \ No newline at end of file + nvidia.com/gpu.deploy.dcgm-exporter: "true" diff --git a/deploy/fake-gpu-operator/templates/status-exporter/hostpath-init-configmap.yaml b/deploy/fake-gpu-operator/templates/status-exporter/hostpath-init-configmap.yaml new file mode 100644 index 0000000..acebeda --- /dev/null +++ b/deploy/fake-gpu-operator/templates/status-exporter/hostpath-init-configmap.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: hostpath-init +data: + init.sh: | + #!/bin/bash + + set -e + + RUNAI_DIR=/runai + + # Allow containers to read/write to RUNAI_DIR + chmod 777 $RUNAI_DIR + chcon -Rt svirt_sandbox_file_t $RUNAI_DIR + diff --git a/internal/status-exporter/export/fs/exporter.go b/internal/status-exporter/export/fs/exporter.go index ae280de..d2bcb01 100644 --- a/internal/status-exporter/export/fs/exporter.go +++ b/internal/status-exporter/export/fs/exporter.go @@ -40,8 +40,14 @@ func (e *FsExporter) Run(stopCh <-chan struct{}) { } func (e *FsExporter) export(nodeTopology *topology.NodeTopology) { - if err := os.RemoveAll("/runai/proc/pod"); err != nil { - log.Printf("Failed deleting runai/proc/pod directory: %s", err.Error()) + exportPods(nodeTopology) + exportEvents() +} + +func exportPods(nodeTopology *topology.NodeTopology) { + podProcDir := "/runai/proc/pod" + if err := os.RemoveAll(podProcDir); err != nil { + log.Printf("Failed deleting %s directory: %s", podProcDir, err.Error()) } for gpuIdx, gpu := range nodeTopology.Gpus { @@ -53,7 +59,7 @@ func (e *FsExporter) export(nodeTopology *topology.NodeTopology) { for podUuid, gpuUsageStatus := range gpu.Status.PodGpuUsageStatus { log.Printf("Exporting pod %s gpu stats to filesystem", podUuid) - path := fmt.Sprintf("/runai/proc/pod/%s/metrics/gpu/%d", podUuid, gpuIdx) + path := fmt.Sprintf("%s/%s/metrics/gpu/%d", podProcDir, podUuid, gpuIdx) if err := os.MkdirAll(path, 0755); err != nil { log.Printf("Failed creating directory for pod %s: %s", podUuid, err.Error()) } @@ -69,6 +75,15 @@ func (e *FsExporter) export(nodeTopology *topology.NodeTopology) { } } +func exportEvents() { + // For now, only creating the directory without exporting any events. + // In the future, we might want to export events to the filesystem as well. + eventsDir := "/runai/proc/events" + if err := os.MkdirAll(eventsDir, 0755); err != nil { + log.Printf("Failed creating directory for events: %s", err.Error()) + } +} + func writeFile(path string, content []byte) error { if err := os.WriteFile(path, content, 0644); err != nil { return fmt.Errorf("failed writing file %s: %w", path, err)