Skip to content

Commit

Permalink
RUN-23425 Runai directory initialization bugfix (#97)
Browse files Browse the repository at this point in the history
  • Loading branch information
gshaibi authored Nov 17, 2024
1 parent 832f4d4 commit 20b6bf3
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 11 deletions.
13 changes: 9 additions & 4 deletions deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ matchLabels:
labels:
app: nvidia-dcgm-exporter
app.kubernetes.io/name: nvidia-container-toolkit
annotations:
checksum/hostpath-init-configmap: {{ include (print $.Template.BasePath "/status-exporter/hostpath-init-configmap.yaml") . | sha256sum }}
{{- end -}}

{{- define "fake-gpu-operator.status-exporter.common.podTemplate.spec" -}}
Expand All @@ -41,8 +43,8 @@ containers:
- containerPort: 9400
name: http
volumeMounts:
- mountPath: /runai/proc
name: runai-proc-directory
- mountPath: /runai
name: runai-data
restartPolicy: Always
schedulerName: default-scheduler
serviceAccount: status-exporter
Expand All @@ -54,8 +56,11 @@ tolerations:
imagePullSecrets:
- name: gcr-secret
volumes:
- name: runai-proc-directory
- name: runai-data
hostPath:
path: /var/lib/runai/proc
path: /var/lib/runai
type: DirectoryOrCreate
- name: hostpath-init-script
configMap:
name: hostpath-init
{{- end -}}
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,11 @@ rules:
{{- if .Values.environment.openshift }}
- apiGroups:
- security.openshift.io
resourceNames:
- hostaccess
resources:
- securitycontextconstraints
resourceNames:
# Required for chmod and chcon to runai host directory
- privileged
verbs:
- use
{{ end }}
{{ end }}
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,18 @@ spec:
{{- include "fake-gpu-operator.status-exporter.common.podTemplate.metadata" . | nindent 6 }}
spec:
{{- include "fake-gpu-operator.status-exporter.common.podTemplate.spec" . | nindent 6 }}
initContainers:
- name: hostpath-init
image: "ubuntu:24.04"
command: ["/bin/bash", "/hostpath-init/init.sh"]
volumeMounts:
- name: runai-data
mountPath: /runai
- name: hostpath-init-script
mountPath: /hostpath-init
securityContext:
seccompProfile:
type: RuntimeDefault
privileged: true
nodeSelector:
nvidia.com/gpu.deploy.dcgm-exporter: "true"
nvidia.com/gpu.deploy.dcgm-exporter: "true"
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: hostpath-init
data:
init.sh: |
#!/bin/bash
set -e
RUNAI_DIR=/runai
# Allow containers to read/write to RUNAI_DIR
chmod 777 $RUNAI_DIR
chcon -Rt svirt_sandbox_file_t $RUNAI_DIR
21 changes: 18 additions & 3 deletions internal/status-exporter/export/fs/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,14 @@ func (e *FsExporter) Run(stopCh <-chan struct{}) {
}

func (e *FsExporter) export(nodeTopology *topology.NodeTopology) {
if err := os.RemoveAll("/runai/proc/pod"); err != nil {
log.Printf("Failed deleting runai/proc/pod directory: %s", err.Error())
exportPods(nodeTopology)
exportEvents()
}

func exportPods(nodeTopology *topology.NodeTopology) {
podProcDir := "/runai/proc/pod"
if err := os.RemoveAll(podProcDir); err != nil {
log.Printf("Failed deleting %s directory: %s", podProcDir, err.Error())
}

for gpuIdx, gpu := range nodeTopology.Gpus {
Expand All @@ -53,7 +59,7 @@ func (e *FsExporter) export(nodeTopology *topology.NodeTopology) {
for podUuid, gpuUsageStatus := range gpu.Status.PodGpuUsageStatus {
log.Printf("Exporting pod %s gpu stats to filesystem", podUuid)

path := fmt.Sprintf("/runai/proc/pod/%s/metrics/gpu/%d", podUuid, gpuIdx)
path := fmt.Sprintf("%s/%s/metrics/gpu/%d", podProcDir, podUuid, gpuIdx)
if err := os.MkdirAll(path, 0755); err != nil {
log.Printf("Failed creating directory for pod %s: %s", podUuid, err.Error())
}
Expand All @@ -69,6 +75,15 @@ func (e *FsExporter) export(nodeTopology *topology.NodeTopology) {
}
}

func exportEvents() {
// For now, only creating the directory without exporting any events.
// In the future, we might want to export events to the filesystem as well.
eventsDir := "/runai/proc/events"
if err := os.MkdirAll(eventsDir, 0755); err != nil {
log.Printf("Failed creating directory for events: %s", err.Error())
}
}

func writeFile(path string, content []byte) error {
if err := os.WriteFile(path, content, 0644); err != nil {
return fmt.Errorf("failed writing file %s: %w", path, err)
Expand Down

0 comments on commit 20b6bf3

Please sign in to comment.