Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RUN-23425 Runai directory initialization bugfix #97

Merged
merged 2 commits into from
Nov 17, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ matchLabels:
labels:
app: nvidia-dcgm-exporter
app.kubernetes.io/name: nvidia-container-toolkit
annotations:
checksum/hostpath-init-configmap: {{ include (print $.Template.BasePath "/status-exporter/hostpath-init-configmap.yaml") . | sha256sum }}
{{- end -}}

{{- define "fake-gpu-operator.status-exporter.common.podTemplate.spec" -}}
Expand All @@ -41,8 +43,8 @@ containers:
- containerPort: 9400
name: http
volumeMounts:
- mountPath: /runai/proc
name: runai-proc-directory
- mountPath: /runai
name: runai-data
restartPolicy: Always
schedulerName: default-scheduler
serviceAccount: status-exporter
Expand All @@ -54,8 +56,11 @@ tolerations:
imagePullSecrets:
- name: gcr-secret
volumes:
- name: runai-proc-directory
- name: runai-data
hostPath:
path: /var/lib/runai/proc
path: /var/lib/runai
type: DirectoryOrCreate
- name: hostpath-init-script
configMap:
name: hostpath-init
{{- end -}}
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,11 @@ rules:
{{- if .Values.environment.openshift }}
- apiGroups:
- security.openshift.io
resourceNames:
- hostaccess
resources:
- securitycontextconstraints
resourceNames:
# Required for chmod and chcon to runai host directory
- privileged
verbs:
- use
{{ end }}
{{ end }}
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,18 @@ spec:
{{- include "fake-gpu-operator.status-exporter.common.podTemplate.metadata" . | nindent 6 }}
spec:
{{- include "fake-gpu-operator.status-exporter.common.podTemplate.spec" . | nindent 6 }}
initContainers:
- name: hostpath-init
image: "ubuntu:24.04"
command: ["/bin/bash", "/hostpath-init/init.sh"]
volumeMounts:
- name: runai-data
mountPath: /runai
- name: hostpath-init-script
mountPath: /hostpath-init
securityContext:
seccompProfile:
type: RuntimeDefault
privileged: true
nodeSelector:
nvidia.com/gpu.deploy.dcgm-exporter: "true"
nvidia.com/gpu.deploy.dcgm-exporter: "true"
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: hostpath-init
data:
init.sh: |
#!/bin/bash

set -e

RUNAI_DIR=/runai

# Print the current date, and user running the script (GuyRemove)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this should be removed?

date
whoami

# Allow containers to read/write to RUNAI_DIR
chmod 777 $RUNAI_DIR
chcon -Rt svirt_sandbox_file_t $RUNAI_DIR

21 changes: 18 additions & 3 deletions internal/status-exporter/export/fs/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,14 @@ func (e *FsExporter) Run(stopCh <-chan struct{}) {
}

func (e *FsExporter) export(nodeTopology *topology.NodeTopology) {
if err := os.RemoveAll("/runai/proc/pod"); err != nil {
log.Printf("Failed deleting runai/proc/pod directory: %s", err.Error())
exportPods(nodeTopology)
exportEvents()
}

func exportPods(nodeTopology *topology.NodeTopology) {
podProcDir := "/runai/proc/pod"
if err := os.RemoveAll(podProcDir); err != nil {
log.Printf("Failed deleting %s directory: %s", podProcDir, err.Error())
}

for gpuIdx, gpu := range nodeTopology.Gpus {
Expand All @@ -53,7 +59,7 @@ func (e *FsExporter) export(nodeTopology *topology.NodeTopology) {
for podUuid, gpuUsageStatus := range gpu.Status.PodGpuUsageStatus {
log.Printf("Exporting pod %s gpu stats to filesystem", podUuid)

path := fmt.Sprintf("/runai/proc/pod/%s/metrics/gpu/%d", podUuid, gpuIdx)
path := fmt.Sprintf("%s/%s/metrics/gpu/%d", podProcDir, podUuid, gpuIdx)
if err := os.MkdirAll(path, 0755); err != nil {
log.Printf("Failed creating directory for pod %s: %s", podUuid, err.Error())
}
Expand All @@ -69,6 +75,15 @@ func (e *FsExporter) export(nodeTopology *topology.NodeTopology) {
}
}

func exportEvents() {
// For now, only creating the directory without exporting any events.
// In the future, we might want to export events to the filesystem as well.
eventsDir := "/runai/proc/events"
if err := os.MkdirAll(eventsDir, 0755); err != nil {
log.Printf("Failed creating directory for events: %s", err.Error())
}
}

func writeFile(path string, content []byte) error {
if err := os.WriteFile(path, content, 0644); err != nil {
return fmt.Errorf("failed writing file %s: %w", path, err)
Expand Down
Loading