From 3fad3f19d7d181a67e6bba561df1a188521474d8 Mon Sep 17 00:00:00 2001 From: Kristi Nikolla Date: Tue, 8 Oct 2024 12:42:41 -0400 Subject: [PATCH] Updated job resources and added mount on /dev/shm /dev/shm being too small was causing a crash in the DataLoader. --- k8s/base/job.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/k8s/base/job.yaml b/k8s/base/job.yaml index 4e9a49a..fb6e446 100644 --- a/k8s/base/job.yaml +++ b/k8s/base/job.yaml @@ -12,9 +12,21 @@ spec: volumeMounts: - mountPath: /storage/unet3d_data name: unet3d-data + - mountPath: /dev/shm + name: shm + resources: + requests: + memory: "16Gi" + cpu: "500m" + limits: + memory: "64Gi" + cpu: "4" restartPolicy: Never volumes: - name: unet3d-data persistentVolumeClaim: claimName: mlperf-storage-data + - name: shm + emptyDir: + medium: Memory backoffLimit: 4