From 1c8d57085af0c4b6f6609b4df4b35f30788b311e Mon Sep 17 00:00:00 2001 From: gshaibi <gshaibi@gmail.com> Date: Sun, 24 Nov 2024 17:15:57 +0200 Subject: [PATCH] Samples --- .../metrics/active-frac-gpu-pod.yaml | 83 ++++ .../metrics/active-whole-gpu-pod.yaml | 84 ++++ design/samples/runai/2.17/fractional_pod.yaml | 228 ++++++++++ .../2.17/fractional_pod_reservation.yaml | 124 ++++++ .../<2.9/fraction/metrics/1_workload.ini | 76 ++++ .../runai/<2.9/fraction/node/1_workload.yaml | 347 +++++++++++++++ .../<2.9/mig/metrics/4 Active Workloads.ini | 86 ++++ .../runai/<2.9/mig/metrics/Active 1g.5gb.ini | 44 ++ .../mig/metrics/Active 2*1g.5gb + 2g.10gb.ini | 72 ++++ .../<2.9/mig/metrics/Empty mig disabled.ini | 145 +++++++ .../<2.9/mig/metrics/Emtpy MIG enabled.ini | 44 ++ .../mig/metrics/Idle 1g.5gb allocated.ini | 44 ++ .../runai/<2.9/mig/node/3_instances.yaml | 395 +++++++++++++++++ .../runai/<2.9/mig/node/4_instances.yaml | 396 ++++++++++++++++++ design/samples/runai/<2.9/mig/pod/1g-5gb.yaml | 174 ++++++++ .../samples/runai/>=2.9/mig/node/7g-40gb.yaml | 385 +++++++++++++++++ .../samples/runai/>=2.9/mig/pod/7g-5gb.yaml | 169 ++++++++ 17 files changed, 2896 insertions(+) create mode 100644 design/samples/gpu-operator/metrics/active-frac-gpu-pod.yaml create mode 100644 design/samples/gpu-operator/metrics/active-whole-gpu-pod.yaml create mode 100644 design/samples/runai/2.17/fractional_pod.yaml create mode 100644 design/samples/runai/2.17/fractional_pod_reservation.yaml create mode 100644 design/samples/runai/<2.9/fraction/metrics/1_workload.ini create mode 100644 design/samples/runai/<2.9/fraction/node/1_workload.yaml create mode 100644 design/samples/runai/<2.9/mig/metrics/4 Active Workloads.ini create mode 100644 design/samples/runai/<2.9/mig/metrics/Active 1g.5gb.ini create mode 100644 design/samples/runai/<2.9/mig/metrics/Active 2*1g.5gb + 2g.10gb.ini create mode 100644 design/samples/runai/<2.9/mig/metrics/Empty mig disabled.ini create mode 100644 design/samples/runai/<2.9/mig/metrics/Emtpy MIG enabled.ini create mode 100644 design/samples/runai/<2.9/mig/metrics/Idle 1g.5gb allocated.ini create mode 100644 design/samples/runai/<2.9/mig/node/3_instances.yaml create mode 100644 design/samples/runai/<2.9/mig/node/4_instances.yaml create mode 100644 design/samples/runai/<2.9/mig/pod/1g-5gb.yaml create mode 100644 design/samples/runai/>=2.9/mig/node/7g-40gb.yaml create mode 100644 design/samples/runai/>=2.9/mig/pod/7g-5gb.yaml diff --git a/design/samples/gpu-operator/metrics/active-frac-gpu-pod.yaml b/design/samples/gpu-operator/metrics/active-frac-gpu-pod.yaml new file mode 100644 index 0000000..791c5cb --- /dev/null +++ b/design/samples/gpu-operator/metrics/active-frac-gpu-pod.yaml @@ -0,0 +1,83 @@ +################ Active Whole GPU Pod ################ + +###### From DCGM Exporter Directly ###### + +{ +# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). +# TYPE DCGM_FI_DEV_SM_CLOCK gauge +DCGM_FI_DEV_SM_CLOCK{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 300 +# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz). +# TYPE DCGM_FI_DEV_MEM_CLOCK gauge +DCGM_FI_DEV_MEM_CLOCK{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 5000 +# HELP DCGM_FI_DEV_MEMORY_TEMP Memory temperature (in C). +# TYPE DCGM_FI_DEV_MEMORY_TEMP gauge +DCGM_FI_DEV_MEMORY_TEMP{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0 +# HELP DCGM_FI_DEV_GPU_TEMP GPU temperature (in C). +# TYPE DCGM_FI_DEV_GPU_TEMP gauge +DCGM_FI_DEV_GPU_TEMP{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 44 +# HELP DCGM_FI_DEV_POWER_USAGE Power draw (in W). +# TYPE DCGM_FI_DEV_POWER_USAGE gauge +DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 28.822000 +# HELP DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Total energy consumption since boot (in mJ). +# TYPE DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION counter +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 484260738 +# HELP DCGM_FI_DEV_PCIE_REPLAY_COUNTER Total number of PCIe retries. +# TYPE DCGM_FI_DEV_PCIE_REPLAY_COUNTER counter +DCGM_FI_DEV_PCIE_REPLAY_COUNTER{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0 +# HELP DCGM_FI_DEV_GPU_UTIL GPU utilization (in %). +# TYPE DCGM_FI_DEV_GPU_UTIL gauge +DCGM_FI_DEV_GPU_UTIL{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0 +# HELP DCGM_FI_DEV_MEM_COPY_UTIL Memory utilization (in %). +# TYPE DCGM_FI_DEV_MEM_COPY_UTIL gauge +DCGM_FI_DEV_MEM_COPY_UTIL{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0 +# HELP DCGM_FI_DEV_ENC_UTIL Encoder utilization (in %). +# TYPE DCGM_FI_DEV_ENC_UTIL gauge +DCGM_FI_DEV_ENC_UTIL{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0 +# HELP DCGM_FI_DEV_DEC_UTIL Decoder utilization (in %). +# TYPE DCGM_FI_DEV_DEC_UTIL gauge +DCGM_FI_DEV_DEC_UTIL{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0 +# HELP DCGM_FI_DEV_XID_ERRORS Value of the last XID error encountered. +# TYPE DCGM_FI_DEV_XID_ERRORS gauge +DCGM_FI_DEV_XID_ERRORS{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",err_code="0",err_msg="No Error",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0 +# HELP DCGM_FI_DEV_FB_FREE Framebuffer memory free (in MiB). +# TYPE DCGM_FI_DEV_FB_FREE gauge +DCGM_FI_DEV_FB_FREE{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 14914 +# HELP DCGM_FI_DEV_FB_USED Framebuffer memory used (in MiB). +# TYPE DCGM_FI_DEV_FB_USED gauge +DCGM_FI_DEV_FB_USED{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 2 +# HELP DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL Total number of NVLink bandwidth counters for all lanes. +# TYPE DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL counter +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0 +# HELP DCGM_FI_DEV_VGPU_LICENSE_STATUS vGPU License status +# TYPE DCGM_FI_DEV_VGPU_LICENSE_STATUS gauge +DCGM_FI_DEV_VGPU_LICENSE_STATUS{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0 +# HELP DCGM_FI_PROF_GR_ENGINE_ACTIVE Ratio of time the graphics engine is active. +# TYPE DCGM_FI_PROF_GR_ENGINE_ACTIVE gauge +DCGM_FI_PROF_GR_ENGINE_ACTIVE{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0.999983 +# HELP DCGM_FI_PROF_PIPE_TENSOR_ACTIVE Ratio of cycles the tensor (HMMA) pipe is active. +# TYPE DCGM_FI_PROF_PIPE_TENSOR_ACTIVE gauge +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0.000000 +# HELP DCGM_FI_PROF_DRAM_ACTIVE Ratio of cycles the device memory interface is active sending or receiving data. +# TYPE DCGM_FI_PROF_DRAM_ACTIVE gauge +DCGM_FI_PROF_DRAM_ACTIVE{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0.466046 +# HELP DCGM_FI_PROF_PCIE_TX_BYTES The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second. +# TYPE DCGM_FI_PROF_PCIE_TX_BYTES gauge +DCGM_FI_PROF_PCIE_TX_BYTES{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 5529374 +# HELP DCGM_FI_PROF_PCIE_RX_BYTES The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second. +# TYPE DCGM_FI_PROF_PCIE_RX_BYTES gauge +DCGM_FI_PROF_PCIE_RX_BYTES{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 21260577 +} + +###### From Prometheus ###### + +{ +DCGM_FI_DEV_FB_FREE{DCGM_FI_DRIVER_VERSION="550.54.14", Hostname="ip-172-20-10-26", UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1", container="nvidia-dcgm-exporter", device="nvidia0", endpoint="gpu-metrics", exported_container="runai-reservation", exported_namespace="runai-reservation", exported_pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw", gpu="0", instance="10.244.1.11:9400", job="nvidia-dcgm-exporter", modelName="Tesla T4", namespace="gpu-operator", pci_bus_id="00000000:00:1E.0", pod="nvidia-dcgm-exporter-mrgds", service="nvidia-dcgm-exporter"} +8422 +DCGM_FI_DEV_FB_USED{DCGM_FI_DRIVER_VERSION="550.54.14", Hostname="ip-172-20-10-26", UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1", container="nvidia-dcgm-exporter", device="nvidia0", endpoint="gpu-metrics", exported_container="runai-reservation", exported_namespace="runai-reservation", exported_pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw", gpu="0", instance="10.244.1.11:9400", job="nvidia-dcgm-exporter", modelName="Tesla T4", namespace="gpu-operator", pci_bus_id="00000000:00:1E.0", pod="nvidia-dcgm-exporter-mrgds", service="nvidia-dcgm-exporter"} +6494 +DCGM_FI_DEV_GPU_UTIL{DCGM_FI_DRIVER_VERSION="550.54.14", Hostname="ip-172-20-10-26", UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1", container="nvidia-dcgm-exporter", device="nvidia0", endpoint="gpu-metrics", exported_container="runai-reservation", exported_namespace="runai-reservation", exported_pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw", gpu="0", instance="10.244.1.11:9400", job="nvidia-dcgm-exporter", modelName="Tesla T4", namespace="gpu-operator", pci_bus_id="00000000:00:1E.0", pod="nvidia-dcgm-exporter-mrgds", service="nvidia-dcgm-exporter"} +100 +} + +###### Note ###### +ip-172-20-10-26 is the hostname of the node where the GPU is installed. diff --git a/design/samples/gpu-operator/metrics/active-whole-gpu-pod.yaml b/design/samples/gpu-operator/metrics/active-whole-gpu-pod.yaml new file mode 100644 index 0000000..3609937 --- /dev/null +++ b/design/samples/gpu-operator/metrics/active-whole-gpu-pod.yaml @@ -0,0 +1,84 @@ +################ Active Whole GPU Pod ################ + +###### From DCGM Exporter Directly ###### + +{ +# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). +# TYPE DCGM_FI_DEV_SM_CLOCK gauge +DCGM_FI_DEV_SM_CLOCK{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="burn-whole",namespace="runai-pa",pod="burn-whole-0-0"} 585 +# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz). +# TYPE DCGM_FI_DEV_MEM_CLOCK gauge +DCGM_FI_DEV_MEM_CLOCK{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="burn-whole",namespace="runai-pa",pod="burn-whole-0-0"} 5000 +# HELP DCGM_FI_DEV_MEMORY_TEMP Memory temperature (in C). +# TYPE DCGM_FI_DEV_MEMORY_TEMP gauge +DCGM_FI_DEV_MEMORY_TEMP{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="burn-whole",namespace="runai-pa",pod="burn-whole-0-0"} 0 +# HELP DCGM_FI_DEV_GPU_TEMP GPU temperature (in C). +# TYPE DCGM_FI_DEV_GPU_TEMP gauge +DCGM_FI_DEV_GPU_TEMP{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="burn-whole",namespace="runai-pa",pod="burn-whole-0-0"} 29 +# HELP DCGM_FI_DEV_POWER_USAGE Power draw (in W). +# TYPE DCGM_FI_DEV_POWER_USAGE gauge +DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="burn-whole",namespace="runai-pa",pod="burn-whole-0-0"} 28.167000 +# HELP DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Total energy consumption since boot (in mJ). +# TYPE DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION counter +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="burn-whole",namespace="runai-pa",pod="burn-whole-0-0"} 213382036 +# HELP DCGM_FI_DEV_PCIE_REPLAY_COUNTER Total number of PCIe retries. +# TYPE DCGM_FI_DEV_PCIE_REPLAY_COUNTER counter +DCGM_FI_DEV_PCIE_REPLAY_COUNTER{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="burn-whole",namespace="runai-pa",pod="burn-whole-0-0"} 0 +# HELP DCGM_FI_DEV_GPU_UTIL GPU utilization (in %). +# TYPE DCGM_FI_DEV_GPU_UTIL gauge +DCGM_FI_DEV_GPU_UTIL{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="burn-whole",namespace="runai-pa",pod="burn-whole-0-0"} 0 +# HELP DCGM_FI_DEV_MEM_COPY_UTIL Memory utilization (in %). +# TYPE DCGM_FI_DEV_MEM_COPY_UTIL gauge +DCGM_FI_DEV_MEM_COPY_UTIL{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="burn-whole",namespace="runai-pa",pod="burn-whole-0-0"} 0 +# HELP DCGM_FI_DEV_ENC_UTIL Encoder utilization (in %). +# TYPE DCGM_FI_DEV_ENC_UTIL gauge +DCGM_FI_DEV_ENC_UTIL{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="burn-whole",namespace="runai-pa",pod="burn-whole-0-0"} 0 +# HELP DCGM_FI_DEV_DEC_UTIL Decoder utilization (in %). +# TYPE DCGM_FI_DEV_DEC_UTIL gauge +DCGM_FI_DEV_DEC_UTIL{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="burn-whole",namespace="runai-pa",pod="burn-whole-0-0"} 0 +# HELP DCGM_FI_DEV_XID_ERRORS Value of the last XID error encountered. +# TYPE DCGM_FI_DEV_XID_ERRORS gauge +DCGM_FI_DEV_XID_ERRORS{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="burn-whole",err_code="0",err_msg="No Error",namespace="runai-pa",pod="burn-whole-0-0"} 0 +# HELP DCGM_FI_DEV_FB_FREE Framebuffer memory free (in MiB). +# TYPE DCGM_FI_DEV_FB_FREE gauge +DCGM_FI_DEV_FB_FREE{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="burn-whole",namespace="runai-pa",pod="burn-whole-0-0"} 14796 +# HELP DCGM_FI_DEV_FB_USED Framebuffer memory used (in MiB). +# TYPE DCGM_FI_DEV_FB_USED gauge +DCGM_FI_DEV_FB_USED{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="burn-whole",namespace="runai-pa",pod="burn-whole-0-0"} 120 +# HELP DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL Total number of NVLink bandwidth counters for all lanes. +# TYPE DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL counter +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="burn-whole",namespace="runai-pa",pod="burn-whole-0-0"} 0 +# HELP DCGM_FI_DEV_VGPU_LICENSE_STATUS vGPU License status +# TYPE DCGM_FI_DEV_VGPU_LICENSE_STATUS gauge +DCGM_FI_DEV_VGPU_LICENSE_STATUS{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="burn-whole",namespace="runai-pa",pod="burn-whole-0-0"} 0 +# HELP DCGM_FI_PROF_GR_ENGINE_ACTIVE Ratio of time the graphics engine is active. +# TYPE DCGM_FI_PROF_GR_ENGINE_ACTIVE gauge +DCGM_FI_PROF_GR_ENGINE_ACTIVE{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="burn-whole",namespace="runai-pa",pod="burn-whole-0-0"} 0.000408 +# HELP DCGM_FI_PROF_PIPE_TENSOR_ACTIVE Ratio of cycles the tensor (HMMA) pipe is active. +# TYPE DCGM_FI_PROF_PIPE_TENSOR_ACTIVE gauge +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="burn-whole",namespace="runai-pa",pod="burn-whole-0-0"} 0.000000 +# HELP DCGM_FI_PROF_DRAM_ACTIVE Ratio of cycles the device memory interface is active sending or receiving data. +# TYPE DCGM_FI_PROF_DRAM_ACTIVE gauge +DCGM_FI_PROF_DRAM_ACTIVE{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="burn-whole",namespace="runai-pa",pod="burn-whole-0-0"} 0.000008 +# HELP DCGM_FI_PROF_PCIE_TX_BYTES The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second. +# TYPE DCGM_FI_PROF_PCIE_TX_BYTES gauge +DCGM_FI_PROF_PCIE_TX_BYTES{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="burn-whole",namespace="runai-pa",pod="burn-whole-0-0"} 548328 +# HELP DCGM_FI_PROF_PCIE_RX_BYTES The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second. +# TYPE DCGM_FI_PROF_PCIE_RX_BYTES gauge +DCGM_FI_PROF_PCIE_RX_BYTES{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="burn-whole",namespace="runai-pa",pod="burn-whole-0-0"} 898157 +} + +###### From Prometheus ###### + +{ + +DCGM_FI_DEV_GPU_UTIL{DCGM_FI_DRIVER_VERSION="550.54.14", Hostname="ip-172-20-10-26", UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1", container="nvidia-dcgm-exporter", device="nvidia0", endpoint="gpu-metrics", exported_container="burn-whole", exported_namespace="runai-pa", exported_pod="burn-whole-0-0", gpu="0", instance="10.244.1.11:9400", job="nvidia-dcgm-exporter", modelName="Tesla T4", namespace="gpu-operator", pci_bus_id="00000000:00:1E.0", pod="nvidia-dcgm-exporter-mrgds", service="nvidia-dcgm-exporter"} +100 +DCGM_FI_DEV_FB_FREE{DCGM_FI_DRIVER_VERSION="550.54.14", Hostname="ip-172-20-10-26", UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1", container="nvidia-dcgm-exporter", device="nvidia0", endpoint="gpu-metrics", exported_container="burn-whole", exported_namespace="runai-pa", exported_pod="burn-whole-0-0", gpu="0", instance="10.244.1.11:9400", job="nvidia-dcgm-exporter", modelName="Tesla T4", namespace="gpu-operator", pci_bus_id="00000000:00:1E.0", pod="nvidia-dcgm-exporter-mrgds", service="nvidia-dcgm-exporter"} +1494 +DCGM_FI_DEV_FB_USED{DCGM_FI_DRIVER_VERSION="550.54.14", Hostname="ip-172-20-10-26", UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1", container="nvidia-dcgm-exporter", device="nvidia0", endpoint="gpu-metrics", exported_container="burn-whole", exported_namespace="runai-pa", exported_pod="burn-whole-0-0", gpu="0", instance="10.244.1.11:9400", job="nvidia-dcgm-exporter", modelName="Tesla T4", namespace="gpu-operator", pci_bus_id="00000000:00:1E.0", pod="nvidia-dcgm-exporter-mrgds", service="nvidia-dcgm-exporter"} +13422 +} + +###### Note ###### +ip-172-20-10-26 is the name of the node where the GPU is installed. diff --git a/design/samples/runai/2.17/fractional_pod.yaml b/design/samples/runai/2.17/fractional_pod.yaml new file mode 100644 index 0000000..b2481bd --- /dev/null +++ b/design/samples/runai/2.17/fractional_pod.yaml @@ -0,0 +1,228 @@ +apiVersion: v1 +kind: Pod +metadata: + annotations: + clusterId: d69dff42-4134-41d9-90fc-1c39505cb774 + cni.projectcalico.org/containerID: 4357ed00f685ddcfafb9b551fbad81f6ad9a39d74f0158fe22b6e937dad415df + cni.projectcalico.org/podIP: 100.122.249.152/32 + cni.projectcalico.org/podIPs: 100.122.249.152/32 + gpu-fraction: "0.5" + pod-group-name: pg-frac-1-0-2237ca39-cac0-4601-b658-8a3c5f406a4f + received-resource-type: Fraction + runai-allocated-gpu-memory: "7680" + runai-allocated-gpus: "0.5" + runai-allocated-mig-gpus: "0" + runai-calculated-status: Running + runai-job-id: 2237ca39-cac0-4601-b658-8a3c5f406a4f + runai-node: i-0b498db53280b86a6 + runai/shared-gpu-configmap: frac-1-ns26p7c-runai-sh-gpu + user: test@run.ai + workloadId: 027397ab-4c3c-45f7-87d0-8b3bae4ded65 + creationTimestamp: "2024-03-31T09:03:22Z" + generateName: frac-1- + labels: + app: runaijob + controller-uid: 2237ca39-cac0-4601-b658-8a3c5f406a4f + createdBy: RunaiJob + project: pa + release: frac-1 + run.ai/top-owner-uid: 027397ab-4c3c-45f7-87d0-8b3bae4ded65 + runai-gpu-group: df7c0dd3-9795-443c-85b9-acbf49c8fb6b + runai/pod-index: 0-0 + workloadKind: TrainingWorkload + workloadName: frac-1 + name: frac-1-0-0 + namespace: runai-pa + ownerReferences: + - apiVersion: run.ai/v1 + blockOwnerDeletion: true + controller: true + kind: RunaiJob + name: frac-1 + uid: 2237ca39-cac0-4601-b658-8a3c5f406a4f + resourceVersion: "10748" + uid: a801b3c7-b9be-4830-821c-2456cad2234f +spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: runai/node-pool + operator: DoesNotExist + containers: + - env: + - name: RUNAI_JOB_NAME + value: frac-1 + - name: RUNAI_PROJECT + value: pa + - name: WANDB_NOTES + value: https://shaibi-real.runailabs.com/trainings?columnFilter=[{"term":"frac-1","name":"name"}]&clusterId=d69dff42-4134-41d9-90fc-1c39505cb774 + - name: POD_INDEX + value: "0" + - name: RUNAI_GPU_MEMORY_REQUEST + value: "0.50" + - name: RUNAI_GPU_MEMORY_LIMIT + value: "0.50" + - name: NVIDIA_VISIBLE_DEVICES + valueFrom: + configMapKeyRef: + key: RUNAI-VISIBLE-DEVICES + name: frac-1-ns26p7c-runai-sh-gpu-0 + - name: RUNAI_NUM_OF_GPUS + valueFrom: + configMapKeyRef: + key: RUNAI_NUM_OF_GPUS + name: frac-1-ns26p7c-runai-sh-gpu-0 + - name: jobUUID + value: 2237ca39-cac0-4601-b658-8a3c5f406a4f + - name: JOB_UUID + value: 2237ca39-cac0-4601-b658-8a3c5f406a4f + - name: jobName + value: frac-1 + - name: JOB_NAME + value: frac-1 + - name: reporterGatewayURL + value: runai-prometheus-pushgateway.runai.svc.cluster.local:9091 + - name: REPORTER_GATEWAY_URL + value: runai-prometheus-pushgateway.runai.svc.cluster.local:9091 + - name: podUUID + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.uid + - name: POD_UUID + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.uid + - name: NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + envFrom: + - configMapRef: + name: frac-1-ns26p7c-runai-sh-gpu-0-evar + optional: false + image: gshaibi/gpu-burn + imagePullPolicy: IfNotPresent + name: frac-1 + resources: + requests: + cpu: 100m + memory: 100M + securityContext: + allowPrivilegeEscalation: false + capabilities: {} + seccompProfile: + type: RuntimeDefault + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /var/run/secrets/kubernetes.io/serviceaccount + name: kube-api-access-dfphn + readOnly: true + - mountPath: /etc/ld.so.preload + name: frac-1-ns26p7c-runai-sh-gpu-0-vol + readOnly: true + subPath: ld.so.preload-key + - mountPath: /etc/runai.d/memory + name: frac-1-ns26p7c-runai-sh-gpu-0-vol + readOnly: true + subPath: config + - mountPath: /etc/runai.d/pod_uuid + name: frac-1-ns26p7c-runai-sh-gpu-0-vol + readOnly: true + subPath: pod-uuid + - mountPath: /runai/shared + name: runai-shared-directory + readOnly: true + - mountPath: /etc/runai.d/route + name: frac-1-ns26p7c-runai-sh-gpu-0-vol + readOnly: true + subPath: route + dnsPolicy: ClusterFirst + enableServiceLinks: true + nodeName: i-0b498db53280b86a6 + preemptionPolicy: PreemptLowerPriority + priority: 0 + restartPolicy: Never + schedulerName: runai-scheduler + securityContext: {} + serviceAccount: default + serviceAccountName: default + terminationGracePeriodSeconds: 30 + tolerations: + - effect: NoExecute + key: node.kubernetes.io/not-ready + operator: Exists + tolerationSeconds: 300 + - effect: NoExecute + key: node.kubernetes.io/unreachable + operator: Exists + tolerationSeconds: 300 + volumes: + - name: kube-api-access-dfphn + projected: + defaultMode: 420 + sources: + - serviceAccountToken: + expirationSeconds: 3607 + path: token + - configMap: + items: + - key: ca.crt + path: ca.crt + name: kube-root-ca.crt + - downwardAPI: + items: + - fieldRef: + apiVersion: v1 + fieldPath: metadata.namespace + path: namespace + - configMap: + defaultMode: 420 + name: frac-1-ns26p7c-runai-sh-gpu-0 + name: frac-1-ns26p7c-runai-sh-gpu-0-vol + - hostPath: + path: /var/lib/runai/shared + type: DirectoryOrCreate + name: runai-shared-directory +status: + conditions: + - lastProbeTime: null + lastTransitionTime: "2024-03-31T09:03:27Z" + status: "True" + type: Initialized + - lastProbeTime: null + lastTransitionTime: "2024-03-31T09:03:51Z" + status: "True" + type: Ready + - lastProbeTime: null + lastTransitionTime: "2024-03-31T09:03:51Z" + status: "True" + type: ContainersReady + - lastProbeTime: null + lastTransitionTime: "2024-03-31T09:03:27Z" + status: "True" + type: PodScheduled + containerStatuses: + - containerID: containerd://4205608c75216bfe3d3a71ea7301f8bc041acba92673e033fc87be6d91867dc6 + image: docker.io/gshaibi/gpu-burn:latest + imageID: docker.io/gshaibi/gpu-burn@sha256:ed07993b0581228c2bd7113fae0ed214549547f0fa91ba50165bc2473cfaf979 + lastState: {} + name: frac-1 + ready: true + restartCount: 0 + started: true + state: + running: + startedAt: "2024-03-31T09:03:51Z" + hostIP: 172.20.62.77 + phase: Running + podIP: 100.122.249.152 + podIPs: + - ip: 100.122.249.152 + qosClass: Burstable + startTime: "2024-03-31T09:03:27Z" diff --git a/design/samples/runai/2.17/fractional_pod_reservation.yaml b/design/samples/runai/2.17/fractional_pod_reservation.yaml new file mode 100644 index 0000000..708a102 --- /dev/null +++ b/design/samples/runai/2.17/fractional_pod_reservation.yaml @@ -0,0 +1,124 @@ +apiVersion: v1 +kind: Pod +metadata: + annotations: + cni.projectcalico.org/containerID: 75affaf027829643896b3de5699d15fedb291f4f7efac6f00b0d0bbe9a2dd65a + cni.projectcalico.org/podIP: 100.122.249.151/32 + cni.projectcalico.org/podIPs: 100.122.249.151/32 + pod-group-name: pg-runai-reservation-gpu-i-0b498db53280b86a6-fzdhl-3b47e794-97f0-4824-b7d5-bb44c122039e + run.ai/reserve_for_gpu_index: GPU-8983c66a-23df-e63b-4c2f-afcae9ec79b3 + runai-job-id: 3b47e794-97f0-4824-b7d5-bb44c122039e + creationTimestamp: "2024-03-31T09:03:25Z" + labels: + app: runai-reservation + app.runai.resource.reservation: runai-reservation-gpu + runai-gpu-group: df7c0dd3-9795-443c-85b9-acbf49c8fb6b + name: runai-reservation-gpu-i-0b498db53280b86a6-fzdhl + namespace: runai-reservation + resourceVersion: "10625" + uid: 3b47e794-97f0-4824-b7d5-bb44c122039e +spec: + containers: + - env: + - name: POD_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.namespace + image: gcr.io/run-ai-prod/resource-reservation:v3.5.0 + imagePullPolicy: IfNotPresent + name: runai-reservation + resources: + limits: + nvidia.com/gpu: "1" + requests: + nvidia.com/gpu: "1" + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /var/run/secrets/kubernetes.io/serviceaccount + name: kube-api-access-fnjgk + readOnly: true + dnsPolicy: ClusterFirst + enableServiceLinks: true + imagePullSecrets: + - name: runai-reg-creds + nodeName: i-0b498db53280b86a6 + preemptionPolicy: PreemptLowerPriority + priority: 0 + restartPolicy: Always + schedulerName: runai-scheduler + securityContext: {} + serviceAccount: runai-reservation-engine + serviceAccountName: runai-reservation-engine + terminationGracePeriodSeconds: 30 + tolerations: + - effect: NoExecute + key: node.kubernetes.io/not-ready + operator: Exists + tolerationSeconds: 300 + - effect: NoExecute + key: node.kubernetes.io/unreachable + operator: Exists + tolerationSeconds: 300 + volumes: + - name: kube-api-access-fnjgk + projected: + defaultMode: 420 + sources: + - serviceAccountToken: + expirationSeconds: 3607 + path: token + - configMap: + items: + - key: ca.crt + path: ca.crt + name: kube-root-ca.crt + - downwardAPI: + items: + - fieldRef: + apiVersion: v1 + fieldPath: metadata.namespace + path: namespace +status: + conditions: + - lastProbeTime: null + lastTransitionTime: "2024-03-31T09:03:25Z" + status: "True" + type: Initialized + - lastProbeTime: null + lastTransitionTime: "2024-03-31T09:03:27Z" + status: "True" + type: Ready + - lastProbeTime: null + lastTransitionTime: "2024-03-31T09:03:27Z" + status: "True" + type: ContainersReady + - lastProbeTime: null + lastTransitionTime: "2024-03-31T09:03:25Z" + status: "True" + type: PodScheduled + containerStatuses: + - containerID: containerd://1063439dc8e82d20ef89a97ad9567d40d59d0d270ac5b8d4cab7f49a474e4398 + image: gcr.io/run-ai-prod/resource-reservation:v3.5.0 + imageID: gcr.io/run-ai-prod/resource-reservation@sha256:add1db641829508bbd1e74a7e757348159bc99b67844fc656acc1e795872d0a6 + lastState: {} + name: runai-reservation + ready: true + restartCount: 0 + started: true + state: + running: + startedAt: "2024-03-31T09:03:27Z" + hostIP: 172.20.62.77 + phase: Running + podIP: 100.122.249.151 + podIPs: + - ip: 100.122.249.151 + qosClass: BestEffort + startTime: "2024-03-31T09:03:25Z" diff --git a/design/samples/runai/<2.9/fraction/metrics/1_workload.ini b/design/samples/runai/<2.9/fraction/metrics/1_workload.ini new file mode 100644 index 0000000..1b2c49c --- /dev/null +++ b/design/samples/runai/<2.9/fraction/metrics/1_workload.ini @@ -0,0 +1,76 @@ +# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). +# TYPE DCGM_FI_DEV_SM_CLOCK gauge +DCGM_FI_DEV_SM_CLOCK{gpu="0",UUID="GPU-7a2e92f2-be71-d37e-a744-541600ffc722",device="nvidia0",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-qa-mig-worker-gpu-1-kvltw"} 585 +DCGM_FI_DEV_SM_CLOCK{gpu="1",UUID="GPU-5f587dd4-6505-0873-c6db-8324dbfe5ba9",device="nvidia1",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="",namespace="",pod=""} 300 +# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz). +# TYPE DCGM_FI_DEV_MEM_CLOCK gauge +DCGM_FI_DEV_MEM_CLOCK{gpu="0",UUID="GPU-7a2e92f2-be71-d37e-a744-541600ffc722",device="nvidia0",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-qa-mig-worker-gpu-1-kvltw"} 5000 +DCGM_FI_DEV_MEM_CLOCK{gpu="1",UUID="GPU-5f587dd4-6505-0873-c6db-8324dbfe5ba9",device="nvidia1",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="",namespace="",pod=""} 405 +# HELP DCGM_FI_DEV_GPU_TEMP GPU temperature (in C). +# TYPE DCGM_FI_DEV_GPU_TEMP gauge +DCGM_FI_DEV_GPU_TEMP{gpu="0",UUID="GPU-7a2e92f2-be71-d37e-a744-541600ffc722",device="nvidia0",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-qa-mig-worker-gpu-1-kvltw"} 55 +DCGM_FI_DEV_GPU_TEMP{gpu="1",UUID="GPU-5f587dd4-6505-0873-c6db-8324dbfe5ba9",device="nvidia1",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="",namespace="",pod=""} 52 +# HELP DCGM_FI_DEV_POWER_USAGE Power draw (in W). +# TYPE DCGM_FI_DEV_POWER_USAGE gauge +DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-7a2e92f2-be71-d37e-a744-541600ffc722",device="nvidia0",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-qa-mig-worker-gpu-1-kvltw"} 28.722000 +DCGM_FI_DEV_POWER_USAGE{gpu="1",UUID="GPU-5f587dd4-6505-0873-c6db-8324dbfe5ba9",device="nvidia1",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="",namespace="",pod=""} 17.035000 +# HELP DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Total energy consumption since boot (in mJ). +# TYPE DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION counter +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="0",UUID="GPU-7a2e92f2-be71-d37e-a744-541600ffc722",device="nvidia0",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-qa-mig-worker-gpu-1-kvltw"} 9022652 +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="1",UUID="GPU-5f587dd4-6505-0873-c6db-8324dbfe5ba9",device="nvidia1",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="",namespace="",pod=""} 8855497 +# HELP DCGM_FI_DEV_PCIE_REPLAY_COUNTER Total number of PCIe retries. +# TYPE DCGM_FI_DEV_PCIE_REPLAY_COUNTER counter +DCGM_FI_DEV_PCIE_REPLAY_COUNTER{gpu="0",UUID="GPU-7a2e92f2-be71-d37e-a744-541600ffc722",device="nvidia0",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-qa-mig-worker-gpu-1-kvltw"} 0 +DCGM_FI_DEV_PCIE_REPLAY_COUNTER{gpu="1",UUID="GPU-5f587dd4-6505-0873-c6db-8324dbfe5ba9",device="nvidia1",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_GPU_UTIL GPU utilization (in %). +# TYPE DCGM_FI_DEV_GPU_UTIL gauge +DCGM_FI_DEV_GPU_UTIL{gpu="0",UUID="GPU-7a2e92f2-be71-d37e-a744-541600ffc722",device="nvidia0",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-qa-mig-worker-gpu-1-kvltw"} 0 +DCGM_FI_DEV_GPU_UTIL{gpu="1",UUID="GPU-5f587dd4-6505-0873-c6db-8324dbfe5ba9",device="nvidia1",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_MEM_COPY_UTIL Memory utilization (in %). +# TYPE DCGM_FI_DEV_MEM_COPY_UTIL gauge +DCGM_FI_DEV_MEM_COPY_UTIL{gpu="0",UUID="GPU-7a2e92f2-be71-d37e-a744-541600ffc722",device="nvidia0",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-qa-mig-worker-gpu-1-kvltw"} 0 +DCGM_FI_DEV_MEM_COPY_UTIL{gpu="1",UUID="GPU-5f587dd4-6505-0873-c6db-8324dbfe5ba9",device="nvidia1",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_ENC_UTIL Encoder utilization (in %). +# TYPE DCGM_FI_DEV_ENC_UTIL gauge +DCGM_FI_DEV_ENC_UTIL{gpu="0",UUID="GPU-7a2e92f2-be71-d37e-a744-541600ffc722",device="nvidia0",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-qa-mig-worker-gpu-1-kvltw"} 0 +DCGM_FI_DEV_ENC_UTIL{gpu="1",UUID="GPU-5f587dd4-6505-0873-c6db-8324dbfe5ba9",device="nvidia1",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_DEC_UTIL Decoder utilization (in %). +# TYPE DCGM_FI_DEV_DEC_UTIL gauge +DCGM_FI_DEV_DEC_UTIL{gpu="0",UUID="GPU-7a2e92f2-be71-d37e-a744-541600ffc722",device="nvidia0",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-qa-mig-worker-gpu-1-kvltw"} 0 +DCGM_FI_DEV_DEC_UTIL{gpu="1",UUID="GPU-5f587dd4-6505-0873-c6db-8324dbfe5ba9",device="nvidia1",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_XID_ERRORS Value of the last XID error encountered. +# TYPE DCGM_FI_DEV_XID_ERRORS gauge +DCGM_FI_DEV_XID_ERRORS{gpu="0",UUID="GPU-7a2e92f2-be71-d37e-a744-541600ffc722",device="nvidia0",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-qa-mig-worker-gpu-1-kvltw"} 0 +DCGM_FI_DEV_XID_ERRORS{gpu="1",UUID="GPU-5f587dd4-6505-0873-c6db-8324dbfe5ba9",device="nvidia1",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_FB_FREE Framebuffer memory free (in MiB). +# TYPE DCGM_FI_DEV_FB_FREE gauge +DCGM_FI_DEV_FB_FREE{gpu="0",UUID="GPU-7a2e92f2-be71-d37e-a744-541600ffc722",device="nvidia0",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-qa-mig-worker-gpu-1-kvltw"} 14831 +DCGM_FI_DEV_FB_FREE{gpu="1",UUID="GPU-5f587dd4-6505-0873-c6db-8324dbfe5ba9",device="nvidia1",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="",namespace="",pod=""} 14955 +# HELP DCGM_FI_DEV_FB_USED Framebuffer memory used (in MiB). +# TYPE DCGM_FI_DEV_FB_USED gauge +DCGM_FI_DEV_FB_USED{gpu="0",UUID="GPU-7a2e92f2-be71-d37e-a744-541600ffc722",device="nvidia0",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-qa-mig-worker-gpu-1-kvltw"} 128 +DCGM_FI_DEV_FB_USED{gpu="1",UUID="GPU-5f587dd4-6505-0873-c6db-8324dbfe5ba9",device="nvidia1",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="",namespace="",pod=""} 5 +# HELP DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL Total number of NVLink bandwidth counters for all lanes. +# TYPE DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL counter +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{gpu="0",UUID="GPU-7a2e92f2-be71-d37e-a744-541600ffc722",device="nvidia0",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-qa-mig-worker-gpu-1-kvltw"} 0 +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{gpu="1",UUID="GPU-5f587dd4-6505-0873-c6db-8324dbfe5ba9",device="nvidia1",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_PROF_GR_ENGINE_ACTIVE Ratio of time the graphics engine is active (in %). +# TYPE DCGM_FI_PROF_GR_ENGINE_ACTIVE gauge +DCGM_FI_PROF_GR_ENGINE_ACTIVE{gpu="0",UUID="GPU-7a2e92f2-be71-d37e-a744-541600ffc722",device="nvidia0",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-qa-mig-worker-gpu-1-kvltw"} 0.000039 +DCGM_FI_PROF_GR_ENGINE_ACTIVE{gpu="1",UUID="GPU-5f587dd4-6505-0873-c6db-8324dbfe5ba9",device="nvidia1",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="",namespace="",pod=""} 0.000000 +# HELP DCGM_FI_PROF_PIPE_TENSOR_ACTIVE Ratio of cycles the tensor (HMMA) pipe is active (in %). +# TYPE DCGM_FI_PROF_PIPE_TENSOR_ACTIVE gauge +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{gpu="0",UUID="GPU-7a2e92f2-be71-d37e-a744-541600ffc722",device="nvidia0",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-qa-mig-worker-gpu-1-kvltw"} 0.000000 +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{gpu="1",UUID="GPU-5f587dd4-6505-0873-c6db-8324dbfe5ba9",device="nvidia1",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="",namespace="",pod=""} 0.000000 +# HELP DCGM_FI_PROF_DRAM_ACTIVE Ratio of cycles the device memory interface is active sending or receiving data (in %). +# TYPE DCGM_FI_PROF_DRAM_ACTIVE gauge +DCGM_FI_PROF_DRAM_ACTIVE{gpu="0",UUID="GPU-7a2e92f2-be71-d37e-a744-541600ffc722",device="nvidia0",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-qa-mig-worker-gpu-1-kvltw"} 0.000002 +DCGM_FI_PROF_DRAM_ACTIVE{gpu="1",UUID="GPU-5f587dd4-6505-0873-c6db-8324dbfe5ba9",device="nvidia1",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="",namespace="",pod=""} 0.000001 +# HELP DCGM_FI_PROF_PCIE_TX_BYTES The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second. +# TYPE DCGM_FI_PROF_PCIE_TX_BYTES gauge +DCGM_FI_PROF_PCIE_TX_BYTES{gpu="0",UUID="GPU-7a2e92f2-be71-d37e-a744-541600ffc722",device="nvidia0",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-qa-mig-worker-gpu-1-kvltw"} 79221 +DCGM_FI_PROF_PCIE_TX_BYTES{gpu="1",UUID="GPU-5f587dd4-6505-0873-c6db-8324dbfe5ba9",device="nvidia1",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="",namespace="",pod=""} 10888 +# HELP DCGM_FI_PROF_PCIE_RX_BYTES The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second. +# TYPE DCGM_FI_PROF_PCIE_RX_BYTES gauge +DCGM_FI_PROF_PCIE_RX_BYTES{gpu="0",UUID="GPU-7a2e92f2-be71-d37e-a744-541600ffc722",device="nvidia0",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-qa-mig-worker-gpu-1-kvltw"} 420785 +DCGM_FI_PROF_PCIE_RX_BYTES{gpu="1",UUID="GPU-5f587dd4-6505-0873-c6db-8324dbfe5ba9",device="nvidia1",modelName="Tesla T4",Hostname="nvidia-dcgm-exporter-s228n",DCGM_FI_DRIVER_VERSION="525.60.13",container="",namespace="",pod=""} 10888 \ No newline at end of file diff --git a/design/samples/runai/<2.9/fraction/node/1_workload.yaml b/design/samples/runai/<2.9/fraction/node/1_workload.yaml new file mode 100644 index 0000000..6fd2abf --- /dev/null +++ b/design/samples/runai/<2.9/fraction/node/1_workload.yaml @@ -0,0 +1,347 @@ +# Please edit the object below. Lines beginning with a '#' will be ignored, +# and an empty file will abort the edit. If an error occurs while saving this file will be +# reopened with the relevant failures. +# +apiVersion: v1 +kind: Node +metadata: + annotations: + flannel.alpha.coreos.com/backend-data: '{"VNI":1,"VtepMAC":"76:25:16:25:5b:52"}' + flannel.alpha.coreos.com/backend-type: vxlan + flannel.alpha.coreos.com/kube-subnet-manager: "true" + flannel.alpha.coreos.com/public-ip: 10.51.0.4 + kubeadm.alpha.kubernetes.io/cri-socket: /var/run/dockershim.sock + nfd.node.kubernetes.io/extended-resources: "" + nfd.node.kubernetes.io/feature-labels: cpu-cpuid.AESNI,cpu-cpuid.AVX,cpu-cpuid.AVX2,cpu-cpuid.FMA3,cpu-cpuid.HYPERVISOR,cpu-cpuid.IBPB,cpu-cpuid.STIBP,cpu-hardware_multithreading,kernel-config.NO_HZ,kernel-config.NO_HZ_IDLE,kernel-version.full,kernel-version.major,kernel-version.minor,kernel-version.revision,nvidia.com/cuda.driver.major,nvidia.com/cuda.driver.minor,nvidia.com/cuda.driver.rev,nvidia.com/cuda.runtime.major,nvidia.com/cuda.runtime.minor,nvidia.com/gfd.timestamp,nvidia.com/gpu.compute.major,nvidia.com/gpu.compute.minor,nvidia.com/gpu.count,nvidia.com/gpu.family,nvidia.com/gpu.machine,nvidia.com/gpu.memory,nvidia.com/gpu.product,nvidia.com/gpu.replicas,nvidia.com/mig.capable,nvidia.com/mig.strategy,nvidia.com/run.ai-swap.enabled,pci-10de.present,pci-1af4.present,system-os_release.ID,system-os_release.VERSION_ID,system-os_release.VERSION_ID.major,system-os_release.VERSION_ID.minor + nfd.node.kubernetes.io/worker.version: v0.10.1 + node.alpha.kubernetes.io/ttl: "0" + volumes.kubernetes.io/controller-managed-attach-detach: "true" + creationTimestamp: "2022-12-26T14:13:03Z" + labels: + beta.kubernetes.io/arch: amd64 + beta.kubernetes.io/os: linux + feature.node.kubernetes.io/cpu-cpuid.AESNI: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX2: "true" + feature.node.kubernetes.io/cpu-cpuid.FMA3: "true" + feature.node.kubernetes.io/cpu-cpuid.HYPERVISOR: "true" + feature.node.kubernetes.io/cpu-cpuid.IBPB: "true" + feature.node.kubernetes.io/cpu-cpuid.STIBP: "true" + feature.node.kubernetes.io/cpu-hardware_multithreading: "true" + feature.node.kubernetes.io/kernel-config.NO_HZ: "true" + feature.node.kubernetes.io/kernel-config.NO_HZ_IDLE: "true" + feature.node.kubernetes.io/kernel-version.full: 5.4.0-1042-gcp + feature.node.kubernetes.io/kernel-version.major: "5" + feature.node.kubernetes.io/kernel-version.minor: "4" + feature.node.kubernetes.io/kernel-version.revision: "0" + feature.node.kubernetes.io/pci-10de.present: "true" + feature.node.kubernetes.io/pci-1af4.present: "true" + feature.node.kubernetes.io/system-os_release.ID: ubuntu + feature.node.kubernetes.io/system-os_release.VERSION_ID: "20.04" + feature.node.kubernetes.io/system-os_release.VERSION_ID.major: "20" + feature.node.kubernetes.io/system-os_release.VERSION_ID.minor: "04" + kubernetes.io/arch: amd64 + kubernetes.io/hostname: qa-mig-worker-gpu-1 + kubernetes.io/os: linux + nvidia.com/cuda.driver.major: "525" + nvidia.com/cuda.driver.minor: "60" + nvidia.com/cuda.driver.rev: "13" + nvidia.com/cuda.runtime.major: "12" + nvidia.com/cuda.runtime.minor: "0" + nvidia.com/gfd.timestamp: "1674110256" + nvidia.com/gpu.compute.major: "7" + nvidia.com/gpu.compute.minor: "5" + nvidia.com/gpu.count: "2" + nvidia.com/gpu.deploy.container-toolkit: "true" + nvidia.com/gpu.deploy.dcgm: "true" + nvidia.com/gpu.deploy.dcgm-exporter: "true" + nvidia.com/gpu.deploy.device-plugin: "true" + nvidia.com/gpu.deploy.driver: "true" + nvidia.com/gpu.deploy.gpu-feature-discovery: "true" + nvidia.com/gpu.deploy.node-status-exporter: "true" + nvidia.com/gpu.deploy.nvsm: "" + nvidia.com/gpu.deploy.operator-validator: "true" + nvidia.com/gpu.family: turing + nvidia.com/gpu.machine: Google-Compute-Engine + nvidia.com/gpu.memory: "15360" + nvidia.com/gpu.present: "true" + nvidia.com/gpu.product: Tesla-T4 + nvidia.com/gpu.replicas: "1" + nvidia.com/mig.capable: "false" + nvidia.com/mig.strategy: mixed + nvidia.com/run.ai-swap.enabled: "false" + name: qa-mig-worker-gpu-1 + resourceVersion: "3084398" + uid: 018b97a5-7073-4701-8576-646aef94752d +spec: + podCIDR: 10.244.1.0/24 + podCIDRs: + - 10.244.1.0/24 +status: + addresses: + - address: 10.51.0.4 + type: InternalIP + - address: qa-mig-worker-gpu-1 + type: Hostname + allocatable: + cpu: "32" + ephemeral-storage: "93492209510" + hugepages-1Gi: "0" + hugepages-2Mi: "0" + memory: 123560520Ki + nvidia.com/gpu: "2" + pods: "110" + capacity: + cpu: "32" + ephemeral-storage: 101445540Ki + hugepages-1Gi: "0" + hugepages-2Mi: "0" + memory: 123662920Ki + nvidia.com/gpu: "2" + pods: "110" + conditions: + - lastHeartbeatTime: "2023-01-19T06:32:28Z" + lastTransitionTime: "2023-01-19T06:32:28Z" + message: Flannel is running on this node + reason: FlannelIsUp + status: "False" + type: NetworkUnavailable + - lastHeartbeatTime: "2023-01-19T06:49:40Z" + lastTransitionTime: "2022-12-28T08:31:06Z" + message: kubelet has sufficient memory available + reason: KubeletHasSufficientMemory + status: "False" + type: MemoryPressure + - lastHeartbeatTime: "2023-01-19T06:49:40Z" + lastTransitionTime: "2022-12-28T08:31:06Z" + message: kubelet has no disk pressure + reason: KubeletHasNoDiskPressure + status: "False" + type: DiskPressure + - lastHeartbeatTime: "2023-01-19T06:49:40Z" + lastTransitionTime: "2022-12-28T08:31:06Z" + message: kubelet has sufficient PID available + reason: KubeletHasSufficientPID + status: "False" + type: PIDPressure + - lastHeartbeatTime: "2023-01-19T06:49:40Z" + lastTransitionTime: "2023-01-19T06:31:33Z" + message: kubelet is posting ready status. AppArmor enabled + reason: KubeletReady + status: "True" + type: Ready + daemonEndpoints: + kubeletEndpoint: + Port: 10250 + images: + - names: + - gcr.io/run-ai-demo/quickstart@sha256:7837847d3a186bb2daa03f1781542212cb7c66575ea8b0aaf3fd886f0043c405 + - gcr.io/run-ai-demo/quickstart:latest + sizeBytes: 12072476783 + - names: + - gcr.io/run-ai-demo/quickstart-hpo@sha256:e410172eea2db1d140092714db75e365e4eef574d5d22729ed239e197e7d4594 + - gcr.io/run-ai-demo/quickstart-hpo:latest + sizeBytes: 4038802056 + - names: + - runai/example-tf-keras-mnist-acgan@sha256:4395e5204646a9a24021e552e978d59ae305da54d8c873c4ebc0698d33bf3aa2 + - runai/example-tf-keras-mnist-acgan:latest + sizeBytes: 3652196155 + - names: + - runai/example-tf-keras-builtin@sha256:f2578dc65651fe8dbfcff45b952a85633ffeb4bb948f957f3dfa96f1265498b2 + - runai/example-tf-keras-builtin:latest + sizeBytes: 3459613901 + - names: + - runai/example-pytorch-builtin@sha256:378341b8de16eae018d0d1966bc6c6f7e47ac81c31727d7c6a7c518b8bf407d8 + - runai/example-pytorch-builtin:latest + sizeBytes: 3326431187 + - names: + - nvcr.io/nvidia/k8s/dcgm-exporter@sha256:c1418a43dd54946ad0d08864c7da5591e067fd0e21791ab48a2dec8bdde14774 + - nvcr.io/nvidia/k8s/dcgm-exporter:3.1.3-3.1.2-ubuntu20.04 + sizeBytes: 1989114969 + - names: + - gshaibi/gpu-burn@sha256:ed07993b0581228c2bd7113fae0ed214549547f0fa91ba50165bc2473cfaf979 + - gshaibi/gpu-burn:latest + sizeBytes: 1619723452 + - names: + - nvcr.io/nvidia/k8s/dcgm-exporter@sha256:952ec6be586dcff7c4b2936a20b7704c55b91be2b0ddb6d121ce72f5a833e804 + - nvcr.io/nvidia/k8s/dcgm-exporter:3.0.4-3.0.0-ubuntu20.04 + sizeBytes: 1222518599 + - names: + - gcr.io/run-ai-lab/mig-provisioner@sha256:d62020d74399d3b7e99ba0e5f8b8715325ad966493218dba2b83c28e261ead48 + - gcr.io/run-ai-lab/mig-provisioner:galb + sizeBytes: 1188454365 + - names: + - gcr.io/run-ai-lab/runai-scheduler@sha256:258baff8c969b1950c70a142545ecb5d150daf5abd257c92959e0e18c4acc04e + - gcr.io/run-ai-lab/runai-scheduler:erez + sizeBytes: 1183999183 + - names: + - nvcr.io/nvidia/driver@sha256:0ee0c585fa720f177734b3295a073f402d75986c1fe018ae68bd73fe9c21b8d8 + - nvcr.io/nvidia/driver:525.60.13-ubuntu20.04 + sizeBytes: 1111261102 + - names: + - gcr.io/run-ai-test-1/agent@sha256:9fad66ecc81185f67dce9bc5adc809b99438ebe3faba1a193de2fafda3cebc14 + - gcr.io/run-ai-test-1/agent:0.0.0-1004326.master-191381 + sizeBytes: 1087873401 + - names: + - gcr.io/run-ai-prod/agent@sha256:853a80f941ba666ab0f34a0790811ce9d78ce93562b7efdc535a72bcb485faf7 + - gcr.io/run-ai-prod/agent:2.8.12-rc.0 + sizeBytes: 1079477044 + - names: + - gcr.io/run-ai-prod/agent@sha256:36c305603419ffa30755eb8d250493b2d388bd2d12e3cbd969763e1e80ac6d18 + - gcr.io/run-ai-prod/agent:2.8.9-rc.1 + sizeBytes: 1079177946 + - names: + - gcr.io/run-ai-prod/agent@sha256:d21a6b1438597e82f85f392b37b5fe1ad63a1a46d0044462fb2d50ef7aec9323 + - gcr.io/run-ai-prod/agent:2.8.8 + sizeBytes: 1079071238 + - names: + - gcr.io/run-ai-prod/agent@sha256:446b3eafe54640bb2c81e35f1c1a3d286fd081cdd30fb4d908ffe6719528fbce + - gcr.io/run-ai-prod/agent:2.7.0 + sizeBytes: 1078148837 + - names: + - gcr.io/run-ai-prod/agent@sha256:9236f3ab5f37d1cc9365b9a5b3bc0e59602cb29bc366bd8e5143bf4c26e18974 + - gcr.io/run-ai-prod/agent:2.7.0-rc.13 + sizeBytes: 1078145358 + - names: + - gcr.io/run-ai-prod/agent@sha256:25bb759919e8c60f693788806d2a7d6f5d13f18300d7547bf4b8b8cafe16caa3 + - gcr.io/run-ai-prod/agent:2.7.1-rc.1 + sizeBytes: 1078139486 + - names: + - gcr.io/run-ai-prod/agent@sha256:f5bb90de8420f178ac3c3c0bd376fbf4d2608e23d626ce0555d1cd1a5deca28a + - gcr.io/run-ai-prod/agent:2.7.0-rc.8 + sizeBytes: 1078129526 + - names: + - gcr.io/run-ai-prod/agent@sha256:cf062c46dcbb829bc80f64b54f2c0e3bb3f43952af9ad1b510236444dd802f2f + - gcr.io/run-ai-prod/agent:v2.6.0-rc.5 + sizeBytes: 1077024287 + - names: + - gcr.io/run-ai-test-1/pod-grouper@sha256:25c27e0825504e580981706baca4ce5d82ae362a2aac0db7babbc15c910f16e8 + - gcr.io/run-ai-test-1/pod-grouper:0.0.0-1004326.master-191381 + sizeBytes: 1041696207 + - names: + - nvcr.io/nvidia/driver@sha256:c24371b1793eab4f2f035bc0584015de3a7be3031587d6cd948069c9127542f6 + - nvcr.io/nvidia/driver:515.65.01-ubuntu20.04 + sizeBytes: 1026833848 + - names: + - gcr.io/run-ai-test-1/nodepool-controller@sha256:ccd526e359bacfd83589ab711d8251c8a1d01e4d4dab95486bcc46684154e1ba + - gcr.io/run-ai-test-1/nodepool-controller:0.0.0-1004326.master-191381 + sizeBytes: 990822532 + - names: + - gcr.io/run-ai-prod/nodepool-controller@sha256:621a08552c775e34949f62b0f822f882af3b9e5d6d2e3a228531389f1167ea21 + - gcr.io/run-ai-prod/nodepool-controller:2.8.0-rc.4 + sizeBytes: 988355646 + - names: + - nvcr.io/nvidia/driver@sha256:d944da4ec30065b98c170f924c75fe1222e06998f5dff726a1867fa1c1f9b801 + - nvcr.io/nvidia/driver:510.47.03-ubuntu20.04 + sizeBytes: 932659896 + - names: + - nvcr.io/nvidia/k8s/dcgm-exporter@sha256:7507d27b8d4736531504abf4f011c0b9586fbb7dd9436e26c3b5cd1e262369db + - nvcr.io/nvidia/k8s/dcgm-exporter:2.3.4-2.6.4-ubuntu20.04 + sizeBytes: 572817930 + - names: + - nvcr.io/nvidia/cloud-native/k8s-driver-manager@sha256:6240c5912aabed789c672f3179b4a65e45511d10fa8c41a5de0d91644a792b14 + - nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.5.1 + sizeBytes: 562256065 + - names: + - bitnami/fluentd@sha256:11bb83687b44a9fb7a4f773e2ecf120e2b4523613a5544999723f0b9cd8fe2ed + - bitnami/fluentd:1.12.0-debian-10-r0 + sizeBytes: 512854333 + - names: + - nvcr.io/nvidia/cloud-native/gpu-operator-validator@sha256:18c9ea88ae06d479e6657b8a4126a8ee3f4300a40c16ddc29fb7ab3763d46005 + - nvcr.io/nvidia/cloud-native/gpu-operator-validator:v22.9.1 + sizeBytes: 478600391 + - names: + - nvcr.io/nvidia/cloud-native/k8s-driver-manager@sha256:e3f16c26b9340ed46aed248cc4d18353ba3a65886bf7a2f0cea25ff41b2553da + - nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.4.2 + sizeBytes: 459618644 + - names: + - nvcr.io/nvidia/k8s-device-plugin@sha256:9c17d3a907eb77eb8f7b4f3faf52d8352e4252af92003f828083f80d629bd2c3 + - nvcr.io/nvidia/k8s-device-plugin:v0.13.0-ubi8 + sizeBytes: 444556116 + - names: + - calico/node@sha256:349c10be37e64a310d25869128d482b17bfeb4166bc80bd9a2ed095203a77ddb + - calico/node:v3.15.5 + sizeBytes: 437164545 + - names: + - nvcr.io/nvidia/cloud-native/gpu-operator-validator@sha256:6fe4200960b2b49d6dac1c91e596f61dacb6b3dcff878c84eb74c5136fedd5b6 + - nvcr.io/nvidia/cloud-native/gpu-operator-validator:v22.9.0 + sizeBytes: 432799839 + - names: + - nvcr.io/nvidia/gpu-feature-discovery@sha256:bec9f026d9b3d9404c78d6091817a359015c6a7aa411735b34138c1518853b5d + - nvcr.io/nvidia/gpu-feature-discovery:v0.7.0-ubi8 + sizeBytes: 415829527 + - names: + - nvcr.io/nvidia/cloud-native/k8s-driver-manager@sha256:5b16056257acc51b517d9cdb1da3218693cefc214af93789e6e214fd2b4cacf1 + - nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.3.0 + sizeBytes: 413807613 + - names: + - nvcr.io/nvidia/k8s-device-plugin@sha256:d1c61116647bb9388eb3c4e31848dd6038458b7ba33c0eb3b659d96739eceb73 + - nvcr.io/nvidia/k8s-device-plugin:v0.12.3-ubi8 + sizeBytes: 408778270 + - names: + - nvcr.io/nvidia/gpu-feature-discovery@sha256:37821ea7829def707f819ac21122cea62efdbbd640679b6004e552fb9a1e17a3 + - nvcr.io/nvidia/gpu-feature-discovery:v0.6.2-ubi8 + sizeBytes: 380437918 + - names: + - nvcr.io/nvidia/cloud-native/gpu-operator-validator@sha256:24d804e8f005d7aeca8343aa13e5f92295d8642a4c47cb24e3ac86a22543bc37 + - nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.10.1 + sizeBytes: 368369724 + - names: + - gcr.io/run-ai-prod/researcher-service@sha256:7cc6c19900277272e4ebccec918e023cc15d0ba2d20ab51b9c65db33f9cf0587 + - gcr.io/run-ai-prod/researcher-service:2.8.8 + sizeBytes: 367572145 + - names: + - gcr.io/run-ai-prod/researcher-service@sha256:91d3fa61f9d3594455e5b6e491a780deb86cb707107e8c5f6998f663e4ab84a0 + - gcr.io/run-ai-prod/researcher-service:2.8.9-rc.1 + sizeBytes: 367571729 + - names: + - gcr.io/run-ai-prod/operator@sha256:073f85f69b7c3dcbb954b630501380f408c7862f4e85f7ae4b6a51f27b7759fc + - gcr.io/run-ai-prod/operator:2.5.5 + sizeBytes: 363172473 + - names: + - bitnami/fluentd@sha256:44309f5c8ccd5c65e18923ed6936aaeee8e930038177eb17bb712872cd18592b + - bitnami/fluentd:1.14.6-debian-11-r5 + sizeBytes: 354777953 + - names: + - gcr.io/run-ai-prod/researcher-service@sha256:de4f5ee6536699256e04d1897eb0bb36396132555236b6a7fb9bf55a8e0b54f9 + - gcr.io/run-ai-prod/researcher-service:2.7.1-rc.1 + sizeBytes: 350391480 + - names: + - gcr.io/run-ai-prod/researcher-service@sha256:8401b01cabe20b39edc358ea3b02322021f133b5d7795478dcf526617a8eccdf + - gcr.io/run-ai-prod/researcher-service:2.7.0 + sizeBytes: 350350686 + - names: + - gcr.io/run-ai-prod/researcher-service@sha256:495f7ba24b078072dee03e3d079730090a6a98d06bf7310f1ac3eb5f99a30147 + - gcr.io/run-ai-prod/researcher-service:2.7.0-rc.13 + sizeBytes: 350347074 + - names: + - gcr.io/run-ai-prod/researcher-service@sha256:d1a19f29248335500c6db22977324f4dcb70500d4eb8402fddb183bbd0ef5caa + - gcr.io/run-ai-prod/researcher-service:2.7.0-rc.8 + sizeBytes: 350347074 + - names: + - nvcr.io/nvidia/cloud-native/gpu-operator-validator@sha256:d7e0397249cd5099046506f32841535ea4f329f7b7583a6ddd9f75ff0f53385e + - nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.9.1 + sizeBytes: 346600785 + - names: + - nvcr.io/nvidia/k8s-device-plugin@sha256:aa95c16106280e36a5e32d2fe4c66e8b70f5a114860c6f4ed5b1a4085c63601b + - nvcr.io/nvidia/k8s-device-plugin:v0.11.0-ubi8 + sizeBytes: 316060543 + - names: + - grafana/grafana@sha256:8c65b333a3d369a095d752da2e784b484d4246659e9a4ac6a09b74353f95d1c9 + - grafana/grafana:9.3.1 + sizeBytes: 314580571 + - names: + - nvcr.io/nvidia/gpu-feature-discovery@sha256:c2e3d0fe41a0d227dbb70caec03c780cc76317515e5ab3875f31d50c63f41c66 + - nvcr.io/nvidia/gpu-feature-discovery:v0.5.0 + sizeBytes: 308759280 + nodeInfo: + architecture: amd64 + bootID: 654bb2c4-19d4-4d7c-bada-7d07b3da1adb + containerRuntimeVersion: docker://20.10.13 + kernelVersion: 5.4.0-1042-gcp + kubeProxyVersion: v1.23.4 + kubeletVersion: v1.23.4 + machineID: f551d6896536dedbb04180ef1a399ef4 + operatingSystem: linux + osImage: Ubuntu 20.04.2 LTS + systemUUID: 5e13faf9-9279-cdbb-3620-5fe3a1ecafb1 diff --git a/design/samples/runai/<2.9/mig/metrics/4 Active Workloads.ini b/design/samples/runai/<2.9/mig/metrics/4 Active Workloads.ini new file mode 100644 index 0000000..565dc94 --- /dev/null +++ b/design/samples/runai/<2.9/mig/metrics/4 Active Workloads.ini @@ -0,0 +1,86 @@ +4 Active Workloads + +# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). +# TYPE DCGM_FI_DEV_SM_CLOCK gauge +DCGM_FI_DEV_SM_CLOCK{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 1410 +DCGM_FI_DEV_SM_CLOCK{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 1410 +DCGM_FI_DEV_SM_CLOCK{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 1410 +DCGM_FI_DEV_SM_CLOCK{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 1410 +# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz). +# TYPE DCGM_FI_DEV_MEM_CLOCK gauge +DCGM_FI_DEV_MEM_CLOCK{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 1215 +DCGM_FI_DEV_MEM_CLOCK{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 1215 +DCGM_FI_DEV_MEM_CLOCK{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 1215 +DCGM_FI_DEV_MEM_CLOCK{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 1215 +# HELP DCGM_FI_DEV_MEMORY_TEMP Memory temperature (in C). +# TYPE DCGM_FI_DEV_MEMORY_TEMP gauge +DCGM_FI_DEV_MEMORY_TEMP{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 36 +DCGM_FI_DEV_MEMORY_TEMP{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 36 +DCGM_FI_DEV_MEMORY_TEMP{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 36 +DCGM_FI_DEV_MEMORY_TEMP{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 36 +# HELP DCGM_FI_DEV_GPU_TEMP GPU temperature (in C). +# TYPE DCGM_FI_DEV_GPU_TEMP gauge +DCGM_FI_DEV_GPU_TEMP{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 35 +DCGM_FI_DEV_GPU_TEMP{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 35 +DCGM_FI_DEV_GPU_TEMP{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 35 +DCGM_FI_DEV_GPU_TEMP{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 35 +# HELP DCGM_FI_DEV_POWER_USAGE Power draw (in W). +# TYPE DCGM_FI_DEV_POWER_USAGE gauge +DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 112.252000 +DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 106.463000 +DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 106.463000 +DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 112.252000 +# HELP DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Total energy consumption since boot (in mJ). +# TYPE DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION counter +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 6890956 +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 6890956 +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 6890956 +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 6890956 +# HELP DCGM_FI_DEV_PCIE_REPLAY_COUNTER Total number of PCIe retries. +# TYPE DCGM_FI_DEV_PCIE_REPLAY_COUNTER counter +DCGM_FI_DEV_PCIE_REPLAY_COUNTER{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +DCGM_FI_DEV_PCIE_REPLAY_COUNTER{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +DCGM_FI_DEV_PCIE_REPLAY_COUNTER{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +DCGM_FI_DEV_PCIE_REPLAY_COUNTER{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_XID_ERRORS Value of the last XID error encountered. +# TYPE DCGM_FI_DEV_XID_ERRORS gauge +DCGM_FI_DEV_XID_ERRORS{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +DCGM_FI_DEV_XID_ERRORS{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +DCGM_FI_DEV_XID_ERRORS{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +DCGM_FI_DEV_XID_ERRORS{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_FB_FREE Framebuffer memory free (in MiB). +# TYPE DCGM_FI_DEV_FB_FREE gauge +DCGM_FI_DEV_FB_FREE{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 4857 +DCGM_FI_DEV_FB_FREE{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 480 +DCGM_FI_DEV_FB_FREE{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 4857 +DCGM_FI_DEV_FB_FREE{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 9843 +# HELP DCGM_FI_DEV_FB_USED Framebuffer memory used (in MiB). +# TYPE DCGM_FI_DEV_FB_USED gauge +DCGM_FI_DEV_FB_USED{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 6 +DCGM_FI_DEV_FB_USED{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 4384 +DCGM_FI_DEV_FB_USED{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 6 +DCGM_FI_DEV_FB_USED{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 13 +# HELP DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL Total number of NVLink bandwidth counters for all lanes. +# TYPE DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL counter +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_PROF_GR_ENGINE_ACTIVE Ratio of time the graphics engine is active (in %). +# TYPE DCGM_FI_PROF_GR_ENGINE_ACTIVE gauge +DCGM_FI_PROF_GR_ENGINE_ACTIVE{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000000 +DCGM_FI_PROF_GR_ENGINE_ACTIVE{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.408493 +DCGM_FI_PROF_GR_ENGINE_ACTIVE{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000000 +DCGM_FI_PROF_GR_ENGINE_ACTIVE{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000000 +# HELP DCGM_FI_PROF_PIPE_TENSOR_ACTIVE Ratio of cycles the tensor (HMMA) pipe is active (in %). +# TYPE DCGM_FI_PROF_PIPE_TENSOR_ACTIVE gauge +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000000 +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000048 +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000000 +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000000 +# HELP DCGM_FI_PROF_DRAM_ACTIVE Ratio of cycles the device memory interface is active sending or receiving data (in %). +# TYPE DCGM_FI_PROF_DRAM_ACTIVE gauge +DCGM_FI_PROF_DRAM_ACTIVE{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000000 +DCGM_FI_PROF_DRAM_ACTIVE{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.250816 +DCGM_FI_PROF_DRAM_ACTIVE{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000000 +DCGM_FI_PROF_DRAM_ACTIVE{gpu="0",UUID="GPU-473571f2-fb37-ef7e-50fe-9885b74a23cd",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-tl46n",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000000 \ No newline at end of file diff --git a/design/samples/runai/<2.9/mig/metrics/Active 1g.5gb.ini b/design/samples/runai/<2.9/mig/metrics/Active 1g.5gb.ini new file mode 100644 index 0000000..3871ca3 --- /dev/null +++ b/design/samples/runai/<2.9/mig/metrics/Active 1g.5gb.ini @@ -0,0 +1,44 @@ +Active 1g.5gb + +# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). +# TYPE DCGM_FI_DEV_SM_CLOCK gauge +DCGM_FI_DEV_SM_CLOCK{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-km55s",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 1410 +# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz). +# TYPE DCGM_FI_DEV_MEM_CLOCK gauge +DCGM_FI_DEV_MEM_CLOCK{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-km55s",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 1215 +# HELP DCGM_FI_DEV_MEMORY_TEMP Memory temperature (in C). +# TYPE DCGM_FI_DEV_MEMORY_TEMP gauge +DCGM_FI_DEV_MEMORY_TEMP{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-km55s",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 39 +# HELP DCGM_FI_DEV_GPU_TEMP GPU temperature (in C). +# TYPE DCGM_FI_DEV_GPU_TEMP gauge +DCGM_FI_DEV_GPU_TEMP{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-km55s",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 37 +# HELP DCGM_FI_DEV_POWER_USAGE Power draw (in W). +# TYPE DCGM_FI_DEV_POWER_USAGE gauge +DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-km55s",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 115.294000 +# HELP DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Total energy consumption since boot (in mJ). +# TYPE DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION counter +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-km55s",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 27151028 +# HELP DCGM_FI_DEV_PCIE_REPLAY_COUNTER Total number of PCIe retries. +# TYPE DCGM_FI_DEV_PCIE_REPLAY_COUNTER counter +DCGM_FI_DEV_PCIE_REPLAY_COUNTER{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-km55s",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_XID_ERRORS Value of the last XID error encountered. +# TYPE DCGM_FI_DEV_XID_ERRORS gauge +DCGM_FI_DEV_XID_ERRORS{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-km55s",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_FB_FREE Framebuffer memory free (in MiB). +# TYPE DCGM_FI_DEV_FB_FREE gauge +DCGM_FI_DEV_FB_FREE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-km55s",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 480 +# HELP DCGM_FI_DEV_FB_USED Framebuffer memory used (in MiB). +# TYPE DCGM_FI_DEV_FB_USED gauge +DCGM_FI_DEV_FB_USED{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-km55s",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 4384 +# HELP DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL Total number of NVLink bandwidth counters for all lanes. +# TYPE DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL counter +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-km55s",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_PROF_GR_ENGINE_ACTIVE Ratio of time the graphics engine is active (in %). +# TYPE DCGM_FI_PROF_GR_ENGINE_ACTIVE gauge +DCGM_FI_PROF_GR_ENGINE_ACTIVE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-km55s",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.999892 +# HELP DCGM_FI_PROF_PIPE_TENSOR_ACTIVE Ratio of cycles the tensor (HMMA) pipe is active (in %). +# TYPE DCGM_FI_PROF_PIPE_TENSOR_ACTIVE gauge +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-km55s",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000118 +# HELP DCGM_FI_PROF_DRAM_ACTIVE Ratio of cycles the device memory interface is active sending or receiving data (in %). +# TYPE DCGM_FI_PROF_DRAM_ACTIVE gauge +DCGM_FI_PROF_DRAM_ACTIVE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-km55s",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.622646 \ No newline at end of file diff --git a/design/samples/runai/<2.9/mig/metrics/Active 2*1g.5gb + 2g.10gb.ini b/design/samples/runai/<2.9/mig/metrics/Active 2*1g.5gb + 2g.10gb.ini new file mode 100644 index 0000000..660113c --- /dev/null +++ b/design/samples/runai/<2.9/mig/metrics/Active 2*1g.5gb + 2g.10gb.ini @@ -0,0 +1,72 @@ +Active 2*1g.5gb + 2g.10gb + +# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). +# TYPE DCGM_FI_DEV_SM_CLOCK gauge +DCGM_FI_DEV_SM_CLOCK{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 1410 +DCGM_FI_DEV_SM_CLOCK{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 1410 +DCGM_FI_DEV_SM_CLOCK{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 1410 +# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz). +# TYPE DCGM_FI_DEV_MEM_CLOCK gauge +DCGM_FI_DEV_MEM_CLOCK{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 1215 +DCGM_FI_DEV_MEM_CLOCK{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 1215 +DCGM_FI_DEV_MEM_CLOCK{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 1215 +# HELP DCGM_FI_DEV_MEMORY_TEMP Memory temperature (in C). +# TYPE DCGM_FI_DEV_MEMORY_TEMP gauge +DCGM_FI_DEV_MEMORY_TEMP{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 39 +DCGM_FI_DEV_MEMORY_TEMP{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 39 +DCGM_FI_DEV_MEMORY_TEMP{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 39 +# HELP DCGM_FI_DEV_GPU_TEMP GPU temperature (in C). +# TYPE DCGM_FI_DEV_GPU_TEMP gauge +DCGM_FI_DEV_GPU_TEMP{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 37 +DCGM_FI_DEV_GPU_TEMP{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 37 +DCGM_FI_DEV_GPU_TEMP{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 37 +# HELP DCGM_FI_DEV_POWER_USAGE Power draw (in W). +# TYPE DCGM_FI_DEV_POWER_USAGE gauge +DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 116.043000 +DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 116.043000 +DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 116.163000 +# HELP DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Total energy consumption since boot (in mJ). +# TYPE DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION counter +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 57585599 +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 57585599 +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 57585599 +# HELP DCGM_FI_DEV_PCIE_REPLAY_COUNTER Total number of PCIe retries. +# TYPE DCGM_FI_DEV_PCIE_REPLAY_COUNTER counter +DCGM_FI_DEV_PCIE_REPLAY_COUNTER{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +DCGM_FI_DEV_PCIE_REPLAY_COUNTER{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +DCGM_FI_DEV_PCIE_REPLAY_COUNTER{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_XID_ERRORS Value of the last XID error encountered. +# TYPE DCGM_FI_DEV_XID_ERRORS gauge +DCGM_FI_DEV_XID_ERRORS{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +DCGM_FI_DEV_XID_ERRORS{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +DCGM_FI_DEV_XID_ERRORS{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_FB_FREE Framebuffer memory free (in MiB). +# TYPE DCGM_FI_DEV_FB_FREE gauge +DCGM_FI_DEV_FB_FREE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 4766 +DCGM_FI_DEV_FB_FREE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 480 +DCGM_FI_DEV_FB_FREE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 9702 +# HELP DCGM_FI_DEV_FB_USED Framebuffer memory used (in MiB). +# TYPE DCGM_FI_DEV_FB_USED gauge +DCGM_FI_DEV_FB_USED{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 98 +DCGM_FI_DEV_FB_USED{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 4384 +DCGM_FI_DEV_FB_USED{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 153 +# HELP DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL Total number of NVLink bandwidth counters for all lanes. +# TYPE DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL counter +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_PROF_GR_ENGINE_ACTIVE Ratio of time the graphics engine is active (in %). +# TYPE DCGM_FI_PROF_GR_ENGINE_ACTIVE gauge +DCGM_FI_PROF_GR_ENGINE_ACTIVE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000062 +DCGM_FI_PROF_GR_ENGINE_ACTIVE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.999917 +DCGM_FI_PROF_GR_ENGINE_ACTIVE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000036 +# HELP DCGM_FI_PROF_PIPE_TENSOR_ACTIVE Ratio of cycles the tensor (HMMA) pipe is active (in %). +# TYPE DCGM_FI_PROF_PIPE_TENSOR_ACTIVE gauge +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000000 +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000118 +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000000 +# HELP DCGM_FI_PROF_DRAM_ACTIVE Ratio of cycles the device memory interface is active sending or receiving data (in %). +# TYPE DCGM_FI_PROF_DRAM_ACTIVE gauge +DCGM_FI_PROF_DRAM_ACTIVE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="7",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000053 +DCGM_FI_PROF_DRAM_ACTIVE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="9",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.647986 +DCGM_FI_PROF_DRAM_ACTIVE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="2g.10gb",GPU_I_ID="5",Hostname="nvidia-dcgm-exporter-q28k4",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000001 \ No newline at end of file diff --git a/design/samples/runai/<2.9/mig/metrics/Empty mig disabled.ini b/design/samples/runai/<2.9/mig/metrics/Empty mig disabled.ini new file mode 100644 index 0000000..4200b2a --- /dev/null +++ b/design/samples/runai/<2.9/mig/metrics/Empty mig disabled.ini @@ -0,0 +1,145 @@ +Empty mig disabled + +-------------------------------------------------------------------------------- +# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). +# TYPE DCGM_FI_DEV_SM_CLOCK gauge +DCGM_FI_DEV_SM_CLOCK{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 210 +# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz). +# TYPE DCGM_FI_DEV_MEM_CLOCK gauge +DCGM_FI_DEV_MEM_CLOCK{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 1215 +# HELP DCGM_FI_DEV_MEMORY_TEMP Memory temperature (in C). +# TYPE DCGM_FI_DEV_MEMORY_TEMP gauge +DCGM_FI_DEV_MEMORY_TEMP{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 28 +# HELP DCGM_FI_DEV_GPU_TEMP GPU temperature (in C). +# TYPE DCGM_FI_DEV_GPU_TEMP gauge +DCGM_FI_DEV_GPU_TEMP{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 29 +# HELP DCGM_FI_DEV_POWER_USAGE Power draw (in W). +# TYPE DCGM_FI_DEV_POWER_USAGE gauge +DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 45.999000 +# HELP DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Total energy consumption since boot (in mJ). +# TYPE DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION counter +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 424405577 +# HELP DCGM_FI_DEV_PCIE_REPLAY_COUNTER Total number of PCIe retries. +# TYPE DCGM_FI_DEV_PCIE_REPLAY_COUNTER counter +DCGM_FI_DEV_PCIE_REPLAY_COUNTER{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_GPU_UTIL GPU utilization (in %). +# TYPE DCGM_FI_DEV_GPU_UTIL gauge +DCGM_FI_DEV_GPU_UTIL{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_MEM_COPY_UTIL Memory utilization (in %). +# TYPE DCGM_FI_DEV_MEM_COPY_UTIL gauge +DCGM_FI_DEV_MEM_COPY_UTIL{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_ENC_UTIL Encoder utilization (in %). +# TYPE DCGM_FI_DEV_ENC_UTIL gauge +DCGM_FI_DEV_ENC_UTIL{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_DEC_UTIL Decoder utilization (in %). +# TYPE DCGM_FI_DEV_DEC_UTIL gauge +DCGM_FI_DEV_DEC_UTIL{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_XID_ERRORS Value of the last XID error encountered. +# TYPE DCGM_FI_DEV_XID_ERRORS gauge +DCGM_FI_DEV_XID_ERRORS{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_FB_FREE Framebuffer memory free (in MiB). +# TYPE DCGM_FI_DEV_FB_FREE gauge +DCGM_FI_DEV_FB_FREE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 40384 +# HELP DCGM_FI_DEV_FB_USED Framebuffer memory used (in MiB). +# TYPE DCGM_FI_DEV_FB_USED gauge +DCGM_FI_DEV_FB_USED{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL Total number of NVLink bandwidth counters for all lanes. +# TYPE DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL counter +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS Number of remapped rows for uncorrectable errors +# TYPE DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS counter +DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS Number of remapped rows for correctable errors +# TYPE DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS counter +DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_ROW_REMAP_FAILURE Whether remapping of rows has failed +# TYPE DCGM_FI_DEV_ROW_REMAP_FAILURE gauge +DCGM_FI_DEV_ROW_REMAP_FAILURE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_PROF_GR_ENGINE_ACTIVE Ratio of time the graphics engine is active (in %). +# TYPE DCGM_FI_PROF_GR_ENGINE_ACTIVE gauge +DCGM_FI_PROF_GR_ENGINE_ACTIVE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000000 +# HELP DCGM_FI_PROF_PIPE_TENSOR_ACTIVE Ratio of cycles the tensor (HMMA) pipe is active (in %). +# TYPE DCGM_FI_PROF_PIPE_TENSOR_ACTIVE gauge +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000000 +# HELP DCGM_FI_PROF_DRAM_ACTIVE Ratio of cycles the device memory interface is active sending or receiving data (in %). +# TYPE DCGM_FI_PROF_DRAM_ACTIVE gauge +DCGM_FI_PROF_DRAM_ACTIVE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000000 +# HELP DCGM_FI_PROF_PCIE_TX_BYTES The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second. +# TYPE DCGM_FI_PROF_PCIE_TX_BYTES gauge +DCGM_FI_PROF_PCIE_TX_BYTES{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 16503 +# HELP DCGM_FI_PROF_PCIE_RX_BYTES The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second. +# TYPE DCGM_FI_PROF_PCIE_RX_BYTES gauge +DCGM_FI_PROF_PCIE_RX_BYTES{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 11775 + + +-------------------------------------------------------------------------------- + +# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). +# TYPE DCGM_FI_DEV_SM_CLOCK gauge +DCGM_FI_DEV_SM_CLOCK{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 210 +# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz). +# TYPE DCGM_FI_DEV_MEM_CLOCK gauge +DCGM_FI_DEV_MEM_CLOCK{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 1215 +# HELP DCGM_FI_DEV_MEMORY_TEMP Memory temperature (in C). +# TYPE DCGM_FI_DEV_MEMORY_TEMP gauge +DCGM_FI_DEV_MEMORY_TEMP{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 28 +# HELP DCGM_FI_DEV_GPU_TEMP GPU temperature (in C). +# TYPE DCGM_FI_DEV_GPU_TEMP gauge +DCGM_FI_DEV_GPU_TEMP{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 29 +# HELP DCGM_FI_DEV_POWER_USAGE Power draw (in W). +# TYPE DCGM_FI_DEV_POWER_USAGE gauge +DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 45.999000 +# HELP DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Total energy consumption since boot (in mJ). +# TYPE DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION counter +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 424405577 +# HELP DCGM_FI_DEV_PCIE_REPLAY_COUNTER Total number of PCIe retries. +# TYPE DCGM_FI_DEV_PCIE_REPLAY_COUNTER counter +DCGM_FI_DEV_PCIE_REPLAY_COUNTER{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_GPU_UTIL GPU utilization (in %). +# TYPE DCGM_FI_DEV_GPU_UTIL gauge +DCGM_FI_DEV_GPU_UTIL{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_MEM_COPY_UTIL Memory utilization (in %). +# TYPE DCGM_FI_DEV_MEM_COPY_UTIL gauge +DCGM_FI_DEV_MEM_COPY_UTIL{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_ENC_UTIL Encoder utilization (in %). +# TYPE DCGM_FI_DEV_ENC_UTIL gauge +DCGM_FI_DEV_ENC_UTIL{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_DEC_UTIL Decoder utilization (in %). +# TYPE DCGM_FI_DEV_DEC_UTIL gauge +DCGM_FI_DEV_DEC_UTIL{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_XID_ERRORS Value of the last XID error encountered. +# TYPE DCGM_FI_DEV_XID_ERRORS gauge +DCGM_FI_DEV_XID_ERRORS{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_FB_FREE Framebuffer memory free (in MiB). +# TYPE DCGM_FI_DEV_FB_FREE gauge +DCGM_FI_DEV_FB_FREE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 40384 +# HELP DCGM_FI_DEV_FB_USED Framebuffer memory used (in MiB). +# TYPE DCGM_FI_DEV_FB_USED gauge +DCGM_FI_DEV_FB_USED{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL Total number of NVLink bandwidth counters for all lanes. +# TYPE DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL counter +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS Number of remapped rows for uncorrectable errors +# TYPE DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS counter +DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS Number of remapped rows for correctable errors +# TYPE DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS counter +DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_ROW_REMAP_FAILURE Whether remapping of rows has failed +# TYPE DCGM_FI_DEV_ROW_REMAP_FAILURE gauge +DCGM_FI_DEV_ROW_REMAP_FAILURE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_PROF_GR_ENGINE_ACTIVE Ratio of time the graphics engine is active (in %). +# TYPE DCGM_FI_PROF_GR_ENGINE_ACTIVE gauge +DCGM_FI_PROF_GR_ENGINE_ACTIVE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000000 +# HELP DCGM_FI_PROF_PIPE_TENSOR_ACTIVE Ratio of cycles the tensor (HMMA) pipe is active (in %). +# TYPE DCGM_FI_PROF_PIPE_TENSOR_ACTIVE gauge +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000000 +# HELP DCGM_FI_PROF_DRAM_ACTIVE Ratio of cycles the device memory interface is active sending or receiving data (in %). +# TYPE DCGM_FI_PROF_DRAM_ACTIVE gauge +DCGM_FI_PROF_DRAM_ACTIVE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000000 +# HELP DCGM_FI_PROF_PCIE_TX_BYTES The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second. +# TYPE DCGM_FI_PROF_PCIE_TX_BYTES gauge +DCGM_FI_PROF_PCIE_TX_BYTES{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 16503 +# HELP DCGM_FI_PROF_PCIE_RX_BYTES The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second. +# TYPE DCGM_FI_PROF_PCIE_RX_BYTES gauge +DCGM_FI_PROF_PCIE_RX_BYTES{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-drqwq",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 11775 \ No newline at end of file diff --git a/design/samples/runai/<2.9/mig/metrics/Emtpy MIG enabled.ini b/design/samples/runai/<2.9/mig/metrics/Emtpy MIG enabled.ini new file mode 100644 index 0000000..650c144 --- /dev/null +++ b/design/samples/runai/<2.9/mig/metrics/Emtpy MIG enabled.ini @@ -0,0 +1,44 @@ +Emtpy MIG enabled + +# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). +# TYPE DCGM_FI_DEV_SM_CLOCK gauge +DCGM_FI_DEV_SM_CLOCK{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-wwvv9",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 1410 +# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz). +# TYPE DCGM_FI_DEV_MEM_CLOCK gauge +DCGM_FI_DEV_MEM_CLOCK{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-wwvv9",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 1215 +# HELP DCGM_FI_DEV_MEMORY_TEMP Memory temperature (in C). +# TYPE DCGM_FI_DEV_MEMORY_TEMP gauge +DCGM_FI_DEV_MEMORY_TEMP{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-wwvv9",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 30 +# HELP DCGM_FI_DEV_GPU_TEMP GPU temperature (in C). +# TYPE DCGM_FI_DEV_GPU_TEMP gauge +DCGM_FI_DEV_GPU_TEMP{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-wwvv9",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 32 +# HELP DCGM_FI_DEV_POWER_USAGE Power draw (in W). +# TYPE DCGM_FI_DEV_POWER_USAGE gauge +DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-wwvv9",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 75.960000 +# HELP DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Total energy consumption since boot (in mJ). +# TYPE DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION counter +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-wwvv9",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 8981778 +# HELP DCGM_FI_DEV_PCIE_REPLAY_COUNTER Total number of PCIe retries. +# TYPE DCGM_FI_DEV_PCIE_REPLAY_COUNTER counter +DCGM_FI_DEV_PCIE_REPLAY_COUNTER{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-wwvv9",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_XID_ERRORS Value of the last XID error encountered. +# TYPE DCGM_FI_DEV_XID_ERRORS gauge +DCGM_FI_DEV_XID_ERRORS{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-wwvv9",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL Total number of NVLink bandwidth counters for all lanes. +# TYPE DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL counter +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-wwvv9",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_PROF_GR_ENGINE_ACTIVE Ratio of time the graphics engine is active (in %). +# TYPE DCGM_FI_PROF_GR_ENGINE_ACTIVE gauge +DCGM_FI_PROF_GR_ENGINE_ACTIVE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-wwvv9",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000033 +# HELP DCGM_FI_PROF_PIPE_TENSOR_ACTIVE Ratio of cycles the tensor (HMMA) pipe is active (in %). +# TYPE DCGM_FI_PROF_PIPE_TENSOR_ACTIVE gauge +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-wwvv9",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000000 +# HELP DCGM_FI_PROF_DRAM_ACTIVE Ratio of cycles the device memory interface is active sending or receiving data (in %). +# TYPE DCGM_FI_PROF_DRAM_ACTIVE gauge +DCGM_FI_PROF_DRAM_ACTIVE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-wwvv9",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000005 +# HELP DCGM_FI_PROF_PCIE_TX_BYTES The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second. +# TYPE DCGM_FI_PROF_PCIE_TX_BYTES gauge +DCGM_FI_PROF_PCIE_TX_BYTES{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-wwvv9",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 563898 +# HELP DCGM_FI_PROF_PCIE_RX_BYTES The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second. +# TYPE DCGM_FI_PROF_PCIE_RX_BYTES gauge +DCGM_FI_PROF_PCIE_RX_BYTES{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",Hostname="nvidia-dcgm-exporter-wwvv9",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 1161041 \ No newline at end of file diff --git a/design/samples/runai/<2.9/mig/metrics/Idle 1g.5gb allocated.ini b/design/samples/runai/<2.9/mig/metrics/Idle 1g.5gb allocated.ini new file mode 100644 index 0000000..264a8fa --- /dev/null +++ b/design/samples/runai/<2.9/mig/metrics/Idle 1g.5gb allocated.ini @@ -0,0 +1,44 @@ +Idle 1g.5gb allocated + +# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). +# TYPE DCGM_FI_DEV_SM_CLOCK gauge +DCGM_FI_DEV_SM_CLOCK{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-m48cc",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 1410 +# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz). +# TYPE DCGM_FI_DEV_MEM_CLOCK gauge +DCGM_FI_DEV_MEM_CLOCK{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-m48cc",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 1215 +# HELP DCGM_FI_DEV_MEMORY_TEMP Memory temperature (in C). +# TYPE DCGM_FI_DEV_MEMORY_TEMP gauge +DCGM_FI_DEV_MEMORY_TEMP{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-m48cc",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 30 +# HELP DCGM_FI_DEV_GPU_TEMP GPU temperature (in C). +# TYPE DCGM_FI_DEV_GPU_TEMP gauge +DCGM_FI_DEV_GPU_TEMP{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-m48cc",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 32 +# HELP DCGM_FI_DEV_POWER_USAGE Power draw (in W). +# TYPE DCGM_FI_DEV_POWER_USAGE gauge +DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-m48cc",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 75.690000 +# HELP DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Total energy consumption since boot (in mJ). +# TYPE DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION counter +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-m48cc",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 4617218 +# HELP DCGM_FI_DEV_PCIE_REPLAY_COUNTER Total number of PCIe retries. +# TYPE DCGM_FI_DEV_PCIE_REPLAY_COUNTER counter +DCGM_FI_DEV_PCIE_REPLAY_COUNTER{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-m48cc",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_XID_ERRORS Value of the last XID error encountered. +# TYPE DCGM_FI_DEV_XID_ERRORS gauge +DCGM_FI_DEV_XID_ERRORS{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-m48cc",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_DEV_FB_FREE Framebuffer memory free (in MiB). +# TYPE DCGM_FI_DEV_FB_FREE gauge +DCGM_FI_DEV_FB_FREE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-m48cc",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 4857 +# HELP DCGM_FI_DEV_FB_USED Framebuffer memory used (in MiB). +# TYPE DCGM_FI_DEV_FB_USED gauge +DCGM_FI_DEV_FB_USED{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-m48cc",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 6 +# HELP DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL Total number of NVLink bandwidth counters for all lanes. +# TYPE DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL counter +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-m48cc",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0 +# HELP DCGM_FI_PROF_GR_ENGINE_ACTIVE Ratio of time the graphics engine is active (in %). +# TYPE DCGM_FI_PROF_GR_ENGINE_ACTIVE gauge +DCGM_FI_PROF_GR_ENGINE_ACTIVE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-m48cc",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000000 +# HELP DCGM_FI_PROF_PIPE_TENSOR_ACTIVE Ratio of cycles the tensor (HMMA) pipe is active (in %). +# TYPE DCGM_FI_PROF_PIPE_TENSOR_ACTIVE gauge +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-m48cc",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000000 +# HELP DCGM_FI_PROF_DRAM_ACTIVE Ratio of cycles the device memory interface is active sending or receiving data (in %). +# TYPE DCGM_FI_PROF_DRAM_ACTIVE gauge +DCGM_FI_PROF_DRAM_ACTIVE{gpu="0",UUID="GPU-7099682b-a20d-2bcc-74ab-b9060454d81e",device="nvidia0",modelName="NVIDIA A100-SXM4-40GB",GPU_I_PROFILE="1g.5gb",GPU_I_ID="8",Hostname="nvidia-dcgm-exporter-m48cc",DCGM_FI_DRIVER_VERSION="520.56.06",container="",namespace="",pod=""} 0.000000 \ No newline at end of file diff --git a/design/samples/runai/<2.9/mig/node/3_instances.yaml b/design/samples/runai/<2.9/mig/node/3_instances.yaml new file mode 100644 index 0000000..06378e0 --- /dev/null +++ b/design/samples/runai/<2.9/mig/node/3_instances.yaml @@ -0,0 +1,395 @@ +# Please edit the object below. Lines beginning with a '#' will be ignored, +# and an empty file will abort the edit. If an error occurs while saving this file will be +# reopened with the relevant failures. +# +apiVersion: v1 +kind: Node +metadata: + annotations: + flannel.alpha.coreos.com/backend-data: '{"VNI":1,"VtepMAC":"7a:8b:3e:b1:ae:06"}' + flannel.alpha.coreos.com/backend-type: vxlan + flannel.alpha.coreos.com/kube-subnet-manager: "true" + flannel.alpha.coreos.com/public-ip: 10.51.0.6 + kubeadm.alpha.kubernetes.io/cri-socket: /var/run/dockershim.sock + nfd.node.kubernetes.io/extended-resources: "" + nfd.node.kubernetes.io/feature-labels: cpu-cpuid.ADX,cpu-cpuid.AESNI,cpu-cpuid.AVX,cpu-cpuid.AVX2,cpu-cpuid.AVX512BW,cpu-cpuid.AVX512CD,cpu-cpuid.AVX512DQ,cpu-cpuid.AVX512F,cpu-cpuid.AVX512VL,cpu-cpuid.AVX512VNNI,cpu-cpuid.FMA3,cpu-cpuid.HLE,cpu-cpuid.HYPERVISOR,cpu-cpuid.IBPB,cpu-cpuid.MPX,cpu-cpuid.RTM,cpu-cpuid.STIBP,cpu-hardware_multithreading,kernel-config.NO_HZ,kernel-config.NO_HZ_IDLE,kernel-version.full,kernel-version.major,kernel-version.minor,kernel-version.revision,nvidia.com/cuda.driver.major,nvidia.com/cuda.driver.minor,nvidia.com/cuda.driver.rev,nvidia.com/cuda.runtime.major,nvidia.com/cuda.runtime.minor,nvidia.com/gfd.timestamp,nvidia.com/gpu.compute.major,nvidia.com/gpu.compute.minor,nvidia.com/gpu.count,nvidia.com/gpu.family,nvidia.com/gpu.machine,nvidia.com/gpu.memory,nvidia.com/gpu.product,nvidia.com/gpu.replicas,nvidia.com/mig-1g.5gb.count,nvidia.com/mig-1g.5gb.engines.copy,nvidia.com/mig-1g.5gb.engines.decoder,nvidia.com/mig-1g.5gb.engines.encoder,nvidia.com/mig-1g.5gb.engines.jpeg,nvidia.com/mig-1g.5gb.engines.ofa,nvidia.com/mig-1g.5gb.memory,nvidia.com/mig-1g.5gb.multiprocessors,nvidia.com/mig-1g.5gb.product,nvidia.com/mig-1g.5gb.replicas,nvidia.com/mig-1g.5gb.slices.ci,nvidia.com/mig-1g.5gb.slices.gi,nvidia.com/mig-2g.10gb.count,nvidia.com/mig-2g.10gb.engines.copy,nvidia.com/mig-2g.10gb.engines.decoder,nvidia.com/mig-2g.10gb.engines.encoder,nvidia.com/mig-2g.10gb.engines.jpeg,nvidia.com/mig-2g.10gb.engines.ofa,nvidia.com/mig-2g.10gb.memory,nvidia.com/mig-2g.10gb.multiprocessors,nvidia.com/mig-2g.10gb.product,nvidia.com/mig-2g.10gb.replicas,nvidia.com/mig-2g.10gb.slices.ci,nvidia.com/mig-2g.10gb.slices.gi,nvidia.com/mig.capable,nvidia.com/mig.strategy,nvidia.com/run.ai-swap.enabled,pci-10de.present,pci-1af4.present,system-os_release.ID,system-os_release.VERSION_ID,system-os_release.VERSION_ID.major,system-os_release.VERSION_ID.minor + nfd.node.kubernetes.io/worker.version: v0.10.1 + node.alpha.kubernetes.io/ttl: "0" + run.ai/mig-mapping: ewogICIwIjogewogICAgIjAiOiAiTUlHLTI5OGVmOGRhLWUzNWYtNTYyNC1hOTM1LTFlYjU1ZmM1OGQwZiIsCiAgICAiNCI6ICJNSUctZGI3ZDViNTMtMTYzMy01NjllLWE0ZWMtNDAxYzVkN2U0YTRkIiwKICAgICI2IjogIk1JRy1hNzA5NmU5Yi1jNTc3LTVhMTktYjUzMC1hNjliYzI2YWViZTUiCiAgfQp9 + run.ai/mig.config: |- + version: v1 + mig-configs: + selected: + - devices: [0] + mig-enabled: true + mig-devices: + 0: 2g.10gb + 4: 1g.5gb + 6: 1g.5gb + volumes.kubernetes.io/controller-managed-attach-detach: "true" + creationTimestamp: "2022-12-26T14:13:08Z" + labels: + beta.kubernetes.io/arch: amd64 + beta.kubernetes.io/os: linux + feature.node.kubernetes.io/cpu-cpuid.ADX: "true" + feature.node.kubernetes.io/cpu-cpuid.AESNI: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX2: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX512BW: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX512CD: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX512DQ: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX512F: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX512VL: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX512VNNI: "true" + feature.node.kubernetes.io/cpu-cpuid.FMA3: "true" + feature.node.kubernetes.io/cpu-cpuid.HLE: "true" + feature.node.kubernetes.io/cpu-cpuid.HYPERVISOR: "true" + feature.node.kubernetes.io/cpu-cpuid.IBPB: "true" + feature.node.kubernetes.io/cpu-cpuid.MPX: "true" + feature.node.kubernetes.io/cpu-cpuid.RTM: "true" + feature.node.kubernetes.io/cpu-cpuid.STIBP: "true" + feature.node.kubernetes.io/cpu-hardware_multithreading: "true" + feature.node.kubernetes.io/kernel-config.NO_HZ: "true" + feature.node.kubernetes.io/kernel-config.NO_HZ_IDLE: "true" + feature.node.kubernetes.io/kernel-version.full: 5.15.0-1021-gcp + feature.node.kubernetes.io/kernel-version.major: "5" + feature.node.kubernetes.io/kernel-version.minor: "15" + feature.node.kubernetes.io/kernel-version.revision: "0" + feature.node.kubernetes.io/pci-10de.present: "true" + feature.node.kubernetes.io/pci-1af4.present: "true" + feature.node.kubernetes.io/system-os_release.ID: ubuntu + feature.node.kubernetes.io/system-os_release.VERSION_ID: "20.04" + feature.node.kubernetes.io/system-os_release.VERSION_ID.major: "20" + feature.node.kubernetes.io/system-os_release.VERSION_ID.minor: "04" + kubernetes.io/arch: amd64 + kubernetes.io/hostname: qa-mig-worker-gpu-a100 + kubernetes.io/os: linux + node-role.kubernetes.io/runai-dynamic-mig: "true" + node-role.kubernetes.io/runai-mig-enabled: "true" + nvidia.com/cuda.driver.major: "520" + nvidia.com/cuda.driver.minor: "56" + nvidia.com/cuda.driver.rev: "06" + nvidia.com/cuda.runtime.major: "11" + nvidia.com/cuda.runtime.minor: "8" + nvidia.com/gfd.timestamp: "1674042893" + nvidia.com/gpu.compute.major: "8" + nvidia.com/gpu.compute.minor: "0" + nvidia.com/gpu.count: "1" + nvidia.com/gpu.deploy.container-toolkit: "true" + nvidia.com/gpu.deploy.dcgm: "true" + nvidia.com/gpu.deploy.dcgm-exporter: "true" + nvidia.com/gpu.deploy.device-plugin: "true" + nvidia.com/gpu.deploy.driver: pre-installed + nvidia.com/gpu.deploy.gpu-feature-discovery: "true" + nvidia.com/gpu.deploy.mig-manager: "true" + nvidia.com/gpu.deploy.node-status-exporter: "true" + nvidia.com/gpu.deploy.operator-validator: "true" + nvidia.com/gpu.family: ampere + nvidia.com/gpu.machine: Google-Compute-Engine + nvidia.com/gpu.memory: "40960" + nvidia.com/gpu.present: "true" + nvidia.com/gpu.product: NVIDIA-A100-SXM4-40GB + nvidia.com/gpu.replicas: "0" + nvidia.com/mig-1g.5gb.count: "2" + nvidia.com/mig-1g.5gb.engines.copy: "1" + nvidia.com/mig-1g.5gb.engines.decoder: "0" + nvidia.com/mig-1g.5gb.engines.encoder: "0" + nvidia.com/mig-1g.5gb.engines.jpeg: "0" + nvidia.com/mig-1g.5gb.engines.ofa: "0" + nvidia.com/mig-1g.5gb.memory: "4864" + nvidia.com/mig-1g.5gb.multiprocessors: "14" + nvidia.com/mig-1g.5gb.product: NVIDIA-A100-SXM4-40GB-MIG-1g.5gb + nvidia.com/mig-1g.5gb.replicas: "1" + nvidia.com/mig-1g.5gb.slices.ci: "1" + nvidia.com/mig-1g.5gb.slices.gi: "1" + nvidia.com/mig-2g.10gb.count: "1" + nvidia.com/mig-2g.10gb.engines.copy: "2" + nvidia.com/mig-2g.10gb.engines.decoder: "1" + nvidia.com/mig-2g.10gb.engines.encoder: "0" + nvidia.com/mig-2g.10gb.engines.jpeg: "0" + nvidia.com/mig-2g.10gb.engines.ofa: "0" + nvidia.com/mig-2g.10gb.memory: "9856" + nvidia.com/mig-2g.10gb.multiprocessors: "28" + nvidia.com/mig-2g.10gb.product: NVIDIA-A100-SXM4-40GB-MIG-2g.10gb + nvidia.com/mig-2g.10gb.replicas: "1" + nvidia.com/mig-2g.10gb.slices.ci: "2" + nvidia.com/mig-2g.10gb.slices.gi: "2" + nvidia.com/mig.capable: "true" + nvidia.com/mig.config.state: success + nvidia.com/mig.strategy: mixed + nvidia.com/run.ai-swap.enabled: "false" + name: qa-mig-worker-gpu-a100 + resourceVersion: "2491211" + uid: 04dc2370-8f7d-43f8-a346-c8b5201fdd5a +spec: + podCIDR: 10.244.2.0/24 + podCIDRs: + - 10.244.2.0/24 +status: + addresses: + - address: 10.51.0.6 + type: InternalIP + - address: qa-mig-worker-gpu-a100 + type: Hostname + allocatable: + cpu: "12" + ephemeral-storage: "93478772582" + hugepages-1Gi: "0" + hugepages-2Mi: "0" + memory: 87425380Ki + nvidia.com/gpu: "0" + nvidia.com/mig-1g.5gb: "2" + nvidia.com/mig-2g.10gb: "1" + nvidia.com/mig-3g.20gb: "0" + pods: "110" + capacity: + cpu: "12" + ephemeral-storage: 101430960Ki + hugepages-1Gi: "0" + hugepages-2Mi: "0" + memory: 87527780Ki + nvidia.com/gpu: "0" + nvidia.com/mig-1g.5gb: "2" + nvidia.com/mig-2g.10gb: "1" + nvidia.com/mig-3g.20gb: "0" + pods: "110" + conditions: + - lastHeartbeatTime: "2023-01-18T11:53:42Z" + lastTransitionTime: "2023-01-18T11:53:42Z" + message: Flannel is running on this node + reason: FlannelIsUp + status: "False" + type: NetworkUnavailable + - lastHeartbeatTime: "2023-01-18T14:56:13Z" + lastTransitionTime: "2023-01-18T11:52:57Z" + message: kubelet has sufficient memory available + reason: KubeletHasSufficientMemory + status: "False" + type: MemoryPressure + - lastHeartbeatTime: "2023-01-18T14:56:13Z" + lastTransitionTime: "2023-01-18T11:52:57Z" + message: kubelet has no disk pressure + reason: KubeletHasNoDiskPressure + status: "False" + type: DiskPressure + - lastHeartbeatTime: "2023-01-18T14:56:13Z" + lastTransitionTime: "2023-01-18T11:52:57Z" + message: kubelet has sufficient PID available + reason: KubeletHasSufficientPID + status: "False" + type: PIDPressure + - lastHeartbeatTime: "2023-01-18T14:56:13Z" + lastTransitionTime: "2023-01-18T11:53:07Z" + message: kubelet is posting ready status. AppArmor enabled + reason: KubeletReady + status: "True" + type: Ready + daemonEndpoints: + kubeletEndpoint: + Port: 10250 + images: + - names: + - gcr.io/run-ai-demo/quickstart@sha256:7837847d3a186bb2daa03f1781542212cb7c66575ea8b0aaf3fd886f0043c405 + - gcr.io/run-ai-demo/quickstart:latest + sizeBytes: 12072476783 + - names: + - nvcr.io/nvidia/k8s/dcgm-exporter@sha256:c1418a43dd54946ad0d08864c7da5591e067fd0e21791ab48a2dec8bdde14774 + - nvcr.io/nvidia/k8s/dcgm-exporter:3.1.3-3.1.2-ubuntu20.04 + sizeBytes: 1989114969 + - names: + - gshaibi/gpu-burn@sha256:ed07993b0581228c2bd7113fae0ed214549547f0fa91ba50165bc2473cfaf979 + - gshaibi/gpu-burn:latest + sizeBytes: 1619723452 + - names: + - nvcr.io/nvidia/k8s/dcgm-exporter@sha256:952ec6be586dcff7c4b2936a20b7704c55b91be2b0ddb6d121ce72f5a833e804 + - nvcr.io/nvidia/k8s/dcgm-exporter:3.0.4-3.0.0-ubuntu20.04 + sizeBytes: 1222518599 + - names: + - gcr.io/run-ai-lab/mig-provisioner@sha256:9cfc874aeb83bb4f6c0b6a1a85f910a05d4d041be95e67e4501767f490b5149e + sizeBytes: 1188522487 + - names: + - gcr.io/run-ai-lab/mig-provisioner@sha256:2e6a769e727ce22c6393a7af9cfdb4d579d6ee6e4d332b10d029b64e4eabfea5 + sizeBytes: 1188506111 + - names: + - gcr.io/run-ai-lab/mig-provisioner@sha256:3c0cb03a8e440cb02fb0b0e0e311c8353e8f7cdeeb464f3c7aaaffa096aaf5c5 + - gcr.io/run-ai-lab/mig-provisioner:galb1 + sizeBytes: 1188467321 + - names: + - gcr.io/run-ai-lab/mig-parted@sha256:423a9d4c654bf69c8b740cc74aa6c3586886775c9e47b12b3a072cee4b2b3d1c + - gcr.io/run-ai-lab/mig-parted:galb-debug2 + sizeBytes: 1147224275 + - names: + - gcr.io/run-ai-lab/mig-parted@sha256:76fe9435c418fbd60b636f114b6410eab08889d0516396f0d19cc8c8335f9473 + - gcr.io/run-ai-lab/mig-parted:galb-debug1 + sizeBytes: 1147224275 + - names: + - gcr.io/run-ai-lab/mig-parted@sha256:72929a39c30f8d0e67684ae9c019ed4f9603592aa361df5aaa4ae81ff7901262 + - gcr.io/run-ai-lab/mig-parted:galb-debug3 + sizeBytes: 1147201491 + - names: + - gcr.io/run-ai-lab/mig-parted@sha256:6b53498374496f3e6ee04bc3dcb402bd8d751645c8285acb6265c7dd0dd361af + - gcr.io/run-ai-lab/mig-parted:galb-debug4 + sizeBytes: 1140576119 + - names: + - gcr.io/run-ai-lab/agent@sha256:9b67cd4e2a720f5a9674046c4b7d2c1c5b5a0df0e5e4dcb956046d76080cec04 + - gcr.io/run-ai-lab/agent:0.0.0-1004002.galbrun-6511-add-gpu-inst-e01f8b + sizeBytes: 1087796268 + - names: + - gcr.io/run-ai-test-1/agent@sha256:0dd22f998c8edd4412018bc77fc57cd6e066af5dd0efcbf64381203cb93bf26a + - gcr.io/run-ai-test-1/agent:0.0.0-1003867.master-655b6a + sizeBytes: 1087759929 + - names: + - gcr.io/run-ai-test-1/agent@sha256:55ad532e88542cf30803c4e28f17509e944fa912830cc368d096483e173ee6dd + - gcr.io/run-ai-test-1/agent:master-354d48 + sizeBytes: 1087526865 + - names: + - gcr.io/run-ai-prod/agent@sha256:0441dc0f08b60c3dd42676be1cd1056bc7ab8fb5c1a910e6d4fae2ded6afe449 + - gcr.io/run-ai-prod/agent:2.8.11-rc.0 + sizeBytes: 1079256002 + - names: + - gcr.io/run-ai-prod/agent@sha256:36c305603419ffa30755eb8d250493b2d388bd2d12e3cbd969763e1e80ac6d18 + - gcr.io/run-ai-prod/agent:2.8.9-rc.1 + sizeBytes: 1079177946 + - names: + - gcr.io/run-ai-prod/agent@sha256:d21a6b1438597e82f85f392b37b5fe1ad63a1a46d0044462fb2d50ef7aec9323 + - gcr.io/run-ai-prod/agent:2.8.8 + sizeBytes: 1079071238 + - names: + - gcr.io/run-ai-prod/agent@sha256:94a7563a637825df8de459390f8afce407e7072d5985d4297427472bd23c8a34 + - gcr.io/run-ai-prod/agent:2.7.14-rc.2 + sizeBytes: 1078449294 + - names: + - gcr.io/run-ai-prod/agent@sha256:35b8aff4f1228635dcc071f77ffad465e5f9c5658424a142a0ee20a886adc5dc + - gcr.io/run-ai-prod/agent:2.7.13 + sizeBytes: 1078418535 + - names: + - gcr.io/run-ai-prod/agent@sha256:2476a44f91ff04cfb85eab22e68072be96fbe82f3dff40da51e086caa1cc81ed + - gcr.io/run-ai-prod/agent:2.7.15-rc.0 + sizeBytes: 1078388027 + - names: + - gcr.io/run-ai-prod/agent@sha256:9236f3ab5f37d1cc9365b9a5b3bc0e59602cb29bc366bd8e5143bf4c26e18974 + - gcr.io/run-ai-prod/agent:2.7.0-rc.13 + sizeBytes: 1078145358 + - names: + - gcr.io/run-ai-lab/pod-grouper@sha256:25870b435c498aeae9d254eadcf94fa1197ebcd557b59a751547404e9c5990a5 + - gcr.io/run-ai-lab/pod-grouper:0.0.0-1004002.galbrun-6511-add-gpu-inst-e01f8b + sizeBytes: 1041712203 + - names: + - gcr.io/run-ai-test-1/pod-grouper@sha256:896bec2645fd93f62c54bffd563cff57f90a7c46b0951a7644b93b769c45ddb5 + - gcr.io/run-ai-test-1/pod-grouper:0.0.0-1003867.master-655b6a + sizeBytes: 1041712203 + - names: + - gcr.io/run-ai-test-1/pod-grouper@sha256:00cf9d524ce5b2a0d27f6957288e3f37c90809b56a0bb28ddc9f29121aaa2ce1 + - gcr.io/run-ai-test-1/pod-grouper:master-354d48 + sizeBytes: 1041631818 + - names: + - gcr.io/run-ai-lab/mig-provisioner@sha256:ba4bdd418faf1e74c293627886f819e065a8ab7a42df2255a42b289518c7c9f6 + sizeBytes: 1027507788 + - names: + - gcr.io/run-ai-lab/mig-provisioner@sha256:871a685f05a7a7bfc0edbc00bb8fb31a76bb7695783c05b0581a0d93debc251f + sizeBytes: 1027398188 + - names: + - nvcr.io/nvidia/driver@sha256:c24371b1793eab4f2f035bc0584015de3a7be3031587d6cd948069c9127542f6 + - nvcr.io/nvidia/driver:515.65.01-ubuntu20.04 + sizeBytes: 1026833848 + - names: + - gcr.io/run-ai-lab/nodepool-controller@sha256:78020b9c166f6fc4bf23a9787f7d9b06637bf4d2002991e1e697d364260ae4fb + - gcr.io/run-ai-lab/nodepool-controller:0.0.0-1004002.galbrun-6511-add-gpu-inst-e01f8b + sizeBytes: 990809924 + - names: + - gcr.io/run-ai-test-1/nodepool-controller@sha256:38248d3444eed59c66cd87a409ed0b366581324ae0649d4d7cf5cbe35315afc3 + - gcr.io/run-ai-test-1/nodepool-controller:0.0.0-1003867.master-655b6a + sizeBytes: 990808388 + - names: + - gcr.io/run-ai-test-1/nodepool-controller@sha256:3a70b8b1c56dcc2502ffa9ff2149be2d0f767069750234cbfc368357b93ca7ce + - gcr.io/run-ai-test-1/nodepool-controller:master-354d48 + sizeBytes: 990793619 + - names: + - nvcr.io/nvidia/driver@sha256:d944da4ec30065b98c170f924c75fe1222e06998f5dff726a1867fa1c1f9b801 + - nvcr.io/nvidia/driver:510.47.03-ubuntu20.04 + sizeBytes: 932659896 + - names: + - gcr.io/run-ai-lab/mig-parted@sha256:a076495d71c76dfc5718908fd39e29575dea5435e3776dbeceb350f5897799e2 + - gcr.io/run-ai-lab/mig-parted:785d641d6-devel + - gcr.io/run-ai-lab/mig-parted:dc76960b1-devel + sizeBytes: 894154074 + - names: + - gcr.io/run-ai-lab/mig-parted@sha256:933e78dde8ebea4679159e8d48f793ba4c1725a28dfacfa5f9c21cb4a02d2deb + sizeBytes: 775214730 + - names: + - nvcr.io/nvidia/k8s/dcgm-exporter@sha256:7507d27b8d4736531504abf4f011c0b9586fbb7dd9436e26c3b5cd1e262369db + - nvcr.io/nvidia/k8s/dcgm-exporter:2.3.4-2.6.4-ubuntu20.04 + sizeBytes: 572817930 + - names: + - nvcr.io/nvidia/cloud-native/k8s-driver-manager@sha256:6240c5912aabed789c672f3179b4a65e45511d10fa8c41a5de0d91644a792b14 + - nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.5.1 + sizeBytes: 562256065 + - names: + - nvcr.io/nvidia/k8s/dcgm-exporter@sha256:3096538afb0203f8568639ed12fd8ad3fb8c16bbd9c4130e58791e7512f65799 + - nvcr.io/nvidia/k8s/dcgm-exporter:2.3.1-2.6.1-ubuntu20.04 + sizeBytes: 542091179 + - names: + - bitnami/fluentd@sha256:11bb83687b44a9fb7a4f773e2ecf120e2b4523613a5544999723f0b9cd8fe2ed + - bitnami/fluentd:1.12.0-debian-10-r0 + sizeBytes: 512854333 + - names: + - nvcr.io/nvidia/cloud-native/gpu-operator-validator@sha256:18c9ea88ae06d479e6657b8a4126a8ee3f4300a40c16ddc29fb7ab3763d46005 + - nvcr.io/nvidia/cloud-native/gpu-operator-validator:v22.9.1 + sizeBytes: 478600391 + - names: + - nvcr.io/nvidia/cloud-native/k8s-driver-manager@sha256:e3f16c26b9340ed46aed248cc4d18353ba3a65886bf7a2f0cea25ff41b2553da + - nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.4.2 + sizeBytes: 459618644 + - names: + - nvcr.io/nvidia/k8s-device-plugin@sha256:9c17d3a907eb77eb8f7b4f3faf52d8352e4252af92003f828083f80d629bd2c3 + - nvcr.io/nvidia/k8s-device-plugin:v0.13.0-ubi8 + sizeBytes: 444556116 + - names: + - calico/node@sha256:349c10be37e64a310d25869128d482b17bfeb4166bc80bd9a2ed095203a77ddb + - calico/node:v3.15.5 + sizeBytes: 437164545 + - names: + - nvcr.io/nvidia/cloud-native/gpu-operator-validator@sha256:6fe4200960b2b49d6dac1c91e596f61dacb6b3dcff878c84eb74c5136fedd5b6 + - nvcr.io/nvidia/cloud-native/gpu-operator-validator:v22.9.0 + sizeBytes: 432799839 + - names: + - nvcr.io/nvidia/gpu-feature-discovery@sha256:bec9f026d9b3d9404c78d6091817a359015c6a7aa411735b34138c1518853b5d + - nvcr.io/nvidia/gpu-feature-discovery:v0.7.0-ubi8 + sizeBytes: 415829527 + - names: + - nvcr.io/nvidia/cloud-native/k8s-driver-manager@sha256:5b16056257acc51b517d9cdb1da3218693cefc214af93789e6e214fd2b4cacf1 + - nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.3.0 + sizeBytes: 413807613 + - names: + - nvcr.io/nvidia/k8s-device-plugin@sha256:d1c61116647bb9388eb3c4e31848dd6038458b7ba33c0eb3b659d96739eceb73 + - nvcr.io/nvidia/k8s-device-plugin:v0.12.3-ubi8 + sizeBytes: 408778270 + - names: + - nvcr.io/nvidia/gpu-feature-discovery@sha256:37821ea7829def707f819ac21122cea62efdbbd640679b6004e552fb9a1e17a3 + - nvcr.io/nvidia/gpu-feature-discovery:v0.6.2-ubi8 + sizeBytes: 380437918 + - names: + - gcr.io/run-ai-lab/mig-parted@sha256:24e6507b389d7b062889e31018880efb8b3a622365e0f059d7e168062e4b840a + sizeBytes: 373870396 + - names: + - gcr.io/run-ai-lab/mig-parted@sha256:3cfd802e9519a260d3f2ca2faa72607c0e10b67d611f7823301f37de02512136 + sizeBytes: 373870388 + - names: + - gcr.io/run-ai-test-1/researcher-service@sha256:f327f0d977558f7a971e3eb1a65b04d9ed21a98ccd396a505db544bb3e9768c3 + - gcr.io/run-ai-test-1/researcher-service:0.0.0-1004326.master-191381 + sizeBytes: 370938582 + - names: + - gcr.io/run-ai-lab/researcher-service@sha256:1f400688723cec93c50de3e9d69988244452133a89daf20e96d9c0197c649a20 + - gcr.io/run-ai-lab/researcher-service:0.0.0-1004002.galbrun-6511-add-gpu-inst-e01f8b + sizeBytes: 370922618 + nodeInfo: + architecture: amd64 + bootID: e824c82c-283f-4604-9d70-911e060e69c4 + containerRuntimeVersion: docker://20.10.13 + kernelVersion: 5.15.0-1021-gcp + kubeProxyVersion: v1.23.4 + kubeletVersion: v1.23.4 + machineID: f551d6896536dedbb04180ef1a399ef4 + operatingSystem: linux + osImage: Ubuntu 20.04.2 LTS + systemUUID: 097f5539-c22c-72cd-6ef0-3b58e43fc6c2 diff --git a/design/samples/runai/<2.9/mig/node/4_instances.yaml b/design/samples/runai/<2.9/mig/node/4_instances.yaml new file mode 100644 index 0000000..b9be423 --- /dev/null +++ b/design/samples/runai/<2.9/mig/node/4_instances.yaml @@ -0,0 +1,396 @@ +# Please edit the object below. Lines beginning with a '#' will be ignored, +# and an empty file will abort the edit. If an error occurs while saving this file will be +# reopened with the relevant failures. +# +apiVersion: v1 +kind: Node +metadata: + annotations: + flannel.alpha.coreos.com/backend-data: '{"VNI":1,"VtepMAC":"b6:1c:2d:1d:f3:34"}' + flannel.alpha.coreos.com/backend-type: vxlan + flannel.alpha.coreos.com/kube-subnet-manager: "true" + flannel.alpha.coreos.com/public-ip: 10.51.0.6 + kubeadm.alpha.kubernetes.io/cri-socket: /var/run/dockershim.sock + nfd.node.kubernetes.io/extended-resources: "" + nfd.node.kubernetes.io/feature-labels: cpu-cpuid.ADX,cpu-cpuid.AESNI,cpu-cpuid.AVX,cpu-cpuid.AVX2,cpu-cpuid.AVX512BW,cpu-cpuid.AVX512CD,cpu-cpuid.AVX512DQ,cpu-cpuid.AVX512F,cpu-cpuid.AVX512VL,cpu-cpuid.AVX512VNNI,cpu-cpuid.FMA3,cpu-cpuid.HLE,cpu-cpuid.HYPERVISOR,cpu-cpuid.IBPB,cpu-cpuid.MPX,cpu-cpuid.RTM,cpu-cpuid.STIBP,cpu-hardware_multithreading,kernel-config.NO_HZ,kernel-config.NO_HZ_IDLE,kernel-version.full,kernel-version.major,kernel-version.minor,kernel-version.revision,nvidia.com/cuda.driver.major,nvidia.com/cuda.driver.minor,nvidia.com/cuda.driver.rev,nvidia.com/cuda.runtime.major,nvidia.com/cuda.runtime.minor,nvidia.com/gfd.timestamp,nvidia.com/gpu.compute.major,nvidia.com/gpu.compute.minor,nvidia.com/gpu.count,nvidia.com/gpu.family,nvidia.com/gpu.machine,nvidia.com/gpu.memory,nvidia.com/gpu.product,nvidia.com/gpu.replicas,nvidia.com/mig-1g.5gb.count,nvidia.com/mig-1g.5gb.engines.copy,nvidia.com/mig-1g.5gb.engines.decoder,nvidia.com/mig-1g.5gb.engines.encoder,nvidia.com/mig-1g.5gb.engines.jpeg,nvidia.com/mig-1g.5gb.engines.ofa,nvidia.com/mig-1g.5gb.memory,nvidia.com/mig-1g.5gb.multiprocessors,nvidia.com/mig-1g.5gb.product,nvidia.com/mig-1g.5gb.replicas,nvidia.com/mig-1g.5gb.slices.ci,nvidia.com/mig-1g.5gb.slices.gi,nvidia.com/mig-2g.10gb.count,nvidia.com/mig-2g.10gb.engines.copy,nvidia.com/mig-2g.10gb.engines.decoder,nvidia.com/mig-2g.10gb.engines.encoder,nvidia.com/mig-2g.10gb.engines.jpeg,nvidia.com/mig-2g.10gb.engines.ofa,nvidia.com/mig-2g.10gb.memory,nvidia.com/mig-2g.10gb.multiprocessors,nvidia.com/mig-2g.10gb.product,nvidia.com/mig-2g.10gb.replicas,nvidia.com/mig-2g.10gb.slices.ci,nvidia.com/mig-2g.10gb.slices.gi,nvidia.com/mig.capable,nvidia.com/mig.strategy,nvidia.com/run.ai-swap.enabled,pci-10de.present,pci-1af4.present,system-os_release.ID,system-os_release.VERSION_ID,system-os_release.VERSION_ID.major,system-os_release.VERSION_ID.minor + nfd.node.kubernetes.io/worker.version: v0.10.1 + node.alpha.kubernetes.io/ttl: "0" + run.ai/mig-mapping: ewogICIwIjogewogICAgIjAiOiAiTUlHLTJhNTMyNTNmLTgyYWQtNTY2ZS05MzJmLTE2NDViZWM3MmNjYiIsCiAgICAiNCI6ICJNSUctMTBhMDUwYTEtNGIyNC01YzZlLWEwNmEtNWQ5YmViM2IzOTg4IiwKICAgICI1IjogIk1JRy1iMTI1OGU3ZS02OTM1LTU2ZTQtYjlkZi05OTBlYzA2YmE5MDYiLAogICAgIjYiOiAiTUlHLTI4ODEwZDQ2LTAxODAtNTEzOS1hOTc1LTAyMGJkYzdmOWNiMSIKICB9Cn0= + run.ai/mig.config: |- + version: v1 + mig-configs: + selected: + - devices: [0] + mig-enabled: true + mig-devices: + 0: 2g.10gb + 4: 1g.5gb + 5: 1g.5gb + 6: 1g.5gb + volumes.kubernetes.io/controller-managed-attach-detach: "true" + creationTimestamp: "2022-12-26T14:13:08Z" + labels: + beta.kubernetes.io/arch: amd64 + beta.kubernetes.io/os: linux + feature.node.kubernetes.io/cpu-cpuid.ADX: "true" + feature.node.kubernetes.io/cpu-cpuid.AESNI: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX2: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX512BW: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX512CD: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX512DQ: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX512F: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX512VL: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX512VNNI: "true" + feature.node.kubernetes.io/cpu-cpuid.FMA3: "true" + feature.node.kubernetes.io/cpu-cpuid.HLE: "true" + feature.node.kubernetes.io/cpu-cpuid.HYPERVISOR: "true" + feature.node.kubernetes.io/cpu-cpuid.IBPB: "true" + feature.node.kubernetes.io/cpu-cpuid.MPX: "true" + feature.node.kubernetes.io/cpu-cpuid.RTM: "true" + feature.node.kubernetes.io/cpu-cpuid.STIBP: "true" + feature.node.kubernetes.io/cpu-hardware_multithreading: "true" + feature.node.kubernetes.io/kernel-config.NO_HZ: "true" + feature.node.kubernetes.io/kernel-config.NO_HZ_IDLE: "true" + feature.node.kubernetes.io/kernel-version.full: 5.15.0-1021-gcp + feature.node.kubernetes.io/kernel-version.major: "5" + feature.node.kubernetes.io/kernel-version.minor: "15" + feature.node.kubernetes.io/kernel-version.revision: "0" + feature.node.kubernetes.io/pci-10de.present: "true" + feature.node.kubernetes.io/pci-1af4.present: "true" + feature.node.kubernetes.io/system-os_release.ID: ubuntu + feature.node.kubernetes.io/system-os_release.VERSION_ID: "20.04" + feature.node.kubernetes.io/system-os_release.VERSION_ID.major: "20" + feature.node.kubernetes.io/system-os_release.VERSION_ID.minor: "04" + kubernetes.io/arch: amd64 + kubernetes.io/hostname: qa-mig-worker-gpu-a100 + kubernetes.io/os: linux + node-role.kubernetes.io/runai-dynamic-mig: "true" + node-role.kubernetes.io/runai-mig-enabled: "true" + nvidia.com/cuda.driver.major: "520" + nvidia.com/cuda.driver.minor: "56" + nvidia.com/cuda.driver.rev: "06" + nvidia.com/cuda.runtime.major: "11" + nvidia.com/cuda.runtime.minor: "8" + nvidia.com/gfd.timestamp: "1674110138" + nvidia.com/gpu.compute.major: "8" + nvidia.com/gpu.compute.minor: "0" + nvidia.com/gpu.count: "1" + nvidia.com/gpu.deploy.container-toolkit: "true" + nvidia.com/gpu.deploy.dcgm: "true" + nvidia.com/gpu.deploy.dcgm-exporter: "true" + nvidia.com/gpu.deploy.device-plugin: "true" + nvidia.com/gpu.deploy.driver: pre-installed + nvidia.com/gpu.deploy.gpu-feature-discovery: "true" + nvidia.com/gpu.deploy.mig-manager: "true" + nvidia.com/gpu.deploy.node-status-exporter: "true" + nvidia.com/gpu.deploy.operator-validator: "true" + nvidia.com/gpu.family: ampere + nvidia.com/gpu.machine: Google-Compute-Engine + nvidia.com/gpu.memory: "40960" + nvidia.com/gpu.present: "true" + nvidia.com/gpu.product: NVIDIA-A100-SXM4-40GB + nvidia.com/gpu.replicas: "0" + nvidia.com/mig-1g.5gb.count: "3" + nvidia.com/mig-1g.5gb.engines.copy: "1" + nvidia.com/mig-1g.5gb.engines.decoder: "0" + nvidia.com/mig-1g.5gb.engines.encoder: "0" + nvidia.com/mig-1g.5gb.engines.jpeg: "0" + nvidia.com/mig-1g.5gb.engines.ofa: "0" + nvidia.com/mig-1g.5gb.memory: "4864" + nvidia.com/mig-1g.5gb.multiprocessors: "14" + nvidia.com/mig-1g.5gb.product: NVIDIA-A100-SXM4-40GB-MIG-1g.5gb + nvidia.com/mig-1g.5gb.replicas: "1" + nvidia.com/mig-1g.5gb.slices.ci: "1" + nvidia.com/mig-1g.5gb.slices.gi: "1" + nvidia.com/mig-2g.10gb.count: "1" + nvidia.com/mig-2g.10gb.engines.copy: "2" + nvidia.com/mig-2g.10gb.engines.decoder: "1" + nvidia.com/mig-2g.10gb.engines.encoder: "0" + nvidia.com/mig-2g.10gb.engines.jpeg: "0" + nvidia.com/mig-2g.10gb.engines.ofa: "0" + nvidia.com/mig-2g.10gb.memory: "9856" + nvidia.com/mig-2g.10gb.multiprocessors: "28" + nvidia.com/mig-2g.10gb.product: NVIDIA-A100-SXM4-40GB-MIG-2g.10gb + nvidia.com/mig-2g.10gb.replicas: "1" + nvidia.com/mig-2g.10gb.slices.ci: "2" + nvidia.com/mig-2g.10gb.slices.gi: "2" + nvidia.com/mig.capable: "true" + nvidia.com/mig.config.state: success + nvidia.com/mig.strategy: mixed + nvidia.com/run.ai-swap.enabled: "false" + name: qa-mig-worker-gpu-a100 + resourceVersion: "3060592" + uid: 04dc2370-8f7d-43f8-a346-c8b5201fdd5a +spec: + podCIDR: 10.244.2.0/24 + podCIDRs: + - 10.244.2.0/24 +status: + addresses: + - address: 10.51.0.6 + type: InternalIP + - address: qa-mig-worker-gpu-a100 + type: Hostname + allocatable: + cpu: "12" + ephemeral-storage: "93478772582" + hugepages-1Gi: "0" + hugepages-2Mi: "0" + memory: 87425380Ki + nvidia.com/gpu: "0" + nvidia.com/mig-1g.5gb: "3" + nvidia.com/mig-2g.10gb: "1" + nvidia.com/mig-3g.20gb: "0" + pods: "110" + capacity: + cpu: "12" + ephemeral-storage: 101430960Ki + hugepages-1Gi: "0" + hugepages-2Mi: "0" + memory: 87527780Ki + nvidia.com/gpu: "0" + nvidia.com/mig-1g.5gb: "3" + nvidia.com/mig-2g.10gb: "1" + nvidia.com/mig-3g.20gb: "1" + pods: "110" + conditions: + - lastHeartbeatTime: "2023-01-19T06:34:21Z" + lastTransitionTime: "2023-01-19T06:34:21Z" + message: Flannel is running on this node + reason: FlannelIsUp + status: "False" + type: NetworkUnavailable + - lastHeartbeatTime: "2023-01-19T06:39:30Z" + lastTransitionTime: "2023-01-19T06:33:41Z" + message: kubelet has sufficient memory available + reason: KubeletHasSufficientMemory + status: "False" + type: MemoryPressure + - lastHeartbeatTime: "2023-01-19T06:39:30Z" + lastTransitionTime: "2023-01-19T06:33:41Z" + message: kubelet has no disk pressure + reason: KubeletHasNoDiskPressure + status: "False" + type: DiskPressure + - lastHeartbeatTime: "2023-01-19T06:39:30Z" + lastTransitionTime: "2023-01-19T06:33:41Z" + message: kubelet has sufficient PID available + reason: KubeletHasSufficientPID + status: "False" + type: PIDPressure + - lastHeartbeatTime: "2023-01-19T06:39:30Z" + lastTransitionTime: "2023-01-19T06:33:52Z" + message: kubelet is posting ready status. AppArmor enabled + reason: KubeletReady + status: "True" + type: Ready + daemonEndpoints: + kubeletEndpoint: + Port: 10250 + images: + - names: + - gcr.io/run-ai-demo/quickstart@sha256:7837847d3a186bb2daa03f1781542212cb7c66575ea8b0aaf3fd886f0043c405 + - gcr.io/run-ai-demo/quickstart:latest + sizeBytes: 12072476783 + - names: + - nvcr.io/nvidia/k8s/dcgm-exporter@sha256:c1418a43dd54946ad0d08864c7da5591e067fd0e21791ab48a2dec8bdde14774 + - nvcr.io/nvidia/k8s/dcgm-exporter:3.1.3-3.1.2-ubuntu20.04 + sizeBytes: 1989114969 + - names: + - gshaibi/gpu-burn@sha256:ed07993b0581228c2bd7113fae0ed214549547f0fa91ba50165bc2473cfaf979 + - gshaibi/gpu-burn:latest + sizeBytes: 1619723452 + - names: + - nvcr.io/nvidia/k8s/dcgm-exporter@sha256:952ec6be586dcff7c4b2936a20b7704c55b91be2b0ddb6d121ce72f5a833e804 + - nvcr.io/nvidia/k8s/dcgm-exporter:3.0.4-3.0.0-ubuntu20.04 + sizeBytes: 1222518599 + - names: + - gcr.io/run-ai-lab/mig-provisioner@sha256:9cfc874aeb83bb4f6c0b6a1a85f910a05d4d041be95e67e4501767f490b5149e + sizeBytes: 1188522487 + - names: + - gcr.io/run-ai-lab/mig-provisioner@sha256:2e6a769e727ce22c6393a7af9cfdb4d579d6ee6e4d332b10d029b64e4eabfea5 + sizeBytes: 1188506111 + - names: + - gcr.io/run-ai-lab/mig-provisioner@sha256:3c0cb03a8e440cb02fb0b0e0e311c8353e8f7cdeeb464f3c7aaaffa096aaf5c5 + - gcr.io/run-ai-lab/mig-provisioner:galb1 + sizeBytes: 1188467321 + - names: + - gcr.io/run-ai-lab/mig-parted@sha256:423a9d4c654bf69c8b740cc74aa6c3586886775c9e47b12b3a072cee4b2b3d1c + - gcr.io/run-ai-lab/mig-parted:galb-debug2 + sizeBytes: 1147224275 + - names: + - gcr.io/run-ai-lab/mig-parted@sha256:76fe9435c418fbd60b636f114b6410eab08889d0516396f0d19cc8c8335f9473 + - gcr.io/run-ai-lab/mig-parted:galb-debug1 + sizeBytes: 1147224275 + - names: + - gcr.io/run-ai-lab/mig-parted@sha256:72929a39c30f8d0e67684ae9c019ed4f9603592aa361df5aaa4ae81ff7901262 + - gcr.io/run-ai-lab/mig-parted:galb-debug3 + sizeBytes: 1147201491 + - names: + - gcr.io/run-ai-lab/mig-parted@sha256:6b53498374496f3e6ee04bc3dcb402bd8d751645c8285acb6265c7dd0dd361af + - gcr.io/run-ai-lab/mig-parted:galb-debug4 + sizeBytes: 1140576119 + - names: + - gcr.io/run-ai-lab/agent@sha256:9b67cd4e2a720f5a9674046c4b7d2c1c5b5a0df0e5e4dcb956046d76080cec04 + - gcr.io/run-ai-lab/agent:0.0.0-1004002.galbrun-6511-add-gpu-inst-e01f8b + sizeBytes: 1087796268 + - names: + - gcr.io/run-ai-test-1/agent@sha256:0dd22f998c8edd4412018bc77fc57cd6e066af5dd0efcbf64381203cb93bf26a + - gcr.io/run-ai-test-1/agent:0.0.0-1003867.master-655b6a + sizeBytes: 1087759929 + - names: + - gcr.io/run-ai-test-1/agent@sha256:55ad532e88542cf30803c4e28f17509e944fa912830cc368d096483e173ee6dd + - gcr.io/run-ai-test-1/agent:master-354d48 + sizeBytes: 1087526865 + - names: + - gcr.io/run-ai-prod/agent@sha256:0441dc0f08b60c3dd42676be1cd1056bc7ab8fb5c1a910e6d4fae2ded6afe449 + - gcr.io/run-ai-prod/agent:2.8.11-rc.0 + sizeBytes: 1079256002 + - names: + - gcr.io/run-ai-prod/agent@sha256:36c305603419ffa30755eb8d250493b2d388bd2d12e3cbd969763e1e80ac6d18 + - gcr.io/run-ai-prod/agent:2.8.9-rc.1 + sizeBytes: 1079177946 + - names: + - gcr.io/run-ai-prod/agent@sha256:d21a6b1438597e82f85f392b37b5fe1ad63a1a46d0044462fb2d50ef7aec9323 + - gcr.io/run-ai-prod/agent:2.8.8 + sizeBytes: 1079071238 + - names: + - gcr.io/run-ai-prod/agent@sha256:94a7563a637825df8de459390f8afce407e7072d5985d4297427472bd23c8a34 + - gcr.io/run-ai-prod/agent:2.7.14-rc.2 + sizeBytes: 1078449294 + - names: + - gcr.io/run-ai-prod/agent@sha256:35b8aff4f1228635dcc071f77ffad465e5f9c5658424a142a0ee20a886adc5dc + - gcr.io/run-ai-prod/agent:2.7.13 + sizeBytes: 1078418535 + - names: + - gcr.io/run-ai-prod/agent@sha256:2476a44f91ff04cfb85eab22e68072be96fbe82f3dff40da51e086caa1cc81ed + - gcr.io/run-ai-prod/agent:2.7.15-rc.0 + sizeBytes: 1078388027 + - names: + - gcr.io/run-ai-prod/agent@sha256:9236f3ab5f37d1cc9365b9a5b3bc0e59602cb29bc366bd8e5143bf4c26e18974 + - gcr.io/run-ai-prod/agent:2.7.0-rc.13 + sizeBytes: 1078145358 + - names: + - gcr.io/run-ai-lab/pod-grouper@sha256:25870b435c498aeae9d254eadcf94fa1197ebcd557b59a751547404e9c5990a5 + - gcr.io/run-ai-lab/pod-grouper:0.0.0-1004002.galbrun-6511-add-gpu-inst-e01f8b + sizeBytes: 1041712203 + - names: + - gcr.io/run-ai-test-1/pod-grouper@sha256:896bec2645fd93f62c54bffd563cff57f90a7c46b0951a7644b93b769c45ddb5 + - gcr.io/run-ai-test-1/pod-grouper:0.0.0-1003867.master-655b6a + sizeBytes: 1041712203 + - names: + - gcr.io/run-ai-test-1/pod-grouper@sha256:00cf9d524ce5b2a0d27f6957288e3f37c90809b56a0bb28ddc9f29121aaa2ce1 + - gcr.io/run-ai-test-1/pod-grouper:master-354d48 + sizeBytes: 1041631818 + - names: + - gcr.io/run-ai-lab/mig-provisioner@sha256:ba4bdd418faf1e74c293627886f819e065a8ab7a42df2255a42b289518c7c9f6 + sizeBytes: 1027507788 + - names: + - gcr.io/run-ai-lab/mig-provisioner@sha256:871a685f05a7a7bfc0edbc00bb8fb31a76bb7695783c05b0581a0d93debc251f + sizeBytes: 1027398188 + - names: + - nvcr.io/nvidia/driver@sha256:c24371b1793eab4f2f035bc0584015de3a7be3031587d6cd948069c9127542f6 + - nvcr.io/nvidia/driver:515.65.01-ubuntu20.04 + sizeBytes: 1026833848 + - names: + - gcr.io/run-ai-lab/nodepool-controller@sha256:78020b9c166f6fc4bf23a9787f7d9b06637bf4d2002991e1e697d364260ae4fb + - gcr.io/run-ai-lab/nodepool-controller:0.0.0-1004002.galbrun-6511-add-gpu-inst-e01f8b + sizeBytes: 990809924 + - names: + - gcr.io/run-ai-test-1/nodepool-controller@sha256:38248d3444eed59c66cd87a409ed0b366581324ae0649d4d7cf5cbe35315afc3 + - gcr.io/run-ai-test-1/nodepool-controller:0.0.0-1003867.master-655b6a + sizeBytes: 990808388 + - names: + - gcr.io/run-ai-test-1/nodepool-controller@sha256:3a70b8b1c56dcc2502ffa9ff2149be2d0f767069750234cbfc368357b93ca7ce + - gcr.io/run-ai-test-1/nodepool-controller:master-354d48 + sizeBytes: 990793619 + - names: + - nvcr.io/nvidia/driver@sha256:d944da4ec30065b98c170f924c75fe1222e06998f5dff726a1867fa1c1f9b801 + - nvcr.io/nvidia/driver:510.47.03-ubuntu20.04 + sizeBytes: 932659896 + - names: + - gcr.io/run-ai-lab/mig-parted@sha256:a076495d71c76dfc5718908fd39e29575dea5435e3776dbeceb350f5897799e2 + - gcr.io/run-ai-lab/mig-parted:785d641d6-devel + - gcr.io/run-ai-lab/mig-parted:dc76960b1-devel + sizeBytes: 894154074 + - names: + - gcr.io/run-ai-lab/mig-parted@sha256:933e78dde8ebea4679159e8d48f793ba4c1725a28dfacfa5f9c21cb4a02d2deb + sizeBytes: 775214730 + - names: + - nvcr.io/nvidia/k8s/dcgm-exporter@sha256:7507d27b8d4736531504abf4f011c0b9586fbb7dd9436e26c3b5cd1e262369db + - nvcr.io/nvidia/k8s/dcgm-exporter:2.3.4-2.6.4-ubuntu20.04 + sizeBytes: 572817930 + - names: + - nvcr.io/nvidia/cloud-native/k8s-driver-manager@sha256:6240c5912aabed789c672f3179b4a65e45511d10fa8c41a5de0d91644a792b14 + - nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.5.1 + sizeBytes: 562256065 + - names: + - nvcr.io/nvidia/k8s/dcgm-exporter@sha256:3096538afb0203f8568639ed12fd8ad3fb8c16bbd9c4130e58791e7512f65799 + - nvcr.io/nvidia/k8s/dcgm-exporter:2.3.1-2.6.1-ubuntu20.04 + sizeBytes: 542091179 + - names: + - bitnami/fluentd@sha256:11bb83687b44a9fb7a4f773e2ecf120e2b4523613a5544999723f0b9cd8fe2ed + - bitnami/fluentd:1.12.0-debian-10-r0 + sizeBytes: 512854333 + - names: + - nvcr.io/nvidia/cloud-native/gpu-operator-validator@sha256:18c9ea88ae06d479e6657b8a4126a8ee3f4300a40c16ddc29fb7ab3763d46005 + - nvcr.io/nvidia/cloud-native/gpu-operator-validator:v22.9.1 + sizeBytes: 478600391 + - names: + - nvcr.io/nvidia/cloud-native/k8s-driver-manager@sha256:e3f16c26b9340ed46aed248cc4d18353ba3a65886bf7a2f0cea25ff41b2553da + - nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.4.2 + sizeBytes: 459618644 + - names: + - nvcr.io/nvidia/k8s-device-plugin@sha256:9c17d3a907eb77eb8f7b4f3faf52d8352e4252af92003f828083f80d629bd2c3 + - nvcr.io/nvidia/k8s-device-plugin:v0.13.0-ubi8 + sizeBytes: 444556116 + - names: + - calico/node@sha256:349c10be37e64a310d25869128d482b17bfeb4166bc80bd9a2ed095203a77ddb + - calico/node:v3.15.5 + sizeBytes: 437164545 + - names: + - nvcr.io/nvidia/cloud-native/gpu-operator-validator@sha256:6fe4200960b2b49d6dac1c91e596f61dacb6b3dcff878c84eb74c5136fedd5b6 + - nvcr.io/nvidia/cloud-native/gpu-operator-validator:v22.9.0 + sizeBytes: 432799839 + - names: + - nvcr.io/nvidia/gpu-feature-discovery@sha256:bec9f026d9b3d9404c78d6091817a359015c6a7aa411735b34138c1518853b5d + - nvcr.io/nvidia/gpu-feature-discovery:v0.7.0-ubi8 + sizeBytes: 415829527 + - names: + - nvcr.io/nvidia/cloud-native/k8s-driver-manager@sha256:5b16056257acc51b517d9cdb1da3218693cefc214af93789e6e214fd2b4cacf1 + - nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.3.0 + sizeBytes: 413807613 + - names: + - nvcr.io/nvidia/k8s-device-plugin@sha256:d1c61116647bb9388eb3c4e31848dd6038458b7ba33c0eb3b659d96739eceb73 + - nvcr.io/nvidia/k8s-device-plugin:v0.12.3-ubi8 + sizeBytes: 408778270 + - names: + - nvcr.io/nvidia/gpu-feature-discovery@sha256:37821ea7829def707f819ac21122cea62efdbbd640679b6004e552fb9a1e17a3 + - nvcr.io/nvidia/gpu-feature-discovery:v0.6.2-ubi8 + sizeBytes: 380437918 + - names: + - gcr.io/run-ai-lab/mig-parted@sha256:24e6507b389d7b062889e31018880efb8b3a622365e0f059d7e168062e4b840a + sizeBytes: 373870396 + - names: + - gcr.io/run-ai-lab/mig-parted@sha256:3cfd802e9519a260d3f2ca2faa72607c0e10b67d611f7823301f37de02512136 + sizeBytes: 373870388 + - names: + - gcr.io/run-ai-test-1/researcher-service@sha256:f327f0d977558f7a971e3eb1a65b04d9ed21a98ccd396a505db544bb3e9768c3 + - gcr.io/run-ai-test-1/researcher-service:0.0.0-1004326.master-191381 + sizeBytes: 370938582 + - names: + - gcr.io/run-ai-lab/researcher-service@sha256:1f400688723cec93c50de3e9d69988244452133a89daf20e96d9c0197c649a20 + - gcr.io/run-ai-lab/researcher-service:0.0.0-1004002.galbrun-6511-add-gpu-inst-e01f8b + sizeBytes: 370922618 + nodeInfo: + architecture: amd64 + bootID: 21138297-a932-4324-989f-b0cd8598f612 + containerRuntimeVersion: docker://20.10.13 + kernelVersion: 5.15.0-1021-gcp + kubeProxyVersion: v1.23.4 + kubeletVersion: v1.23.4 + machineID: f551d6896536dedbb04180ef1a399ef4 + operatingSystem: linux + osImage: Ubuntu 20.04.2 LTS + systemUUID: 097f5539-c22c-72cd-6ef0-3b58e43fc6c2 diff --git a/design/samples/runai/<2.9/mig/pod/1g-5gb.yaml b/design/samples/runai/<2.9/mig/pod/1g-5gb.yaml new file mode 100644 index 0000000..993a970 --- /dev/null +++ b/design/samples/runai/<2.9/mig/pod/1g-5gb.yaml @@ -0,0 +1,174 @@ +# Please edit the object below. Lines beginning with a '#' will be ignored, +# and an empty file will abort the edit. If an error occurs while saving this file will be +# reopened with the relevant failures. +# +apiVersion: v1 +kind: Pod +metadata: + annotations: + nvidia.com/mig-1g.5gb: "1" + pod-group-name: pg-job-2-0-a369bc70-fa23-4181-a163-c749f2e49ba1 + received-resource-type: MigInstance + runai-allocated-gpu-memory: "0" + runai-allocated-gpus: "0" + runai-allocated-mig-gpus: "0.14285714285714285" + runai-calculated-status: Running + runai-cli-command: runai submit -i gshaibi/gpu-burn --mig-profile 1g.5gb --cpu + 0.5 + runai-job-id: a369bc70-fa23-4181-a163-c749f2e49ba1 + runai-mig-device: '{"name":"nvidia.com/mig-1g.5gb","position":6,"gpuamount":1,"instanceid":"MIG-28810d46-0180-5139-a975-020bdc7f9cb1","gpuindex":0,"gputype":"A100-40GB"}' + user: guyshaibi + creationTimestamp: "2023-01-19T07:56:47Z" + generateName: job-2- + labels: + app: runaijob + controller-uid: a369bc70-fa23-4181-a163-c749f2e49ba1 + createdBy: RunaiJob + project: team-a + release: job-2 + runai-pod-job-mutated: "true" + runai/pod-index: 0-0 + runai/queue: team-a + name: job-2-0-0 + namespace: runai-team-a + ownerReferences: + - apiVersion: run.ai/v1 + blockOwnerDeletion: true + controller: true + kind: RunaiJob + name: job-2 + uid: a369bc70-fa23-4181-a163-c749f2e49ba1 + resourceVersion: "3247064" + uid: cb9aaa99-97df-4c10-bfa7-c1ed024c672b +spec: + containers: + - env: + - name: reporterGatewayURL + value: runai-prometheus-pushgateway.runai.svc.cluster.local:9091 + - name: REPORTER_GATEWAY_URL + value: runai-prometheus-pushgateway.runai.svc.cluster.local:9091 + - name: podUUID + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.uid + - name: POD_UUID + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.uid + - name: NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + - name: POD_INDEX + value: "0" + - name: jobUUID + value: a369bc70-fa23-4181-a163-c749f2e49ba1 + - name: JOB_UUID + value: a369bc70-fa23-4181-a163-c749f2e49ba1 + - name: jobName + value: job-2 + - name: JOB_NAME + value: job-2 + - name: NVIDIA_VISIBLE_DEVICES + valueFrom: + configMapKeyRef: + key: RUNAI-VISIBLE-DEVICES + name: job-2-7zvpqdb-runai-sh-gpu + - name: RUNAI_NUM_OF_GPUS + valueFrom: + configMapKeyRef: + key: RUNAI_NUM_OF_GPUS + name: job-2-7zvpqdb-runai-sh-gpu + image: gshaibi/gpu-burn + imagePullPolicy: Always + name: job-2 + resources: + requests: + cpu: 500m + securityContext: + allowPrivilegeEscalation: true + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /var/run/secrets/kubernetes.io/serviceaccount + name: kube-api-access-d4m7c + readOnly: true + dnsPolicy: ClusterFirst + enableServiceLinks: true + nodeName: qa-mig-worker-gpu-a100 + preemptionPolicy: PreemptLowerPriority + priority: 0 + restartPolicy: Never + schedulerName: runai-scheduler + securityContext: {} + serviceAccount: default + serviceAccountName: default + terminationGracePeriodSeconds: 30 + tolerations: + - effect: NoExecute + key: node.kubernetes.io/not-ready + operator: Exists + tolerationSeconds: 300 + - effect: NoExecute + key: node.kubernetes.io/unreachable + operator: Exists + tolerationSeconds: 300 + volumes: + - name: kube-api-access-d4m7c + projected: + defaultMode: 420 + sources: + - serviceAccountToken: + expirationSeconds: 3607 + path: token + - configMap: + items: + - key: ca.crt + path: ca.crt + name: kube-root-ca.crt + - downwardAPI: + items: + - fieldRef: + apiVersion: v1 + fieldPath: metadata.namespace + path: namespace +status: + conditions: + - lastProbeTime: null + lastTransitionTime: "2023-01-19T07:56:50Z" + status: "True" + type: Initialized + - lastProbeTime: null + lastTransitionTime: "2023-01-19T07:56:57Z" + status: "True" + type: Ready + - lastProbeTime: null + lastTransitionTime: "2023-01-19T07:56:57Z" + status: "True" + type: ContainersReady + - lastProbeTime: null + lastTransitionTime: "2023-01-19T07:56:49Z" + status: "True" + type: PodScheduled + containerStatuses: + - containerID: docker://3c3ceaec786d1c7da5950dc026a1be636244998bb007664c6aa874e457c11e5a + image: gshaibi/gpu-burn:latest + imageID: docker-pullable://gshaibi/gpu-burn@sha256:ed07993b0581228c2bd7113fae0ed214549547f0fa91ba50165bc2473cfaf979 + lastState: {} + name: job-2 + ready: true + restartCount: 0 + started: true + state: + running: + startedAt: "2023-01-19T07:56:55Z" + hostIP: 10.51.0.6 + phase: Running + podIP: 10.244.2.31 + podIPs: + - ip: 10.244.2.31 + qosClass: Burstable + startTime: "2023-01-19T07:56:50Z" diff --git a/design/samples/runai/>=2.9/mig/node/7g-40gb.yaml b/design/samples/runai/>=2.9/mig/node/7g-40gb.yaml new file mode 100644 index 0000000..f092719 --- /dev/null +++ b/design/samples/runai/>=2.9/mig/node/7g-40gb.yaml @@ -0,0 +1,385 @@ +# Please edit the object below. Lines beginning with a '#' will be ignored, +# and an empty file will abort the edit. If an error occurs while saving this file will be +# reopened with the relevant failures. +# +apiVersion: v1 +kind: Node +metadata: + annotations: + flannel.alpha.coreos.com/backend-data: '{"VNI":1,"VtepMAC":"b6:1c:2d:1d:f3:34"}' + flannel.alpha.coreos.com/backend-type: vxlan + flannel.alpha.coreos.com/kube-subnet-manager: "true" + flannel.alpha.coreos.com/public-ip: 10.51.0.6 + kubeadm.alpha.kubernetes.io/cri-socket: /var/run/dockershim.sock + nfd.node.kubernetes.io/extended-resources: "" + nfd.node.kubernetes.io/feature-labels: cpu-cpuid.ADX,cpu-cpuid.AESNI,cpu-cpuid.AVX,cpu-cpuid.AVX2,cpu-cpuid.AVX512BW,cpu-cpuid.AVX512CD,cpu-cpuid.AVX512DQ,cpu-cpuid.AVX512F,cpu-cpuid.AVX512VL,cpu-cpuid.AVX512VNNI,cpu-cpuid.FMA3,cpu-cpuid.HLE,cpu-cpuid.HYPERVISOR,cpu-cpuid.IBPB,cpu-cpuid.MPX,cpu-cpuid.RTM,cpu-cpuid.STIBP,cpu-hardware_multithreading,kernel-config.NO_HZ,kernel-config.NO_HZ_IDLE,kernel-version.full,kernel-version.major,kernel-version.minor,kernel-version.revision,nvidia.com/cuda.driver.major,nvidia.com/cuda.driver.minor,nvidia.com/cuda.driver.rev,nvidia.com/cuda.runtime.major,nvidia.com/cuda.runtime.minor,nvidia.com/gfd.timestamp,nvidia.com/gpu.compute.major,nvidia.com/gpu.compute.minor,nvidia.com/gpu.count,nvidia.com/gpu.family,nvidia.com/gpu.machine,nvidia.com/gpu.memory,nvidia.com/gpu.product,nvidia.com/gpu.replicas,nvidia.com/mig-7g.40gb.count,nvidia.com/mig-7g.40gb.engines.copy,nvidia.com/mig-7g.40gb.engines.decoder,nvidia.com/mig-7g.40gb.engines.encoder,nvidia.com/mig-7g.40gb.engines.jpeg,nvidia.com/mig-7g.40gb.engines.ofa,nvidia.com/mig-7g.40gb.memory,nvidia.com/mig-7g.40gb.multiprocessors,nvidia.com/mig-7g.40gb.product,nvidia.com/mig-7g.40gb.replicas,nvidia.com/mig-7g.40gb.slices.ci,nvidia.com/mig-7g.40gb.slices.gi,nvidia.com/mig.capable,nvidia.com/mig.strategy,nvidia.com/run.ai-swap.enabled,pci-10de.present,pci-1af4.present,system-os_release.ID,system-os_release.VERSION_ID,system-os_release.VERSION_ID.major,system-os_release.VERSION_ID.minor + nfd.node.kubernetes.io/worker.version: v0.10.1 + node.alpha.kubernetes.io/ttl: "0" + run.ai/mig-mapping: ewogICIwIjogWwogICAgewogICAgICAicG9zaXRpb24iOiAwLAogICAgICAiZGV2aWNlX3V1aWQiOiAiTUlHLTZkOWJlOWRiLWQzNzYtNTk5MS04YzRjLTE4YWI0NDg5MzNmZiIsCiAgICAgICJncHVfaW5zdGFuY2VfaWQiOiAwCiAgICB9CiAgXQp9 + run.ai/mig.config: |- + version: v1 + mig-configs: + selected: + - devices: [0] + mig-enabled: true + mig-devices: + - name: 7g.40gb + position: 0 + size: 8 + volumes.kubernetes.io/controller-managed-attach-detach: "true" + creationTimestamp: "2022-12-26T14:13:08Z" + labels: + beta.kubernetes.io/arch: amd64 + beta.kubernetes.io/os: linux + feature.node.kubernetes.io/cpu-cpuid.ADX: "true" + feature.node.kubernetes.io/cpu-cpuid.AESNI: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX2: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX512BW: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX512CD: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX512DQ: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX512F: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX512VL: "true" + feature.node.kubernetes.io/cpu-cpuid.AVX512VNNI: "true" + feature.node.kubernetes.io/cpu-cpuid.FMA3: "true" + feature.node.kubernetes.io/cpu-cpuid.HLE: "true" + feature.node.kubernetes.io/cpu-cpuid.HYPERVISOR: "true" + feature.node.kubernetes.io/cpu-cpuid.IBPB: "true" + feature.node.kubernetes.io/cpu-cpuid.MPX: "true" + feature.node.kubernetes.io/cpu-cpuid.RTM: "true" + feature.node.kubernetes.io/cpu-cpuid.STIBP: "true" + feature.node.kubernetes.io/cpu-hardware_multithreading: "true" + feature.node.kubernetes.io/kernel-config.NO_HZ: "true" + feature.node.kubernetes.io/kernel-config.NO_HZ_IDLE: "true" + feature.node.kubernetes.io/kernel-version.full: 5.15.0-1021-gcp + feature.node.kubernetes.io/kernel-version.major: "5" + feature.node.kubernetes.io/kernel-version.minor: "15" + feature.node.kubernetes.io/kernel-version.revision: "0" + feature.node.kubernetes.io/pci-10de.present: "true" + feature.node.kubernetes.io/pci-1af4.present: "true" + feature.node.kubernetes.io/system-os_release.ID: ubuntu + feature.node.kubernetes.io/system-os_release.VERSION_ID: "20.04" + feature.node.kubernetes.io/system-os_release.VERSION_ID.major: "20" + feature.node.kubernetes.io/system-os_release.VERSION_ID.minor: "04" + kubernetes.io/arch: amd64 + kubernetes.io/hostname: qa-mig-worker-gpu-a100 + kubernetes.io/os: linux + node-role.kubernetes.io/runai-dynamic-mig: "true" + node-role.kubernetes.io/runai-mig-enabled: "true" + nvidia.com/cuda.driver.major: "520" + nvidia.com/cuda.driver.minor: "56" + nvidia.com/cuda.driver.rev: "06" + nvidia.com/cuda.runtime.major: "11" + nvidia.com/cuda.runtime.minor: "8" + nvidia.com/gfd.timestamp: "1674110138" + nvidia.com/gpu.compute.major: "8" + nvidia.com/gpu.compute.minor: "0" + nvidia.com/gpu.count: "1" + nvidia.com/gpu.deploy.container-toolkit: "true" + nvidia.com/gpu.deploy.dcgm: "true" + nvidia.com/gpu.deploy.dcgm-exporter: "true" + nvidia.com/gpu.deploy.device-plugin: "true" + nvidia.com/gpu.deploy.driver: pre-installed + nvidia.com/gpu.deploy.gpu-feature-discovery: "true" + nvidia.com/gpu.deploy.mig-manager: "true" + nvidia.com/gpu.deploy.node-status-exporter: "true" + nvidia.com/gpu.deploy.operator-validator: "true" + nvidia.com/gpu.family: ampere + nvidia.com/gpu.machine: Google-Compute-Engine + nvidia.com/gpu.memory: "40960" + nvidia.com/gpu.present: "true" + nvidia.com/gpu.product: NVIDIA-A100-SXM4-40GB + nvidia.com/gpu.replicas: "0" + nvidia.com/mig-7g.40gb.count: "1" + nvidia.com/mig-7g.40gb.engines.copy: "7" + nvidia.com/mig-7g.40gb.engines.decoder: "5" + nvidia.com/mig-7g.40gb.engines.encoder: "0" + nvidia.com/mig-7g.40gb.engines.jpeg: "1" + nvidia.com/mig-7g.40gb.engines.ofa: "1" + nvidia.com/mig-7g.40gb.memory: "40192" + nvidia.com/mig-7g.40gb.multiprocessors: "98" + nvidia.com/mig-7g.40gb.product: NVIDIA-A100-SXM4-40GB-MIG-7g.40gb + nvidia.com/mig-7g.40gb.replicas: "1" + nvidia.com/mig-7g.40gb.slices.ci: "7" + nvidia.com/mig-7g.40gb.slices.gi: "7" + nvidia.com/mig.capable: "true" + nvidia.com/mig.config.state: success + nvidia.com/mig.strategy: mixed + nvidia.com/run.ai-swap.enabled: "false" + name: qa-mig-worker-gpu-a100 + resourceVersion: "3336915" + uid: 04dc2370-8f7d-43f8-a346-c8b5201fdd5a +spec: + podCIDR: 10.244.2.0/24 + podCIDRs: + - 10.244.2.0/24 +status: + addresses: + - address: 10.51.0.6 + type: InternalIP + - address: qa-mig-worker-gpu-a100 + type: Hostname + allocatable: + cpu: "12" + ephemeral-storage: "93478772582" + hugepages-1Gi: "0" + hugepages-2Mi: "0" + memory: 87425380Ki + nvidia.com/gpu: "0" + nvidia.com/mig-1g.5gb: "0" + nvidia.com/mig-2g.10gb: "0" + nvidia.com/mig-3g.20gb: "0" + nvidia.com/mig-7g.40gb: "1" + pods: "110" + capacity: + cpu: "12" + ephemeral-storage: 101430960Ki + hugepages-1Gi: "0" + hugepages-2Mi: "0" + memory: 87527780Ki + nvidia.com/gpu: "0" + nvidia.com/mig-1g.5gb: "0" + nvidia.com/mig-2g.10gb: "0" + nvidia.com/mig-3g.20gb: "0" + nvidia.com/mig-7g.40gb: "1" + pods: "110" + conditions: + - lastHeartbeatTime: "2023-01-19T06:34:21Z" + lastTransitionTime: "2023-01-19T06:34:21Z" + message: Flannel is running on this node + reason: FlannelIsUp + status: "False" + type: NetworkUnavailable + - lastHeartbeatTime: "2023-01-19T11:33:29Z" + lastTransitionTime: "2023-01-19T06:33:41Z" + message: kubelet has sufficient memory available + reason: KubeletHasSufficientMemory + status: "False" + type: MemoryPressure + - lastHeartbeatTime: "2023-01-19T11:33:29Z" + lastTransitionTime: "2023-01-19T06:33:41Z" + message: kubelet has no disk pressure + reason: KubeletHasNoDiskPressure + status: "False" + type: DiskPressure + - lastHeartbeatTime: "2023-01-19T11:33:29Z" + lastTransitionTime: "2023-01-19T06:33:41Z" + message: kubelet has sufficient PID available + reason: KubeletHasSufficientPID + status: "False" + type: PIDPressure + - lastHeartbeatTime: "2023-01-19T11:33:29Z" + lastTransitionTime: "2023-01-19T06:33:52Z" + message: kubelet is posting ready status. AppArmor enabled + reason: KubeletReady + status: "True" + type: Ready + daemonEndpoints: + kubeletEndpoint: + Port: 10250 + images: + - names: + - gcr.io/run-ai-demo/quickstart@sha256:7837847d3a186bb2daa03f1781542212cb7c66575ea8b0aaf3fd886f0043c405 + - gcr.io/run-ai-demo/quickstart:latest + sizeBytes: 12072476783 + - names: + - nvcr.io/nvidia/k8s/dcgm-exporter@sha256:c1418a43dd54946ad0d08864c7da5591e067fd0e21791ab48a2dec8bdde14774 + - nvcr.io/nvidia/k8s/dcgm-exporter:3.1.3-3.1.2-ubuntu20.04 + sizeBytes: 1989114969 + - names: + - gshaibi/gpu-burn@sha256:ed07993b0581228c2bd7113fae0ed214549547f0fa91ba50165bc2473cfaf979 + - gshaibi/gpu-burn:latest + sizeBytes: 1619723452 + - names: + - nvcr.io/nvidia/k8s/dcgm-exporter@sha256:952ec6be586dcff7c4b2936a20b7704c55b91be2b0ddb6d121ce72f5a833e804 + - nvcr.io/nvidia/k8s/dcgm-exporter:3.0.4-3.0.0-ubuntu20.04 + sizeBytes: 1222518599 + - names: + - gcr.io/run-ai-lab/mig-provisioner@sha256:9cfc874aeb83bb4f6c0b6a1a85f910a05d4d041be95e67e4501767f490b5149e + sizeBytes: 1188522487 + - names: + - gcr.io/run-ai-lab/mig-provisioner@sha256:2e6a769e727ce22c6393a7af9cfdb4d579d6ee6e4d332b10d029b64e4eabfea5 + sizeBytes: 1188506111 + - names: + - gcr.io/run-ai-lab/mig-provisioner@sha256:3c0cb03a8e440cb02fb0b0e0e311c8353e8f7cdeeb464f3c7aaaffa096aaf5c5 + - gcr.io/run-ai-lab/mig-provisioner:galb1 + sizeBytes: 1188467321 + - names: + - gcr.io/run-ai-lab/mig-parted@sha256:423a9d4c654bf69c8b740cc74aa6c3586886775c9e47b12b3a072cee4b2b3d1c + - gcr.io/run-ai-lab/mig-parted:galb-debug2 + sizeBytes: 1147224275 + - names: + - gcr.io/run-ai-lab/mig-parted@sha256:76fe9435c418fbd60b636f114b6410eab08889d0516396f0d19cc8c8335f9473 + - gcr.io/run-ai-lab/mig-parted:galb-debug1 + sizeBytes: 1147224275 + - names: + - gcr.io/run-ai-lab/mig-parted@sha256:72929a39c30f8d0e67684ae9c019ed4f9603592aa361df5aaa4ae81ff7901262 + - gcr.io/run-ai-lab/mig-parted:galb-debug3 + sizeBytes: 1147201491 + - names: + - gcr.io/run-ai-lab/mig-parted@sha256:6b53498374496f3e6ee04bc3dcb402bd8d751645c8285acb6265c7dd0dd361af + - gcr.io/run-ai-lab/mig-parted:galb-debug4 + sizeBytes: 1140576119 + - names: + - gcr.io/run-ai-lab/agent@sha256:9b67cd4e2a720f5a9674046c4b7d2c1c5b5a0df0e5e4dcb956046d76080cec04 + - gcr.io/run-ai-lab/agent:0.0.0-1004002.galbrun-6511-add-gpu-inst-e01f8b + sizeBytes: 1087796268 + - names: + - gcr.io/run-ai-test-1/agent@sha256:0dd22f998c8edd4412018bc77fc57cd6e066af5dd0efcbf64381203cb93bf26a + - gcr.io/run-ai-test-1/agent:0.0.0-1003867.master-655b6a + sizeBytes: 1087759929 + - names: + - gcr.io/run-ai-test-1/agent@sha256:55ad532e88542cf30803c4e28f17509e944fa912830cc368d096483e173ee6dd + - gcr.io/run-ai-test-1/agent:master-354d48 + sizeBytes: 1087526865 + - names: + - gcr.io/run-ai-prod/agent@sha256:0441dc0f08b60c3dd42676be1cd1056bc7ab8fb5c1a910e6d4fae2ded6afe449 + - gcr.io/run-ai-prod/agent:2.8.11-rc.0 + sizeBytes: 1079256002 + - names: + - gcr.io/run-ai-prod/agent@sha256:36c305603419ffa30755eb8d250493b2d388bd2d12e3cbd969763e1e80ac6d18 + - gcr.io/run-ai-prod/agent:2.8.9-rc.1 + sizeBytes: 1079177946 + - names: + - gcr.io/run-ai-prod/agent@sha256:d21a6b1438597e82f85f392b37b5fe1ad63a1a46d0044462fb2d50ef7aec9323 + - gcr.io/run-ai-prod/agent:2.8.8 + sizeBytes: 1079071238 + - names: + - gcr.io/run-ai-prod/agent@sha256:94a7563a637825df8de459390f8afce407e7072d5985d4297427472bd23c8a34 + - gcr.io/run-ai-prod/agent:2.7.14-rc.2 + sizeBytes: 1078449294 + - names: + - gcr.io/run-ai-prod/agent@sha256:35b8aff4f1228635dcc071f77ffad465e5f9c5658424a142a0ee20a886adc5dc + - gcr.io/run-ai-prod/agent:2.7.13 + sizeBytes: 1078418535 + - names: + - gcr.io/run-ai-prod/agent@sha256:2476a44f91ff04cfb85eab22e68072be96fbe82f3dff40da51e086caa1cc81ed + - gcr.io/run-ai-prod/agent:2.7.15-rc.0 + sizeBytes: 1078388027 + - names: + - gcr.io/run-ai-prod/agent@sha256:9236f3ab5f37d1cc9365b9a5b3bc0e59602cb29bc366bd8e5143bf4c26e18974 + - gcr.io/run-ai-prod/agent:2.7.0-rc.13 + sizeBytes: 1078145358 + - names: + - gcr.io/run-ai-staging/pod-grouper@sha256:3aa137e1ccbf61b73b7b0e538b8db7806af19a6631532b1655477c0c101b7b45 + - gcr.io/run-ai-staging/pod-grouper:2.9.0-rc.1 + sizeBytes: 1041885213 + - names: + - gcr.io/run-ai-lab/pod-grouper@sha256:25870b435c498aeae9d254eadcf94fa1197ebcd557b59a751547404e9c5990a5 + - gcr.io/run-ai-lab/pod-grouper:0.0.0-1004002.galbrun-6511-add-gpu-inst-e01f8b + sizeBytes: 1041712203 + - names: + - gcr.io/run-ai-test-1/pod-grouper@sha256:896bec2645fd93f62c54bffd563cff57f90a7c46b0951a7644b93b769c45ddb5 + - gcr.io/run-ai-test-1/pod-grouper:0.0.0-1003867.master-655b6a + sizeBytes: 1041712203 + - names: + - gcr.io/run-ai-test-1/pod-grouper@sha256:00cf9d524ce5b2a0d27f6957288e3f37c90809b56a0bb28ddc9f29121aaa2ce1 + - gcr.io/run-ai-test-1/pod-grouper:master-354d48 + sizeBytes: 1041631818 + - names: + - gcr.io/run-ai-lab/mig-provisioner@sha256:ba4bdd418faf1e74c293627886f819e065a8ab7a42df2255a42b289518c7c9f6 + sizeBytes: 1027507788 + - names: + - gcr.io/run-ai-lab/mig-provisioner@sha256:871a685f05a7a7bfc0edbc00bb8fb31a76bb7695783c05b0581a0d93debc251f + sizeBytes: 1027398188 + - names: + - nvcr.io/nvidia/driver@sha256:c24371b1793eab4f2f035bc0584015de3a7be3031587d6cd948069c9127542f6 + - nvcr.io/nvidia/driver:515.65.01-ubuntu20.04 + sizeBytes: 1026833848 + - names: + - gcr.io/run-ai-lab/nodepool-controller@sha256:78020b9c166f6fc4bf23a9787f7d9b06637bf4d2002991e1e697d364260ae4fb + - gcr.io/run-ai-lab/nodepool-controller:0.0.0-1004002.galbrun-6511-add-gpu-inst-e01f8b + sizeBytes: 990809924 + - names: + - gcr.io/run-ai-test-1/nodepool-controller@sha256:38248d3444eed59c66cd87a409ed0b366581324ae0649d4d7cf5cbe35315afc3 + - gcr.io/run-ai-test-1/nodepool-controller:0.0.0-1003867.master-655b6a + sizeBytes: 990808388 + - names: + - gcr.io/run-ai-test-1/nodepool-controller@sha256:3a70b8b1c56dcc2502ffa9ff2149be2d0f767069750234cbfc368357b93ca7ce + - gcr.io/run-ai-test-1/nodepool-controller:master-354d48 + sizeBytes: 990793619 + - names: + - nvcr.io/nvidia/driver@sha256:d944da4ec30065b98c170f924c75fe1222e06998f5dff726a1867fa1c1f9b801 + - nvcr.io/nvidia/driver:510.47.03-ubuntu20.04 + sizeBytes: 932659896 + - names: + - gcr.io/run-ai-lab/mig-parted@sha256:a076495d71c76dfc5718908fd39e29575dea5435e3776dbeceb350f5897799e2 + - gcr.io/run-ai-lab/mig-parted:785d641d6-devel + - gcr.io/run-ai-lab/mig-parted:dc76960b1-devel + sizeBytes: 894154074 + - names: + - gcr.io/run-ai-lab/mig-parted@sha256:933e78dde8ebea4679159e8d48f793ba4c1725a28dfacfa5f9c21cb4a02d2deb + sizeBytes: 775214730 + - names: + - nvcr.io/nvidia/k8s/dcgm-exporter@sha256:7507d27b8d4736531504abf4f011c0b9586fbb7dd9436e26c3b5cd1e262369db + - nvcr.io/nvidia/k8s/dcgm-exporter:2.3.4-2.6.4-ubuntu20.04 + sizeBytes: 572817930 + - names: + - nvcr.io/nvidia/cloud-native/k8s-driver-manager@sha256:6240c5912aabed789c672f3179b4a65e45511d10fa8c41a5de0d91644a792b14 + - nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.5.1 + sizeBytes: 562256065 + - names: + - nvcr.io/nvidia/k8s/dcgm-exporter@sha256:3096538afb0203f8568639ed12fd8ad3fb8c16bbd9c4130e58791e7512f65799 + - nvcr.io/nvidia/k8s/dcgm-exporter:2.3.1-2.6.1-ubuntu20.04 + sizeBytes: 542091179 + - names: + - bitnami/fluentd@sha256:11bb83687b44a9fb7a4f773e2ecf120e2b4523613a5544999723f0b9cd8fe2ed + - bitnami/fluentd:1.12.0-debian-10-r0 + sizeBytes: 512854333 + - names: + - nvcr.io/nvidia/cloud-native/gpu-operator-validator@sha256:18c9ea88ae06d479e6657b8a4126a8ee3f4300a40c16ddc29fb7ab3763d46005 + - nvcr.io/nvidia/cloud-native/gpu-operator-validator:v22.9.1 + sizeBytes: 478600391 + - names: + - nvcr.io/nvidia/cloud-native/k8s-driver-manager@sha256:e3f16c26b9340ed46aed248cc4d18353ba3a65886bf7a2f0cea25ff41b2553da + - nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.4.2 + sizeBytes: 459618644 + - names: + - nvcr.io/nvidia/k8s-device-plugin@sha256:9c17d3a907eb77eb8f7b4f3faf52d8352e4252af92003f828083f80d629bd2c3 + - nvcr.io/nvidia/k8s-device-plugin:v0.13.0-ubi8 + sizeBytes: 444556116 + - names: + - gcr.io/run-ai-staging/researcher-service@sha256:c202b4487bcedb04de5ffef609f2f32712df90bae0a4d38fdbdafc347d5c9349 + - gcr.io/run-ai-staging/researcher-service:2.9.0-rc.1 + sizeBytes: 437834397 + - names: + - calico/node@sha256:349c10be37e64a310d25869128d482b17bfeb4166bc80bd9a2ed095203a77ddb + - calico/node:v3.15.5 + sizeBytes: 437164545 + - names: + - nvcr.io/nvidia/cloud-native/gpu-operator-validator@sha256:6fe4200960b2b49d6dac1c91e596f61dacb6b3dcff878c84eb74c5136fedd5b6 + - nvcr.io/nvidia/cloud-native/gpu-operator-validator:v22.9.0 + sizeBytes: 432799839 + - names: + - nvcr.io/nvidia/gpu-feature-discovery@sha256:bec9f026d9b3d9404c78d6091817a359015c6a7aa411735b34138c1518853b5d + - nvcr.io/nvidia/gpu-feature-discovery:v0.7.0-ubi8 + sizeBytes: 415829527 + - names: + - nvcr.io/nvidia/cloud-native/k8s-driver-manager@sha256:5b16056257acc51b517d9cdb1da3218693cefc214af93789e6e214fd2b4cacf1 + - nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.3.0 + sizeBytes: 413807613 + - names: + - nvcr.io/nvidia/k8s-device-plugin@sha256:d1c61116647bb9388eb3c4e31848dd6038458b7ba33c0eb3b659d96739eceb73 + - nvcr.io/nvidia/k8s-device-plugin:v0.12.3-ubi8 + sizeBytes: 408778270 + - names: + - nvcr.io/nvidia/gpu-feature-discovery@sha256:37821ea7829def707f819ac21122cea62efdbbd640679b6004e552fb9a1e17a3 + - nvcr.io/nvidia/gpu-feature-discovery:v0.6.2-ubi8 + sizeBytes: 380437918 + - names: + - gcr.io/run-ai-lab/mig-parted@sha256:24e6507b389d7b062889e31018880efb8b3a622365e0f059d7e168062e4b840a + sizeBytes: 373870396 + - names: + - gcr.io/run-ai-lab/mig-parted@sha256:3cfd802e9519a260d3f2ca2faa72607c0e10b67d611f7823301f37de02512136 + sizeBytes: 373870388 + nodeInfo: + architecture: amd64 + bootID: 21138297-a932-4324-989f-b0cd8598f612 + containerRuntimeVersion: docker://20.10.13 + kernelVersion: 5.15.0-1021-gcp + kubeProxyVersion: v1.23.4 + kubeletVersion: v1.23.4 + machineID: f551d6896536dedbb04180ef1a399ef4 + operatingSystem: linux + osImage: Ubuntu 20.04.2 LTS + systemUUID: 097f5539-c22c-72cd-6ef0-3b58e43fc6c2 diff --git a/design/samples/runai/>=2.9/mig/pod/7g-5gb.yaml b/design/samples/runai/>=2.9/mig/pod/7g-5gb.yaml new file mode 100644 index 0000000..30aee97 --- /dev/null +++ b/design/samples/runai/>=2.9/mig/pod/7g-5gb.yaml @@ -0,0 +1,169 @@ +# Please edit the object below. Lines beginning with a '#' will be ignored, +# and an empty file will abort the edit. If an error occurs while saving this file will be +# reopened with the relevant failures. +# +apiVersion: v1 +kind: Pod +metadata: + annotations: + nvidia.com/mig-7g.40gb: "1" + pod-group-name: pg-job-969475a25a51-cfb7521b-0251-4c5c-88ed-5a84678c1461 + received-resource-type: MigInstance + runai-allocated-gpu-memory: "0" + runai-allocated-gpus: "0" + runai-allocated-mig-gpus: "1" + runai-calculated-status: Running + runai-cli-command: runai submit --interactive -i gcr.io/run-ai-demo/mat-mul --mig-profile + 7g.40gb -p p1 + runai-job-id: cfb7521b-0251-4c5c-88ed-5a84678c1461 + runai-mig-device: '{"name":"nvidia.com/mig-7g.40gb","position":0,"gpuamount":7,"deviceUUID":"MIG-6d9be9db-d376-5991-8c4c-18ab448933ff","gpuinstanceid":0,"gpuindex":"0","gputype":"A100-40GB"}' + user: galben-yair + creationTimestamp: "2023-01-19T11:03:42Z" + generateName: job-969475a25a51- + labels: + app: runaijob + controller-uid: cfb7521b-0251-4c5c-88ed-5a84678c1461 + createdBy: RunaiJob + project: p1 + release: job-969475a25a51 + runai/pod-index: 0-0 + name: job-969475a25a51-0-0 + namespace: runai-p1 + ownerReferences: + - apiVersion: run.ai/v1 + blockOwnerDeletion: true + controller: true + kind: RunaiJob + name: job-969475a25a51 + uid: cfb7521b-0251-4c5c-88ed-5a84678c1461 + resourceVersion: "3332596" + uid: 83a0c3bc-63ed-4b94-85bc-e5bd124c72f2 +spec: + containers: + - env: + - name: POD_INDEX + value: "0" + - name: NVIDIA_VISIBLE_DEVICES + valueFrom: + configMapKeyRef: + key: RUNAI-VISIBLE-DEVICES + name: job-969475a25a51-jzjkqqm-runai-sh-gpu + - name: RUNAI_NUM_OF_GPUS + valueFrom: + configMapKeyRef: + key: RUNAI_NUM_OF_GPUS + name: job-969475a25a51-jzjkqqm-runai-sh-gpu + - name: jobUUID + value: cfb7521b-0251-4c5c-88ed-5a84678c1461 + - name: JOB_UUID + value: cfb7521b-0251-4c5c-88ed-5a84678c1461 + - name: jobName + value: job-969475a25a51 + - name: JOB_NAME + value: job-969475a25a51 + - name: reporterGatewayURL + value: runai-prometheus-pushgateway.runai.svc.cluster.local:9091 + - name: REPORTER_GATEWAY_URL + value: runai-prometheus-pushgateway.runai.svc.cluster.local:9091 + - name: podUUID + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.uid + - name: POD_UUID + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.uid + - name: NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + image: gcr.io/run-ai-demo/mat-mul + imagePullPolicy: Always + name: job-969475a25a51 + resources: {} + securityContext: {} + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /var/run/secrets/kubernetes.io/serviceaccount + name: kube-api-access-n4xs2 + readOnly: true + dnsPolicy: ClusterFirst + enableServiceLinks: true + nodeName: qa-mig-worker-gpu-a100 + preemptionPolicy: PreemptLowerPriority + priority: 0 + restartPolicy: Never + schedulerName: runai-scheduler + securityContext: {} + serviceAccount: default + serviceAccountName: default + terminationGracePeriodSeconds: 30 + tolerations: + - effect: NoExecute + key: node.kubernetes.io/not-ready + operator: Exists + tolerationSeconds: 300 + - effect: NoExecute + key: node.kubernetes.io/unreachable + operator: Exists + tolerationSeconds: 300 + volumes: + - name: kube-api-access-n4xs2 + projected: + defaultMode: 420 + sources: + - serviceAccountToken: + expirationSeconds: 3607 + path: token + - configMap: + items: + - key: ca.crt + path: ca.crt + name: kube-root-ca.crt + - downwardAPI: + items: + - fieldRef: + apiVersion: v1 + fieldPath: metadata.namespace + path: namespace +status: + conditions: + - lastProbeTime: null + lastTransitionTime: "2023-01-19T11:04:17Z" + status: "True" + type: Initialized + - lastProbeTime: null + lastTransitionTime: "2023-01-19T11:04:19Z" + status: "True" + type: Ready + - lastProbeTime: null + lastTransitionTime: "2023-01-19T11:04:19Z" + status: "True" + type: ContainersReady + - lastProbeTime: null + lastTransitionTime: "2023-01-19T11:04:17Z" + status: "True" + type: PodScheduled + containerStatuses: + - containerID: docker://e71fcd88233bbbac6d719ebceb272cd1d1580f3f527e05abf4f766d214ebe101 + image: gcr.io/run-ai-demo/mat-mul:latest + imageID: docker-pullable://gcr.io/run-ai-demo/mat-mul@sha256:4d6fdf600f2be44b2bf62375633ab63bde8d9597007a7997d91e368fb1678856 + lastState: {} + name: job-969475a25a51 + ready: true + restartCount: 0 + started: true + state: + running: + startedAt: "2023-01-19T11:04:18Z" + hostIP: 10.51.0.6 + phase: Running + podIP: 10.244.2.58 + podIPs: + - ip: 10.244.2.58 + qosClass: BestEffort + startTime: "2023-01-19T11:04:17Z"