Skip to content

Commit

Permalink
Samples
Browse files Browse the repository at this point in the history
  • Loading branch information
gshaibi committed Nov 26, 2024
1 parent 83b94a7 commit 1c8d570
Show file tree
Hide file tree
Showing 17 changed files with 2,896 additions and 0 deletions.
83 changes: 83 additions & 0 deletions design/samples/gpu-operator/metrics/active-frac-gpu-pod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
################ Active Whole GPU Pod ################

###### From DCGM Exporter Directly ######

{
# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
# TYPE DCGM_FI_DEV_SM_CLOCK gauge
DCGM_FI_DEV_SM_CLOCK{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 300
# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz).
# TYPE DCGM_FI_DEV_MEM_CLOCK gauge
DCGM_FI_DEV_MEM_CLOCK{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 5000
# HELP DCGM_FI_DEV_MEMORY_TEMP Memory temperature (in C).
# TYPE DCGM_FI_DEV_MEMORY_TEMP gauge
DCGM_FI_DEV_MEMORY_TEMP{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0
# HELP DCGM_FI_DEV_GPU_TEMP GPU temperature (in C).
# TYPE DCGM_FI_DEV_GPU_TEMP gauge
DCGM_FI_DEV_GPU_TEMP{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 44
# HELP DCGM_FI_DEV_POWER_USAGE Power draw (in W).
# TYPE DCGM_FI_DEV_POWER_USAGE gauge
DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 28.822000
# HELP DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Total energy consumption since boot (in mJ).
# TYPE DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION counter
DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 484260738
# HELP DCGM_FI_DEV_PCIE_REPLAY_COUNTER Total number of PCIe retries.
# TYPE DCGM_FI_DEV_PCIE_REPLAY_COUNTER counter
DCGM_FI_DEV_PCIE_REPLAY_COUNTER{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0
# HELP DCGM_FI_DEV_GPU_UTIL GPU utilization (in %).
# TYPE DCGM_FI_DEV_GPU_UTIL gauge
DCGM_FI_DEV_GPU_UTIL{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0
# HELP DCGM_FI_DEV_MEM_COPY_UTIL Memory utilization (in %).
# TYPE DCGM_FI_DEV_MEM_COPY_UTIL gauge
DCGM_FI_DEV_MEM_COPY_UTIL{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0
# HELP DCGM_FI_DEV_ENC_UTIL Encoder utilization (in %).
# TYPE DCGM_FI_DEV_ENC_UTIL gauge
DCGM_FI_DEV_ENC_UTIL{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0
# HELP DCGM_FI_DEV_DEC_UTIL Decoder utilization (in %).
# TYPE DCGM_FI_DEV_DEC_UTIL gauge
DCGM_FI_DEV_DEC_UTIL{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0
# HELP DCGM_FI_DEV_XID_ERRORS Value of the last XID error encountered.
# TYPE DCGM_FI_DEV_XID_ERRORS gauge
DCGM_FI_DEV_XID_ERRORS{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",err_code="0",err_msg="No Error",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0
# HELP DCGM_FI_DEV_FB_FREE Framebuffer memory free (in MiB).
# TYPE DCGM_FI_DEV_FB_FREE gauge
DCGM_FI_DEV_FB_FREE{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 14914
# HELP DCGM_FI_DEV_FB_USED Framebuffer memory used (in MiB).
# TYPE DCGM_FI_DEV_FB_USED gauge
DCGM_FI_DEV_FB_USED{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 2
# HELP DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL Total number of NVLink bandwidth counters for all lanes.
# TYPE DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL counter
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0
# HELP DCGM_FI_DEV_VGPU_LICENSE_STATUS vGPU License status
# TYPE DCGM_FI_DEV_VGPU_LICENSE_STATUS gauge
DCGM_FI_DEV_VGPU_LICENSE_STATUS{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0
# HELP DCGM_FI_PROF_GR_ENGINE_ACTIVE Ratio of time the graphics engine is active.
# TYPE DCGM_FI_PROF_GR_ENGINE_ACTIVE gauge
DCGM_FI_PROF_GR_ENGINE_ACTIVE{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0.999983
# HELP DCGM_FI_PROF_PIPE_TENSOR_ACTIVE Ratio of cycles the tensor (HMMA) pipe is active.
# TYPE DCGM_FI_PROF_PIPE_TENSOR_ACTIVE gauge
DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0.000000
# HELP DCGM_FI_PROF_DRAM_ACTIVE Ratio of cycles the device memory interface is active sending or receiving data.
# TYPE DCGM_FI_PROF_DRAM_ACTIVE gauge
DCGM_FI_PROF_DRAM_ACTIVE{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0.466046
# HELP DCGM_FI_PROF_PCIE_TX_BYTES The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second.
# TYPE DCGM_FI_PROF_PCIE_TX_BYTES gauge
DCGM_FI_PROF_PCIE_TX_BYTES{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 5529374
# HELP DCGM_FI_PROF_PCIE_RX_BYTES The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second.
# TYPE DCGM_FI_PROF_PCIE_RX_BYTES gauge
DCGM_FI_PROF_PCIE_RX_BYTES{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 21260577
}

###### From Prometheus ######

{
DCGM_FI_DEV_FB_FREE{DCGM_FI_DRIVER_VERSION="550.54.14", Hostname="ip-172-20-10-26", UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1", container="nvidia-dcgm-exporter", device="nvidia0", endpoint="gpu-metrics", exported_container="runai-reservation", exported_namespace="runai-reservation", exported_pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw", gpu="0", instance="10.244.1.11:9400", job="nvidia-dcgm-exporter", modelName="Tesla T4", namespace="gpu-operator", pci_bus_id="00000000:00:1E.0", pod="nvidia-dcgm-exporter-mrgds", service="nvidia-dcgm-exporter"}
8422
DCGM_FI_DEV_FB_USED{DCGM_FI_DRIVER_VERSION="550.54.14", Hostname="ip-172-20-10-26", UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1", container="nvidia-dcgm-exporter", device="nvidia0", endpoint="gpu-metrics", exported_container="runai-reservation", exported_namespace="runai-reservation", exported_pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw", gpu="0", instance="10.244.1.11:9400", job="nvidia-dcgm-exporter", modelName="Tesla T4", namespace="gpu-operator", pci_bus_id="00000000:00:1E.0", pod="nvidia-dcgm-exporter-mrgds", service="nvidia-dcgm-exporter"}
6494
DCGM_FI_DEV_GPU_UTIL{DCGM_FI_DRIVER_VERSION="550.54.14", Hostname="ip-172-20-10-26", UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1", container="nvidia-dcgm-exporter", device="nvidia0", endpoint="gpu-metrics", exported_container="runai-reservation", exported_namespace="runai-reservation", exported_pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw", gpu="0", instance="10.244.1.11:9400", job="nvidia-dcgm-exporter", modelName="Tesla T4", namespace="gpu-operator", pci_bus_id="00000000:00:1E.0", pod="nvidia-dcgm-exporter-mrgds", service="nvidia-dcgm-exporter"}
100
}

###### Note ######
ip-172-20-10-26 is the hostname of the node where the GPU is installed.
Loading

0 comments on commit 1c8d570

Please sign in to comment.