-
Notifications
You must be signed in to change notification settings - Fork 17
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
17 changed files
with
2,896 additions
and
0 deletions.
There are no files selected for viewing
83 changes: 83 additions & 0 deletions
83
design/samples/gpu-operator/metrics/active-frac-gpu-pod.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
################ Active Whole GPU Pod ################ | ||
|
||
###### From DCGM Exporter Directly ###### | ||
|
||
{ | ||
# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). | ||
# TYPE DCGM_FI_DEV_SM_CLOCK gauge | ||
DCGM_FI_DEV_SM_CLOCK{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 300 | ||
# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz). | ||
# TYPE DCGM_FI_DEV_MEM_CLOCK gauge | ||
DCGM_FI_DEV_MEM_CLOCK{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 5000 | ||
# HELP DCGM_FI_DEV_MEMORY_TEMP Memory temperature (in C). | ||
# TYPE DCGM_FI_DEV_MEMORY_TEMP gauge | ||
DCGM_FI_DEV_MEMORY_TEMP{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0 | ||
# HELP DCGM_FI_DEV_GPU_TEMP GPU temperature (in C). | ||
# TYPE DCGM_FI_DEV_GPU_TEMP gauge | ||
DCGM_FI_DEV_GPU_TEMP{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 44 | ||
# HELP DCGM_FI_DEV_POWER_USAGE Power draw (in W). | ||
# TYPE DCGM_FI_DEV_POWER_USAGE gauge | ||
DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 28.822000 | ||
# HELP DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Total energy consumption since boot (in mJ). | ||
# TYPE DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION counter | ||
DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 484260738 | ||
# HELP DCGM_FI_DEV_PCIE_REPLAY_COUNTER Total number of PCIe retries. | ||
# TYPE DCGM_FI_DEV_PCIE_REPLAY_COUNTER counter | ||
DCGM_FI_DEV_PCIE_REPLAY_COUNTER{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0 | ||
# HELP DCGM_FI_DEV_GPU_UTIL GPU utilization (in %). | ||
# TYPE DCGM_FI_DEV_GPU_UTIL gauge | ||
DCGM_FI_DEV_GPU_UTIL{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0 | ||
# HELP DCGM_FI_DEV_MEM_COPY_UTIL Memory utilization (in %). | ||
# TYPE DCGM_FI_DEV_MEM_COPY_UTIL gauge | ||
DCGM_FI_DEV_MEM_COPY_UTIL{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0 | ||
# HELP DCGM_FI_DEV_ENC_UTIL Encoder utilization (in %). | ||
# TYPE DCGM_FI_DEV_ENC_UTIL gauge | ||
DCGM_FI_DEV_ENC_UTIL{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0 | ||
# HELP DCGM_FI_DEV_DEC_UTIL Decoder utilization (in %). | ||
# TYPE DCGM_FI_DEV_DEC_UTIL gauge | ||
DCGM_FI_DEV_DEC_UTIL{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0 | ||
# HELP DCGM_FI_DEV_XID_ERRORS Value of the last XID error encountered. | ||
# TYPE DCGM_FI_DEV_XID_ERRORS gauge | ||
DCGM_FI_DEV_XID_ERRORS{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",err_code="0",err_msg="No Error",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0 | ||
# HELP DCGM_FI_DEV_FB_FREE Framebuffer memory free (in MiB). | ||
# TYPE DCGM_FI_DEV_FB_FREE gauge | ||
DCGM_FI_DEV_FB_FREE{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 14914 | ||
# HELP DCGM_FI_DEV_FB_USED Framebuffer memory used (in MiB). | ||
# TYPE DCGM_FI_DEV_FB_USED gauge | ||
DCGM_FI_DEV_FB_USED{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 2 | ||
# HELP DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL Total number of NVLink bandwidth counters for all lanes. | ||
# TYPE DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL counter | ||
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0 | ||
# HELP DCGM_FI_DEV_VGPU_LICENSE_STATUS vGPU License status | ||
# TYPE DCGM_FI_DEV_VGPU_LICENSE_STATUS gauge | ||
DCGM_FI_DEV_VGPU_LICENSE_STATUS{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0 | ||
# HELP DCGM_FI_PROF_GR_ENGINE_ACTIVE Ratio of time the graphics engine is active. | ||
# TYPE DCGM_FI_PROF_GR_ENGINE_ACTIVE gauge | ||
DCGM_FI_PROF_GR_ENGINE_ACTIVE{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0.999983 | ||
# HELP DCGM_FI_PROF_PIPE_TENSOR_ACTIVE Ratio of cycles the tensor (HMMA) pipe is active. | ||
# TYPE DCGM_FI_PROF_PIPE_TENSOR_ACTIVE gauge | ||
DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0.000000 | ||
# HELP DCGM_FI_PROF_DRAM_ACTIVE Ratio of cycles the device memory interface is active sending or receiving data. | ||
# TYPE DCGM_FI_PROF_DRAM_ACTIVE gauge | ||
DCGM_FI_PROF_DRAM_ACTIVE{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 0.466046 | ||
# HELP DCGM_FI_PROF_PCIE_TX_BYTES The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second. | ||
# TYPE DCGM_FI_PROF_PCIE_TX_BYTES gauge | ||
DCGM_FI_PROF_PCIE_TX_BYTES{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 5529374 | ||
# HELP DCGM_FI_PROF_PCIE_RX_BYTES The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second. | ||
# TYPE DCGM_FI_PROF_PCIE_RX_BYTES gauge | ||
DCGM_FI_PROF_PCIE_RX_BYTES{gpu="0",UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-26",DCGM_FI_DRIVER_VERSION="550.54.14",container="runai-reservation",namespace="runai-reservation",pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw"} 21260577 | ||
} | ||
|
||
###### From Prometheus ###### | ||
|
||
{ | ||
DCGM_FI_DEV_FB_FREE{DCGM_FI_DRIVER_VERSION="550.54.14", Hostname="ip-172-20-10-26", UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1", container="nvidia-dcgm-exporter", device="nvidia0", endpoint="gpu-metrics", exported_container="runai-reservation", exported_namespace="runai-reservation", exported_pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw", gpu="0", instance="10.244.1.11:9400", job="nvidia-dcgm-exporter", modelName="Tesla T4", namespace="gpu-operator", pci_bus_id="00000000:00:1E.0", pod="nvidia-dcgm-exporter-mrgds", service="nvidia-dcgm-exporter"} | ||
8422 | ||
DCGM_FI_DEV_FB_USED{DCGM_FI_DRIVER_VERSION="550.54.14", Hostname="ip-172-20-10-26", UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1", container="nvidia-dcgm-exporter", device="nvidia0", endpoint="gpu-metrics", exported_container="runai-reservation", exported_namespace="runai-reservation", exported_pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw", gpu="0", instance="10.244.1.11:9400", job="nvidia-dcgm-exporter", modelName="Tesla T4", namespace="gpu-operator", pci_bus_id="00000000:00:1E.0", pod="nvidia-dcgm-exporter-mrgds", service="nvidia-dcgm-exporter"} | ||
6494 | ||
DCGM_FI_DEV_GPU_UTIL{DCGM_FI_DRIVER_VERSION="550.54.14", Hostname="ip-172-20-10-26", UUID="GPU-b397ddd6-b9f5-1b5a-a710-a105c3b542d1", container="nvidia-dcgm-exporter", device="nvidia0", endpoint="gpu-metrics", exported_container="runai-reservation", exported_namespace="runai-reservation", exported_pod="runai-reservation-gpu-ip-172-20-10-26-6xhvw", gpu="0", instance="10.244.1.11:9400", job="nvidia-dcgm-exporter", modelName="Tesla T4", namespace="gpu-operator", pci_bus_id="00000000:00:1E.0", pod="nvidia-dcgm-exporter-mrgds", service="nvidia-dcgm-exporter"} | ||
100 | ||
} | ||
|
||
###### Note ###### | ||
ip-172-20-10-26 is the hostname of the node where the GPU is installed. |
Oops, something went wrong.