Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a custom metrics CSV for the dcgm snap #325

Merged
merged 19 commits into from
Oct 2, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def exporters(self) -> List[BaseExporter]:
exporters.append(SmartCtlExporter(self.charm_dir, self.model.config))

if stored_tools & DCGMExporter.hw_tools():
exporters.append(DCGMExporter(self.model.config))
exporters.append(DCGMExporter(self.charm_dir, self.model.config))

return exporters

Expand Down Expand Up @@ -291,4 +291,4 @@ def cos_agent_related(self) -> bool:


if __name__ == "__main__": # pragma: nocover
ops.main(HardwareObserverCharm) # type: ignore
ops.main(HardwareObserverCharm)
161 changes: 161 additions & 0 deletions src/gpu_metrics/dcgm_metrics.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
###############################################################################
# [ WARNING ]
# Configuration file maintained by Juju. Local changes may be overwritten.
###############################################################################

# Selected metrics for dcgm-exporter
gabrielcocenza marked this conversation as resolved.
Show resolved Hide resolved
# Default metric list https://github.com/NVIDIA/dcgm-exporter/blob/main/etc/default-counters.csv

# Format
# If line starts with a '#' it is considered a comment
# Boolean values decode to - 1 = enabled 0 = disabled
# DCGM FIELD, Prometheus metric type, help message




# DEFAULT METRICS
# Clocks
DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz).
DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).

# Temperature
DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C).

# Power
DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ).
DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W).

# PCIE
DCGM_FI_PROF_PCIE_TX_BYTES, counter, Total number of bytes transmitted through PCIe TX via NVML.
DCGM_FI_PROF_PCIE_RX_BYTES, counter, Total number of bytes received through PCIe RX via NVML.
DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries.

# Utilization (the sample period varies depending on the product)
DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %).
DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %).
DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %).
DCGM_FI_DEV_DEC_UTIL, gauge, Decoder utilization (in %).

# Errors and violations
DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered.

# Memory usage
DCGM_FI_DEV_FB_FREE, gauge, Frame buffer memory free (in MB).
DCGM_FI_DEV_FB_USED, gauge, Frame buffer memory used (in MB).

# NVLink
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes

# VGPU License status
DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status

# Remapped rows
DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors
DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors
DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed

# Static configuration information and features
DCGM_FI_DRIVER_VERSION, label, Driver Version




# CUSTOM METRICS
# Clocks
DCGM_FI_DEV_VIDEO_CLOCK, gauge, Video encoder/decoder clock (in MHz).

# Temperature
DCGM_FI_DEV_FAN_SPEED, gauge, Fan speed (in 0-100%)

# Power
DCGM_FI_DEV_POWER_USAGE_INSTANT, gauge, Current instantaneous power usage (in W).

# Errors and violations
DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, counter, Throttling reasons bitmask
DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us).
DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us).
DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us).
DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us).
DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us).
DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us).

# Memory usage
DCGM_FI_DEV_FB_RESERVED, gauge, Frame buffer memory reserved (in MB).
DCGM_FI_DEV_FB_USED_PERCENT, gauge, Frame buffer percentage used (in 0-100%) - Used/(Total - Reserved)

# ECC
DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors.
DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors.
DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors.
DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors.

# Retired pages
DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors.
DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors.
DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement.

# NVLink
DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors.
DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors.
DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries.
DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors.

# VGPU
DCGM_FI_DEV_VGPU_INSTANCE_IDS, counter, Count of vGPU Instances
DCGM_FI_DEV_VGPU_UTILIZATIONS, gauge, vGPUs utilization

# Bar
DCGM_FI_DEV_BAR1_USED, gauge, Used BAR1 (in MB)
DCGM_FI_DEV_BAR1_FREE, gauge, Free BAR1 (in MB)

# DCP metrics
DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active.
DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned.
DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM.
DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active.
DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data.
DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active.
DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active.
DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active.
DCGM_FI_PROF_PCIE_TX_BYTES, gauge, The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second.
DCGM_FI_PROF_PCIE_RX_BYTES, gauge, The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second.

# Static configuration information and features
DCGM_FI_NVML_VERSION, label, NVML Version
DCGM_FI_DEV_BRAND, label, Device Brand
DCGM_FI_DEV_SERIAL, label, Device Serial Number
DCGM_FI_DEV_NAME, label, Device Name
DCGM_FI_DEV_MINOR_NUMBER, label, Device node minor (/dev/nvidia#)
DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY, label, Cuda compute capability for the device (The major version is the upper 32 bits and the minor version is the lower 32 bits)
DCGM_FI_DEV_OEM_INFOROM_VER, label, OEM inforom version
DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version
DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version
DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version
DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device

DCGM_FI_DEV_COMPUTE_MODE, label, Compute mode
DCGM_FI_DEV_PERSISTENCE_MODE, label, Persistance mode (1 or 0)
DCGM_FI_DEV_CC_MODE, label, ConfidentialCompute/AmpereProtectedMemory status (1 or 0)
DCGM_FI_DEV_ECC_CURRENT, label, Current ECC mode
DCGM_FI_DEV_VIRTUAL_MODE, label, Virtualization mode
DCGM_FI_DEV_AUTOBOOST, label, Auto-boost enabled

DCGM_FI_DEV_BAR1_TOTAL, label, Total BAR1 (in MB)

DCGM_FI_DEV_MAX_SM_CLOCK, label, Maximum supported SM clock
DCGM_FI_DEV_MAX_MEM_CLOCK, label, Maximum supported Memory clock

DCGM_FI_DEV_GPU_MAX_OP_TEMP, label, Maximum operating temperature
DCGM_FI_DEV_SLOWDOWN_TEMP, label, Slowdown temperature
DCGM_FI_DEV_SHUTDOWN_TEMP, label, Shutdown temperature

DCGM_FI_DEV_POWER_MGMT_LIMIT, label, Current Power limit
DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN, label, Minimum Power limit
DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX, label, Maximum Power limit
DCGM_FI_DEV_ENFORCED_POWER_LIMIT, label, Effective Power limit that the driver enforces after taking into account all limiters

DCGM_FI_DEV_FB_TOTAL, label, Total Frame buffer (in MB)

DCGM_FI_DEV_COUNT, label, Number of devices on the node
24 changes: 23 additions & 1 deletion src/service.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Exporter service helper."""

import os
import shutil
from abc import ABC, abstractmethod
from logging import getLogger
from pathlib import Path
Expand Down Expand Up @@ -449,12 +450,33 @@ class DCGMExporter(SnapExporter):

exporter_name: str = "dcgm"
port: int = 9400
snap_common: Path = Path("/var/snap/dcgm/common/")
metric_config: str = "dcgm-exporter-metrics-file"

def __init__(self, config: ConfigData):
def __init__(self, charm_dir: Path, config: ConfigData):
"""Init."""
self.strategy = DCGMExporterStrategy(str(config["dcgm-snap-channel"]))
self.charm_dir = charm_dir
self.metrics_file = self.charm_dir / "src/gpu_metrics/dcgm_metrics.csv"
self.metric_config_value = self.metrics_file.name
super().__init__(config)

def install(self) -> bool:
"""Install the DCGM exporter and configure custom metrics."""
if not super().install():
return False

try:
logger.info("Creating a custom metrics file and configuring the DCGM snap to use it")
shutil.copy(self.metrics_file, self.snap_common)
self.snap_client.set({self.metric_config: self.metric_config_value})
self.snap_client.restart(reload=True)
samuelallan72 marked this conversation as resolved.
Show resolved Hide resolved
except Exception as err: # pylint: disable=broad-except
logger.error("Failed to configure custom DCGM metrics: %s", err)
return False

return True

@staticmethod
def hw_tools() -> Set[HWTool]:
"""Return hardware tools to watch."""
Expand Down
71 changes: 61 additions & 10 deletions tests/unit/test_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -813,6 +813,67 @@ def test_resource_remove(self):
self.exporter.strategy.remove.accept_called()


class TestDCGMSnapExporter(unittest.TestCase):
"""Test DCGM Snap exporter's methods."""

def setUp(self) -> None:
"""Set up harness for each test case."""
snap_lib_patcher = mock.patch.object(service, "snap")
shutil_lib_patcher = mock.patch.object(service, "shutil")
self.mock_snap = snap_lib_patcher.start()
self.mock_shutil = shutil_lib_patcher.start()
self.addCleanup(snap_lib_patcher.stop)
self.addCleanup(shutil_lib_patcher.stop)

search_path = pathlib.Path(f"{__file__}/../../..").resolve()
self.mock_config = {
"dcgm-snap-channel": "latest/stable",
}

self.exporter = service.DCGMExporter(search_path, self.mock_config)
self.exporter.strategy = mock.MagicMock()

def test_exporter_name(self):
self.assertEqual(self.exporter.exporter_name, "dcgm")
gabrielcocenza marked this conversation as resolved.
Show resolved Hide resolved

def test_hw_tools(self):
self.assertEqual(self.exporter.hw_tools(), {HWTool.DCGM})

def test_install_failed(self):
self.exporter.snap_client.present = False

exporter_install_ok = self.exporter.install()

self.exporter.strategy.install.accept_called()
self.mock_shutil.copy.accept_not_called()
samuelallan72 marked this conversation as resolved.
Show resolved Hide resolved
self.assertFalse(exporter_install_ok)

def test_install_success(self):
self.exporter.snap_client.present = True

exporter_install_ok = self.exporter.install()

self.exporter.strategy.install.accept_called()
self.mock_shutil.copy.accept_called_with(
self.exporter.metrics_file, self.exporter.snap_common
)
self.exporter.snap_client.set.accept_called_with(
{self.exporter.metric_config: self.exporter.metric_config_value}
)
self.exporter.snap_client.restart.accept_called_with(reload=True)
self.assertTrue(exporter_install_ok)

def test_install_metrics_copy_fail(self):
self.exporter.snap_client.present = True
self.mock_shutil.copy.side_effect = FileNotFoundError

exporter_install_ok = self.exporter.install()

self.exporter.strategy.install.accept_called()
self.exporter.snap_client.restart.accept_not_called()
self.assertFalse(exporter_install_ok)


class TestWriteToFile(unittest.TestCase):
def setUp(self):
self.temp_file = tempfile.NamedTemporaryFile(delete=False)
Expand Down Expand Up @@ -971,15 +1032,5 @@ def test_snap_exporter_configure(mock_install, snap_exporter, install_result, ex
mock_install.assert_called_once()


def test_dcgm_exporter():
mock_config = {
"dcgm-snap-channel": "latest/stable",
}

exporter = service.DCGMExporter(mock_config)
assert exporter.exporter_name == "dcgm"
assert exporter.hw_tools() == {HWTool.DCGM}


if __name__ == "__main__":
unittest.main()
Loading