Skip to content

Commit

Permalink
Merge branch 'main' into chore-reenable-type-hints-on-ops-main
Browse files Browse the repository at this point in the history
  • Loading branch information
samuelallan72 authored Oct 7, 2024
2 parents a84eced + c63a4d8 commit b7bbfeb
Show file tree
Hide file tree
Showing 13 changed files with 1,924 additions and 410 deletions.
6 changes: 6 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ options:
description: |
Channel to install the DCGM snap if the hardware has NVIDIA GPU. By default, it will install
from latest/stable
smartctl-exporter-snap-channel:
type: string
default: "latest/stable"
description: |
Channel to install the Smartctl exporter snap if the hardware has smart disk. By default, it will install
from latest/stable.
exporter-log-level:
type: string
default: "INFO"
Expand Down
10 changes: 7 additions & 3 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from ops.framework import EventBase, StoredState
from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus

from hw_tools import HWTool, HWToolHelper, detect_available_tools
from hw_tools import HWTool, HWToolHelper, detect_available_tools, remove_legacy_smartctl_exporter
from service import BaseExporter, DCGMExporter, ExporterError, HardwareExporter, SmartCtlExporter

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -81,10 +81,10 @@ def exporters(self) -> List[BaseExporter]:
)

if stored_tools & SmartCtlExporter.hw_tools():
exporters.append(SmartCtlExporter(self.charm_dir, self.model.config))
exporters.append(SmartCtlExporter(self.model.config))

if stored_tools & DCGMExporter.hw_tools():
exporters.append(DCGMExporter(self.model.config))
exporters.append(DCGMExporter(self.charm_dir, self.model.config))

return exporters

Expand All @@ -97,6 +97,8 @@ def get_stored_tools(self) -> Set[HWTool]:
if not self._stored.stored_tools: # type: ignore[truthy-function]
available_tools = detect_available_tools() # type: ignore[unreachable]
self._stored.stored_tools = {tool.value for tool in available_tools}
if "smartctl" in self._stored.stored_tools: # type: ignore[operator]
self._stored.stored_tools.remove("smartctl") # type: ignore[attr-defined]
return {HWTool(value) for value in self._stored.stored_tools} # type: ignore[attr-defined]

def _on_redetect_hardware(self, event: ops.ActionEvent) -> None:
Expand Down Expand Up @@ -130,6 +132,8 @@ def _on_install_or_upgrade(self, event: EventBase) -> None:
"""Install or upgrade charm."""
self.model.unit.status = MaintenanceStatus("Installing resources...")

remove_legacy_smartctl_exporter()

stored_tools = self.get_stored_tools()

msg: str
Expand Down
17 changes: 1 addition & 16 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,20 +36,6 @@ class HardwareExporterSettings(ExporterSettings): # pylint: disable = too-few-p
HARDWARE_EXPORTER_SETTINGS = HardwareExporterSettings()


class SmartCtlExporterSettings(ExporterSettings): # pylint: disable = too-few-public-methods
"""Constant settings for SmartCtl Exporter."""

name: str = "smartctl-exporter"
config_path: Path = Path(f"/etc/{name}-config.yaml")
service_path: Path = Path(f"/etc/systemd/system/{name}.service")
config_template: str = f"{name}-config.yaml.j2"
service_template: str = f"{name}.service.j2"
crash_msg: str = "SmartCtl exporter crashed unexpectedly, please refer to systemd logs..."


SMARTCTL_EXPORTER_SETTINGS = SmartCtlExporterSettings()


class SystemVendor(str, Enum):
"""Different hardware system vendor."""

Expand Down Expand Up @@ -77,8 +63,7 @@ class HWTool(str, Enum):
IPMI_SEL = "ipmi_sel"
IPMI_SENSOR = "ipmi_sensor"
REDFISH = "redfish"
SMARTCTL = "smartctl"
SMARTCTL_EXPORTER = "smartctl_exporter"
SMARTCTL_EXPORTER = "smartctl-exporter"
DCGM = "dcgm"


Expand Down
160 changes: 160 additions & 0 deletions src/gpu_metrics/dcgm_metrics.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
###############################################################################
# [ WARNING ]
# Configuration file maintained by Juju. Local changes may be overwritten.
###############################################################################

# Selected metrics for dcgm-exporter
# Default metric list https://github.com/NVIDIA/dcgm-exporter/blob/main/etc/default-counters.csv

# Format
# If line starts with a '#' it is considered a comment
# Boolean values decode to - 1 = enabled 0 = disabled
# DCGM FIELD, Prometheus metric type, help message




# DEFAULT METRICS
# Clocks
DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz).
DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).

# Temperature
DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C).

# Power
DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ).
DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W).

# PCIE
DCGM_FI_PROF_PCIE_TX_BYTES, counter, Total number of bytes transmitted through PCIe TX via NVML.
DCGM_FI_PROF_PCIE_RX_BYTES, counter, Total number of bytes received through PCIe RX via NVML.
DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries.

# Utilization (the sample period varies depending on the product)
DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %).
DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %).
DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %).
DCGM_FI_DEV_DEC_UTIL, gauge, Decoder utilization (in %).

# Errors and violations
DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered.

# Memory usage
DCGM_FI_DEV_FB_FREE, gauge, Frame buffer memory free (in MB).
DCGM_FI_DEV_FB_USED, gauge, Frame buffer memory used (in MB).

# NVLink
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes

# VGPU License status
DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status

# Remapped rows
DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors
DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors
DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed

# Static configuration information and features
DCGM_FI_DRIVER_VERSION, label, Driver Version




# CUSTOM METRICS
# Clocks
DCGM_FI_DEV_VIDEO_CLOCK, gauge, Video encoder/decoder clock (in MHz).

# Temperature
DCGM_FI_DEV_FAN_SPEED, gauge, Fan speed (in 0-100%)

# Power
DCGM_FI_DEV_POWER_USAGE_INSTANT, gauge, Current instantaneous power usage (in W).

# Errors and violations
DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, counter, Throttling reasons bitmask
DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us).
DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us).
DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us).
DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us).
DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us).
DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us).

# Memory usage
DCGM_FI_DEV_FB_RESERVED, gauge, Frame buffer memory reserved (in MB).
DCGM_FI_DEV_FB_USED_PERCENT, gauge, Frame buffer percentage used (in 0-100%) - Used/(Total - Reserved)

# ECC
DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors.
DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors.
DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors.
DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors.

# Retired pages
DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors.
DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors.
DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement.

# NVLink
DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors.
DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors.
DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries.
DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors.

# VGPU
DCGM_FI_DEV_VGPU_UTILIZATIONS, gauge, vGPUs utilization

# Bar
DCGM_FI_DEV_BAR1_USED, gauge, Used BAR1 (in MB)
DCGM_FI_DEV_BAR1_FREE, gauge, Free BAR1 (in MB)

# DCP metrics
DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active.
DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned.
DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM.
DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active.
DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data.
DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active.
DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active.
DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active.
DCGM_FI_PROF_PCIE_TX_BYTES, gauge, The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second.
DCGM_FI_PROF_PCIE_RX_BYTES, gauge, The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second.

# Static configuration information and features
DCGM_FI_NVML_VERSION, label, NVML Version
DCGM_FI_DEV_BRAND, label, Device Brand
DCGM_FI_DEV_SERIAL, label, Device Serial Number
DCGM_FI_DEV_NAME, label, Device Name
DCGM_FI_DEV_MINOR_NUMBER, label, Device node minor (/dev/nvidia#)
DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY, label, Cuda compute capability for the device (The major version is the upper 32 bits and the minor version is the lower 32 bits)
DCGM_FI_DEV_OEM_INFOROM_VER, label, OEM inforom version
DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version
DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version
DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version
DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device

DCGM_FI_DEV_COMPUTE_MODE, label, Compute mode
DCGM_FI_DEV_PERSISTENCE_MODE, label, Persistance mode (1 or 0)
DCGM_FI_DEV_CC_MODE, label, ConfidentialCompute/AmpereProtectedMemory status (1 or 0)
DCGM_FI_DEV_ECC_CURRENT, label, Current ECC mode
DCGM_FI_DEV_VIRTUAL_MODE, label, Virtualization mode
DCGM_FI_DEV_AUTOBOOST, label, Auto-boost enabled

DCGM_FI_DEV_BAR1_TOTAL, label, Total BAR1 (in MB)

DCGM_FI_DEV_MAX_SM_CLOCK, label, Maximum supported SM clock
DCGM_FI_DEV_MAX_MEM_CLOCK, label, Maximum supported Memory clock

DCGM_FI_DEV_GPU_MAX_OP_TEMP, label, Maximum operating temperature
DCGM_FI_DEV_SLOWDOWN_TEMP, label, Slowdown temperature
DCGM_FI_DEV_SHUTDOWN_TEMP, label, Shutdown temperature

DCGM_FI_DEV_POWER_MGMT_LIMIT, label, Current Power limit
DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN, label, Minimum Power limit
DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX, label, Maximum Power limit
DCGM_FI_DEV_ENFORCED_POWER_LIMIT, label, Effective Power limit that the driver enforces after taking into account all limiters

DCGM_FI_DEV_FB_TOTAL, label, Total Frame buffer (in MB)

DCGM_FI_DEV_COUNT, label, Number of devices on the node
Loading

0 comments on commit b7bbfeb

Please sign in to comment.