From 586cc8f1d745cde9a22e437ad635dec88f71c6c6 Mon Sep 17 00:00:00 2001 From: james_lin Date: Fri, 24 Nov 2023 18:26:13 +0800 Subject: [PATCH] feat: Add metrics ipmi_dcmi_power_consumption_rate (#45) * feat: Add metrics ipmi_dcmi_power_consumption_rate Add new ipmi-dcmi metrics Rate = power consumption / maximum power consumption * fix: Replace lshw with dmidecode * feat: Add cache for dmidecode get_power_capacities Change wording and add cache for dmidecode get_power_capacities * test: Add unit tests * test: Make test cases more comprehensive * docs: Update doc-string * docs: Update the command for max power capacity formula --- prometheus_hardware_exporter/collector.py | 50 ++++++++++++++- .../collectors/dmidecode.py | 33 ++++++++++ .../collectors/ipmi_dcmi.py | 28 ++++++++- tests/unit/test_collector.py | 61 ++++++++++++++++++- tests/unit/test_dmidecode.py | 27 ++++++++ tests/unit/test_ipmi_dcmi.py | 31 +++++++++- .../dmidecode/dmidecode_type_39_output.txt | 35 +++++++++++ .../ipmi/ipmitool_sdr_ps_sample_output.txt | 3 + 8 files changed, 261 insertions(+), 7 deletions(-) create mode 100644 prometheus_hardware_exporter/collectors/dmidecode.py create mode 100644 tests/unit/test_dmidecode.py create mode 100644 tests/unit/test_resources/dmidecode/dmidecode_type_39_output.txt create mode 100644 tests/unit/test_resources/ipmi/ipmitool_sdr_ps_sample_output.txt diff --git a/prometheus_hardware_exporter/collector.py b/prometheus_hardware_exporter/collector.py index 2d66999..aa669ff 100644 --- a/prometheus_hardware_exporter/collector.py +++ b/prometheus_hardware_exporter/collector.py @@ -10,7 +10,8 @@ SessionCreationError, ) -from .collectors.ipmi_dcmi import IpmiDcmi +from .collectors.dmidecode import Dmidecode +from .collectors.ipmi_dcmi import IpmiDcmi, IpmiTool from .collectors.ipmi_sel import IpmiSel from .collectors.ipmimonitoring import IpmiMonitoring from .collectors.perccli import PercCLI @@ -324,13 +325,15 @@ class IpmiDcmiCollector(BlockingCollector): """Collector for ipmi dcmi metrics.""" ipmi_dcmi = IpmiDcmi() + ipmi_tool = IpmiTool() + dmidecode = Dmidecode() @property def specifications(self) -> List[Specification]: """Define dcmi metric specs.""" return [ Specification( - name="ipmi_dcmi_power_cosumption_watts", + name="ipmi_dcmi_power_consumption_watts", documentation="Current power consumption in watts", metric_class=GaugeMetricFamily, ), @@ -339,6 +342,14 @@ def specifications(self) -> List[Specification]: documentation="Indicates if the ipmi dcmi command is successful or not", metric_class=GaugeMetricFamily, ), + Specification( + name="ipmi_dcmi_power_consumption_percentage", + documentation=( + "Current power capacity usage as a percentage of the overall PSU budget" + ), + labels=["ps_redundancy", "get_ps_redundancy_ok", "maximum_power_capacity"], + metric_class=GaugeMetricFamily, + ), ] def fetch(self) -> List[Payload]: @@ -349,11 +360,44 @@ def fetch(self) -> List[Payload]: logger.error("Failed to fetch current power from ipmi dcmi") return [Payload(name="ipmi_dcmi_command_success", value=0.0)] + get_ps_redundancy_ok, ps_redundancy = self.ipmi_tool.get_ps_redundancy() + # Because we fail to get the redundancy config from the server, + # Suppose redundancy enable make denominator smaller + # and alert is more easy to fire. + if not get_ps_redundancy_ok: + ps_redundancy = True + + power_capacities = self.dmidecode.get_power_capacities() + # If power supply redundancy is enabled, + # it means server only use one power in the same time and another is for backup + # We calculate the average capacities as the value of maximum_power_capacity + # Note: We don't consider the situation that two powers' capacities are + # different on a single server. + maximum_power_capacity = ( + (ps_redundancy and len(power_capacities) > 0) + and sum(power_capacities) / len(power_capacities) + or sum(power_capacities) + ) + + power_capacity_percentage = ( + maximum_power_capacity + and current_power_payload["current_power"] / maximum_power_capacity + or 0 + ) + + ps_redundancy_str = "1" if ps_redundancy else "0" + get_ps_redundancy_ok_str = "1" if get_ps_redundancy_ok else "0" + payloads = [ Payload( - name="ipmi_dcmi_power_cosumption_watts", + name="ipmi_dcmi_power_consumption_watts", value=current_power_payload["current_power"], ), + Payload( + name="ipmi_dcmi_power_consumption_percentage", + value=power_capacity_percentage, + labels=[ps_redundancy_str, get_ps_redundancy_ok_str, str(maximum_power_capacity)], + ), Payload(name="ipmi_dcmi_command_success", value=1.0), ] return payloads diff --git a/prometheus_hardware_exporter/collectors/dmidecode.py b/prometheus_hardware_exporter/collectors/dmidecode.py new file mode 100644 index 0000000..c01e33a --- /dev/null +++ b/prometheus_hardware_exporter/collectors/dmidecode.py @@ -0,0 +1,33 @@ +"""Dmidecode metrics collector.""" +import re +from functools import lru_cache +from logging import getLogger +from typing import List + +from ..utils import Command + +logger = getLogger(__name__) + + +MAX_POWER_CAPACITY_REGEX = r"(Max Power Capacity: )(\d+)( W)" + + +class Dmidecode(Command): + """Command line tool for dmidecode.""" + + prefix = "" + command = "dmidecode" + + @lru_cache # PSU ratings won't change over the lifetime of a server + def get_power_capacities(self) -> List[int]: + """Get list of power capacities.""" + result = self("-t 39") + if result.error: + logger.error(result.error) + return [] + + lines = re.findall(MAX_POWER_CAPACITY_REGEX, result.data) + powers = [] + for line in lines: + powers.append(int(line[1])) + return powers diff --git a/prometheus_hardware_exporter/collectors/ipmi_dcmi.py b/prometheus_hardware_exporter/collectors/ipmi_dcmi.py index af66f7d..62f60eb 100644 --- a/prometheus_hardware_exporter/collectors/ipmi_dcmi.py +++ b/prometheus_hardware_exporter/collectors/ipmi_dcmi.py @@ -2,7 +2,7 @@ import re from logging import getLogger -from typing import Dict +from typing import Dict, Tuple from ..utils import Command @@ -11,6 +11,32 @@ CURRENT_POWER_REGEX = re.compile(r"^Current Power\s*:\s*(?P[0-9.]*)\s*Watts.*") +class IpmiTool(Command): + """Command line tool for ipmitool.""" + + prefix = "" + command = "ipmitool" + + def get_ps_redundancy(self) -> Tuple[bool, bool]: + """Get power supply redundancy. + + returns: + - ok - True if fetching redundancy info is successful + - redundancy - True if redundancy is enabled + """ + result = self("""sdr type "Power Supply" -c""") + if result.error: + logger.error(result.error) + return False, False + output = [] + for line in result.data.splitlines(): + data = line.split(",") + if "Redundancy" in data[0]: + # column 4 is redundancy status + output.append(data[4]) + return True, all(status == "Fully Redundant" for status in output) | False + + class IpmiDcmi(Command): """Command line tool for ipmi dcmi.""" diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 2422411..20d33a9 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -295,6 +295,9 @@ def test_30_ipmi_dcmi_collector_not_installed(self): ipmi_dcmi_collector.ipmi_dcmi = Mock() ipmi_dcmi_collector.ipmi_dcmi.installed = False ipmi_dcmi_collector.ipmi_dcmi.get_current_power.return_value = {} + + ipmi_dcmi_collector.ipmi_tool = Mock() + ipmi_dcmi_collector.dmidecode = Mock() payloads = ipmi_dcmi_collector.collect() self.assertEqual(len(list(payloads)), 1) @@ -303,18 +306,72 @@ def test_31_ipmi_dcmi_collector_installed_and_okay(self): """Test ipmi dcmi collector can fetch correct number of metrics.""" ipmi_dcmi_collector = IpmiDcmiCollector(Mock()) ipmi_dcmi_collector.ipmi_dcmi = Mock() + ipmi_dcmi_collector.ipmi_tool = Mock() + ipmi_dcmi_collector.dmidecode = Mock() mock_dcmi_payload = {"current_power": 105} ipmi_dcmi_collector.ipmi_dcmi.get_current_power.return_value = mock_dcmi_payload + ipmi_dcmi_collector.ipmi_tool.get_ps_redundancy.return_value = (True, True) + ipmi_dcmi_collector.dmidecode.get_power_capacities.return_value = [1000, 1000] - payloads = ipmi_dcmi_collector.collect() + payloads = list(ipmi_dcmi_collector.collect()) available_metrics = [spec.name for spec in ipmi_dcmi_collector.specifications] - self.assertEqual(len(list(payloads)), len(available_metrics)) + self.assertEqual(len(payloads), len(available_metrics)) for payload in payloads: self.assertIn(payload.name, available_metrics) + def test_32_ipmi_dcmi_collector_get_ps_redundancy_not_ok(self): + """Test ipmi dcmi collector with ps_redundancy return is not ok.""" + ipmi_dcmi_collector = IpmiDcmiCollector(Mock()) + ipmi_dcmi_collector.ipmi_dcmi = Mock() + ipmi_dcmi_collector.ipmi_tool = Mock() + ipmi_dcmi_collector.dmidecode = Mock() + + mock_dcmi_payload = {"current_power": 105} + + ipmi_dcmi_collector.ipmi_dcmi.get_current_power.return_value = mock_dcmi_payload + ipmi_dcmi_collector.ipmi_tool.get_ps_redundancy.return_value = (False, False) + ipmi_dcmi_collector.dmidecode.get_power_capacities.return_value = [1000, 1000] + + payloads = list(ipmi_dcmi_collector.collect()) + + available_metrics = [spec.name for spec in ipmi_dcmi_collector.specifications] + self.assertEqual(len(payloads), len(available_metrics)) + for payload in payloads: + self.assertIn(payload.name, available_metrics) + + def test_33_ipmi_dcmi_collector_power_consumption_percentage_valid(self): + """Test ipmi dcmi collector can fetch correct number of metrics.""" + ipmi_dcmi_collector = IpmiDcmiCollector(Mock()) + ipmi_dcmi_collector.ipmi_dcmi = Mock() + ipmi_dcmi_collector.ipmi_tool = Mock() + ipmi_dcmi_collector.dmidecode = Mock() + + mock_dcmi_payload = {"current_power": 105} + + for ps_redundancy, power_capacities, expect in [ + # Expect value should be current_power / average power_capacities + ((True, True), [1000, 1000], 105 / 1000), + ((False, True), [1000, 1000], 105 / 1000), + ((False, False), [1000, 2000], 105 / ((1000 + 2000) / 2)), + # Expect value should be current_power / sum power_capacities + ((True, False), [1000, 1000], 105 / 2000), + ]: + ipmi_dcmi_collector.ipmi_dcmi.get_current_power.return_value = mock_dcmi_payload + ipmi_dcmi_collector.ipmi_tool.get_ps_redundancy.return_value = ps_redundancy + ipmi_dcmi_collector.dmidecode.get_power_capacities.return_value = power_capacities + + payloads = ipmi_dcmi_collector.collect() + + payload_exists = False + for payload in payloads: + if payload.name == "ipmi_dcmi_power_consumption_percentage": + payload_exists = True + self.assertEqual(payload.samples[0].value, expect) + self.assertTrue(payload_exists) + def test_40_ipmi_sel_not_installed(self): """Test ipmi sel collector when ipmi sel is not installed.""" ipmi_sel_collector = IpmiSelCollector(Mock()) diff --git a/tests/unit/test_dmidecode.py b/tests/unit/test_dmidecode.py new file mode 100644 index 0000000..6128959 --- /dev/null +++ b/tests/unit/test_dmidecode.py @@ -0,0 +1,27 @@ +import unittest +from unittest.mock import patch + +from prometheus_hardware_exporter.collectors.dmidecode import Dmidecode +from prometheus_hardware_exporter.utils import Command, Result + +TYPE_39_OUTPUT = "tests/unit/test_resources/dmidecode/dmidecode_type_39_output.txt" + + +class TestDmidecode(unittest.TestCase): + """Test the Dmidecode class.""" + + @patch.object(Command, "__call__") + def test_00_get_power_capacities_success(self, mock_call): + with open(TYPE_39_OUTPUT, "r") as content: + mock_call.return_value = Result(content.read(), None) + dmidecode = Dmidecode() + power_capacities = dmidecode.get_power_capacities() + self.assertEqual(power_capacities, [1400, 1400]) + + @patch.object(Command, "__call__") + def test_01_get_power_capacities_error(self, mock_call): + mock_call.return_value = Result("", True) + + dmidecode = Dmidecode() + power_capacities = dmidecode.get_power_capacities() + self.assertEqual(power_capacities, []) diff --git a/tests/unit/test_ipmi_dcmi.py b/tests/unit/test_ipmi_dcmi.py index bca89d9..b577d24 100644 --- a/tests/unit/test_ipmi_dcmi.py +++ b/tests/unit/test_ipmi_dcmi.py @@ -1,10 +1,11 @@ import unittest from unittest.mock import patch -from prometheus_hardware_exporter.collectors.ipmi_dcmi import IpmiDcmi +from prometheus_hardware_exporter.collectors.ipmi_dcmi import IpmiDcmi, IpmiTool from prometheus_hardware_exporter.utils import Command, Result DCMI_SAMPLE_OUTPUT = "tests/unit/test_resources/ipmi/ipmi_dcmi_sample_output.txt" +IPMITOOL_SDR_PS_SAMPLE_OUTPUT = "tests/unit/test_resources/ipmi/ipmitool_sdr_ps_sample_output.txt" class TestIpmiDcmi(unittest.TestCase): @@ -31,3 +32,31 @@ def test_01_get_current_power_parse_failure(self, mock_call): ipmi_dcmi = IpmiDcmi() payload = ipmi_dcmi.get_current_power() self.assertEqual(payload, {}) + + +class TestIpmiTool(unittest.TestCase): + """Test the IpmiTool class.""" + + @patch.object(Command, "__call__") + def test_00_get_ps_redundancy_success(self, mock_call): + with open(IPMITOOL_SDR_PS_SAMPLE_OUTPUT, "r") as content: + mock_call.return_value = Result(content.read(), None) + ipmitool = IpmiTool() + ps_redundancy = ipmitool.get_ps_redundancy() + self.assertEqual(ps_redundancy, (True, True)) + + @patch.object(Command, "__call__") + def test_01_get_ps_redundancy_error(self, mock_call): + mock_call.return_value = Result("", True) + ipmitool = IpmiTool() + ps_redundancy = ipmitool.get_ps_redundancy() + self.assertEqual(ps_redundancy, (False, False)) + + @patch.object(Command, "__call__") + def test_02_get_ps_redundancy_success_redundancy_disable(self, mock_call): + with open(IPMITOOL_SDR_PS_SAMPLE_OUTPUT, "r") as content: + data = content.read().replace("Fully Redundant", "Not Fully Redundant") + mock_call.return_value = Result(data, None) + ipmitool = IpmiTool() + ps_redundancy = ipmitool.get_ps_redundancy() + self.assertEqual(ps_redundancy, (True, False)) diff --git a/tests/unit/test_resources/dmidecode/dmidecode_type_39_output.txt b/tests/unit/test_resources/dmidecode/dmidecode_type_39_output.txt new file mode 100644 index 0000000..c065022 --- /dev/null +++ b/tests/unit/test_resources/dmidecode/dmidecode_type_39_output.txt @@ -0,0 +1,35 @@ +# dmidecode 3.3 +Getting SMBIOS data from sysfs. +SMBIOS 3.3.0 present. + +Handle 0x2700, DMI type 39, 22 bytes +System Power Supply + Location: Not Specified + Name: PWR SPLY,1400W,RDNT,LTON + Manufacturer: DELL + Serial Number: CNLOD0019M3A6D + Asset Tag: Not Specified + Model Part Number: 01CW9GA04 + Revision: Not Specified + Max Power Capacity: 1400 W + Status: Present, Unknown + Type: Unknown + Input Voltage Range Switching: Unknown + Plugged: Yes + Hot Replaceable: Yes + +Handle 0x2701, DMI type 39, 22 bytes +System Power Supply + Location: Not Specified + Name: PWR SPLY,1400W,RDNT,LTON + Manufacturer: DELL + Serial Number: CNLOD0019M369D + Asset Tag: Not Specified + Model Part Number: 01CW9GA04 + Revision: Not Specified + Max Power Capacity: 1400 W + Status: Present, Unknown + Type: Unknown + Input Voltage Range Switching: Unknown + Plugged: Yes + Hot Replaceable: Yes diff --git a/tests/unit/test_resources/ipmi/ipmitool_sdr_ps_sample_output.txt b/tests/unit/test_resources/ipmi/ipmitool_sdr_ps_sample_output.txt new file mode 100644 index 0000000..6d889df --- /dev/null +++ b/tests/unit/test_resources/ipmi/ipmitool_sdr_ps_sample_output.txt @@ -0,0 +1,3 @@ +PS Redundancy,77h,ok,7.1,Fully Redundant +Status,85h,ok,10.1,Presence detected +Status,86h,ok,10.2,Presence detected