Skip to content

Commit

Permalink
feat: Add metrics ipmi_dcmi_power_consumption_rate (#45)
Browse files Browse the repository at this point in the history
* feat: Add metrics ipmi_dcmi_power_consumption_rate

Add new ipmi-dcmi metrics
Rate = power consumption / maximum power consumption

* fix: Replace lshw with dmidecode

* feat: Add cache for dmidecode get_power_capacities

Change wording and add cache for dmidecode get_power_capacities

* test: Add unit tests

* test: Make test cases more comprehensive

* docs: Update doc-string

* docs: Update the command for max power capacity formula
  • Loading branch information
jneo8 authored Nov 24, 2023
1 parent 03a17a7 commit 586cc8f
Show file tree
Hide file tree
Showing 8 changed files with 261 additions and 7 deletions.
50 changes: 47 additions & 3 deletions prometheus_hardware_exporter/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
SessionCreationError,
)

from .collectors.ipmi_dcmi import IpmiDcmi
from .collectors.dmidecode import Dmidecode
from .collectors.ipmi_dcmi import IpmiDcmi, IpmiTool
from .collectors.ipmi_sel import IpmiSel
from .collectors.ipmimonitoring import IpmiMonitoring
from .collectors.perccli import PercCLI
Expand Down Expand Up @@ -324,13 +325,15 @@ class IpmiDcmiCollector(BlockingCollector):
"""Collector for ipmi dcmi metrics."""

ipmi_dcmi = IpmiDcmi()
ipmi_tool = IpmiTool()
dmidecode = Dmidecode()

@property
def specifications(self) -> List[Specification]:
"""Define dcmi metric specs."""
return [
Specification(
name="ipmi_dcmi_power_cosumption_watts",
name="ipmi_dcmi_power_consumption_watts",
documentation="Current power consumption in watts",
metric_class=GaugeMetricFamily,
),
Expand All @@ -339,6 +342,14 @@ def specifications(self) -> List[Specification]:
documentation="Indicates if the ipmi dcmi command is successful or not",
metric_class=GaugeMetricFamily,
),
Specification(
name="ipmi_dcmi_power_consumption_percentage",
documentation=(
"Current power capacity usage as a percentage of the overall PSU budget"
),
labels=["ps_redundancy", "get_ps_redundancy_ok", "maximum_power_capacity"],
metric_class=GaugeMetricFamily,
),
]

def fetch(self) -> List[Payload]:
Expand All @@ -349,11 +360,44 @@ def fetch(self) -> List[Payload]:
logger.error("Failed to fetch current power from ipmi dcmi")
return [Payload(name="ipmi_dcmi_command_success", value=0.0)]

get_ps_redundancy_ok, ps_redundancy = self.ipmi_tool.get_ps_redundancy()
# Because we fail to get the redundancy config from the server,
# Suppose redundancy enable make denominator smaller
# and alert is more easy to fire.
if not get_ps_redundancy_ok:
ps_redundancy = True

power_capacities = self.dmidecode.get_power_capacities()
# If power supply redundancy is enabled,
# it means server only use one power in the same time and another is for backup
# We calculate the average capacities as the value of maximum_power_capacity
# Note: We don't consider the situation that two powers' capacities are
# different on a single server.
maximum_power_capacity = (
(ps_redundancy and len(power_capacities) > 0)
and sum(power_capacities) / len(power_capacities)
or sum(power_capacities)
)

power_capacity_percentage = (
maximum_power_capacity
and current_power_payload["current_power"] / maximum_power_capacity
or 0
)

ps_redundancy_str = "1" if ps_redundancy else "0"
get_ps_redundancy_ok_str = "1" if get_ps_redundancy_ok else "0"

payloads = [
Payload(
name="ipmi_dcmi_power_cosumption_watts",
name="ipmi_dcmi_power_consumption_watts",
value=current_power_payload["current_power"],
),
Payload(
name="ipmi_dcmi_power_consumption_percentage",
value=power_capacity_percentage,
labels=[ps_redundancy_str, get_ps_redundancy_ok_str, str(maximum_power_capacity)],
),
Payload(name="ipmi_dcmi_command_success", value=1.0),
]
return payloads
Expand Down
33 changes: 33 additions & 0 deletions prometheus_hardware_exporter/collectors/dmidecode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""Dmidecode metrics collector."""
import re
from functools import lru_cache
from logging import getLogger
from typing import List

from ..utils import Command

logger = getLogger(__name__)


MAX_POWER_CAPACITY_REGEX = r"(Max Power Capacity: )(\d+)( W)"


class Dmidecode(Command):
"""Command line tool for dmidecode."""

prefix = ""
command = "dmidecode"

@lru_cache # PSU ratings won't change over the lifetime of a server
def get_power_capacities(self) -> List[int]:
"""Get list of power capacities."""
result = self("-t 39")
if result.error:
logger.error(result.error)
return []

lines = re.findall(MAX_POWER_CAPACITY_REGEX, result.data)
powers = []
for line in lines:
powers.append(int(line[1]))
return powers
28 changes: 27 additions & 1 deletion prometheus_hardware_exporter/collectors/ipmi_dcmi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import re
from logging import getLogger
from typing import Dict
from typing import Dict, Tuple

from ..utils import Command

Expand All @@ -11,6 +11,32 @@
CURRENT_POWER_REGEX = re.compile(r"^Current Power\s*:\s*(?P<value>[0-9.]*)\s*Watts.*")


class IpmiTool(Command):
"""Command line tool for ipmitool."""

prefix = ""
command = "ipmitool"

def get_ps_redundancy(self) -> Tuple[bool, bool]:
"""Get power supply redundancy.
returns:
- ok - True if fetching redundancy info is successful
- redundancy - True if redundancy is enabled
"""
result = self("""sdr type "Power Supply" -c""")
if result.error:
logger.error(result.error)
return False, False
output = []
for line in result.data.splitlines():
data = line.split(",")
if "Redundancy" in data[0]:
# column 4 is redundancy status
output.append(data[4])
return True, all(status == "Fully Redundant" for status in output) | False


class IpmiDcmi(Command):
"""Command line tool for ipmi dcmi."""

Expand Down
61 changes: 59 additions & 2 deletions tests/unit/test_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,9 @@ def test_30_ipmi_dcmi_collector_not_installed(self):
ipmi_dcmi_collector.ipmi_dcmi = Mock()
ipmi_dcmi_collector.ipmi_dcmi.installed = False
ipmi_dcmi_collector.ipmi_dcmi.get_current_power.return_value = {}

ipmi_dcmi_collector.ipmi_tool = Mock()
ipmi_dcmi_collector.dmidecode = Mock()
payloads = ipmi_dcmi_collector.collect()

self.assertEqual(len(list(payloads)), 1)
Expand All @@ -303,18 +306,72 @@ def test_31_ipmi_dcmi_collector_installed_and_okay(self):
"""Test ipmi dcmi collector can fetch correct number of metrics."""
ipmi_dcmi_collector = IpmiDcmiCollector(Mock())
ipmi_dcmi_collector.ipmi_dcmi = Mock()
ipmi_dcmi_collector.ipmi_tool = Mock()
ipmi_dcmi_collector.dmidecode = Mock()

mock_dcmi_payload = {"current_power": 105}

ipmi_dcmi_collector.ipmi_dcmi.get_current_power.return_value = mock_dcmi_payload
ipmi_dcmi_collector.ipmi_tool.get_ps_redundancy.return_value = (True, True)
ipmi_dcmi_collector.dmidecode.get_power_capacities.return_value = [1000, 1000]

payloads = ipmi_dcmi_collector.collect()
payloads = list(ipmi_dcmi_collector.collect())

available_metrics = [spec.name for spec in ipmi_dcmi_collector.specifications]
self.assertEqual(len(list(payloads)), len(available_metrics))
self.assertEqual(len(payloads), len(available_metrics))
for payload in payloads:
self.assertIn(payload.name, available_metrics)

def test_32_ipmi_dcmi_collector_get_ps_redundancy_not_ok(self):
"""Test ipmi dcmi collector with ps_redundancy return is not ok."""
ipmi_dcmi_collector = IpmiDcmiCollector(Mock())
ipmi_dcmi_collector.ipmi_dcmi = Mock()
ipmi_dcmi_collector.ipmi_tool = Mock()
ipmi_dcmi_collector.dmidecode = Mock()

mock_dcmi_payload = {"current_power": 105}

ipmi_dcmi_collector.ipmi_dcmi.get_current_power.return_value = mock_dcmi_payload
ipmi_dcmi_collector.ipmi_tool.get_ps_redundancy.return_value = (False, False)
ipmi_dcmi_collector.dmidecode.get_power_capacities.return_value = [1000, 1000]

payloads = list(ipmi_dcmi_collector.collect())

available_metrics = [spec.name for spec in ipmi_dcmi_collector.specifications]
self.assertEqual(len(payloads), len(available_metrics))
for payload in payloads:
self.assertIn(payload.name, available_metrics)

def test_33_ipmi_dcmi_collector_power_consumption_percentage_valid(self):
"""Test ipmi dcmi collector can fetch correct number of metrics."""
ipmi_dcmi_collector = IpmiDcmiCollector(Mock())
ipmi_dcmi_collector.ipmi_dcmi = Mock()
ipmi_dcmi_collector.ipmi_tool = Mock()
ipmi_dcmi_collector.dmidecode = Mock()

mock_dcmi_payload = {"current_power": 105}

for ps_redundancy, power_capacities, expect in [
# Expect value should be current_power / average power_capacities
((True, True), [1000, 1000], 105 / 1000),
((False, True), [1000, 1000], 105 / 1000),
((False, False), [1000, 2000], 105 / ((1000 + 2000) / 2)),
# Expect value should be current_power / sum power_capacities
((True, False), [1000, 1000], 105 / 2000),
]:
ipmi_dcmi_collector.ipmi_dcmi.get_current_power.return_value = mock_dcmi_payload
ipmi_dcmi_collector.ipmi_tool.get_ps_redundancy.return_value = ps_redundancy
ipmi_dcmi_collector.dmidecode.get_power_capacities.return_value = power_capacities

payloads = ipmi_dcmi_collector.collect()

payload_exists = False
for payload in payloads:
if payload.name == "ipmi_dcmi_power_consumption_percentage":
payload_exists = True
self.assertEqual(payload.samples[0].value, expect)
self.assertTrue(payload_exists)

def test_40_ipmi_sel_not_installed(self):
"""Test ipmi sel collector when ipmi sel is not installed."""
ipmi_sel_collector = IpmiSelCollector(Mock())
Expand Down
27 changes: 27 additions & 0 deletions tests/unit/test_dmidecode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import unittest
from unittest.mock import patch

from prometheus_hardware_exporter.collectors.dmidecode import Dmidecode
from prometheus_hardware_exporter.utils import Command, Result

TYPE_39_OUTPUT = "tests/unit/test_resources/dmidecode/dmidecode_type_39_output.txt"


class TestDmidecode(unittest.TestCase):
"""Test the Dmidecode class."""

@patch.object(Command, "__call__")
def test_00_get_power_capacities_success(self, mock_call):
with open(TYPE_39_OUTPUT, "r") as content:
mock_call.return_value = Result(content.read(), None)
dmidecode = Dmidecode()
power_capacities = dmidecode.get_power_capacities()
self.assertEqual(power_capacities, [1400, 1400])

@patch.object(Command, "__call__")
def test_01_get_power_capacities_error(self, mock_call):
mock_call.return_value = Result("", True)

dmidecode = Dmidecode()
power_capacities = dmidecode.get_power_capacities()
self.assertEqual(power_capacities, [])
31 changes: 30 additions & 1 deletion tests/unit/test_ipmi_dcmi.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import unittest
from unittest.mock import patch

from prometheus_hardware_exporter.collectors.ipmi_dcmi import IpmiDcmi
from prometheus_hardware_exporter.collectors.ipmi_dcmi import IpmiDcmi, IpmiTool
from prometheus_hardware_exporter.utils import Command, Result

DCMI_SAMPLE_OUTPUT = "tests/unit/test_resources/ipmi/ipmi_dcmi_sample_output.txt"
IPMITOOL_SDR_PS_SAMPLE_OUTPUT = "tests/unit/test_resources/ipmi/ipmitool_sdr_ps_sample_output.txt"


class TestIpmiDcmi(unittest.TestCase):
Expand All @@ -31,3 +32,31 @@ def test_01_get_current_power_parse_failure(self, mock_call):
ipmi_dcmi = IpmiDcmi()
payload = ipmi_dcmi.get_current_power()
self.assertEqual(payload, {})


class TestIpmiTool(unittest.TestCase):
"""Test the IpmiTool class."""

@patch.object(Command, "__call__")
def test_00_get_ps_redundancy_success(self, mock_call):
with open(IPMITOOL_SDR_PS_SAMPLE_OUTPUT, "r") as content:
mock_call.return_value = Result(content.read(), None)
ipmitool = IpmiTool()
ps_redundancy = ipmitool.get_ps_redundancy()
self.assertEqual(ps_redundancy, (True, True))

@patch.object(Command, "__call__")
def test_01_get_ps_redundancy_error(self, mock_call):
mock_call.return_value = Result("", True)
ipmitool = IpmiTool()
ps_redundancy = ipmitool.get_ps_redundancy()
self.assertEqual(ps_redundancy, (False, False))

@patch.object(Command, "__call__")
def test_02_get_ps_redundancy_success_redundancy_disable(self, mock_call):
with open(IPMITOOL_SDR_PS_SAMPLE_OUTPUT, "r") as content:
data = content.read().replace("Fully Redundant", "Not Fully Redundant")
mock_call.return_value = Result(data, None)
ipmitool = IpmiTool()
ps_redundancy = ipmitool.get_ps_redundancy()
self.assertEqual(ps_redundancy, (True, False))
35 changes: 35 additions & 0 deletions tests/unit/test_resources/dmidecode/dmidecode_type_39_output.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# dmidecode 3.3
Getting SMBIOS data from sysfs.
SMBIOS 3.3.0 present.

Handle 0x2700, DMI type 39, 22 bytes
System Power Supply
Location: Not Specified
Name: PWR SPLY,1400W,RDNT,LTON
Manufacturer: DELL
Serial Number: CNLOD0019M3A6D
Asset Tag: Not Specified
Model Part Number: 01CW9GA04
Revision: Not Specified
Max Power Capacity: 1400 W
Status: Present, Unknown
Type: Unknown
Input Voltage Range Switching: Unknown
Plugged: Yes
Hot Replaceable: Yes

Handle 0x2701, DMI type 39, 22 bytes
System Power Supply
Location: Not Specified
Name: PWR SPLY,1400W,RDNT,LTON
Manufacturer: DELL
Serial Number: CNLOD0019M369D
Asset Tag: Not Specified
Model Part Number: 01CW9GA04
Revision: Not Specified
Max Power Capacity: 1400 W
Status: Present, Unknown
Type: Unknown
Input Voltage Range Switching: Unknown
Plugged: Yes
Hot Replaceable: Yes
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
PS Redundancy,77h,ok,7.1,Fully Redundant
Status,85h,ok,10.1,Presence detected
Status,86h,ok,10.2,Presence detected

0 comments on commit 586cc8f

Please sign in to comment.