Skip to content

Commit

Permalink
fix(ipmi-sel): ipmi-sel error handling (#29)
Browse files Browse the repository at this point in the history
* fix(ipmi-sel): Handle postinit record

The PostInit record which doesn't have expected datatime format output

* fix(ipmi_sel): ipmi_sel_command_success statement

Return None if command fail instead of empty list

* style: Change log message when ipmi_sel command failing
  • Loading branch information
jneo8 authored Jul 14, 2023
1 parent fa83b9c commit 743402c
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 13 deletions.
4 changes: 2 additions & 2 deletions prometheus_hardware_exporter/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,8 +531,8 @@ def fetch(self) -> List[Payload]:
"""Load ipmi sel entries."""
sel_entries = self.ipmi_sel.get_sel_entries(self.config.ipmi_sel_interval)

if not sel_entries:
logger.warning("No recent ipmi sel entries to collect.")
if sel_entries is None:
logger.warning("Fail to get ipmi sel entries.")
return [Payload(name="ipmi_sel_command_success", value=0.0)]

sel_states_dict = {"NOMINAL": 0, "WARNING": 1, "CRITICAL": 2}
Expand Down
19 changes: 11 additions & 8 deletions prometheus_hardware_exporter/collectors/ipmi_sel.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import datetime
from logging import getLogger
from typing import Dict, List
from typing import Dict, List, Optional

from ..utils import Command

Expand All @@ -15,7 +15,7 @@ class IpmiSel(Command):
prefix = ""
command = "ipmi-sel"

def get_sel_entries(self, time_range: int) -> List[Dict[str, str]]:
def get_sel_entries(self, time_range: int) -> Optional[List[Dict[str, str]]]:
"""Get SEL entries along with state.
:param time_range int: Time in seconds, to determine from how far back the SEL
Expand All @@ -26,7 +26,7 @@ def get_sel_entries(self, time_range: int) -> List[Dict[str, str]]:
result = self("--output-event-state --interpret-oem-data --entity-sensor-names")
if result.error:
logger.error(result.error)
return []
return None

oldest_log_time = datetime.datetime.now() - datetime.timedelta(seconds=time_range)

Expand All @@ -37,10 +37,13 @@ def get_sel_entries(self, time_range: int) -> List[Dict[str, str]]:
sel_item_values = sel_item.split("|")
sel_item_values = [entry.strip() for entry in sel_item_values]
sel_item_dict = dict(zip(sel_data_fields, sel_item_values))
sel_item_datetime_str = sel_item_dict["Date"] + sel_item_dict["Time"]
sel_item_datetime = datetime.datetime.strptime(
sel_item_datetime_str, "%b-%d-%Y%H:%M:%S"
)
if sel_item_datetime > oldest_log_time:
if sel_item_dict["Date"] == "PostInit":
sel_entries.append(sel_item_dict)
else:
sel_item_datetime_str = sel_item_dict["Date"] + sel_item_dict["Time"]
sel_item_datetime = datetime.datetime.strptime(
sel_item_datetime_str, "%b-%d-%Y%H:%M:%S"
)
if sel_item_datetime > oldest_log_time:
sel_entries.append(sel_item_dict)
return sel_entries
11 changes: 11 additions & 0 deletions tests/unit/test_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,17 @@ def test_41_ipmi_sel_installed_and_okay(self):

self.assertDictEqual(payloads_labels_value_map, expected_payloads_label_value_map)

def test_42_ipmi_sel_cmd_fail(self):
"""Test ipmi sel collector when ipmi sel is not installed."""
ipmi_sel_collector = IpmiSelCollector(Mock())
ipmi_sel_collector.ipmi_sel = Mock()

ipmi_sel_collector.ipmi_sel.get_sel_entries.return_value = None

payloads = list(ipmi_sel_collector.collect())
assert payloads[0].name == "ipmi_sel_command_success"
assert payloads[0].samples[0].value == 0.0

def test_50_ipmimonitoring_not_installed(self):
"""Test ipmi sensor collector when ipmimonitoring is not installed."""
ipmi_sensors_collector = IpmiSensorsCollector(Mock())
Expand Down
21 changes: 19 additions & 2 deletions tests/unit/test_ipmi_sel.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,15 @@

SEL_SAMPLE_OUTPUT = "tests/unit/test_resources/ipmi/ipmi_sel_sample_output.txt"
SAMPLE_SEL_ENTRIES = [
{
"ID": "14",
"Date": "PostInit",
"Time": "PostInit",
"Name": "Disk Drive Bay 1 Cable SAS A",
"Type": "Cable/Interconnect",
"State": "Critical",
"Event": "Configuration Error - Incorrect cable connected",
},
{
"ID": "494",
"Date": "Jul-09-2023",
Expand Down Expand Up @@ -40,11 +49,19 @@ def test_00_get_sel_entries_success(self, mock_call):
ipmi_sel = IpmiSel()
payloads = ipmi_sel.get_sel_entries(24 * 60 * 60)
expected_sel_entries = SAMPLE_SEL_ENTRIES
print(payloads)
self.assertEqual(payloads, expected_sel_entries)

@patch.object(Command, "__call__")
def test_01_get_sel_entries_error(self, mock_call):
mock_call.return_value = Result("", True)
def test_01_get_sel_entries_zero_records(self, mock_call):
mock_call.return_value = Result("", None)
ipmi_sel = IpmiSel()
payloads = ipmi_sel.get_sel_entries(300)
self.assertEqual(payloads, [])

@patch.object(Command, "__call__")
def test_02_get_sel_entries_error(self, mock_call):
mock_call.return_value = Result("", Exception())
ipmi_sel = IpmiSel()
payloads = ipmi_sel.get_sel_entries(300)
self.assertEqual(payloads, None)
3 changes: 2 additions & 1 deletion tests/unit/test_resources/ipmi/ipmi_sel_sample_output.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
ID | Date | Time | Name | Type | State | Event
14 | PostInit | PostInit | Disk Drive Bay 1 Cable SAS A | Cable/Interconnect | Critical | Configuration Error - Incorrect cable connected
493 | Oct-06-2022 | 19:47:13 | System Board ACPI_Stat | System ACPI Power State | Nominal | S0/G0
494 | Jul-09-2023 | 13:56:23 | System Chassis SysHealth_Stat | Chassis | Critical | transition to Non-recoverable from less severe
495 | Oct-06-2022 | 19:57:38 | System Board ACPI_Stat | System ACPI Power State | Nominal | S4/S5 soft-off
496 | Jul-09-2023 | 13:57:50 | System Board ACPI_Stat | System ACPI Power State | Nominal | S0/G0
496 | Jul-09-2023 | 13:57:50 | System Board ACPI_Stat | System ACPI Power State | Nominal | S0/G0

0 comments on commit 743402c

Please sign in to comment.