From 743402c6b3c0fd26cfc5d1064d97f9675f92b0ad Mon Sep 17 00:00:00 2001 From: james_lin Date: Fri, 14 Jul 2023 17:57:56 +0800 Subject: [PATCH] fix(ipmi-sel): ipmi-sel error handling (#29) * fix(ipmi-sel): Handle postinit record The PostInit record which doesn't have expected datatime format output * fix(ipmi_sel): ipmi_sel_command_success statement Return None if command fail instead of empty list * style: Change log message when ipmi_sel command failing --- prometheus_hardware_exporter/collector.py | 4 ++-- .../collectors/ipmi_sel.py | 19 ++++++++++------- tests/unit/test_collector.py | 11 ++++++++++ tests/unit/test_ipmi_sel.py | 21 +++++++++++++++++-- .../ipmi/ipmi_sel_sample_output.txt | 3 ++- 5 files changed, 45 insertions(+), 13 deletions(-) diff --git a/prometheus_hardware_exporter/collector.py b/prometheus_hardware_exporter/collector.py index f9692a1..d0435f2 100644 --- a/prometheus_hardware_exporter/collector.py +++ b/prometheus_hardware_exporter/collector.py @@ -531,8 +531,8 @@ def fetch(self) -> List[Payload]: """Load ipmi sel entries.""" sel_entries = self.ipmi_sel.get_sel_entries(self.config.ipmi_sel_interval) - if not sel_entries: - logger.warning("No recent ipmi sel entries to collect.") + if sel_entries is None: + logger.warning("Fail to get ipmi sel entries.") return [Payload(name="ipmi_sel_command_success", value=0.0)] sel_states_dict = {"NOMINAL": 0, "WARNING": 1, "CRITICAL": 2} diff --git a/prometheus_hardware_exporter/collectors/ipmi_sel.py b/prometheus_hardware_exporter/collectors/ipmi_sel.py index dd4bc14..b492985 100644 --- a/prometheus_hardware_exporter/collectors/ipmi_sel.py +++ b/prometheus_hardware_exporter/collectors/ipmi_sel.py @@ -2,7 +2,7 @@ import datetime from logging import getLogger -from typing import Dict, List +from typing import Dict, List, Optional from ..utils import Command @@ -15,7 +15,7 @@ class IpmiSel(Command): prefix = "" command = "ipmi-sel" - def get_sel_entries(self, time_range: int) -> List[Dict[str, str]]: + def get_sel_entries(self, time_range: int) -> Optional[List[Dict[str, str]]]: """Get SEL entries along with state. :param time_range int: Time in seconds, to determine from how far back the SEL @@ -26,7 +26,7 @@ def get_sel_entries(self, time_range: int) -> List[Dict[str, str]]: result = self("--output-event-state --interpret-oem-data --entity-sensor-names") if result.error: logger.error(result.error) - return [] + return None oldest_log_time = datetime.datetime.now() - datetime.timedelta(seconds=time_range) @@ -37,10 +37,13 @@ def get_sel_entries(self, time_range: int) -> List[Dict[str, str]]: sel_item_values = sel_item.split("|") sel_item_values = [entry.strip() for entry in sel_item_values] sel_item_dict = dict(zip(sel_data_fields, sel_item_values)) - sel_item_datetime_str = sel_item_dict["Date"] + sel_item_dict["Time"] - sel_item_datetime = datetime.datetime.strptime( - sel_item_datetime_str, "%b-%d-%Y%H:%M:%S" - ) - if sel_item_datetime > oldest_log_time: + if sel_item_dict["Date"] == "PostInit": sel_entries.append(sel_item_dict) + else: + sel_item_datetime_str = sel_item_dict["Date"] + sel_item_dict["Time"] + sel_item_datetime = datetime.datetime.strptime( + sel_item_datetime_str, "%b-%d-%Y%H:%M:%S" + ) + if sel_item_datetime > oldest_log_time: + sel_entries.append(sel_item_dict) return sel_entries diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 010aa06..47d9e22 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -344,6 +344,17 @@ def test_41_ipmi_sel_installed_and_okay(self): self.assertDictEqual(payloads_labels_value_map, expected_payloads_label_value_map) + def test_42_ipmi_sel_cmd_fail(self): + """Test ipmi sel collector when ipmi sel is not installed.""" + ipmi_sel_collector = IpmiSelCollector(Mock()) + ipmi_sel_collector.ipmi_sel = Mock() + + ipmi_sel_collector.ipmi_sel.get_sel_entries.return_value = None + + payloads = list(ipmi_sel_collector.collect()) + assert payloads[0].name == "ipmi_sel_command_success" + assert payloads[0].samples[0].value == 0.0 + def test_50_ipmimonitoring_not_installed(self): """Test ipmi sensor collector when ipmimonitoring is not installed.""" ipmi_sensors_collector = IpmiSensorsCollector(Mock()) diff --git a/tests/unit/test_ipmi_sel.py b/tests/unit/test_ipmi_sel.py index 9878794..c25e35d 100644 --- a/tests/unit/test_ipmi_sel.py +++ b/tests/unit/test_ipmi_sel.py @@ -8,6 +8,15 @@ SEL_SAMPLE_OUTPUT = "tests/unit/test_resources/ipmi/ipmi_sel_sample_output.txt" SAMPLE_SEL_ENTRIES = [ + { + "ID": "14", + "Date": "PostInit", + "Time": "PostInit", + "Name": "Disk Drive Bay 1 Cable SAS A", + "Type": "Cable/Interconnect", + "State": "Critical", + "Event": "Configuration Error - Incorrect cable connected", + }, { "ID": "494", "Date": "Jul-09-2023", @@ -40,11 +49,19 @@ def test_00_get_sel_entries_success(self, mock_call): ipmi_sel = IpmiSel() payloads = ipmi_sel.get_sel_entries(24 * 60 * 60) expected_sel_entries = SAMPLE_SEL_ENTRIES + print(payloads) self.assertEqual(payloads, expected_sel_entries) @patch.object(Command, "__call__") - def test_01_get_sel_entries_error(self, mock_call): - mock_call.return_value = Result("", True) + def test_01_get_sel_entries_zero_records(self, mock_call): + mock_call.return_value = Result("", None) ipmi_sel = IpmiSel() payloads = ipmi_sel.get_sel_entries(300) self.assertEqual(payloads, []) + + @patch.object(Command, "__call__") + def test_02_get_sel_entries_error(self, mock_call): + mock_call.return_value = Result("", Exception()) + ipmi_sel = IpmiSel() + payloads = ipmi_sel.get_sel_entries(300) + self.assertEqual(payloads, None) diff --git a/tests/unit/test_resources/ipmi/ipmi_sel_sample_output.txt b/tests/unit/test_resources/ipmi/ipmi_sel_sample_output.txt index f923ed5..1b3ac1c 100644 --- a/tests/unit/test_resources/ipmi/ipmi_sel_sample_output.txt +++ b/tests/unit/test_resources/ipmi/ipmi_sel_sample_output.txt @@ -1,5 +1,6 @@ ID | Date | Time | Name | Type | State | Event +14 | PostInit | PostInit | Disk Drive Bay 1 Cable SAS A | Cable/Interconnect | Critical | Configuration Error - Incorrect cable connected 493 | Oct-06-2022 | 19:47:13 | System Board ACPI_Stat | System ACPI Power State | Nominal | S0/G0 494 | Jul-09-2023 | 13:56:23 | System Chassis SysHealth_Stat | Chassis | Critical | transition to Non-recoverable from less severe 495 | Oct-06-2022 | 19:57:38 | System Board ACPI_Stat | System ACPI Power State | Nominal | S4/S5 soft-off -496 | Jul-09-2023 | 13:57:50 | System Board ACPI_Stat | System ACPI Power State | Nominal | S0/G0 \ No newline at end of file +496 | Jul-09-2023 | 13:57:50 | System Board ACPI_Stat | System ACPI Power State | Nominal | S0/G0