From fa83b9c117707fd2c04a13d6149a6322cccc3c82 Mon Sep 17 00:00:00 2001 From: Sudeep Bhandari Date: Fri, 14 Jul 2023 07:58:48 +0545 Subject: [PATCH] Remove duplicates and older logs from ipmi sel (#28) * Remove duplicates and older logs from ipmi sel * Small refactoring in ipmi sel collector * Handle ipmi sel interval via config * Remove duplicates when the same sel item has different states * Add unit tests for checking labels in ipmi sel collector --- prometheus_hardware_exporter/__main__.py | 7 ++++ prometheus_hardware_exporter/collector.py | 16 +++++---- .../collectors/ipmi_sel.py | 15 +++++++-- prometheus_hardware_exporter/config.py | 2 ++ tests/unit/requirements.txt | 1 + tests/unit/test_collector.py | 14 ++++++-- tests/unit/test_ipmi_sel.py | 33 +++++-------------- .../ipmi/ipmi_sel_sample_output.txt | 4 +-- 8 files changed, 55 insertions(+), 37 deletions(-) diff --git a/prometheus_hardware_exporter/__main__.py b/prometheus_hardware_exporter/__main__.py index f8586d8..f57624a 100644 --- a/prometheus_hardware_exporter/__main__.py +++ b/prometheus_hardware_exporter/__main__.py @@ -61,6 +61,12 @@ def parse_command_line() -> argparse.Namespace: default="", type=str, ) + parser.add_argument( + "--ipmi-sel-interval", + help="The duration for how many seconds to collect SEL records", + default=300, + type=int, + ) parser.add_argument( "--collector.hpe_ssa", help="Enable HPE Smart Array Controller collector (default: disabled)", @@ -159,6 +165,7 @@ def main() -> None: redfish_host=namespace.redfish_host, redfish_username=namespace.redfish_username, redfish_password=namespace.redfish_password, + ipmi_sel_interval=namespace.ipmi_sel_interval, ) # Start the exporter diff --git a/prometheus_hardware_exporter/collector.py b/prometheus_hardware_exporter/collector.py index c33e78b..f9692a1 100644 --- a/prometheus_hardware_exporter/collector.py +++ b/prometheus_hardware_exporter/collector.py @@ -529,16 +529,17 @@ def specifications(self) -> List[Specification]: def fetch(self) -> List[Payload]: """Load ipmi sel entries.""" - sel_entries = self.ipmi_sel.get_sel_entries() + sel_entries = self.ipmi_sel.get_sel_entries(self.config.ipmi_sel_interval) if not sel_entries: - logger.error("Failed to get ipmi sel entries.") + logger.warning("No recent ipmi sel entries to collect.") return [Payload(name="ipmi_sel_command_success", value=0.0)] sel_states_dict = {"NOMINAL": 0, "WARNING": 1, "CRITICAL": 2} payloads = [Payload(name="ipmi_sel_command_success", value=1.0)] + sel_entries_dict: Dict[tuple, int] = {} for sel_entry in sel_entries: if sel_entry["State"].upper() in sel_states_dict: sel_state_value = sel_states_dict[sel_entry["State"].upper()] @@ -547,13 +548,16 @@ def fetch(self) -> List[Payload]: "Unknown ipmi SEL state: %s. Treating it as Nominal.", sel_entry["State"] ) sel_state_value = sel_states_dict["NOMINAL"] + + key = (sel_entry["Name"], sel_entry["Type"]) + if key not in sel_entries_dict or sel_entries_dict[key] < sel_state_value: + sel_entries_dict[key] = sel_state_value + + for sel_labels, sel_state_value in sel_entries_dict.items(): payloads.append( Payload( name="ipmi_sel_state", - labels=[ - sel_entry["Name"], - sel_entry["Type"], - ], + labels=list(sel_labels), value=sel_state_value, ) ) diff --git a/prometheus_hardware_exporter/collectors/ipmi_sel.py b/prometheus_hardware_exporter/collectors/ipmi_sel.py index 4778b53..dd4bc14 100644 --- a/prometheus_hardware_exporter/collectors/ipmi_sel.py +++ b/prometheus_hardware_exporter/collectors/ipmi_sel.py @@ -1,5 +1,6 @@ """IPMI SEL metrics collector.""" +import datetime from logging import getLogger from typing import Dict, List @@ -14,9 +15,11 @@ class IpmiSel(Command): prefix = "" command = "ipmi-sel" - def get_sel_entries(self) -> List[Dict[str, str]]: + def get_sel_entries(self, time_range: int) -> List[Dict[str, str]]: """Get SEL entries along with state. + :param time_range int: Time in seconds, to determine from how far back the SEL + entries should be read. Returns: sel_entries: a list of dictionaries containing sel_sentries, or [] """ @@ -25,11 +28,19 @@ def get_sel_entries(self) -> List[Dict[str, str]]: logger.error(result.error) return [] + oldest_log_time = datetime.datetime.now() - datetime.timedelta(seconds=time_range) + raw_sel_data = result.data.strip().split("\n") sel_entries = [] sel_data_fields = ["ID", "Date", "Time", "Name", "Type", "State", "Event"] for sel_item in raw_sel_data[1:]: sel_item_values = sel_item.split("|") sel_item_values = [entry.strip() for entry in sel_item_values] - sel_entries.append(dict(zip(sel_data_fields, sel_item_values))) + sel_item_dict = dict(zip(sel_data_fields, sel_item_values)) + sel_item_datetime_str = sel_item_dict["Date"] + sel_item_dict["Time"] + sel_item_datetime = datetime.datetime.strptime( + sel_item_datetime_str, "%b-%d-%Y%H:%M:%S" + ) + if sel_item_datetime > oldest_log_time: + sel_entries.append(sel_item_dict) return sel_entries diff --git a/prometheus_hardware_exporter/config.py b/prometheus_hardware_exporter/config.py index 8eff87d..0449612 100644 --- a/prometheus_hardware_exporter/config.py +++ b/prometheus_hardware_exporter/config.py @@ -22,6 +22,8 @@ class Config(BaseModel): level: str = "DEBUG" enable_collectors: List[str] = [] + ipmi_sel_interval: int = 300 + redfish_host: str = "127.0.0.1" redfish_username: str = "" redfish_password: str = "" diff --git a/tests/unit/requirements.txt b/tests/unit/requirements.txt index e079f8a..df2a236 100644 --- a/tests/unit/requirements.txt +++ b/tests/unit/requirements.txt @@ -1 +1,2 @@ pytest +freezegun \ No newline at end of file diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 0087acb..010aa06 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -331,10 +331,18 @@ def test_41_ipmi_sel_installed_and_okay(self): payloads = ipmi_sel_collector.collect() - available_metrics = [spec.name for spec in ipmi_sel_collector.specifications] - self.assertEqual(len(list(payloads)), len(mock_sel_entries) + 1) + payloads_labels_value_map = {} for payload in payloads: - self.assertIn(payload.name, available_metrics) + if payload.name == "ipmi_sel_state": + payloads_labels_value_map[ + tuple(payload.samples[0].labels.values()) + ] = payload.samples[0].value + expected_payloads_label_value_map = { + ("System Board ACPI_Stat", "System ACPI Power State"): 1, + ("System Chassis SysHealth_Stat", "Chassis"): 2, + } + + self.assertDictEqual(payloads_labels_value_map, expected_payloads_label_value_map) def test_50_ipmimonitoring_not_installed(self): """Test ipmi sensor collector when ipmimonitoring is not installed.""" diff --git a/tests/unit/test_ipmi_sel.py b/tests/unit/test_ipmi_sel.py index cd1b868..9878794 100644 --- a/tests/unit/test_ipmi_sel.py +++ b/tests/unit/test_ipmi_sel.py @@ -1,42 +1,26 @@ import unittest from unittest.mock import patch +from freezegun import freeze_time + from prometheus_hardware_exporter.collectors.ipmi_sel import IpmiSel from prometheus_hardware_exporter.utils import Command, Result SEL_SAMPLE_OUTPUT = "tests/unit/test_resources/ipmi/ipmi_sel_sample_output.txt" SAMPLE_SEL_ENTRIES = [ - { - "ID": "493", - "Date": "Oct-06-2022", - "Time": "19:47:13", - "Name": "System Board ACPI_Stat", - "Type": "System ACPI Power State", - "State": "Nominal", - "Event": "S0/G0", - }, { "ID": "494", - "Date": "Oct-06-2022", - "Time": "19:57:23", + "Date": "Jul-09-2023", + "Time": "13:56:23", "Name": "System Chassis SysHealth_Stat", "Type": "Chassis", "State": "Critical", "Event": "transition to Non-recoverable from less severe", }, - { - "ID": "495", - "Date": "Oct-06-2022", - "Time": "19:57:38", - "Name": "System Board ACPI_Stat", - "Type": "System ACPI Power State", - "State": "Nominal", - "Event": "S4/S5 soft-off", - }, { "ID": "496", - "Date": "Oct-06-2022", - "Time": "19:57:51", + "Date": "Jul-09-2023", + "Time": "13:57:50", "Name": "System Board ACPI_Stat", "Type": "System ACPI Power State", "State": "Nominal", @@ -49,11 +33,12 @@ class TestIpmiSel(unittest.TestCase): """Test the IpmiSel class.""" @patch.object(Command, "__call__") + @freeze_time("2023-07-09 23:59:59") def test_00_get_sel_entries_success(self, mock_call): with open(SEL_SAMPLE_OUTPUT, "r") as content: mock_call.return_value = Result(content.read(), None) ipmi_sel = IpmiSel() - payloads = ipmi_sel.get_sel_entries() + payloads = ipmi_sel.get_sel_entries(24 * 60 * 60) expected_sel_entries = SAMPLE_SEL_ENTRIES self.assertEqual(payloads, expected_sel_entries) @@ -61,5 +46,5 @@ def test_00_get_sel_entries_success(self, mock_call): def test_01_get_sel_entries_error(self, mock_call): mock_call.return_value = Result("", True) ipmi_sel = IpmiSel() - payloads = ipmi_sel.get_sel_entries() + payloads = ipmi_sel.get_sel_entries(300) self.assertEqual(payloads, []) diff --git a/tests/unit/test_resources/ipmi/ipmi_sel_sample_output.txt b/tests/unit/test_resources/ipmi/ipmi_sel_sample_output.txt index 1267550..f923ed5 100644 --- a/tests/unit/test_resources/ipmi/ipmi_sel_sample_output.txt +++ b/tests/unit/test_resources/ipmi/ipmi_sel_sample_output.txt @@ -1,5 +1,5 @@ ID | Date | Time | Name | Type | State | Event 493 | Oct-06-2022 | 19:47:13 | System Board ACPI_Stat | System ACPI Power State | Nominal | S0/G0 -494 | Oct-06-2022 | 19:57:23 | System Chassis SysHealth_Stat | Chassis | Critical | transition to Non-recoverable from less severe +494 | Jul-09-2023 | 13:56:23 | System Chassis SysHealth_Stat | Chassis | Critical | transition to Non-recoverable from less severe 495 | Oct-06-2022 | 19:57:38 | System Board ACPI_Stat | System ACPI Power State | Nominal | S4/S5 soft-off -496 | Oct-06-2022 | 19:57:51 | System Board ACPI_Stat | System ACPI Power State | Nominal | S0/G0 \ No newline at end of file +496 | Jul-09-2023 | 13:57:50 | System Board ACPI_Stat | System ACPI Power State | Nominal | S0/G0 \ No newline at end of file