Skip to content

Commit

Permalink
Remove duplicates and older logs from ipmi sel (#28)
Browse files Browse the repository at this point in the history
* Remove duplicates and older logs from ipmi sel

* Small refactoring in ipmi sel collector

* Handle ipmi sel interval via config

* Remove duplicates when the same sel item has different states

* Add unit tests for checking labels in ipmi sel collector
  • Loading branch information
sudeephb authored Jul 14, 2023
1 parent ee468b4 commit fa83b9c
Show file tree
Hide file tree
Showing 8 changed files with 55 additions and 37 deletions.
7 changes: 7 additions & 0 deletions prometheus_hardware_exporter/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,12 @@ def parse_command_line() -> argparse.Namespace:
default="",
type=str,
)
parser.add_argument(
"--ipmi-sel-interval",
help="The duration for how many seconds to collect SEL records",
default=300,
type=int,
)
parser.add_argument(
"--collector.hpe_ssa",
help="Enable HPE Smart Array Controller collector (default: disabled)",
Expand Down Expand Up @@ -159,6 +165,7 @@ def main() -> None:
redfish_host=namespace.redfish_host,
redfish_username=namespace.redfish_username,
redfish_password=namespace.redfish_password,
ipmi_sel_interval=namespace.ipmi_sel_interval,
)

# Start the exporter
Expand Down
16 changes: 10 additions & 6 deletions prometheus_hardware_exporter/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,16 +529,17 @@ def specifications(self) -> List[Specification]:

def fetch(self) -> List[Payload]:
"""Load ipmi sel entries."""
sel_entries = self.ipmi_sel.get_sel_entries()
sel_entries = self.ipmi_sel.get_sel_entries(self.config.ipmi_sel_interval)

if not sel_entries:
logger.error("Failed to get ipmi sel entries.")
logger.warning("No recent ipmi sel entries to collect.")
return [Payload(name="ipmi_sel_command_success", value=0.0)]

sel_states_dict = {"NOMINAL": 0, "WARNING": 1, "CRITICAL": 2}

payloads = [Payload(name="ipmi_sel_command_success", value=1.0)]

sel_entries_dict: Dict[tuple, int] = {}
for sel_entry in sel_entries:
if sel_entry["State"].upper() in sel_states_dict:
sel_state_value = sel_states_dict[sel_entry["State"].upper()]
Expand All @@ -547,13 +548,16 @@ def fetch(self) -> List[Payload]:
"Unknown ipmi SEL state: %s. Treating it as Nominal.", sel_entry["State"]
)
sel_state_value = sel_states_dict["NOMINAL"]

key = (sel_entry["Name"], sel_entry["Type"])
if key not in sel_entries_dict or sel_entries_dict[key] < sel_state_value:
sel_entries_dict[key] = sel_state_value

for sel_labels, sel_state_value in sel_entries_dict.items():
payloads.append(
Payload(
name="ipmi_sel_state",
labels=[
sel_entry["Name"],
sel_entry["Type"],
],
labels=list(sel_labels),
value=sel_state_value,
)
)
Expand Down
15 changes: 13 additions & 2 deletions prometheus_hardware_exporter/collectors/ipmi_sel.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""IPMI SEL metrics collector."""

import datetime
from logging import getLogger
from typing import Dict, List

Expand All @@ -14,9 +15,11 @@ class IpmiSel(Command):
prefix = ""
command = "ipmi-sel"

def get_sel_entries(self) -> List[Dict[str, str]]:
def get_sel_entries(self, time_range: int) -> List[Dict[str, str]]:
"""Get SEL entries along with state.
:param time_range int: Time in seconds, to determine from how far back the SEL
entries should be read.
Returns:
sel_entries: a list of dictionaries containing sel_sentries, or []
"""
Expand All @@ -25,11 +28,19 @@ def get_sel_entries(self) -> List[Dict[str, str]]:
logger.error(result.error)
return []

oldest_log_time = datetime.datetime.now() - datetime.timedelta(seconds=time_range)

raw_sel_data = result.data.strip().split("\n")
sel_entries = []
sel_data_fields = ["ID", "Date", "Time", "Name", "Type", "State", "Event"]
for sel_item in raw_sel_data[1:]:
sel_item_values = sel_item.split("|")
sel_item_values = [entry.strip() for entry in sel_item_values]
sel_entries.append(dict(zip(sel_data_fields, sel_item_values)))
sel_item_dict = dict(zip(sel_data_fields, sel_item_values))
sel_item_datetime_str = sel_item_dict["Date"] + sel_item_dict["Time"]
sel_item_datetime = datetime.datetime.strptime(
sel_item_datetime_str, "%b-%d-%Y%H:%M:%S"
)
if sel_item_datetime > oldest_log_time:
sel_entries.append(sel_item_dict)
return sel_entries
2 changes: 2 additions & 0 deletions prometheus_hardware_exporter/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ class Config(BaseModel):
level: str = "DEBUG"
enable_collectors: List[str] = []

ipmi_sel_interval: int = 300

redfish_host: str = "127.0.0.1"
redfish_username: str = ""
redfish_password: str = ""
Expand Down
1 change: 1 addition & 0 deletions tests/unit/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
pytest
freezegun
14 changes: 11 additions & 3 deletions tests/unit/test_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,10 +331,18 @@ def test_41_ipmi_sel_installed_and_okay(self):

payloads = ipmi_sel_collector.collect()

available_metrics = [spec.name for spec in ipmi_sel_collector.specifications]
self.assertEqual(len(list(payloads)), len(mock_sel_entries) + 1)
payloads_labels_value_map = {}
for payload in payloads:
self.assertIn(payload.name, available_metrics)
if payload.name == "ipmi_sel_state":
payloads_labels_value_map[
tuple(payload.samples[0].labels.values())
] = payload.samples[0].value
expected_payloads_label_value_map = {
("System Board ACPI_Stat", "System ACPI Power State"): 1,
("System Chassis SysHealth_Stat", "Chassis"): 2,
}

self.assertDictEqual(payloads_labels_value_map, expected_payloads_label_value_map)

def test_50_ipmimonitoring_not_installed(self):
"""Test ipmi sensor collector when ipmimonitoring is not installed."""
Expand Down
33 changes: 9 additions & 24 deletions tests/unit/test_ipmi_sel.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,26 @@
import unittest
from unittest.mock import patch

from freezegun import freeze_time

from prometheus_hardware_exporter.collectors.ipmi_sel import IpmiSel
from prometheus_hardware_exporter.utils import Command, Result

SEL_SAMPLE_OUTPUT = "tests/unit/test_resources/ipmi/ipmi_sel_sample_output.txt"
SAMPLE_SEL_ENTRIES = [
{
"ID": "493",
"Date": "Oct-06-2022",
"Time": "19:47:13",
"Name": "System Board ACPI_Stat",
"Type": "System ACPI Power State",
"State": "Nominal",
"Event": "S0/G0",
},
{
"ID": "494",
"Date": "Oct-06-2022",
"Time": "19:57:23",
"Date": "Jul-09-2023",
"Time": "13:56:23",
"Name": "System Chassis SysHealth_Stat",
"Type": "Chassis",
"State": "Critical",
"Event": "transition to Non-recoverable from less severe",
},
{
"ID": "495",
"Date": "Oct-06-2022",
"Time": "19:57:38",
"Name": "System Board ACPI_Stat",
"Type": "System ACPI Power State",
"State": "Nominal",
"Event": "S4/S5 soft-off",
},
{
"ID": "496",
"Date": "Oct-06-2022",
"Time": "19:57:51",
"Date": "Jul-09-2023",
"Time": "13:57:50",
"Name": "System Board ACPI_Stat",
"Type": "System ACPI Power State",
"State": "Nominal",
Expand All @@ -49,17 +33,18 @@ class TestIpmiSel(unittest.TestCase):
"""Test the IpmiSel class."""

@patch.object(Command, "__call__")
@freeze_time("2023-07-09 23:59:59")
def test_00_get_sel_entries_success(self, mock_call):
with open(SEL_SAMPLE_OUTPUT, "r") as content:
mock_call.return_value = Result(content.read(), None)
ipmi_sel = IpmiSel()
payloads = ipmi_sel.get_sel_entries()
payloads = ipmi_sel.get_sel_entries(24 * 60 * 60)
expected_sel_entries = SAMPLE_SEL_ENTRIES
self.assertEqual(payloads, expected_sel_entries)

@patch.object(Command, "__call__")
def test_01_get_sel_entries_error(self, mock_call):
mock_call.return_value = Result("", True)
ipmi_sel = IpmiSel()
payloads = ipmi_sel.get_sel_entries()
payloads = ipmi_sel.get_sel_entries(300)
self.assertEqual(payloads, [])
4 changes: 2 additions & 2 deletions tests/unit/test_resources/ipmi/ipmi_sel_sample_output.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ID | Date | Time | Name | Type | State | Event
493 | Oct-06-2022 | 19:47:13 | System Board ACPI_Stat | System ACPI Power State | Nominal | S0/G0
494 | Oct-06-2022 | 19:57:23 | System Chassis SysHealth_Stat | Chassis | Critical | transition to Non-recoverable from less severe
494 | Jul-09-2023 | 13:56:23 | System Chassis SysHealth_Stat | Chassis | Critical | transition to Non-recoverable from less severe
495 | Oct-06-2022 | 19:57:38 | System Board ACPI_Stat | System ACPI Power State | Nominal | S4/S5 soft-off
496 | Oct-06-2022 | 19:57:51 | System Board ACPI_Stat | System ACPI Power State | Nominal | S0/G0
496 | Jul-09-2023 | 13:57:50 | System Board ACPI_Stat | System ACPI Power State | Nominal | S0/G0

0 comments on commit fa83b9c

Please sign in to comment.