From 92f6264e1b1bdb69f8dd8cbe26fd7f6944c8af48 Mon Sep 17 00:00:00 2001 From: Ashley James Date: Fri, 11 Aug 2023 18:41:50 +0530 Subject: [PATCH] Add TTL cache to redfish discover method (#34) The `redfish.discover_ssdp()` method takes a very long time to query the redfish services. This causes the scrape timeout to be exceeded for Prometheus. In order to solve this, a TTL cache has been added. A logging message has also been added for debugging. Additionally, the `redfish.redfish_client` call to create a redfish object has been provided with the `timeout` and `max_retry` arguments in order to not have the query take too long in case of incorrect redfish credentials. redfish_client config and discover cache parameters can be modified through the prometheus-hardware-exporter CLI args. The default values for these configs are defined in `config.py`. Some other helpful improvements: - `RedfishHelper` class now directly takes an instance of `Config` instead of sending the parameters individually. - Use context manager for redish login object to automatically logout at the end of the context. - Split sensor data methods to be more readable. - Add unit tests --- prometheus_hardware_exporter/__main__.py | 32 ++- prometheus_hardware_exporter/collector.py | 14 +- .../collectors/redfish.py | 113 +++++---- prometheus_hardware_exporter/config.py | 6 + requirements.txt | 1 + tests/unit/requirements.txt | 1 + tests/unit/test_collector.py | 5 +- tests/unit/test_redfish.py | 222 +++++++++++++++--- 8 files changed, 301 insertions(+), 93 deletions(-) diff --git a/prometheus_hardware_exporter/__main__.py b/prometheus_hardware_exporter/__main__.py index f57624a..5056e96 100644 --- a/prometheus_hardware_exporter/__main__.py +++ b/prometheus_hardware_exporter/__main__.py @@ -16,7 +16,13 @@ RedfishCollector, SsaCLICollector, ) -from .config import DEFAULT_CONFIG, Config +from .config import ( + DEFAULT_CONFIG, + DEFAULT_REDFISH_CLIENT_MAX_RETRY, + DEFAULT_REDFISH_CLIENT_TIMEOUT, + DEFAULT_REDFISH_DISCOVER_CACHE_TTL, + Config, +) from .exporter import Exporter logger = logging.getLogger(__name__) @@ -112,6 +118,27 @@ def parse_command_line() -> argparse.Namespace: help="Enable redfish collector (default: disabled)", action="store_true", ) + parser.add_argument( + "--redfish-client-timeout", + help="Redfish client timeout in seconds for initial connection (default: 3) ", + default=DEFAULT_REDFISH_CLIENT_TIMEOUT, + type=int, + ) + parser.add_argument( + "--redfish-client-max-retry", + help="Number of times the redfish client will retry after timeout (default: 1) ", + default=DEFAULT_REDFISH_CLIENT_MAX_RETRY, + type=int, + ) + parser.add_argument( + "--redfish-discover-cache-ttl", + help=( + "How long should the cached redfish discovery services " + "be stored in seconds (default: 86400) " + ), + default=DEFAULT_REDFISH_DISCOVER_CACHE_TTL, + type=int, + ) args = parser.parse_args() return args @@ -166,6 +193,9 @@ def main() -> None: redfish_username=namespace.redfish_username, redfish_password=namespace.redfish_password, ipmi_sel_interval=namespace.ipmi_sel_interval, + redfish_client_timeout=namespace.redfish_client_timeout, + redfish_client_max_retry=namespace.redfish_client_max_retry, + redfish_discover_cache_ttl=namespace.redfish_discover_cache_ttl, ) # Start the exporter diff --git a/prometheus_hardware_exporter/collector.py b/prometheus_hardware_exporter/collector.py index d0435f2..5a07726 100644 --- a/prometheus_hardware_exporter/collector.py +++ b/prometheus_hardware_exporter/collector.py @@ -850,7 +850,10 @@ def process(self, payloads: List[Payload], datastore: Dict[str, Payload]) -> Lis class RedfishCollector(BlockingCollector): """Collector for redfish status and data.""" - redfish_helper = RedfishHelper() + def __init__(self, config: Config) -> None: + """Initialize RedfishHelper instance.""" + super().__init__(config) + self.redfish_helper = RedfishHelper(config) @property def specifications(self) -> List[Specification]: @@ -875,18 +878,11 @@ def specifications(self) -> List[Specification]: def fetch(self) -> List[Payload]: """Load redfish data.""" - redfish_host = self.config.redfish_host - redfish_username = self.config.redfish_username - redfish_password = self.config.redfish_password payloads = [] service_status = self.redfish_helper.discover() payloads.append(Payload(name="redfish_service_available", value=float(service_status))) - sensor_data = self.redfish_helper.get_sensor_data( - host=redfish_host, - username=redfish_username, - password=redfish_password, - ) + sensor_data = self.redfish_helper.get_sensor_data() if not sensor_data: logger.error("Failed to get sensor data via redfish.") payloads.append(Payload(name="redfish_call_success", value=0.0)) diff --git a/prometheus_hardware_exporter/collectors/redfish.py b/prometheus_hardware_exporter/collectors/redfish.py index 196640f..439b984 100644 --- a/prometheus_hardware_exporter/collectors/redfish.py +++ b/prometheus_hardware_exporter/collectors/redfish.py @@ -1,10 +1,17 @@ """Redfish collector.""" from logging import getLogger -from typing import Any, Dict, List, Optional +from typing import Any, Callable, Dict, List import redfish import redfish_utilities -from redfish.rest.v1 import HttpClient, InvalidCredentialsError, SessionCreationError +from cachetools.func import ttl_cache +from redfish.rest.v1 import ( + InvalidCredentialsError, + RetriesExhaustedError, + SessionCreationError, +) + +from prometheus_hardware_exporter.config import Config logger = getLogger(__name__) @@ -12,54 +19,76 @@ class RedfishHelper: """Helper function for redfish.""" - def get_sensor_data(self, host: str, username: str, password: str) -> Dict[str, List]: + def __init__(self, config: Config) -> None: + """Initialize redfish login args and cache TTL value for discover method.""" + self.host = config.redfish_host + self.username = config.redfish_username + self.password = config.redfish_password + self.timeout = config.redfish_client_timeout + self.max_retry = config.redfish_client_max_retry + self.discover = self.get_discover(config.redfish_discover_cache_ttl) + + def get_sensor_data(self) -> Dict[str, List]: """Get sensor data. Returns: sensor_data: a dictionary where key, value maps to chassis name, sensor data. """ - data = self._get_sensor_data(host=host, username=username, password=password) - if not data: - return {} - output = {} - for chassis in data: - name = str(chassis["ChassisName"]) - sensors = [] - for sensor in chassis["Readings"]: - reading: str = str(sensor["Reading"]) + (sensor["Units"] or "") + data = self._retrieve_redfish_sensor_data() + return self._map_sensor_data_to_chassis(data) - sensors.append( - { - "Sensor": sensor["Name"], - "Reading": reading, - "Health": sensor["Health"] or "N/A", - } - ) - output[name] = sensors + def _map_sensor_data_to_chassis(self, sensor_data: List[Any]) -> Dict[str, List]: + output = {} + for chassis in sensor_data: + output[str(chassis["ChassisName"])] = [ + { + "Sensor": sensor["Name"], + "Reading": str(sensor["Reading"]) + (sensor["Units"] or ""), + "Health": sensor["Health"] or "N/A", + } + for sensor in chassis["Readings"] + ] return output - def _get_sensor_data(self, host: str, username: str, password: str) -> Optional[List[Any]]: - """Return sensor if sensor exists else None.""" - sensors: Optional[List[Any]] = None - redfish_obj: Optional[HttpClient] = None + def _retrieve_redfish_sensor_data(self) -> List[Any]: + """Return sensor if sensor exists else None. + + Returns: + sensors: List of dicts with details for each sensor + """ try: - redfish_obj = redfish.redfish_client( - base_url=host, username=username, password=password - ) - redfish_obj.login(auth="session") - sensors = redfish_utilities.get_sensors(redfish_obj) - return sensors - except (InvalidCredentialsError, SessionCreationError, ConnectionError) as err: + with redfish.redfish_client( + base_url=self.host, + username=self.username, + password=self.password, + timeout=self.timeout, + max_retry=self.max_retry, + ) as redfish_obj: + return redfish_utilities.get_sensors(redfish_obj) + except ( + InvalidCredentialsError, + SessionCreationError, + ConnectionError, + RetriesExhaustedError, + ) as err: logger.exception(err) - finally: - # Log out - if redfish_obj: - redfish_obj.logout() - return sensors + return [] + + def get_discover(self, ttl: int) -> Callable: + """Return the cached discover function. + + Passes the ttl value to the cache decorator at runtime. + """ + + @ttl_cache(ttl=ttl) + def _discover() -> bool: + """Return true if redfish services have been discovered.""" + logger.info("Discovering redfish services...") + services = redfish.discover_ssdp() + if len(services) == 0: + logger.info("No redfish services discovered") + return False + logger.info("Discovered redfish services: %s", services) + return True - def discover(self) -> bool: - """Return true if redfish is been discovered.""" - services = redfish.discover_ssdp() - if len(services) == 0: - return False - return True + return _discover diff --git a/prometheus_hardware_exporter/config.py b/prometheus_hardware_exporter/config.py index 0449612..50b72f0 100644 --- a/prometheus_hardware_exporter/config.py +++ b/prometheus_hardware_exporter/config.py @@ -11,6 +11,9 @@ DEFAULT_CONFIG = os.path.join(os.environ.get("SNAP_DATA", "./"), "config.yaml") +DEFAULT_REDFISH_CLIENT_TIMEOUT = 3 +DEFAULT_REDFISH_CLIENT_MAX_RETRY = 1 +DEFAULT_REDFISH_DISCOVER_CACHE_TTL = 86400 # pylint: disable=E0213 @@ -27,6 +30,9 @@ class Config(BaseModel): redfish_host: str = "127.0.0.1" redfish_username: str = "" redfish_password: str = "" + redfish_client_timeout: int = DEFAULT_REDFISH_CLIENT_TIMEOUT + redfish_client_max_retry: int = DEFAULT_REDFISH_CLIENT_MAX_RETRY + redfish_discover_cache_ttl: int = DEFAULT_REDFISH_DISCOVER_CACHE_TTL @validator("port") def validate_port_range(cls, port: int) -> int: diff --git a/requirements.txt b/requirements.txt index fbf5a6f..2203da2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +cachetools prometheus-client pydantic < 2.0 pyyaml diff --git a/tests/unit/requirements.txt b/tests/unit/requirements.txt index df2a236..a86cdba 100644 --- a/tests/unit/requirements.txt +++ b/tests/unit/requirements.txt @@ -1,2 +1,3 @@ +cachetools pytest freezegun \ No newline at end of file diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 47d9e22..86a87d5 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -16,7 +16,6 @@ RedfishCollector, SsaCLICollector, ) -from prometheus_hardware_exporter.collectors.redfish import RedfishHelper class TestCustomCollector(unittest.TestCase): @@ -550,7 +549,7 @@ def test_105_perccli_physical_device_command_success(self): def test_200_redfish_not_installed(self): """Test redfish collector when redfish-utilitites is not installed.""" redfish_collector = RedfishCollector(Mock()) - redfish_collector.redfish_helper = Mock(spec=RedfishHelper) + redfish_collector.redfish_helper = Mock() redfish_collector.redfish_helper.discover.return_value = False redfish_collector.redfish_helper.get_sensor_data.return_value = [] payloads = redfish_collector.collect() @@ -560,7 +559,7 @@ def test_200_redfish_not_installed(self): def test_201_redfish_installed_and_okay(self): """Test redfish collector when redfish-utilitites is installed.""" redfish_collector = RedfishCollector(Mock()) - redfish_collector.redfish_helper = Mock(spec=RedfishHelper) + redfish_collector.redfish_helper = Mock() redfish_collector.redfish_helper.discover.return_value = False redfish_collector.redfish_helper.get_sensor_data.return_value = { "1": [ diff --git a/tests/unit/test_redfish.py b/tests/unit/test_redfish.py index 39e155a..d10d63f 100644 --- a/tests/unit/test_redfish.py +++ b/tests/unit/test_redfish.py @@ -1,17 +1,23 @@ import unittest +from time import sleep from unittest.mock import Mock, patch from redfish.rest.v1 import InvalidCredentialsError, SessionCreationError from prometheus_hardware_exporter.collectors.redfish import RedfishHelper +from prometheus_hardware_exporter.config import Config class TestRedfishSensors(unittest.TestCase): """Test the RedfishSensors class.""" + def mock_helper(self): + mock_config = Mock() + return RedfishHelper(mock_config) + @patch.object( RedfishHelper, - "_get_sensor_data", + "_retrieve_redfish_sensor_data", return_value=[ { "ChassisName": 1, @@ -72,9 +78,9 @@ class TestRedfishSensors(unittest.TestCase): } ], ) - def test_00_get_sensor_data_success(self, mock_get_sensor_data): - helper = RedfishHelper() - data = helper.get_sensor_data("", "", "") + def test_00_get_sensor_data_success(self, mock_sensor_data): + helper = self.mock_helper() + data = helper.get_sensor_data() self.assertEqual( data, { @@ -97,7 +103,7 @@ def test_00_get_sensor_data_success(self, mock_get_sensor_data): @patch.object( RedfishHelper, - "_get_sensor_data", + "_retrieve_redfish_sensor_data", return_value=[ { "ChassisName": 1, @@ -163,9 +169,9 @@ def test_00_get_sensor_data_success(self, mock_get_sensor_data): }, ], ) - def test_01_get_multiple_chassis_sensor_data_success(self, mock_get_sensor_data): - helper = RedfishHelper() - data = helper.get_sensor_data("", "", "") + def test_01_get_multiple_chassis_sensor_data_success(self, mock_sensor_data): + helper = self.mock_helper() + data = helper.get_sensor_data() self.assertEqual( data, { @@ -188,55 +194,161 @@ def test_01_get_multiple_chassis_sensor_data_success(self, mock_get_sensor_data) }, ) - @patch.object(RedfishHelper, "_get_sensor_data", return_value=[]) - def test_02_get_sensor_data_fail(self, mock_get_sensor_data): - helper = RedfishHelper() - data = helper.get_sensor_data("", "", "") + @patch.object(RedfishHelper, "_retrieve_redfish_sensor_data", return_value=[]) + def test_02_get_sensor_data_fail(self, mock_sensor_data): + helper = self.mock_helper() + data = helper.get_sensor_data() self.assertEqual(data, {}) - @patch("prometheus_hardware_exporter.collectors.redfish.redfish_utilities") - @patch("prometheus_hardware_exporter.collectors.redfish.redfish") - def test_03__get_sensor_data_success(self, mock_redfish, mock_redfish_utilities): - mock_redfish_utilities.get_sensors.return_value = ["return_data"] - - mock_redfish_client = Mock() - mock_redfish.redfish_client.return_value = mock_redfish_client - helper = RedfishHelper() - data = helper._get_sensor_data("", "", "") + def test_03_map_sensor_data_to_chassis(self): + mock_data = [ + { + "ChassisName": 1, + "Readings": [ + { + "Name": "State", + "Reading": "Enabled", + "Units": None, + "State": "Enabled", + "Health": "OK", + "LowerFatal": None, + "LowerCritical": None, + "LowerCaution": None, + "UpperCaution": None, + "UpperCritical": None, + "UpperFatal": None, + }, + { + "Name": "HpeServerPowerSupply State", + "Reading": "Enabled", + "Units": None, + "State": "Enabled", + "Health": "OK", + "LowerFatal": None, + "LowerCritical": None, + "LowerCaution": None, + "UpperCaution": None, + "UpperCritical": None, + "UpperFatal": None, + }, + ], + }, + { + "ChassisName": 2, + "Readings": [ + { + "Name": "HpeServerPowerSupply LineInputVoltage", + "Reading": 208, + "Units": "V", + "State": None, + "Health": None, + "LowerFatal": None, + "LowerCritical": None, + "LowerCaution": None, + "UpperCaution": None, + "UpperCritical": None, + "UpperFatal": None, + }, + { + "Name": "HpeServerPowerSupply PowerCapacityWatts", + "Reading": 800, + "Units": "W", + "State": None, + "Health": None, + "LowerFatal": None, + "LowerCritical": None, + "LowerCaution": None, + "UpperCaution": None, + "UpperCritical": None, + "UpperFatal": None, + }, + ], + }, + ] + + helper = self.mock_helper() + output = helper._map_sensor_data_to_chassis(mock_data) + self.assertEqual( + output, + { + "1": [ + {"Sensor": "State", "Reading": "Enabled", "Health": "OK"}, + {"Sensor": "HpeServerPowerSupply State", "Reading": "Enabled", "Health": "OK"}, + ], + "2": [ + { + "Sensor": "HpeServerPowerSupply LineInputVoltage", + "Reading": "208V", + "Health": "N/A", + }, + { + "Sensor": "HpeServerPowerSupply PowerCapacityWatts", + "Reading": "800W", + "Health": "N/A", + }, + ], + }, + ) + + @patch("prometheus_hardware_exporter.collectors.redfish.redfish_utilities.get_sensors") + @patch("prometheus_hardware_exporter.collectors.redfish.redfish.redfish_client") + def test_04_retrieve_redfish_sensor_data_success(self, mock_redfish_client, mock_get_sensors): + mock_get_sensors.return_value = ["return_data"] + + mock_redfish_obj = Mock() + mock_redfish_client.return_value.__enter__.return_value = mock_redfish_obj + helper = self.mock_helper() + data = helper._retrieve_redfish_sensor_data() self.assertEqual(data, ["return_data"]) - mock_redfish_client.logout.assert_called() @patch("prometheus_hardware_exporter.collectors.redfish.logger") @patch("prometheus_hardware_exporter.collectors.redfish.redfish_utilities") - @patch("prometheus_hardware_exporter.collectors.redfish.redfish") - def test_04__get_sensor_data_fail(self, mock_redfish, mock_redfish_utilities, mock_logger): + @patch("prometheus_hardware_exporter.collectors.redfish.redfish.redfish_client") + def test_05_retrieve_redfish_sensor_data_fail( + self, mock_redfish_client, mock_redfish_utilities, mock_logger + ): for err in [InvalidCredentialsError(), SessionCreationError()]: - mock_redfish_client = Mock() - mock_redfish_client.login.side_effect = err - mock_redfish.redfish_client.return_value = mock_redfish_client + mock_redfish_client.return_value.__enter__.side_effect = err - helper = RedfishHelper() - helper._get_sensor_data("", "", "") - mock_redfish_client.logout.assert_called() - mock_logger.exception.assert_called_with(mock_redfish_client.login.side_effect) + helper = self.mock_helper() + data = helper._retrieve_redfish_sensor_data() + mock_logger.exception.assert_called_with( + mock_redfish_client.return_value.__enter__.side_effect + ) + self.assertEqual(data, []) - @patch("prometheus_hardware_exporter.collectors.redfish.redfish") - def test_05__get_sensor_data_connection_error(self, mock_redfish): + @patch("prometheus_hardware_exporter.collectors.redfish.redfish.redfish_client") + def test_06_retrieve_redfish_sensor_data_connection_error(self, mock_redfish_client): """Shouldn't raise error if connection fail.""" - mock_redfish.redfish_client.side_effect = ConnectionError() - helper = RedfishHelper() - helper._get_sensor_data("", "", "") + mock_redfish_client.return_value.__enter__.side_effect = ConnectionError() + helper = self.mock_helper() + data = helper._retrieve_redfish_sensor_data() + self.assertEqual(data, []) class TestRedfishServiceStatus(unittest.TestCase): """Test the RedfishServiceStatus class.""" + def mock_helper(self): + mock_ttl = 10 + mock_timeout = 3 + mock_max_retry = 1 + mock_config = Config( + redfish_host="", + redfish_username="", + redfish_password="", + redfish_client_timeout=mock_timeout, + redfish_client_max_retry=mock_max_retry, + redfish_discover_cache_ttl=mock_ttl, + ) + return RedfishHelper(mock_config) + @patch( "prometheus_hardware_exporter.collectors.redfish.redfish.discover_ssdp", return_value=[1, 2, 3], ) def test_00_get_service_status_good(self, mock_discover_ssdp): - helper = RedfishHelper() + helper = self.mock_helper() ok = helper.discover() self.assertEqual(ok, True) @@ -244,6 +356,40 @@ def test_00_get_service_status_good(self, mock_discover_ssdp): "prometheus_hardware_exporter.collectors.redfish.redfish.discover_ssdp", return_value=[] ) def test_01_get_service_status_bad(self, mock_call): - helper = RedfishHelper() + helper = self.mock_helper() ok = helper.discover() self.assertEqual(ok, False) + + @patch( + "prometheus_hardware_exporter.collectors.redfish.redfish.discover_ssdp", + return_value=[1, 2, 3], + ) + def test_02_discover_cache(self, mock_discover): + mock_timeout = 3 + mock_max_retry = 1 + ttl = 1 + mock_config = Config( + redfish_host="", + redfish_username="", + redfish_password="", + redfish_client_timeout=mock_timeout, + redfish_client_max_retry=mock_max_retry, + redfish_discover_cache_ttl=ttl, + ) + helper = RedfishHelper(mock_config) + + output = helper.discover() + self.assertEqual(output, True) + mock_discover.assert_called() + mock_discover.reset_mock() + + # output from cache + output = helper.discover() + self.assertEqual(output, True) + mock_discover.assert_not_called() + + # wait till cache expires + sleep(ttl + 1) + output = helper.discover() + self.assertEqual(output, True) + mock_discover.assert_called()