Skip to content

Commit

Permalink
Add TTL cache to redfish discover method (#34)
Browse files Browse the repository at this point in the history
The `redfish.discover_ssdp()` method takes a very long time to query the
redfish services. This causes the scrape timeout to be exceeded for
Prometheus. In order to solve this, a TTL cache has been added. A
logging message has also been added for debugging.

Additionally, the `redfish.redfish_client` call to create a redfish
object has been provided with the `timeout` and `max_retry` arguments in
order to not have the query take too long in case of incorrect redfish
credentials.

redfish_client config and discover cache parameters can be
modified through the prometheus-hardware-exporter CLI args.
The default values for these configs are defined in `config.py`.

Some other helpful improvements:
- `RedfishHelper` class now directly takes an instance of `Config` instead of 
sending the parameters individually.
- Use context manager for redish login object to automatically logout at the end of the context.
- Split sensor data methods to be more readable.
- Add unit tests
  • Loading branch information
dashmage authored Aug 11, 2023
1 parent 743402c commit 92f6264
Show file tree
Hide file tree
Showing 8 changed files with 301 additions and 93 deletions.
32 changes: 31 additions & 1 deletion prometheus_hardware_exporter/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,13 @@
RedfishCollector,
SsaCLICollector,
)
from .config import DEFAULT_CONFIG, Config
from .config import (
DEFAULT_CONFIG,
DEFAULT_REDFISH_CLIENT_MAX_RETRY,
DEFAULT_REDFISH_CLIENT_TIMEOUT,
DEFAULT_REDFISH_DISCOVER_CACHE_TTL,
Config,
)
from .exporter import Exporter

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -112,6 +118,27 @@ def parse_command_line() -> argparse.Namespace:
help="Enable redfish collector (default: disabled)",
action="store_true",
)
parser.add_argument(
"--redfish-client-timeout",
help="Redfish client timeout in seconds for initial connection (default: 3) ",
default=DEFAULT_REDFISH_CLIENT_TIMEOUT,
type=int,
)
parser.add_argument(
"--redfish-client-max-retry",
help="Number of times the redfish client will retry after timeout (default: 1) ",
default=DEFAULT_REDFISH_CLIENT_MAX_RETRY,
type=int,
)
parser.add_argument(
"--redfish-discover-cache-ttl",
help=(
"How long should the cached redfish discovery services "
"be stored in seconds (default: 86400) "
),
default=DEFAULT_REDFISH_DISCOVER_CACHE_TTL,
type=int,
)
args = parser.parse_args()

return args
Expand Down Expand Up @@ -166,6 +193,9 @@ def main() -> None:
redfish_username=namespace.redfish_username,
redfish_password=namespace.redfish_password,
ipmi_sel_interval=namespace.ipmi_sel_interval,
redfish_client_timeout=namespace.redfish_client_timeout,
redfish_client_max_retry=namespace.redfish_client_max_retry,
redfish_discover_cache_ttl=namespace.redfish_discover_cache_ttl,
)

# Start the exporter
Expand Down
14 changes: 5 additions & 9 deletions prometheus_hardware_exporter/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -850,7 +850,10 @@ def process(self, payloads: List[Payload], datastore: Dict[str, Payload]) -> Lis
class RedfishCollector(BlockingCollector):
"""Collector for redfish status and data."""

redfish_helper = RedfishHelper()
def __init__(self, config: Config) -> None:
"""Initialize RedfishHelper instance."""
super().__init__(config)
self.redfish_helper = RedfishHelper(config)

@property
def specifications(self) -> List[Specification]:
Expand All @@ -875,18 +878,11 @@ def specifications(self) -> List[Specification]:

def fetch(self) -> List[Payload]:
"""Load redfish data."""
redfish_host = self.config.redfish_host
redfish_username = self.config.redfish_username
redfish_password = self.config.redfish_password
payloads = []
service_status = self.redfish_helper.discover()
payloads.append(Payload(name="redfish_service_available", value=float(service_status)))

sensor_data = self.redfish_helper.get_sensor_data(
host=redfish_host,
username=redfish_username,
password=redfish_password,
)
sensor_data = self.redfish_helper.get_sensor_data()
if not sensor_data:
logger.error("Failed to get sensor data via redfish.")
payloads.append(Payload(name="redfish_call_success", value=0.0))
Expand Down
113 changes: 71 additions & 42 deletions prometheus_hardware_exporter/collectors/redfish.py
Original file line number Diff line number Diff line change
@@ -1,65 +1,94 @@
"""Redfish collector."""
from logging import getLogger
from typing import Any, Dict, List, Optional
from typing import Any, Callable, Dict, List

import redfish
import redfish_utilities
from redfish.rest.v1 import HttpClient, InvalidCredentialsError, SessionCreationError
from cachetools.func import ttl_cache
from redfish.rest.v1 import (
InvalidCredentialsError,
RetriesExhaustedError,
SessionCreationError,
)

from prometheus_hardware_exporter.config import Config

logger = getLogger(__name__)


class RedfishHelper:
"""Helper function for redfish."""

def get_sensor_data(self, host: str, username: str, password: str) -> Dict[str, List]:
def __init__(self, config: Config) -> None:
"""Initialize redfish login args and cache TTL value for discover method."""
self.host = config.redfish_host
self.username = config.redfish_username
self.password = config.redfish_password
self.timeout = config.redfish_client_timeout
self.max_retry = config.redfish_client_max_retry
self.discover = self.get_discover(config.redfish_discover_cache_ttl)

def get_sensor_data(self) -> Dict[str, List]:
"""Get sensor data.
Returns:
sensor_data: a dictionary where key, value maps to chassis name, sensor data.
"""
data = self._get_sensor_data(host=host, username=username, password=password)
if not data:
return {}
output = {}
for chassis in data:
name = str(chassis["ChassisName"])
sensors = []
for sensor in chassis["Readings"]:
reading: str = str(sensor["Reading"]) + (sensor["Units"] or "")
data = self._retrieve_redfish_sensor_data()
return self._map_sensor_data_to_chassis(data)

sensors.append(
{
"Sensor": sensor["Name"],
"Reading": reading,
"Health": sensor["Health"] or "N/A",
}
)
output[name] = sensors
def _map_sensor_data_to_chassis(self, sensor_data: List[Any]) -> Dict[str, List]:
output = {}
for chassis in sensor_data:
output[str(chassis["ChassisName"])] = [
{
"Sensor": sensor["Name"],
"Reading": str(sensor["Reading"]) + (sensor["Units"] or ""),
"Health": sensor["Health"] or "N/A",
}
for sensor in chassis["Readings"]
]
return output

def _get_sensor_data(self, host: str, username: str, password: str) -> Optional[List[Any]]:
"""Return sensor if sensor exists else None."""
sensors: Optional[List[Any]] = None
redfish_obj: Optional[HttpClient] = None
def _retrieve_redfish_sensor_data(self) -> List[Any]:
"""Return sensor if sensor exists else None.
Returns:
sensors: List of dicts with details for each sensor
"""
try:
redfish_obj = redfish.redfish_client(
base_url=host, username=username, password=password
)
redfish_obj.login(auth="session")
sensors = redfish_utilities.get_sensors(redfish_obj)
return sensors
except (InvalidCredentialsError, SessionCreationError, ConnectionError) as err:
with redfish.redfish_client(
base_url=self.host,
username=self.username,
password=self.password,
timeout=self.timeout,
max_retry=self.max_retry,
) as redfish_obj:
return redfish_utilities.get_sensors(redfish_obj)
except (
InvalidCredentialsError,
SessionCreationError,
ConnectionError,
RetriesExhaustedError,
) as err:
logger.exception(err)
finally:
# Log out
if redfish_obj:
redfish_obj.logout()
return sensors
return []

def get_discover(self, ttl: int) -> Callable:
"""Return the cached discover function.
Passes the ttl value to the cache decorator at runtime.
"""

@ttl_cache(ttl=ttl)
def _discover() -> bool:
"""Return true if redfish services have been discovered."""
logger.info("Discovering redfish services...")
services = redfish.discover_ssdp()
if len(services) == 0:
logger.info("No redfish services discovered")
return False
logger.info("Discovered redfish services: %s", services)
return True

def discover(self) -> bool:
"""Return true if redfish is been discovered."""
services = redfish.discover_ssdp()
if len(services) == 0:
return False
return True
return _discover
6 changes: 6 additions & 0 deletions prometheus_hardware_exporter/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@

DEFAULT_CONFIG = os.path.join(os.environ.get("SNAP_DATA", "./"), "config.yaml")

DEFAULT_REDFISH_CLIENT_TIMEOUT = 3
DEFAULT_REDFISH_CLIENT_MAX_RETRY = 1
DEFAULT_REDFISH_DISCOVER_CACHE_TTL = 86400

# pylint: disable=E0213

Expand All @@ -27,6 +30,9 @@ class Config(BaseModel):
redfish_host: str = "127.0.0.1"
redfish_username: str = ""
redfish_password: str = ""
redfish_client_timeout: int = DEFAULT_REDFISH_CLIENT_TIMEOUT
redfish_client_max_retry: int = DEFAULT_REDFISH_CLIENT_MAX_RETRY
redfish_discover_cache_ttl: int = DEFAULT_REDFISH_DISCOVER_CACHE_TTL

@validator("port")
def validate_port_range(cls, port: int) -> int:
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
cachetools
prometheus-client
pydantic < 2.0
pyyaml
Expand Down
1 change: 1 addition & 0 deletions tests/unit/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
cachetools
pytest
freezegun
5 changes: 2 additions & 3 deletions tests/unit/test_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
RedfishCollector,
SsaCLICollector,
)
from prometheus_hardware_exporter.collectors.redfish import RedfishHelper


class TestCustomCollector(unittest.TestCase):
Expand Down Expand Up @@ -550,7 +549,7 @@ def test_105_perccli_physical_device_command_success(self):
def test_200_redfish_not_installed(self):
"""Test redfish collector when redfish-utilitites is not installed."""
redfish_collector = RedfishCollector(Mock())
redfish_collector.redfish_helper = Mock(spec=RedfishHelper)
redfish_collector.redfish_helper = Mock()
redfish_collector.redfish_helper.discover.return_value = False
redfish_collector.redfish_helper.get_sensor_data.return_value = []
payloads = redfish_collector.collect()
Expand All @@ -560,7 +559,7 @@ def test_200_redfish_not_installed(self):
def test_201_redfish_installed_and_okay(self):
"""Test redfish collector when redfish-utilitites is installed."""
redfish_collector = RedfishCollector(Mock())
redfish_collector.redfish_helper = Mock(spec=RedfishHelper)
redfish_collector.redfish_helper = Mock()
redfish_collector.redfish_helper.discover.return_value = False
redfish_collector.redfish_helper.get_sensor_data.return_value = {
"1": [
Expand Down
Loading

0 comments on commit 92f6264

Please sign in to comment.