diff --git a/prometheus_hardware_exporter/core.py b/prometheus_hardware_exporter/core.py index fcf094f..da0be89 100644 --- a/prometheus_hardware_exporter/core.py +++ b/prometheus_hardware_exporter/core.py @@ -5,7 +5,7 @@ from logging import getLogger from typing import Any, Dict, Iterable, List, Type -from prometheus_client.metrics_core import Metric +from prometheus_client.metrics_core import GaugeMetricFamily, Metric from prometheus_client.registry import Collector from .config import Config @@ -89,6 +89,22 @@ def specifications(self) -> List[Specification]: A list of specification. """ + @property + def failed_metrics(self) -> Iterable[Metric]: + """Defines the metrics return when collect func fail. + + Yields: + metrics: the internal metrics + """ + name = self.__class__.__name__ + metric = GaugeMetricFamily( + name=f"{name.lower()}_collector_failed", + labels=[], + documentation=f"{name} Collector fail to fetch metrics", + ) + metric.add_metric(labels=[], value=1) + yield metric + def init_default_datastore(self, payloads: List[Payload]) -> None: """Initialize or fill data the store with default values. @@ -101,7 +117,7 @@ def init_default_datastore(self, payloads: List[Payload]) -> None: name=payload.name, labels=payload.labels, value=0.0 ) - def collect(self) -> Iterable[Metric]: + def collect(self) -> Iterable[Metric]: # pylint: disable=R1710 """Fetch data and update the internal metrics. This is a callback method that is used internally within @@ -111,21 +127,28 @@ def collect(self) -> Iterable[Metric]: Yields: metrics: the internal metrics """ - payloads = self.fetch() - self.init_default_datastore(payloads) - processed_payloads = self.process(payloads, self._datastore) - - # unpacked and create metrics - for payload in processed_payloads: - spec = self._specs[payload.name] - # We have to ignore the type checking here, since the subclass of - # any metric family from prometheus client adds new attributes and - # methods. - metric = spec.metric_class( # type: ignore[call-arg] - name=spec.name, labels=spec.labels, documentation=spec.documentation - ) - metric.add_metric( # type: ignore[attr-defined] - labels=payload.labels, value=payload.value - ) - yield metric - self._datastore[payload.uuid] = payload + # The general exception hanlder will try to make sure the single + # collector's bug will only change the metrics output to failed_metrics + # and also make sure other collectors are still working. + try: + payloads = self.fetch() + self.init_default_datastore(payloads) + processed_payloads = self.process(payloads, self._datastore) + + # unpacked and create metrics + for payload in processed_payloads: + spec = self._specs[payload.name] + # We have to ignore the type checking here, since the subclass of + # any metric family from prometheus client adds new attributes and + # methods. + metric = spec.metric_class( # type: ignore[call-arg] + name=spec.name, labels=spec.labels, documentation=spec.documentation + ) + metric.add_metric( # type: ignore[attr-defined] + labels=payload.labels, value=payload.value + ) + yield metric + self._datastore[payload.uuid] = payload + except Exception as e: # pylint: disable=W0718 + logger.error(e) + yield from self.failed_metrics diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 2422411..78ddd0d 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -36,6 +36,7 @@ def test_00_mega_raid_collector_not_installed(self): self.assertEqual(len(list(payloads)), 1) + def test_01_mega_raid_collector_installed_and_okay(self): """Test mega raid collector can fetch correct number of metrics.""" mega_raid_collector = MegaRAIDCollector(Mock()) @@ -1235,3 +1236,18 @@ def test_210_redfish_create_smart_storage_health_metric_payload(self): ) ], ) + + def test_1000_collector_fetch_failed(self): + for collector_cls, expected_name in [ + (MegaRAIDCollector, "megaraidcollector_collector_failed"), + (RedfishCollector, "redfishcollector_collector_failed"), + (IpmiSensorsCollector, "ipmisensorscollector_collector_failed"), + ]: + collector = collector_cls(Mock()) + collector.fetch = Mock() + collector.fetch.side_effect = Exception("Unknow error") + payloads = collector.collect() + payloads = list(payloads) + self.assertEqual(len(payloads), 1) + self.assertEqual(payloads[0].name, expected_name) + assert payloads[0].samples[0].value == 1.0