From fd75a3455e8f79476e077878937c799f188ba657 Mon Sep 17 00:00:00 2001 From: jneo8 Date: Tue, 21 Nov 2023 16:27:40 +0800 Subject: [PATCH 1/4] fix: General err handling for collector The error handler will try to output the failed metrics and make sure the single collector's bug won't affect other collectors. --- prometheus_hardware_exporter/core.py | 63 +++++++++++++++++++--------- tests/unit/test_collector.py | 15 +++++++ 2 files changed, 58 insertions(+), 20 deletions(-) diff --git a/prometheus_hardware_exporter/core.py b/prometheus_hardware_exporter/core.py index fcf094f..da0be89 100644 --- a/prometheus_hardware_exporter/core.py +++ b/prometheus_hardware_exporter/core.py @@ -5,7 +5,7 @@ from logging import getLogger from typing import Any, Dict, Iterable, List, Type -from prometheus_client.metrics_core import Metric +from prometheus_client.metrics_core import GaugeMetricFamily, Metric from prometheus_client.registry import Collector from .config import Config @@ -89,6 +89,22 @@ def specifications(self) -> List[Specification]: A list of specification. """ + @property + def failed_metrics(self) -> Iterable[Metric]: + """Defines the metrics return when collect func fail. + + Yields: + metrics: the internal metrics + """ + name = self.__class__.__name__ + metric = GaugeMetricFamily( + name=f"{name.lower()}_collector_failed", + labels=[], + documentation=f"{name} Collector fail to fetch metrics", + ) + metric.add_metric(labels=[], value=1) + yield metric + def init_default_datastore(self, payloads: List[Payload]) -> None: """Initialize or fill data the store with default values. @@ -101,7 +117,7 @@ def init_default_datastore(self, payloads: List[Payload]) -> None: name=payload.name, labels=payload.labels, value=0.0 ) - def collect(self) -> Iterable[Metric]: + def collect(self) -> Iterable[Metric]: # pylint: disable=R1710 """Fetch data and update the internal metrics. This is a callback method that is used internally within @@ -111,21 +127,28 @@ def collect(self) -> Iterable[Metric]: Yields: metrics: the internal metrics """ - payloads = self.fetch() - self.init_default_datastore(payloads) - processed_payloads = self.process(payloads, self._datastore) - - # unpacked and create metrics - for payload in processed_payloads: - spec = self._specs[payload.name] - # We have to ignore the type checking here, since the subclass of - # any metric family from prometheus client adds new attributes and - # methods. - metric = spec.metric_class( # type: ignore[call-arg] - name=spec.name, labels=spec.labels, documentation=spec.documentation - ) - metric.add_metric( # type: ignore[attr-defined] - labels=payload.labels, value=payload.value - ) - yield metric - self._datastore[payload.uuid] = payload + # The general exception hanlder will try to make sure the single + # collector's bug will only change the metrics output to failed_metrics + # and also make sure other collectors are still working. + try: + payloads = self.fetch() + self.init_default_datastore(payloads) + processed_payloads = self.process(payloads, self._datastore) + + # unpacked and create metrics + for payload in processed_payloads: + spec = self._specs[payload.name] + # We have to ignore the type checking here, since the subclass of + # any metric family from prometheus client adds new attributes and + # methods. + metric = spec.metric_class( # type: ignore[call-arg] + name=spec.name, labels=spec.labels, documentation=spec.documentation + ) + metric.add_metric( # type: ignore[attr-defined] + labels=payload.labels, value=payload.value + ) + yield metric + self._datastore[payload.uuid] = payload + except Exception as e: # pylint: disable=W0718 + logger.error(e) + yield from self.failed_metrics diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 2422411..ce7ca12 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -1235,3 +1235,18 @@ def test_210_redfish_create_smart_storage_health_metric_payload(self): ) ], ) + + def test_1000_collector_fetch_failed(self): + for collector_cls, expected_name in [ + (MegaRAIDCollector, "megaraidcollector_collector_failed"), + (RedfishCollector, "redfishcollector_collector_failed"), + (IpmiSensorsCollector, "ipmisensorscollector_collector_failed"), + ]: + collector = collector_cls(Mock()) + collector.fetch = Mock() + collector.fetch.side_effect = Exception("Unknow error") + payloads = collector.collect() + payloads = list(payloads) + self.assertEqual(len(payloads), 1) + self.assertEqual(payloads[0].name, expected_name) + assert payloads[0].samples[0].value == 1.0 From 76d1f0b8f5141b11e129639eb357675ba2776b97 Mon Sep 17 00:00:00 2001 From: jneo8 Date: Thu, 23 Nov 2023 18:09:03 +0800 Subject: [PATCH 2/4] fix: Small refactor to address the review comments --- prometheus_hardware_exporter/core.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/prometheus_hardware_exporter/core.py b/prometheus_hardware_exporter/core.py index da0be89..4030abb 100644 --- a/prometheus_hardware_exporter/core.py +++ b/prometheus_hardware_exporter/core.py @@ -99,10 +99,9 @@ def failed_metrics(self) -> Iterable[Metric]: name = self.__class__.__name__ metric = GaugeMetricFamily( name=f"{name.lower()}_collector_failed", - labels=[], documentation=f"{name} Collector fail to fetch metrics", + value=1, ) - metric.add_metric(labels=[], value=1) yield metric def init_default_datastore(self, payloads: List[Payload]) -> None: @@ -117,7 +116,7 @@ def init_default_datastore(self, payloads: List[Payload]) -> None: name=payload.name, labels=payload.labels, value=0.0 ) - def collect(self) -> Iterable[Metric]: # pylint: disable=R1710 + def collect(self) -> Iterable[Metric]: """Fetch data and update the internal metrics. This is a callback method that is used internally within @@ -149,6 +148,6 @@ def collect(self) -> Iterable[Metric]: # pylint: disable=R1710 ) yield metric self._datastore[payload.uuid] = payload - except Exception as e: # pylint: disable=W0718 - logger.error(e) + except Exception as err: # pylint: disable=W0718 + logger.error(err) yield from self.failed_metrics From db364316c34e776c6476d2b89615568f9d1290a1 Mon Sep 17 00:00:00 2001 From: jneo8 Date: Thu, 23 Nov 2023 19:05:40 +0800 Subject: [PATCH 3/4] feat: Add label to metric --- prometheus_hardware_exporter/core.py | 4 ++++ tests/unit/test_collector.py | 23 ++++++++++++++++++----- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/prometheus_hardware_exporter/core.py b/prometheus_hardware_exporter/core.py index 4030abb..965b0a2 100644 --- a/prometheus_hardware_exporter/core.py +++ b/prometheus_hardware_exporter/core.py @@ -100,6 +100,10 @@ def failed_metrics(self) -> Iterable[Metric]: metric = GaugeMetricFamily( name=f"{name.lower()}_collector_failed", documentation=f"{name} Collector fail to fetch metrics", + labels=["collector"], + ) + metric.add_metric( + labels=[self.__class__.__name__], value=1, ) yield metric diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index ce7ca12..6b2ba76 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -1237,10 +1237,22 @@ def test_210_redfish_create_smart_storage_health_metric_payload(self): ) def test_1000_collector_fetch_failed(self): - for collector_cls, expected_name in [ - (MegaRAIDCollector, "megaraidcollector_collector_failed"), - (RedfishCollector, "redfishcollector_collector_failed"), - (IpmiSensorsCollector, "ipmisensorscollector_collector_failed"), + for collector_cls, expected_name, expected_labels in [ + ( + MegaRAIDCollector, + "megaraidcollector_collector_failed", + {"collector": "MegaRAIDCollector"}, + ), + ( + RedfishCollector, + "redfishcollector_collector_failed", + {"collector": "RedfishCollector"}, + ), + ( + IpmiSensorsCollector, + "ipmisensorscollector_collector_failed", + {"collector": "IpmiSensorsCollector"}, + ), ]: collector = collector_cls(Mock()) collector.fetch = Mock() @@ -1249,4 +1261,5 @@ def test_1000_collector_fetch_failed(self): payloads = list(payloads) self.assertEqual(len(payloads), 1) self.assertEqual(payloads[0].name, expected_name) - assert payloads[0].samples[0].value == 1.0 + self.assertEqual(payloads[0].samples[0].value, 1.0) + self.assertEqual(payloads[0].samples[0].labels, expected_labels) From 8468cac8f5b3595545ad9fc75198c53877d1c377 Mon Sep 17 00:00:00 2001 From: jneo8 Date: Fri, 24 Nov 2023 15:54:28 +0800 Subject: [PATCH 4/4] fix: Wording fix --- prometheus_hardware_exporter/core.py | 4 ++-- tests/unit/test_collector.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/prometheus_hardware_exporter/core.py b/prometheus_hardware_exporter/core.py index 965b0a2..ba9434c 100644 --- a/prometheus_hardware_exporter/core.py +++ b/prometheus_hardware_exporter/core.py @@ -91,7 +91,7 @@ def specifications(self) -> List[Specification]: @property def failed_metrics(self) -> Iterable[Metric]: - """Defines the metrics return when collect func fail. + """Defines the metrics to be returned when collector fails. Yields: metrics: the internal metrics @@ -99,7 +99,7 @@ def failed_metrics(self) -> Iterable[Metric]: name = self.__class__.__name__ metric = GaugeMetricFamily( name=f"{name.lower()}_collector_failed", - documentation=f"{name} Collector fail to fetch metrics", + documentation=f"{name} Collector failed to fetch metrics", labels=["collector"], ) metric.add_metric( diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 6b2ba76..e542159 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -1256,7 +1256,7 @@ def test_1000_collector_fetch_failed(self): ]: collector = collector_cls(Mock()) collector.fetch = Mock() - collector.fetch.side_effect = Exception("Unknow error") + collector.fetch.side_effect = Exception("Unknown error") payloads = collector.collect() payloads = list(payloads) self.assertEqual(len(payloads), 1)