Skip to content

Commit

Permalink
fix: General err handling for collector
Browse files Browse the repository at this point in the history
The error handler will try to output the failed metrics and make sure
the single collector's bug won't affect other collectors.
  • Loading branch information
jneo8 committed Nov 21, 2023
1 parent 1d96201 commit 11b60ec
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 20 deletions.
63 changes: 43 additions & 20 deletions prometheus_hardware_exporter/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from logging import getLogger
from typing import Any, Dict, Iterable, List, Type

from prometheus_client.metrics_core import Metric
from prometheus_client.metrics_core import GaugeMetricFamily, Metric
from prometheus_client.registry import Collector

from .config import Config
Expand Down Expand Up @@ -89,6 +89,22 @@ def specifications(self) -> List[Specification]:
A list of specification.
"""

@property
def failed_metrics(self) -> Iterable[Metric]:
"""Defines the metrics return when collect func fail.
Yields:
metrics: the internal metrics
"""
name = self.__class__.__name__
metric = GaugeMetricFamily(
name=f"{name.lower()}_collector_failed",
labels=[],
documentation=f"{name} Collector fail to fetch metrics",
)
metric.add_metric(labels=[], value=1)
yield metric

def init_default_datastore(self, payloads: List[Payload]) -> None:
"""Initialize or fill data the store with default values.
Expand All @@ -101,7 +117,7 @@ def init_default_datastore(self, payloads: List[Payload]) -> None:
name=payload.name, labels=payload.labels, value=0.0
)

def collect(self) -> Iterable[Metric]:
def collect(self) -> Iterable[Metric]: # pylint: disable=R1710
"""Fetch data and update the internal metrics.
This is a callback method that is used internally within
Expand All @@ -111,21 +127,28 @@ def collect(self) -> Iterable[Metric]:
Yields:
metrics: the internal metrics
"""
payloads = self.fetch()
self.init_default_datastore(payloads)
processed_payloads = self.process(payloads, self._datastore)

# unpacked and create metrics
for payload in processed_payloads:
spec = self._specs[payload.name]
# We have to ignore the type checking here, since the subclass of
# any metric family from prometheus client adds new attributes and
# methods.
metric = spec.metric_class( # type: ignore[call-arg]
name=spec.name, labels=spec.labels, documentation=spec.documentation
)
metric.add_metric( # type: ignore[attr-defined]
labels=payload.labels, value=payload.value
)
yield metric
self._datastore[payload.uuid] = payload
# The general exception hanlder will try to make sure the single
# collector's bug will only change the metrics output to failed_metrics
# and also make sure other collectors are still working.
try:
payloads = self.fetch()
self.init_default_datastore(payloads)
processed_payloads = self.process(payloads, self._datastore)

# unpacked and create metrics
for payload in processed_payloads:
spec = self._specs[payload.name]
# We have to ignore the type checking here, since the subclass of
# any metric family from prometheus client adds new attributes and
# methods.
metric = spec.metric_class( # type: ignore[call-arg]
name=spec.name, labels=spec.labels, documentation=spec.documentation
)
metric.add_metric( # type: ignore[attr-defined]
labels=payload.labels, value=payload.value
)
yield metric
self._datastore[payload.uuid] = payload
except Exception as e: # pylint: disable=W0718
logger.error(e)
yield from self.failed_metrics
16 changes: 16 additions & 0 deletions tests/unit/test_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def test_00_mega_raid_collector_not_installed(self):

self.assertEqual(len(list(payloads)), 1)


def test_01_mega_raid_collector_installed_and_okay(self):
"""Test mega raid collector can fetch correct number of metrics."""
mega_raid_collector = MegaRAIDCollector(Mock())
Expand Down Expand Up @@ -1235,3 +1236,18 @@ def test_210_redfish_create_smart_storage_health_metric_payload(self):
)
],
)

def test_1000_collector_fetch_failed(self):
for collector_cls, expected_name in [
(MegaRAIDCollector, "megaraidcollector_collector_failed"),
(RedfishCollector, "redfishcollector_collector_failed"),
(IpmiSensorsCollector, "ipmisensorscollector_collector_failed"),
]:
collector = collector_cls(Mock())
collector.fetch = Mock()
collector.fetch.side_effect = Exception("Unknow error")
payloads = collector.collect()
payloads = list(payloads)
self.assertEqual(len(payloads), 1)
self.assertEqual(payloads[0].name, expected_name)
assert payloads[0].samples[0].value == 1.0

0 comments on commit 11b60ec

Please sign in to comment.