From 22901df6982a0eef32ee3361600e5000646ea90c Mon Sep 17 00:00:00 2001 From: Jonathan Davies Date: Mon, 13 Nov 2023 15:44:58 +0000 Subject: [PATCH] systemd.go: Added watchdog metrics Signed-off-by: Jonathan Davies --- systemd/systemd.go | 94 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 1 deletion(-) diff --git a/systemd/systemd.go b/systemd/systemd.go index 85a9df7..89ff080 100644 --- a/systemd/systemd.go +++ b/systemd/systemd.go @@ -17,6 +17,7 @@ import ( "context" "fmt" "math" + "strconv" // Register pprof-over-http handlers _ "net/http/pprof" @@ -42,6 +43,7 @@ var ( systemdUser = kingpin.Flag("systemd.collector.user", "Connect to the user systemd instance.").Bool() enableRestartsMetrics = kingpin.Flag("systemd.collector.enable-restart-count", "Enables service restart count metrics. This feature only works with systemd 235 and above.").Bool() enableIPAccountingMetrics = kingpin.Flag("systemd.collector.enable-ip-accounting", "Enables service ip accounting metrics. This feature only works with systemd 235 and above.").Bool() + watchdogTimeRE = regexp.MustCompile(`\d+`) ) var unitStatesName = []string{"active", "activating", "deactivating", "inactive", "failed"} @@ -77,6 +79,9 @@ type Collector struct { ipEgressBytes *prometheus.Desc ipIngressPackets *prometheus.Desc ipEgressPackets *prometheus.Desc + watchdogRuntimeSeconds *prometheus.Desc + watchdogLastPingMonotonic *prometheus.Desc + watchdogLastPingTimestamp *prometheus.Desc unitIncludePattern *regexp.Regexp unitExcludePattern *regexp.Regexp @@ -186,6 +191,21 @@ func NewCollector(logger log.Logger) (*Collector, error) { "Service unit egress IP accounting in packets.", []string{"name"}, nil, ) + watchdogRuntimeSeconds := prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "watchdog_runtime_seconds"), + "systemd watchdog runtime seconds", + []string{"device"}, nil, + ) + watchdogLastPingMonotonic := prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "watchdog_last_ping_monotonic_seconds"), + "systemd watchdog last ping monotonic seconds", + []string{"device"}, nil, + ) + watchdogLastPingTimestamp := prometheus.NewDesc( + prometheus.BuildFQName(namespace, "", "watchdog_last_ping_time_seconds"), + "systemd watchdog last ping time seconds", + []string{"device"}, nil, + ) unitIncludePattern := regexp.MustCompile(fmt.Sprintf("^(?:%s)$", *unitInclude)) unitExcludePattern := regexp.MustCompile(fmt.Sprintf("^(?:%s)$", *unitExclude)) @@ -215,6 +235,9 @@ func NewCollector(logger log.Logger) (*Collector, error) { ipEgressPackets: ipEgressPackets, unitIncludePattern: unitIncludePattern, unitExcludePattern: unitExcludePattern, + watchdogRuntimeSeconds: watchdogRuntimeSeconds, + watchdogLastPingMonotonic: watchdogLastPingMonotonic, + watchdogLastPingTimestamp: watchdogLastPingTimestamp, }, nil } @@ -243,7 +266,9 @@ func (c *Collector) Describe(desc chan<- *prometheus.Desc) { desc <- c.ipEgressBytes desc <- c.ipIngressPackets desc <- c.ipEgressPackets - + desc <- c.watchdogRuntimeSeconds + desc <- c.watchdogLastPingMonotonic + desc <- c.watchdogLastPingTimestamp } func parseUnitType(unit dbus.UnitStatus) string { @@ -259,6 +284,11 @@ func (c *Collector) collect(ch chan<- prometheus.Metric) error { } defer conn.Close() + err = c.collectWatchdogMetrics(conn, ch) + if err != nil { + level.Debug(c.logger).Log("msg", "Failed to collect watchdog timestamps", "err", err) + } + allUnits, err := conn.ListUnitsContext(c.ctx) if err != nil { return errors.Wrap(err, "could not get list of systemd units from dbus") @@ -620,3 +650,65 @@ func (c *Collector) filterUnits(units []dbus.UnitStatus, includePattern, exclude return filtered } + +func (c *Collector) collectWatchdogMetrics(conn *dbus.Conn, ch chan<- prometheus.Metric) error { + watchdogDevice, err := conn.GetManagerProperty("WatchdogDevice") + if err != nil { + return err + } + + watchdogDeviceString := strings.TrimPrefix(strings.TrimSuffix(watchdogDevice, `"`), `"`) + + if len(watchdogDeviceString) == 0 { + level.Debug(c.logger).Log("msg", "No watchdog configured, ignoring metrics") + return nil + } + + runtimeWatchdogUSecProperty, err := conn.GetManagerProperty("RuntimeWatchdogUSec") + if err != nil { + return err + } + + watchdogLastPingMonotonicProperty, err := conn.GetManagerProperty("WatchdogLastPingTimestampMonotonic") + if err != nil { + return err + } + + watchdogLastPingTimeProperty, err := conn.GetManagerProperty("WatchdogLastPingTimestamp") + if err != nil { + return err + } + + parsedRuntimeWatchdog := watchdogTimeRE.FindString(runtimeWatchdogUSecProperty) + parsedWatchdogLastPingMonotonic := watchdogTimeRE.FindString(watchdogLastPingMonotonicProperty) + parsedWatchdogLastPingTime := watchdogTimeRE.FindString(watchdogLastPingTimeProperty) + + runtimeWatchdogUSec, err := strconv.ParseFloat(parsedRuntimeWatchdog, 64) + if err != nil { + return err + } + + watchdogLastPingMonotonic, err := strconv.ParseFloat(parsedWatchdogLastPingMonotonic, 64) + if err != nil { + return err + } + + watchdogLastPingTimestamp, err := strconv.ParseFloat(parsedWatchdogLastPingTime, 64) + if err != nil { + return err + } + + ch <- prometheus.MustNewConstMetric( + c.watchdogRuntimeSeconds, prometheus.GaugeValue, + float64(runtimeWatchdogUSec)/1e6, watchdogDevice) + + ch <- prometheus.MustNewConstMetric( + c.watchdogLastPingMonotonic, prometheus.GaugeValue, + float64(watchdogLastPingMonotonic)/1e6, watchdogDevice) + + ch <- prometheus.MustNewConstMetric( + c.watchdogLastPingTimestamp, prometheus.GaugeValue, + float64(watchdogLastPingTimestamp)/1e6, watchdogDevice) + + return nil +}