From 3f03b340750ee58aee57b455a101b24bf329864b Mon Sep 17 00:00:00 2001 From: Patrick Schork <354473+pschork@users.noreply.github.com> Date: Tue, 14 May 2024 09:51:27 -0700 Subject: [PATCH] Add metrics for reachability status Disable reachability goroutine if configuration is broken --- node/metrics.go | 10 ++++++++++ node/node.go | 28 ++++++++++++++++------------ 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/node/metrics.go b/node/metrics.go index 5b7fc53ce7..65bf149fbb 100644 --- a/node/metrics.go +++ b/node/metrics.go @@ -43,6 +43,8 @@ type Metrics struct { AccuSocketUpdates prometheus.Counter // avs node spec eigen_ metrics: https://eigen.nethermind.io/docs/spec/metrics/metrics-prom-spec EigenMetrics eigenmetrics.Metrics + // Reachability gauge to monitoring the reachability of the node's retrieval/dispersal sockets + ReachabilityGauge *prometheus.GaugeVec registry *prometheus.Registry // socketAddr is the address at which the metrics server will be listening. @@ -129,6 +131,14 @@ func NewMetrics(eigenMetrics eigenmetrics.Metrics, reg *prometheus.Registry, log Help: "the total number of node's socket address updates", }, ), + ReachabilityGauge: promauto.With(reg).NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: Namespace, + Name: "reachability_status", + Help: "the reachability status of the nodes retrievel/dispersal sockets", + }, + []string{"service"}, + ), EigenMetrics: eigenMetrics, logger: logger.With("component", "NodeMetrics"), registry: reg, diff --git a/node/node.go b/node/node.go index 95e0cb4e22..bde3732466 100644 --- a/node/node.go +++ b/node/node.go @@ -474,6 +474,17 @@ func (n *Node) checkNodeReachability() { return } + if n.Config.DataApiUrl == "" { + n.Logger.Error("Unable to perform reachability check - NODE_DATAAPI_URL is not defined in .env") + return + } + + checkUrl, err := url.Parse(fmt.Sprintf("%s/api/v1/operators-info/port-check?operator_id=%s", n.Config.DataApiUrl, n.Config.ID.Hex())) + if err != nil { + n.Logger.Error("Reachability check failed - invalid check url", err, "checkUrl", checkUrl.String()) + return + } + n.Logger.Info("Start nodeReachabilityCheck goroutine in background to check the reachability of the operator node") ticker := time.NewTicker(time.Duration(n.Config.ReachabilityPollIntervalSec) * time.Second) defer ticker.Stop() @@ -481,18 +492,7 @@ func (n *Node) checkNodeReachability() { for { <-ticker.C - if n.Config.DataApiUrl == "" { - n.Logger.Error("Unable to perform reachability check - NODE_DATAAPI_URL is not defined in .env") - continue - } - - checkUrl, err := url.Parse(fmt.Sprintf("%s/api/v1/operators-info/port-check?operator_id=%s", n.Config.DataApiUrl, n.Config.ID.Hex())) - if err != nil { - n.Logger.Error("Reachability check failed - invalid check url", err, "checkUrl", checkUrl.String()) - return - } - - n.Logger.Info("Calling reachability check", "url", checkUrl.String()) + n.Logger.Debug("Calling reachability check", "url", checkUrl.String()) resp, err := http.Get(checkUrl.String()) if err != nil { @@ -521,13 +521,17 @@ func (n *Node) checkNodeReachability() { if responseObject.DispersalOnline { n.Logger.Info("Reachability check - dispersal socket is ONLINE", "socket", responseObject.DispersalSocket) + n.Metrics.ReachabilityGauge.WithLabelValues("dispersal").Set(1.0) } else { n.Logger.Error("Reachability check - dispersal socket is UNREACHABLE", "socket", responseObject.DispersalSocket) + n.Metrics.ReachabilityGauge.WithLabelValues("dispersal").Set(0.0) } if responseObject.RetrievalOnline { n.Logger.Info("Reachability check - retrieval socket is ONLINE", "socket", responseObject.RetrievalSocket) + n.Metrics.ReachabilityGauge.WithLabelValues("retrieval").Set(1.0) } else { n.Logger.Error("Reachability check - retrieval socket is UNREACHABLE", "socket", responseObject.RetrievalSocket) + n.Metrics.ReachabilityGauge.WithLabelValues("retrieval").Set(0.0) } } }