Skip to content

Commit

Permalink
Add metrics for reachability status
Browse files Browse the repository at this point in the history
Disable reachability goroutine if configuration is broken
  • Loading branch information
pschork committed May 14, 2024
1 parent 7e2eec5 commit 6f15896
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 12 deletions.
10 changes: 10 additions & 0 deletions node/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ type Metrics struct {
AccuSocketUpdates prometheus.Counter
// avs node spec eigen_ metrics: https://eigen.nethermind.io/docs/spec/metrics/metrics-prom-spec
EigenMetrics eigenmetrics.Metrics
// Reachability gauge to monitoring the reachability of the node's retrieval/dispersal sockets
ReachabilityGauge *prometheus.GaugeVec

registry *prometheus.Registry
// socketAddr is the address at which the metrics server will be listening.
Expand Down Expand Up @@ -129,6 +131,14 @@ func NewMetrics(eigenMetrics eigenmetrics.Metrics, reg *prometheus.Registry, log
Help: "the total number of node's socket address updates",
},
),
ReachabilityGauge: promauto.With(reg).NewGaugeVec(
prometheus.GaugeOpts{
Namespace: Namespace,
Name: "reachability_status",
Help: "the reachability status of the nodes retrievel/dispersal sockets",
},
[]string{"service"},
),
EigenMetrics: eigenMetrics,
logger: logger.With("component", "NodeMetrics"),
registry: reg,
Expand Down
28 changes: 16 additions & 12 deletions node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -474,25 +474,25 @@ func (n *Node) checkNodeReachability() {
return
}

if n.Config.DataApiUrl == "" {
n.Logger.Error("Unable to perform reachability check - NODE_DATAAPI_URL is not defined in .env")
return
}

checkUrl, err := url.Parse(fmt.Sprintf("%s/api/v1/operators-info/port-check?operator_id=%s", n.Config.DataApiUrl, n.Config.ID.Hex()))
if err != nil {
n.Logger.Error("Reachability check failed - invalid check url", err, "checkUrl", checkUrl.String())
return
}

n.Logger.Info("Start nodeReachabilityCheck goroutine in background to check the reachability of the operator node")
ticker := time.NewTicker(time.Duration(n.Config.ReachabilityPollIntervalSec) * time.Second)
defer ticker.Stop()

for {
<-ticker.C

if n.Config.DataApiUrl == "" {
n.Logger.Error("Unable to perform reachability check - NODE_DATAAPI_URL is not defined in .env")
continue
}

checkUrl, err := url.Parse(fmt.Sprintf("%s/api/v1/operators-info/port-check?operator_id=%s", n.Config.DataApiUrl, n.Config.ID.Hex()))
if err != nil {
n.Logger.Error("Reachability check failed - invalid check url", err, "checkUrl", checkUrl.String())
return
}

n.Logger.Info("Calling reachability check", "url", checkUrl.String())
n.Logger.Debug("Calling reachability check", "url", checkUrl.String())

resp, err := http.Get(checkUrl.String())
if err != nil {
Expand Down Expand Up @@ -521,13 +521,17 @@ func (n *Node) checkNodeReachability() {

if responseObject.DispersalOnline {
n.Logger.Info("Reachability check - dispersal socket is ONLINE", "socket", responseObject.DispersalSocket)
n.Metrics.ReachabilityGauge.WithLabelValues("dispersal").Set(1.0)
} else {
n.Logger.Error("Reachability check - dispersal socket is UNREACHABLE", "socket", responseObject.DispersalSocket)
n.Metrics.ReachabilityGauge.WithLabelValues("dispersal").Set(0.0)
}
if responseObject.RetrievalOnline {
n.Logger.Info("Reachability check - retrieval socket is ONLINE", "socket", responseObject.RetrievalSocket)
n.Metrics.ReachabilityGauge.WithLabelValues("retrieval").Set(1.0)
} else {
n.Logger.Error("Reachability check - retrieval socket is UNREACHABLE", "socket", responseObject.RetrievalSocket)
n.Metrics.ReachabilityGauge.WithLabelValues("retrieval").Set(0.0)
}
}
}

0 comments on commit 6f15896

Please sign in to comment.