From a53c69f3e07fb7bd5462520aadf5ffb857a84167 Mon Sep 17 00:00:00 2001 From: Dmitry Shmulevich Date: Fri, 22 Nov 2024 10:30:02 -0800 Subject: [PATCH] add Prometheus metrics for block size calculation errors Signed-off-by: Dmitry Shmulevich --- pkg/metrics/metrics.go | 14 ++++++++++++++ pkg/translate/output.go | 9 +++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 138fb77..ecc2ca7 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -51,12 +51,22 @@ var ( }, []string{"provider"}, ) + + blockSizeValidationErrorsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "blocksize_error_total", + Help: "Total number of blocksize validation errors.", + Subsystem: "topograph", + }, + []string{"type"}, + ) ) func init() { prometheus.MustRegister(httpRequestsTotal) prometheus.MustRegister(httpRequestDuration) prometheus.MustRegister(missingTopologyNodes) + prometheus.MustRegister(blockSizeValidationErrorsTotal) } func Add(provider, engine string, code int, duration time.Duration) { @@ -68,3 +78,7 @@ func Add(provider, engine string, code int, duration time.Duration) { func SetMissingTopology(provider string, count int) { missingTopologyNodes.WithLabelValues(provider).Set(float64(count)) } + +func AddBlockSizeValidationError(errorType string) { + blockSizeValidationErrorsTotal.WithLabelValues(errorType).Inc() +} diff --git a/pkg/translate/output.go b/pkg/translate/output.go index 041b449..252be78 100644 --- a/pkg/translate/output.go +++ b/pkg/translate/output.go @@ -24,7 +24,10 @@ import ( "strconv" "strings" + "k8s.io/klog/v2" + "github.com/NVIDIA/topograph/pkg/engines" + "github.com/NVIDIA/topograph/pkg/metrics" "github.com/NVIDIA/topograph/pkg/topology" ) @@ -98,12 +101,14 @@ func getBlockSize(domainVisited map[string]int, adminBlockSize string) string { blockSizes := strings.Split(adminBlockSize, ",") planningBS, err := strconv.Atoi(blockSizes[0]) if err != nil { - fmt.Printf("Alert, strconv Atoi for admin provided blockSize %v failed with err: %v! Ignoring it\n", blockSizes[0], err) + metrics.AddBlockSizeValidationError("parsing error") + klog.Warningf("Failed to parse blockSize %v: %v. Ignoring.", blockSizes[0], err) } else { if planningBS > 0 && planningBS <= minDomainSize { return adminBlockSize } - fmt.Printf("Alert Overriden planning blockSize of %v does not meet criteria, minimum domain size %v! Ignoring it\n", planningBS, minDomainSize) + metrics.AddBlockSizeValidationError("bad domain size") + klog.Warningf("Overriden planning blockSize of %v does not meet criteria, minimum domain size %v. Ignoring.", planningBS, minDomainSize) } } logDsize := math.Log2(float64(minDomainSize))