From 4190d49ba04c649a5f07706b45f3d019752f4056 Mon Sep 17 00:00:00 2001 From: Cody Littley <56973212+cody-littley@users.noreply.github.com> Date: Wed, 11 Dec 2024 08:07:49 -0600 Subject: [PATCH] Remove wrapper from relay (#975) Signed-off-by: Cody Littley --- common/metrics/count_metric.go | 101 ------ common/metrics/gauge_metric.go | 103 ------ common/metrics/label_maker.go | 73 ----- common/metrics/latency_metric.go | 102 ------ common/metrics/metrics.go | 148 --------- common/metrics/metrics_server.go | 434 -------------------------- common/metrics/mock_metrics.go | 159 ---------- common/metrics/test/main.go | 131 -------- common/metrics/test/metrics.md | 71 ----- relay/cache/cache_accessor.go | 12 +- relay/cache/cache_accessor_metrics.go | 147 +++++---- relay/limiter/blob_rate_limiter.go | 6 +- relay/limiter/chunk_rate_limiter.go | 12 +- relay/mdoc/main.go | 24 -- relay/mdoc/relay-metrics.md | 426 ------------------------- relay/metrics/metrics.go | 397 +++++++++++++---------- relay/server.go | 32 +- 17 files changed, 349 insertions(+), 2029 deletions(-) delete mode 100644 common/metrics/count_metric.go delete mode 100644 common/metrics/gauge_metric.go delete mode 100644 common/metrics/label_maker.go delete mode 100644 common/metrics/latency_metric.go delete mode 100644 common/metrics/metrics.go delete mode 100644 common/metrics/metrics_server.go delete mode 100644 common/metrics/mock_metrics.go delete mode 100644 common/metrics/test/main.go delete mode 100644 common/metrics/test/metrics.md delete mode 100644 relay/mdoc/main.go delete mode 100644 relay/mdoc/relay-metrics.md diff --git a/common/metrics/count_metric.go b/common/metrics/count_metric.go deleted file mode 100644 index f52b942cb6..0000000000 --- a/common/metrics/count_metric.go +++ /dev/null @@ -1,101 +0,0 @@ -package metrics - -import ( - "fmt" - "github.com/Layr-Labs/eigensdk-go/logging" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" -) - -var _ CountMetric = &countMetric{} - -// countMetric a standard implementation of the CountMetric. -type countMetric struct { - Metric - - // logger is the logger used to log errors. - logger logging.Logger - - // name is the name of the metric. - name string - - // description is the description of the metric. - description string - - // counter is the prometheus counter used to report this metric. - vec *prometheus.CounterVec - - // labeler is the label maker used to create labels for this metric. - labeler *labelMaker -} - -// newCountMetric creates a new CountMetric instance. -func newCountMetric( - logger logging.Logger, - registry *prometheus.Registry, - namespace string, - name string, - description string, - labelTemplate any) (CountMetric, error) { - - labeler, err := newLabelMaker(labelTemplate) - if err != nil { - return nil, err - } - - vec := promauto.With(registry).NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Name: fmt.Sprintf("%s_count", name), - }, - labeler.getKeys(), - ) - - return &countMetric{ - logger: logger, - name: name, - description: description, - vec: vec, - labeler: labeler, - }, nil -} - -func (m *countMetric) Name() string { - return m.name -} - -func (m *countMetric) Unit() string { - return "count" -} - -func (m *countMetric) Description() string { - return m.description -} - -func (m *countMetric) Type() string { - return "counter" -} - -func (m *countMetric) LabelFields() []string { - return m.labeler.getKeys() -} - -func (m *countMetric) Increment(label ...any) { - m.Add(1, label...) -} - -func (m *countMetric) Add(value float64, label ...any) { - var l any - if len(label) > 0 { - l = label[0] - } - - values, err := m.labeler.extractValues(l) - if err != nil { - m.logger.Errorf("error extracting values from label for metric %s: %v", m.name, err) - return - } - - observer := m.vec.WithLabelValues(values...) - observer.Add(value) -} diff --git a/common/metrics/gauge_metric.go b/common/metrics/gauge_metric.go deleted file mode 100644 index ca3c80ae74..0000000000 --- a/common/metrics/gauge_metric.go +++ /dev/null @@ -1,103 +0,0 @@ -package metrics - -import ( - "fmt" - "github.com/Layr-Labs/eigensdk-go/logging" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" -) - -var _ GaugeMetric = &gaugeMetric{} - -// gaugeMetric is a standard implementation of the GaugeMetric interface via prometheus. -type gaugeMetric struct { - Metric - - // logger is the logger used to log errors. - logger logging.Logger - - // name is the name of the metric. - name string - - // unit is the unit of the metric. - unit string - - // description is the description of the metric. - description string - - // gauge is the prometheus gauge used to report this metric. - vec *prometheus.GaugeVec - - // labeler is the label maker used to create labels for this metric. - labeler *labelMaker -} - -// newGaugeMetric creates a new GaugeMetric instance. -func newGaugeMetric( - logger logging.Logger, - registry *prometheus.Registry, - namespace string, - name string, - unit string, - description string, - labelTemplate any) (GaugeMetric, error) { - - labeler, err := newLabelMaker(labelTemplate) - if err != nil { - return nil, err - } - - vec := promauto.With(registry).NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: namespace, - Name: fmt.Sprintf("%s_%s", name, unit), - }, - labeler.getKeys(), - ) - - return &gaugeMetric{ - logger: logger, - name: name, - unit: unit, - description: description, - vec: vec, - labeler: labeler, - }, nil -} - -func (m *gaugeMetric) Name() string { - return m.name -} - -func (m *gaugeMetric) Unit() string { - return m.unit -} - -func (m *gaugeMetric) Description() string { - return m.description -} - -func (m *gaugeMetric) Type() string { - return "gauge" -} - -func (m *gaugeMetric) LabelFields() []string { - return m.labeler.getKeys() -} - -func (m *gaugeMetric) Set(value float64, label ...any) { - var l any - if len(label) > 0 { - l = label[0] - } - - values, err := m.labeler.extractValues(l) - if err != nil { - m.logger.Errorf("failed to extract values from label: %v", err) - return - } - - observer := m.vec.WithLabelValues(values...) - - observer.Set(value) -} diff --git a/common/metrics/label_maker.go b/common/metrics/label_maker.go deleted file mode 100644 index cb56ecdfc7..0000000000 --- a/common/metrics/label_maker.go +++ /dev/null @@ -1,73 +0,0 @@ -package metrics - -import ( - "fmt" - "reflect" -) - -// labelMaker encapsulates logic for creating labels for metrics. -type labelMaker struct { - keys []string - emptyValues []string - templateType reflect.Type - labelCount int -} - -// newLabelMaker creates a new labelMaker instance given a label template. The label template may be nil. -func newLabelMaker(labelTemplate any) (*labelMaker, error) { - labeler := &labelMaker{ - keys: make([]string, 0), - } - - if labelTemplate == nil { - return labeler, nil - } - - v := reflect.ValueOf(labelTemplate) - if v.Kind() != reflect.Struct { - return nil, fmt.Errorf("label template must be a struct") - } - - t := v.Type() - labeler.templateType = t - for i := 0; i < t.NumField(); i++ { - - fieldType := t.Field(i).Type - if fieldType.Kind() != reflect.String { - return nil, fmt.Errorf( - "field %s has type %v, only string fields are supported", t.Field(i).Name, fieldType) - } - - labeler.keys = append(labeler.keys, t.Field(i).Name) - } - - labeler.emptyValues = make([]string, len(labeler.keys)) - labeler.labelCount = len(labeler.keys) - - return labeler, nil -} - -// getKeys provides the keys for the label struct. -func (l *labelMaker) getKeys() []string { - return l.keys -} - -// extractValues extracts the values from the given label struct. -func (l *labelMaker) extractValues(label any) ([]string, error) { - if l.templateType == nil || label == nil { - return l.emptyValues, nil - } - - if l.templateType != reflect.TypeOf(label) { - return nil, fmt.Errorf( - "label type mismatch, expected %v, got %v", l.templateType, reflect.TypeOf(label)) - } - - values := make([]string, 0, l.labelCount) - for i := 0; i < l.labelCount; i++ { - v := reflect.ValueOf(label) - values = append(values, v.Field(i).String()) - } - - return values, nil -} diff --git a/common/metrics/latency_metric.go b/common/metrics/latency_metric.go deleted file mode 100644 index 1309a9b33d..0000000000 --- a/common/metrics/latency_metric.go +++ /dev/null @@ -1,102 +0,0 @@ -package metrics - -import ( - "fmt" - "github.com/Layr-Labs/eigensdk-go/logging" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" - "time" -) - -var _ LatencyMetric = &latencyMetric{} - -// latencyMetric is a standard implementation of the LatencyMetric interface via prometheus. -type latencyMetric struct { - Metric - - // logger is the logger used to log errors. - logger logging.Logger - - // name is the name of the metric. - name string - - // description is the description of the metric. - description string - - // vec is the prometheus summary vector used to report this metric. - vec *prometheus.SummaryVec - - // lm is the label maker used to create labels for this metric. - labeler *labelMaker -} - -// newLatencyMetric creates a new LatencyMetric instance. -func newLatencyMetric( - logger logging.Logger, - registry *prometheus.Registry, - namespace string, - name string, - description string, - objectives map[float64]float64, - labelTemplate any) (LatencyMetric, error) { - - labeler, err := newLabelMaker(labelTemplate) - if err != nil { - return nil, err - } - - vec := promauto.With(registry).NewSummaryVec( - prometheus.SummaryOpts{ - Namespace: namespace, - Name: fmt.Sprintf("%s_ms", name), - Objectives: objectives, - }, - labeler.getKeys(), - ) - - return &latencyMetric{ - logger: logger, - name: name, - description: description, - vec: vec, - labeler: labeler, - }, nil -} - -func (m *latencyMetric) Name() string { - return m.name -} - -func (m *latencyMetric) Unit() string { - return "ms" -} - -func (m *latencyMetric) Description() string { - return m.description -} - -func (m *latencyMetric) Type() string { - return "latency" -} - -func (m *latencyMetric) LabelFields() []string { - return m.labeler.getKeys() -} - -func (m *latencyMetric) ReportLatency(latency time.Duration, label ...any) { - var l any - if len(label) > 0 { - l = label[0] - } - - values, err := m.labeler.extractValues(l) - if err != nil { - m.logger.Errorf("error extracting values from label: %v", err) - } - - observer := m.vec.WithLabelValues(values...) - - nanoseconds := float64(latency.Nanoseconds()) - milliseconds := nanoseconds / float64(time.Millisecond) - observer.Observe(milliseconds) -} diff --git a/common/metrics/metrics.go b/common/metrics/metrics.go deleted file mode 100644 index 4fbf5d0547..0000000000 --- a/common/metrics/metrics.go +++ /dev/null @@ -1,148 +0,0 @@ -package metrics - -import ( - "github.com/prometheus/client_golang/prometheus" - "time" -) - -// Metrics provides a convenient interface for reporting metrics. -type Metrics interface { - // Start starts the metrics server. - Start() error - - // Stop stops the metrics server. - Stop() error - - // GenerateMetricsDocumentation generates documentation for all currently registered metrics. - // Documentation is returned as a string in markdown format. - GenerateMetricsDocumentation() string - - // WriteMetricsDocumentation writes documentation for all currently registered metrics to a file. - // Documentation is written in markdown format. - WriteMetricsDocumentation(fileName string) error - - // NewLatencyMetric creates a new LatencyMetric instance. Useful for reporting the latency of an operation. - // Metric name and label may only contain alphanumeric characters and underscores. - // - // The labelTemplate parameter is the label type that will be used for this metric. Each field becomes a label for - // the metric. Each field type must be a string. If no labels are needed, pass nil. - NewLatencyMetric( - name string, - description string, - labelTemplate any, - quantiles ...*Quantile) (LatencyMetric, error) - - // NewCountMetric creates a new CountMetric instance. Useful for tracking the count of a type of event. - // Metric name and label may only contain alphanumeric characters and underscores. - // - // The labelTemplate parameter is the label type that will be used for this metric. Each field becomes a label for - // the metric. Each field type must be a string. If no labels are needed, pass nil. - NewCountMetric( - name string, - description string, - labelTemplate any) (CountMetric, error) - - // NewGaugeMetric creates a new GaugeMetric instance. Useful for reporting specific values. - // Metric name and label may only contain alphanumeric characters and underscores. - // - // The labelTemplate parameter is the label type that will be used for this metric. Each field becomes a label for - // the metric. Each field type must be a string. If no labels are needed, pass nil. - NewGaugeMetric( - name string, - unit string, - description string, - labelTemplate any) (GaugeMetric, error) - - // NewAutoGauge creates a new GaugeMetric instance that is automatically updated by the given source function. - // The function is polled at the given period. This produces a gauge type metric internally. - // Metric name and label may only contain alphanumeric characters and underscores. - // - // The label parameter accepts zero or one label. - NewAutoGauge( - name string, - unit string, - description string, - pollPeriod time.Duration, - source func() float64, - label ...any) error - - // RegisterExternalMetrics registers prometheus collectors created outside the metrics framework. - RegisterExternalMetrics(collectors ...prometheus.Collector) -} - -// Metric represents a metric that can be reported. -type Metric interface { - - // Name returns the name of the metric. - Name() string - - // Unit returns the unit of the metric. - Unit() string - - // Description returns the description of the metric. Should be a one or two sentence human-readable description. - Description() string - - // Type returns the type of the metric. - Type() string - - // LabelFields returns the fields of the label template. - LabelFields() []string -} - -// GaugeMetric allows specific values to be reported. -type GaugeMetric interface { - Metric - - // Set sets the value of a gauge metric. - // - // The label parameter accepts zero or one label. If the label type does not match the template label type provided - // when creating the metric, an error will be returned. - Set(value float64, label ...any) -} - -// CountMetric allows the count of a type of event to be tracked. -type CountMetric interface { - Metric - - // Increment increments the count by 1. - // - // The label parameter accepts zero or one label. If the label type does not match the template label type provided - // when creating the metric, an error will be returned. - Increment(label ...any) - - // Add increments the count by the given value. - // - // The label parameter accepts zero or one label. If the label type does not match the template label type provided - // when creating the metric, an error will be returned. - Add(value float64, label ...any) -} - -// Quantile describes a quantile of a latency metric that should be reported. For a description of how -// to interpret a quantile, see the prometheus documentation -// https://github.com/prometheus/client_golang/blob/v1.20.5/prometheus/summary.go#L126 -type Quantile struct { - Quantile float64 - Error float64 -} - -// NewQuantile creates a new Quantile instance. Error is set to 1% of the quantile. -func NewQuantile(quantile float64) *Quantile { - return &Quantile{ - Quantile: quantile, - Error: quantile / 100.0, - } -} - -// LatencyMetric allows the latency of an operation to be tracked. Similar to a gauge metric, but specialized for time. -// -// The label parameter accepts zero or one label. If the label type does not match the template label type provided -// when creating the metric, an error will be returned. -type LatencyMetric interface { - Metric - - // ReportLatency reports a latency value. - // - // The label parameter accepts zero or one label. If the label type does not match the template label type provided - // when creating the metric, an error will be returned. - ReportLatency(latency time.Duration, label ...any) -} diff --git a/common/metrics/metrics_server.go b/common/metrics/metrics_server.go deleted file mode 100644 index 7a5868dd0d..0000000000 --- a/common/metrics/metrics_server.go +++ /dev/null @@ -1,434 +0,0 @@ -package metrics - -import ( - "errors" - "fmt" - "github.com/Layr-Labs/eigensdk-go/logging" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/collectors" - "github.com/prometheus/client_golang/prometheus/promhttp" - "net/http" - "os" - "regexp" - "slices" - "strings" - "sync" - "sync/atomic" - "time" -) - -var _ Metrics = &metrics{} - -// metrics is a standard implementation of the Metrics interface via prometheus. -type metrics struct { - // logger is the logger used to log messages. - logger logging.Logger - - // namespace is prepended to all metric names. - namespace string - - // registry is the prometheus registry used to report metrics. - registry *prometheus.Registry - - // A map from metricID to Metric instance. If a metric is requested but that metric - // already exists, the existing metric will be returned instead of a new one being created. - metricMap map[metricID]Metric - - // autoGaugesToStart is a list of functions that will start auto-gauges. If an auto-gauge is created - // before the metrics server is started, we don't actually start the goroutine until the server is started. - autoGaugesToStart []func() - - // lock is a lock used to ensure that metrics are not created concurrently. - lock sync.Mutex - - // started is true if the metrics server has been started. - started bool - - // isAlize is true if the metrics server has not been stopped. - isAlive atomic.Bool - - // server is the metrics server - server *http.Server - - // quantilesMap contains a string describing the quantiles for each latency metric. Used to generate documentation. - quantilesMap map[metricID]string -} - -// NewMetrics creates a new Metrics instance. -func NewMetrics(logger logging.Logger, namespace string, port int) Metrics { - reg := prometheus.NewRegistry() - reg.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{})) - reg.MustRegister(collectors.NewGoCollector()) - - logger.Infof("Starting metrics server at port %d", port) - addr := fmt.Sprintf(":%d", port) - mux := http.NewServeMux() - mux.Handle("/metrics", promhttp.HandlerFor( - reg, - promhttp.HandlerOpts{}, - )) - server := &http.Server{ - Addr: addr, - Handler: mux, - } - - m := &metrics{ - logger: logger, - namespace: namespace, - registry: reg, - metricMap: make(map[metricID]Metric), - isAlive: atomic.Bool{}, - server: server, - quantilesMap: make(map[metricID]string), - } - m.isAlive.Store(true) - return m -} - -// metricID is a unique identifier for a metric. -type metricID struct { - name string - unit string -} - -var legalCharactersRegex = regexp.MustCompile(`^[a-zA-Z0-9_]+$`) - -// containsLegalCharacters returns true if the string contains only legal characters (alphanumeric and underscore). -func containsLegalCharacters(s string) bool { - return legalCharactersRegex.MatchString(s) -} - -// newMetricID creates a new metricID instance. -func newMetricID(name string, unit string) (metricID, error) { - if !containsLegalCharacters(name) { - return metricID{}, fmt.Errorf("invalid metric name: %s", name) - } - if !containsLegalCharacters(unit) { - return metricID{}, fmt.Errorf("invalid metric unit: %s", unit) - } - return metricID{ - name: name, - unit: unit, - }, nil -} - -// NameWithUnit returns the name of the metric with the unit appended. -func (i *metricID) NameWithUnit() string { - return fmt.Sprintf("%s_%s", i.name, i.unit) -} - -// Start starts the metrics server. -func (m *metrics) Start() error { - m.lock.Lock() - defer m.lock.Unlock() - - if m.started { - return errors.New("metrics server already started") - } - m.started = true - - go func() { - err := m.server.ListenAndServe() - if err != nil && !strings.Contains(err.Error(), "http: Server closed") { - m.logger.Errorf("metrics server error: %v", err) - } - }() - - // start the auto-gauges that were created before the server was started - for _, autoGauge := range m.autoGaugesToStart { - go autoGauge() - } - - return nil -} - -// Stop stops the metrics server. -func (m *metrics) Stop() error { - m.lock.Lock() - defer m.lock.Unlock() - - if !m.started { - return errors.New("metrics server not started") - } - - if !m.isAlive.Load() { - return errors.New("metrics server already stopped") - } - - m.isAlive.Store(false) - return m.server.Close() -} - -// NewLatencyMetric creates a new LatencyMetric instance. -func (m *metrics) NewLatencyMetric( - name string, - description string, - labelTemplate any, - quantiles ...*Quantile) (LatencyMetric, error) { - - m.lock.Lock() - defer m.lock.Unlock() - - if !m.isAlive.Load() { - return nil, errors.New("metrics server is not alive") - } - - id, err := newMetricID(name, "ms") - if err != nil { - return nil, err - } - - preExistingMetric, ok := m.metricMap[id] - if ok { - return preExistingMetric.(LatencyMetric), nil - } - - quantilesString := "" - objectives := make(map[float64]float64, len(quantiles)) - for i, q := range quantiles { - objectives[q.Quantile] = q.Error - - quantilesString += fmt.Sprintf("`%.3f`", q.Quantile) - if i < len(quantiles)-1 { - quantilesString += ", " - } - } - m.quantilesMap[id] = quantilesString - - metric, err := newLatencyMetric( - m.logger, - m.registry, - m.namespace, - name, - description, - objectives, - labelTemplate) - - if err != nil { - return nil, err - } - - m.metricMap[id] = metric - return metric, nil -} - -// NewCountMetric creates a new CountMetric instance. -func (m *metrics) NewCountMetric( - name string, - description string, - labelTemplate any) (CountMetric, error) { - - m.lock.Lock() - defer m.lock.Unlock() - - if !m.isAlive.Load() { - return nil, errors.New("metrics server is not alive") - } - - id, err := newMetricID(name, "count") - if err != nil { - return nil, err - } - - preExistingMetric, ok := m.metricMap[id] - if ok { - return preExistingMetric.(CountMetric), nil - } - - metric, err := newCountMetric( - m.logger, - m.registry, - m.namespace, - name, description, - labelTemplate) - - if err != nil { - return nil, err - } - - m.metricMap[id] = metric - - return metric, nil -} - -// NewGaugeMetric creates a new GaugeMetric instance. -func (m *metrics) NewGaugeMetric( - name string, - unit string, - description string, - labelTemplate any) (GaugeMetric, error) { - - m.lock.Lock() - defer m.lock.Unlock() - return m.newGaugeMetricUnsafe(name, unit, description, labelTemplate) -} - -// newGaugeMetricUnsafe creates a new GaugeMetric instance without locking. -func (m *metrics) newGaugeMetricUnsafe( - name string, - unit string, - description string, - labelTemplate any) (GaugeMetric, error) { - - if !m.isAlive.Load() { - return nil, errors.New("metrics server is not alive") - } - - id, err := newMetricID(name, unit) - if err != nil { - return nil, err - } - - preExistingMetric, ok := m.metricMap[id] - if ok { - return preExistingMetric.(GaugeMetric), nil - } - - metric, err := newGaugeMetric( - m.logger, - m.registry, - m.namespace, - name, - unit, - description, - labelTemplate) - - if err != nil { - return nil, err - } - - m.metricMap[id] = metric - return metric, nil -} - -func (m *metrics) NewAutoGauge( - name string, - unit string, - description string, - pollPeriod time.Duration, - source func() float64, - label ...any) error { - - m.lock.Lock() - defer m.lock.Unlock() - - if !m.isAlive.Load() { - return errors.New("metrics server is not alive") - } - - if len(label) > 1 { - return fmt.Errorf("too many labels provided, expected 1, got %d", len(label)) - } - var l any - if len(label) == 1 { - l = label[0] - } - - gauge, err := m.newGaugeMetricUnsafe(name, unit, description, l) - if err != nil { - return err - } - - pollingAgent := func() { - ticker := time.NewTicker(pollPeriod) - for m.isAlive.Load() { - value := source() - - gauge.Set(value, l) - <-ticker.C - } - } - - if m.started { - // start the polling agent immediately - go pollingAgent() - } else { - // the polling agent will be started when the metrics server is started - m.autoGaugesToStart = append(m.autoGaugesToStart, pollingAgent) - } - - return nil -} - -func (m *metrics) GenerateMetricsDocumentation() string { - sb := &strings.Builder{} - - metricIDs := make([]*metricID, 0, len(m.metricMap)) - for id := range m.metricMap { - boundID := id - metricIDs = append(metricIDs, &boundID) - } - - // sort the metric IDs alphabetically - sortFunc := func(a *metricID, b *metricID) int { - if a.name != b.name { - return strings.Compare(a.name, b.name) - } - return strings.Compare(a.unit, b.unit) - } - slices.SortFunc(metricIDs, sortFunc) - - sb.Write([]byte(fmt.Sprintf("# Metrics Documentation for namespace '%s'\n\n", m.namespace))) - sb.Write([]byte(fmt.Sprintf("This documentation was automatically generated at time `%s`\n\n", - time.Now().Format(time.RFC3339)))) - - sb.Write([]byte(fmt.Sprintf("There are a total of `%d` registered metrics.\n\n", len(m.metricMap)))) - - for _, id := range metricIDs { - metric := m.metricMap[*id] - - sb.Write([]byte("---\n\n")) - sb.Write([]byte(fmt.Sprintf("## %s\n\n", id.NameWithUnit()))) - - sb.Write([]byte(fmt.Sprintf("%s\n\n", metric.Description()))) - - sb.Write([]byte("| | |\n")) - sb.Write([]byte("|---|---|\n")) - sb.Write([]byte(fmt.Sprintf("| **Name** | `%s` |\n", metric.Name()))) - sb.Write([]byte(fmt.Sprintf("| **Unit** | `%s` |\n", metric.Unit()))) - labels := metric.LabelFields() - if len(labels) > 0 { - sb.Write([]byte("| **Labels** | ")) - for i, label := range labels { - sb.Write([]byte(fmt.Sprintf("`%s`", label))) - if i < len(labels)-1 { - sb.Write([]byte(", ")) - } - } - sb.Write([]byte(" |\n")) - } - sb.Write([]byte(fmt.Sprintf("| **Type** | `%s` |\n", metric.Type()))) - if metric.Type() == "latency" { - sb.Write([]byte(fmt.Sprintf("| **Quantiles** | %s |\n", m.quantilesMap[*id]))) - } - sb.Write([]byte(fmt.Sprintf("| **Fully Qualified Name** | `%s_%s_%s` |\n", - m.namespace, id.name, id.unit))) - } - - return sb.String() -} - -func (m *metrics) WriteMetricsDocumentation(fileName string) error { - doc := m.GenerateMetricsDocumentation() - - file, err := os.Create(fileName) - if err != nil { - return fmt.Errorf("error creating file: %v", err) - } - - _, err = file.Write([]byte(doc)) - if err != nil { - return fmt.Errorf("error writing to file: %v", err) - } - - err = file.Close() - if err != nil { - return fmt.Errorf("error closing file: %v", err) - } - - return nil -} - -func (m *metrics) RegisterExternalMetrics(collectors ...prometheus.Collector) { - m.registry.MustRegister(collectors...) -} diff --git a/common/metrics/mock_metrics.go b/common/metrics/mock_metrics.go deleted file mode 100644 index 244f268c90..0000000000 --- a/common/metrics/mock_metrics.go +++ /dev/null @@ -1,159 +0,0 @@ -package metrics - -import ( - "github.com/prometheus/client_golang/prometheus" - "time" -) - -var _ Metrics = &mockMetrics{} - -// mockMetrics is a mock implementation of the Metrics interface. -type mockMetrics struct { -} - -// NewMockMetrics creates a new mock Metrics instance. -// Suitable for testing or for when you just want to disable all metrics. -func NewMockMetrics() Metrics { - return &mockMetrics{} -} - -func (m *mockMetrics) GenerateMetricsDocumentation() string { - return "" -} - -func (m *mockMetrics) WriteMetricsDocumentation(fileName string) error { - return nil -} - -func (m *mockMetrics) Start() error { - return nil -} - -func (m *mockMetrics) Stop() error { - return nil -} - -func (m *mockMetrics) NewLatencyMetric( - name string, - description string, - templateLabel any, - quantiles ...*Quantile) (LatencyMetric, error) { - return &mockLatencyMetric{}, nil -} - -func (m *mockMetrics) NewCountMetric(name string, description string, templateLabel any) (CountMetric, error) { - return &mockCountMetric{}, nil -} - -func (m *mockMetrics) NewGaugeMetric( - name string, - unit string, - description string, - labelTemplate any) (GaugeMetric, error) { - return &mockGaugeMetric{}, nil -} - -func (m *mockMetrics) NewAutoGauge( - name string, - unit string, - description string, - pollPeriod time.Duration, - source func() float64, - label ...any) error { - return nil -} - -func (m *mockMetrics) RegisterExternalMetrics(collectors ...prometheus.Collector) { - -} - -var _ CountMetric = &mockCountMetric{} - -type mockCountMetric struct { -} - -func (m *mockCountMetric) Name() string { - return "" -} - -func (m *mockCountMetric) Unit() string { - return "" -} - -func (m *mockCountMetric) Description() string { - return "" -} - -func (m *mockCountMetric) Type() string { - return "" -} - -func (m *mockCountMetric) LabelFields() []string { - return make([]string, 0) -} - -func (m *mockCountMetric) Increment(label ...any) { - -} - -func (m *mockCountMetric) Add(value float64, label ...any) { - -} - -var _ GaugeMetric = &mockGaugeMetric{} - -type mockGaugeMetric struct { -} - -func (m *mockGaugeMetric) Name() string { - return "" -} - -func (m *mockGaugeMetric) Unit() string { - return "" -} - -func (m *mockGaugeMetric) Description() string { - return "" -} - -func (m *mockGaugeMetric) Type() string { - return "" -} - -func (m *mockGaugeMetric) LabelFields() []string { - return make([]string, 0) -} - -func (m *mockGaugeMetric) Set(value float64, label ...any) { - -} - -var _ LatencyMetric = &mockLatencyMetric{} - -type mockLatencyMetric struct { -} - -func (m *mockLatencyMetric) Name() string { - return "" -} - -func (m *mockLatencyMetric) Unit() string { - return "" -} - -func (m *mockLatencyMetric) Description() string { - return "" -} - -func (m *mockLatencyMetric) Type() string { - return "" -} - -func (m *mockLatencyMetric) LabelFields() []string { - return make([]string, 0) -} - -func (m *mockLatencyMetric) ReportLatency(latency time.Duration, label ...any) { - -} diff --git a/common/metrics/test/main.go b/common/metrics/test/main.go deleted file mode 100644 index 430bfc75ba..0000000000 --- a/common/metrics/test/main.go +++ /dev/null @@ -1,131 +0,0 @@ -package main - -import ( - "fmt" - "github.com/Layr-Labs/eigenda/common" - "github.com/Layr-Labs/eigenda/common/metrics" - "math/rand" - "sync/atomic" - "time" -) - -// This is a simple test bed for validating the metrics server (since it's not straight forward to unit test). - -type LabelType1 struct { - foo string - bar string - baz string -} - -type LabelType2 struct { - X string - Y string - Z string -} - -func main() { - logger, err := common.NewLogger(common.DefaultLoggerConfig()) - if err != nil { - panic(err) - } - - metricsServer := metrics.NewMetrics(logger, "test", 9101) - - l1, err := metricsServer.NewLatencyMetric( - "l1", - "this metric shows the latency of the sleep cycle", - LabelType1{}, - metrics.NewQuantile(0.5), - metrics.NewQuantile(0.9), - metrics.NewQuantile(0.99)) - if err != nil { - panic(err) - } - - c1, err := metricsServer.NewCountMetric( - "c1", - "this metric shows the number of times the sleep cycle has been executed", - LabelType2{}) - if err != nil { - panic(err) - } - - c2, err := metricsServer.NewCountMetric( - "c2", - "the purpose of this counter is to test what happens if we don't provide a label template", - nil) - if err != nil { - panic(err) - } - - g1, err := metricsServer.NewGaugeMetric( - "g1", - "milliseconds", - "this metric shows the duration of the most recent sleep cycle", - LabelType1{}) - if err != nil { - panic(err) - } - - sum := atomic.Int64{} - err = metricsServer.NewAutoGauge( - "g2", - "milliseconds", - "this metric shows the sum of all sleep cycles", - 1*time.Second, - func() float64 { - return float64(sum.Load()) - }, - LabelType2{X: "sum"}) - if err != nil { - panic(err) - } - - err = metricsServer.WriteMetricsDocumentation("metrics.md") - if err != nil { - panic(err) - } - - err = metricsServer.Start() - if err != nil { - panic(err) - } - - prev := time.Now() - for i := 0; i < 100; i++ { - fmt.Printf("Iteration %d\n", i) - time.Sleep(time.Duration(rand.Intn(1000)) * time.Millisecond) - now := time.Now() - elapsed := now.Sub(prev) - prev = now - - l1.ReportLatency(elapsed) - - l1.ReportLatency(elapsed/2, - LabelType1{ - foo: "half of the normal value", - bar: "42", - baz: "true", - }) - - c1.Increment() - c1.Add(2, LabelType2{ - X: "2x", - }) - c2.Increment() - - g1.Set(float64(elapsed.Milliseconds()), - LabelType1{ - foo: "bar", - bar: "baz", - baz: "foo", - }) - - sum.Store(sum.Load() + elapsed.Milliseconds()) - } - - err = metricsServer.Stop() - if err != nil { - panic(err) - } -} diff --git a/common/metrics/test/metrics.md b/common/metrics/test/metrics.md deleted file mode 100644 index dfb98804a5..0000000000 --- a/common/metrics/test/metrics.md +++ /dev/null @@ -1,71 +0,0 @@ -# Metrics Documentation for namespace 'test' - -This documentation was automatically generated at time `2024-11-25T12:46:49-06:00` - -There are a total of `5` registered metrics. - ---- - -## c1_count - -this metric shows the number of times the sleep cycle has been executed - -| | | -|---|---| -| **Name** | `c1` | -| **Unit** | `count` | -| **Labels** | `X`, `Y`, `Z` | -| **Type** | `counter` | -| **Fully Qualified Name** | `test_c1_count` | ---- - -## c2_count - -the purpose of this counter is to test what happens if we don't provide a label template - -| | | -|---|---| -| **Name** | `c2` | -| **Unit** | `count` | -| **Type** | `counter` | -| **Fully Qualified Name** | `test_c2_count` | ---- - -## g1_milliseconds - -this metric shows the duration of the most recent sleep cycle - -| | | -|---|---| -| **Name** | `g1` | -| **Unit** | `milliseconds` | -| **Labels** | `foo`, `bar`, `baz` | -| **Type** | `gauge` | -| **Fully Qualified Name** | `test_g1_milliseconds` | ---- - -## g2_milliseconds - -this metric shows the sum of all sleep cycles - -| | | -|---|---| -| **Name** | `g2` | -| **Unit** | `milliseconds` | -| **Labels** | `X`, `Y`, `Z` | -| **Type** | `gauge` | -| **Fully Qualified Name** | `test_g2_milliseconds` | ---- - -## l1_ms - -this metric shows the latency of the sleep cycle - -| | | -|---|---| -| **Name** | `l1` | -| **Unit** | `ms` | -| **Labels** | `foo`, `bar`, `baz` | -| **Type** | `latency` | -| **Quantiles** | `0.500`, `0.900`, `0.990` | -| **Fully Qualified Name** | `test_l1_ms` | diff --git a/relay/cache/cache_accessor.go b/relay/cache/cache_accessor.go index 83c1f74484..c9ad42611a 100644 --- a/relay/cache/cache_accessor.go +++ b/relay/cache/cache_accessor.go @@ -108,7 +108,7 @@ func (c *cacheAccessor[K, V]) Get(ctx context.Context, key K) (V, error) { c.cacheLock.Unlock() if c.metrics != nil { - c.metrics.cacheHits.Increment() + c.metrics.ReportCacheHit() } return v, nil } @@ -123,7 +123,7 @@ func (c *cacheAccessor[K, V]) Get(ctx context.Context, key K) (V, error) { c.cacheLock.Unlock() if c.metrics != nil { - c.metrics.cacheMisses.Increment() + c.metrics.ReportCacheMiss() } if alreadyLoading { @@ -170,7 +170,7 @@ func (c *cacheAccessor[K, V]) fetchResult(ctx context.Context, key K, result *ac if c.metrics != nil { start := time.Now() defer func() { - c.metrics.cacheMissLatency.ReportLatency(time.Since(start)) + c.metrics.ReportCacheMissLatency(time.Since(start)) }() } @@ -183,13 +183,13 @@ func (c *cacheAccessor[K, V]) fetchResult(ctx context.Context, key K, result *ac if c.metrics != nil { size := c.cache.Size() weight := c.cache.Weight() - c.metrics.size.Set(float64(size)) - c.metrics.weight.Set(float64(weight)) + c.metrics.ReportSize(size) + c.metrics.ReportWeight(weight) var averageWeight float64 if size > 0 { averageWeight = float64(weight) / float64(size) } - c.metrics.averageWeight.Set(averageWeight) + c.metrics.ReportAverageWeight(averageWeight) } } diff --git a/relay/cache/cache_accessor_metrics.go b/relay/cache/cache_accessor_metrics.go index b28c54261d..e5ef456218 100644 --- a/relay/cache/cache_accessor_metrics.go +++ b/relay/cache/cache_accessor_metrics.go @@ -2,77 +2,82 @@ package cache import ( "fmt" - "github.com/Layr-Labs/eigenda/common/metrics" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + "time" ) +const namespace = "eigenda_relay" + // CacheAccessorMetrics provides metrics for a CacheAccessor. type CacheAccessorMetrics struct { - cacheHits metrics.CountMetric - cacheMisses metrics.CountMetric - size metrics.GaugeMetric - weight metrics.GaugeMetric - averageWeight metrics.GaugeMetric - cacheMissLatency metrics.LatencyMetric + cacheHits *prometheus.CounterVec + cacheMisses *prometheus.CounterVec + size *prometheus.GaugeVec + weight *prometheus.GaugeVec + averageWeight *prometheus.GaugeVec + cacheMissLatency *prometheus.SummaryVec } // NewCacheAccessorMetrics creates a new CacheAccessorMetrics. func NewCacheAccessorMetrics( - server metrics.Metrics, - cacheName string) (*CacheAccessorMetrics, error) { - - cacheHits, err := server.NewCountMetric( - fmt.Sprintf("%s_cache_hit", cacheName), - fmt.Sprintf("Number of cache hits in the %s cache", cacheName), - nil) - if err != nil { - return nil, err - } + registry *prometheus.Registry, + cacheName string) *CacheAccessorMetrics { - cacheMisses, err := server.NewCountMetric( - fmt.Sprintf("%s_cache_miss", cacheName), - fmt.Sprintf("Number of cache misses in the %s cache", cacheName), - nil) - if err != nil { - return nil, err - } + cacheHits := promauto.With(registry).NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Name: fmt.Sprintf("%s_cache_hit_count", cacheName), + Help: "Number of cache hits", + }, + []string{}, + ) - size, err := server.NewGaugeMetric( - fmt.Sprintf("%s_cache", cacheName), - "size", - fmt.Sprintf("Number of items in the %s cache", cacheName), - nil) - if err != nil { - return nil, err - } + cacheMisses := promauto.With(registry).NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Name: fmt.Sprintf("%s_cache_miss_count", cacheName), + Help: "Number of cache misses", + }, + []string{}, + ) - weight, err := server.NewGaugeMetric( - fmt.Sprintf("%s_cache", cacheName), - "weight", - fmt.Sprintf("Total weight of items in the %s cache", cacheName), - nil) - if err != nil { - return nil, err - } + size := promauto.With(registry).NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Name: fmt.Sprintf("%s_cache_size", cacheName), + Help: "Number of items in the cache", + }, + []string{}, + ) - averageWeight, err := server.NewGaugeMetric( - fmt.Sprintf("%s_cache_item", cacheName), - "weight", - fmt.Sprintf("Weight of each item currently in the %s cache", cacheName), - nil) - if err != nil { - return nil, err - } + weight := promauto.With(registry).NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Name: fmt.Sprintf("%s_cache_weight", cacheName), + Help: "Total weight of items in the cache", + }, + []string{}, + ) - cacheMissLatency, err := server.NewLatencyMetric( - fmt.Sprintf("%s_cache_miss_latency", cacheName), - fmt.Sprintf("Latency of cache misses in the %s cache", cacheName), - nil, - &metrics.Quantile{Quantile: 0.5, Error: 0.05}, - &metrics.Quantile{Quantile: 0.9, Error: 0.05}, - &metrics.Quantile{Quantile: 0.99, Error: 0.05}) - if err != nil { - return nil, err - } + averageWeight := promauto.With(registry).NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Name: fmt.Sprintf("%s_cache_average_weight", cacheName), + Help: "Weight of each item currently in the cache", + }, + []string{}, + ) + + cacheMissLatency := promauto.With(registry).NewSummaryVec( + prometheus.SummaryOpts{ + Namespace: namespace, + Name: fmt.Sprintf("%s_cache_miss_latency_ms", cacheName), + Help: "Latency of cache misses", + Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.05, 0.99: 0.01}, + }, + []string{}, + ) return &CacheAccessorMetrics{ cacheHits: cacheHits, @@ -81,5 +86,29 @@ func NewCacheAccessorMetrics( weight: weight, averageWeight: averageWeight, cacheMissLatency: cacheMissLatency, - }, nil + } +} + +func (m *CacheAccessorMetrics) ReportCacheHit() { + m.cacheHits.WithLabelValues().Inc() +} + +func (m *CacheAccessorMetrics) ReportCacheMiss() { + m.cacheMisses.WithLabelValues().Inc() +} + +func (m *CacheAccessorMetrics) ReportSize(size int) { + m.size.WithLabelValues().Set(float64(size)) +} + +func (m *CacheAccessorMetrics) ReportWeight(weight uint64) { + m.weight.WithLabelValues().Set(float64(weight)) +} + +func (m *CacheAccessorMetrics) ReportAverageWeight(averageWeight float64) { + m.averageWeight.WithLabelValues().Set(averageWeight) +} + +func (m *CacheAccessorMetrics) ReportCacheMissLatency(duration time.Duration) { + m.cacheMissLatency.WithLabelValues().Observe(float64(duration.Nanoseconds()) / float64(time.Millisecond)) } diff --git a/relay/limiter/blob_rate_limiter.go b/relay/limiter/blob_rate_limiter.go index 8766d9d870..07351834b0 100644 --- a/relay/limiter/blob_rate_limiter.go +++ b/relay/limiter/blob_rate_limiter.go @@ -63,14 +63,14 @@ func (l *BlobRateLimiter) BeginGetBlobOperation(now time.Time) error { if l.operationsInFlight >= l.config.MaxConcurrentGetBlobOps { if l.relayMetrics != nil { - l.relayMetrics.GetBlobRateLimited.Increment(metrics.RateLimitLabel{Reason: "global concurrency"}) + l.relayMetrics.ReportBlobRateLimited("global concurrency") } return fmt.Errorf("global concurrent request limit %d exceeded for getBlob operations, try again later", l.config.MaxConcurrentGetBlobOps) } if l.opLimiter.TokensAt(now) < 1 { if l.relayMetrics != nil { - l.relayMetrics.GetBlobRateLimited.Increment(metrics.RateLimitLabel{Reason: "global rate"}) + l.relayMetrics.ReportBlobRateLimited("global rate") } return fmt.Errorf("global rate limit %0.1fhz exceeded for getBlob operations, try again later", l.config.MaxGetBlobOpsPerSecond) @@ -110,7 +110,7 @@ func (l *BlobRateLimiter) RequestGetBlobBandwidth(now time.Time, bytes uint32) e allowed := l.bandwidthLimiter.AllowN(now, int(bytes)) if !allowed { if l.relayMetrics != nil { - l.relayMetrics.GetBlobRateLimited.Increment(metrics.RateLimitLabel{Reason: "global bandwidth"}) + l.relayMetrics.ReportBlobRateLimited("global bandwidth") } return fmt.Errorf("global rate limit %dMib/s exceeded for getBlob bandwidth, try again later", int(l.config.MaxGetBlobBytesPerSecond/1024/1024)) diff --git a/relay/limiter/chunk_rate_limiter.go b/relay/limiter/chunk_rate_limiter.go index d5de3e650b..9c8644e035 100644 --- a/relay/limiter/chunk_rate_limiter.go +++ b/relay/limiter/chunk_rate_limiter.go @@ -98,7 +98,7 @@ func (l *ChunkRateLimiter) BeginGetChunkOperation( if l.globalOperationsInFlight >= l.config.MaxConcurrentGetChunkOps { if l.relayMetrics != nil { - l.relayMetrics.GetChunksRateLimited.Increment(metrics.RateLimitLabel{Reason: "global concurrency"}) + l.relayMetrics.ReportChunkRateLimited("global concurrency") } return fmt.Errorf( "global concurrent request limit %d exceeded for GetChunks operations, try again later", @@ -106,21 +106,21 @@ func (l *ChunkRateLimiter) BeginGetChunkOperation( } if l.globalOpLimiter.TokensAt(now) < 1 { if l.relayMetrics != nil { - l.relayMetrics.GetChunksRateLimited.Increment(metrics.RateLimitLabel{Reason: "global rate"}) + l.relayMetrics.ReportChunkRateLimited("global rate") } return fmt.Errorf("global rate limit %0.1fhz exceeded for GetChunks operations, try again later", l.config.MaxGetChunkOpsPerSecond) } if l.perClientOperationsInFlight[requesterID] >= l.config.MaxConcurrentGetChunkOpsClient { if l.relayMetrics != nil { - l.relayMetrics.GetChunksRateLimited.Increment(metrics.RateLimitLabel{Reason: "client concurrency"}) + l.relayMetrics.ReportChunkRateLimited("client concurrency") } return fmt.Errorf("client concurrent request limit %d exceeded for GetChunks", l.config.MaxConcurrentGetChunkOpsClient) } if l.perClientOpLimiter[requesterID].TokensAt(now) < 1 { if l.relayMetrics != nil { - l.relayMetrics.GetChunksRateLimited.Increment(metrics.RateLimitLabel{Reason: "client rate"}) + l.relayMetrics.ReportChunkRateLimited("client rate") } return fmt.Errorf("client rate limit %0.1fhz exceeded for GetChunks, try again later", l.config.MaxGetChunkOpsPerSecondClient) @@ -159,7 +159,7 @@ func (l *ChunkRateLimiter) RequestGetChunkBandwidth(now time.Time, requesterID s allowed := l.globalBandwidthLimiter.AllowN(now, bytes) if !allowed { if l.relayMetrics != nil { - l.relayMetrics.GetChunksRateLimited.Increment(metrics.RateLimitLabel{Reason: "global bandwidth"}) + l.relayMetrics.ReportChunkRateLimited("global bandwidth") } return fmt.Errorf("global rate limit %dMiB exceeded for GetChunk bandwidth, try again later", int(l.config.MaxGetChunkBytesPerSecond/1024/1024)) @@ -173,7 +173,7 @@ func (l *ChunkRateLimiter) RequestGetChunkBandwidth(now time.Time, requesterID s if !allowed { l.globalBandwidthLimiter.AllowN(now, -bytes) if l.relayMetrics != nil { - l.relayMetrics.GetChunksRateLimited.Increment(metrics.RateLimitLabel{Reason: "client bandwidth"}) + l.relayMetrics.ReportChunkRateLimited("client bandwidth") } return fmt.Errorf("client rate limit %dMiB exceeded for GetChunk bandwidth, try again later", int(l.config.MaxGetChunkBytesPerSecondClient/1024/1024)) diff --git a/relay/mdoc/main.go b/relay/mdoc/main.go deleted file mode 100644 index 56c68999fd..0000000000 --- a/relay/mdoc/main.go +++ /dev/null @@ -1,24 +0,0 @@ -package main - -import ( - "github.com/Layr-Labs/eigenda/common" - "github.com/Layr-Labs/eigenda/relay/metrics" -) - -// main generates documentation for relay metrics. -func main() { - logger, err := common.NewLogger(common.DefaultLoggerConfig()) - if err != nil { - panic(err) - } - - metrics, err := metrics.NewRelayMetrics(logger, 0) - if err != nil { - panic(err) - } - - err = metrics.WriteMetricsDocumentation() - if err != nil { - panic(err) - } -} diff --git a/relay/mdoc/relay-metrics.md b/relay/mdoc/relay-metrics.md deleted file mode 100644 index 959c3a5493..0000000000 --- a/relay/mdoc/relay-metrics.md +++ /dev/null @@ -1,426 +0,0 @@ -# Metrics Documentation for namespace 'relay' - -This documentation was automatically generated at time `2024-12-03T10:26:19-06:00` - -There are a total of `34` registered metrics. - ---- - -## blob_cache_size - -Number of items in the blob cache - -| | | -|---|---| -| **Name** | `blob_cache` | -| **Unit** | `size` | -| **Type** | `gauge` | -| **Fully Qualified Name** | `relay_blob_cache_size` | ---- - -## blob_cache_weight - -Total weight of items in the blob cache - -| | | -|---|---| -| **Name** | `blob_cache` | -| **Unit** | `weight` | -| **Type** | `gauge` | -| **Fully Qualified Name** | `relay_blob_cache_weight` | ---- - -## blob_cache_average_weight - -Average weight of items currently in the blob cache - -| | | -|---|---| -| **Name** | `blob_cache_average` | -| **Unit** | `weight` | -| **Type** | `gauge` | -| **Fully Qualified Name** | `relay_blob_cache_average_weight` | ---- - -## blob_cache_hit_count - -Number of cache hits in the blob cache - -| | | -|---|---| -| **Name** | `blob_cache_hit` | -| **Unit** | `count` | -| **Type** | `counter` | -| **Fully Qualified Name** | `relay_blob_cache_hit_count` | ---- - -## blob_cache_lifespan_ms - -Time an item remains in the blob cache before being evicted. - -| | | -|---|---| -| **Name** | `blob_cache_lifespan` | -| **Unit** | `ms` | -| **Type** | `gauge` | -| **Fully Qualified Name** | `relay_blob_cache_lifespan_ms` | ---- - -## blob_cache_miss_count - -Number of cache misses in the blob cache - -| | | -|---|---| -| **Name** | `blob_cache_miss` | -| **Unit** | `count` | -| **Type** | `counter` | -| **Fully Qualified Name** | `relay_blob_cache_miss_count` | ---- - -## blob_cache_miss_latency_ms - -Latency of cache misses in the blob cache - -| | | -|---|---| -| **Name** | `blob_cache_miss_latency` | -| **Unit** | `ms` | -| **Type** | `latency` | -| **Quantiles** | `0.500`, `0.900`, `0.990` | -| **Fully Qualified Name** | `relay_blob_cache_miss_latency_ms` | ---- - -## chunk_cache_size - -Number of items in the chunk cache - -| | | -|---|---| -| **Name** | `chunk_cache` | -| **Unit** | `size` | -| **Type** | `gauge` | -| **Fully Qualified Name** | `relay_chunk_cache_size` | ---- - -## chunk_cache_weight - -Total weight of items in the chunk cache - -| | | -|---|---| -| **Name** | `chunk_cache` | -| **Unit** | `weight` | -| **Type** | `gauge` | -| **Fully Qualified Name** | `relay_chunk_cache_weight` | ---- - -## chunk_cache_average_weight - -Average weight of items currently in the chunk cache - -| | | -|---|---| -| **Name** | `chunk_cache_average` | -| **Unit** | `weight` | -| **Type** | `gauge` | -| **Fully Qualified Name** | `relay_chunk_cache_average_weight` | ---- - -## chunk_cache_hit_count - -Number of cache hits in the chunk cache - -| | | -|---|---| -| **Name** | `chunk_cache_hit` | -| **Unit** | `count` | -| **Type** | `counter` | -| **Fully Qualified Name** | `relay_chunk_cache_hit_count` | ---- - -## chunk_cache_lifespan_ms - -Time an item remains in the chunk cache before being evicted. - -| | | -|---|---| -| **Name** | `chunk_cache_lifespan` | -| **Unit** | `ms` | -| **Type** | `gauge` | -| **Fully Qualified Name** | `relay_chunk_cache_lifespan_ms` | ---- - -## chunk_cache_miss_count - -Number of cache misses in the chunk cache - -| | | -|---|---| -| **Name** | `chunk_cache_miss` | -| **Unit** | `count` | -| **Type** | `counter` | -| **Fully Qualified Name** | `relay_chunk_cache_miss_count` | ---- - -## chunk_cache_miss_latency_ms - -Latency of cache misses in the chunk cache - -| | | -|---|---| -| **Name** | `chunk_cache_miss_latency` | -| **Unit** | `ms` | -| **Type** | `latency` | -| **Quantiles** | `0.500`, `0.900`, `0.990` | -| **Fully Qualified Name** | `relay_chunk_cache_miss_latency_ms` | ---- - -## get_blob_data_latency_ms - -Latency of the GetBlob RPC data retrieval - -| | | -|---|---| -| **Name** | `get_blob_data_latency` | -| **Unit** | `ms` | -| **Type** | `latency` | -| **Quantiles** | `0.500`, `0.900`, `0.990` | -| **Fully Qualified Name** | `relay_get_blob_data_latency_ms` | ---- - -## get_blob_data_size_bytes - -Data size of requested blobs. - -| | | -|---|---| -| **Name** | `get_blob_data_size` | -| **Unit** | `bytes` | -| **Type** | `gauge` | -| **Fully Qualified Name** | `relay_get_blob_data_size_bytes` | ---- - -## get_blob_latency_ms - -Latency of the GetBlob RPC - -| | | -|---|---| -| **Name** | `get_blob_latency` | -| **Unit** | `ms` | -| **Type** | `latency` | -| **Quantiles** | `0.500`, `0.900`, `0.990` | -| **Fully Qualified Name** | `relay_get_blob_latency_ms` | ---- - -## get_blob_metadata_latency_ms - -Latency of the GetBlob RPC metadata retrieval - -| | | -|---|---| -| **Name** | `get_blob_metadata_latency` | -| **Unit** | `ms` | -| **Type** | `latency` | -| **Quantiles** | `0.500`, `0.900`, `0.990` | -| **Fully Qualified Name** | `relay_get_blob_metadata_latency_ms` | ---- - -## get_blob_rate_limited_count - -Number of GetBlob RPC rate limited - -| | | -|---|---| -| **Name** | `get_blob_rate_limited` | -| **Unit** | `count` | -| **Labels** | `reason` | -| **Type** | `counter` | -| **Fully Qualified Name** | `relay_get_blob_rate_limited_count` | ---- - -## get_chunks_auth_failure_count - -Number of GetChunks RPC authentication failures - -| | | -|---|---| -| **Name** | `get_chunks_auth_failure` | -| **Unit** | `count` | -| **Type** | `counter` | -| **Fully Qualified Name** | `relay_get_chunks_auth_failure_count` | ---- - -## get_chunks_authentication_latency_ms - -Latency of the GetChunks RPC client authentication - -| | | -|---|---| -| **Name** | `get_chunks_authentication_latency` | -| **Unit** | `ms` | -| **Type** | `latency` | -| **Quantiles** | `0.500`, `0.900`, `0.990` | -| **Fully Qualified Name** | `relay_get_chunks_authentication_latency_ms` | ---- - -## get_chunks_data_latency_ms - -Latency of the GetChunks RPC data retrieval - -| | | -|---|---| -| **Name** | `get_chunks_data_latency` | -| **Unit** | `ms` | -| **Type** | `latency` | -| **Quantiles** | `0.500`, `0.900`, `0.990` | -| **Fully Qualified Name** | `relay_get_chunks_data_latency_ms` | ---- - -## get_chunks_data_size_bytes - -Data size in a GetChunks request. - -| | | -|---|---| -| **Name** | `get_chunks_data_size` | -| **Unit** | `bytes` | -| **Type** | `gauge` | -| **Fully Qualified Name** | `relay_get_chunks_data_size_bytes` | ---- - -## get_chunks_key_count - -Number of keys in a GetChunks request. - -| | | -|---|---| -| **Name** | `get_chunks_key` | -| **Unit** | `count` | -| **Type** | `gauge` | -| **Fully Qualified Name** | `relay_get_chunks_key_count` | ---- - -## get_chunks_latency_ms - -Latency of the GetChunks RPC - -| | | -|---|---| -| **Name** | `get_chunks_latency` | -| **Unit** | `ms` | -| **Type** | `latency` | -| **Quantiles** | `0.500`, `0.900`, `0.990` | -| **Fully Qualified Name** | `relay_get_chunks_latency_ms` | ---- - -## get_chunks_metadata_latency_ms - -Latency of the GetChunks RPC metadata retrieval - -| | | -|---|---| -| **Name** | `get_chunks_metadata_latency` | -| **Unit** | `ms` | -| **Type** | `latency` | -| **Quantiles** | `0.500`, `0.900`, `0.990` | -| **Fully Qualified Name** | `relay_get_chunks_metadata_latency_ms` | ---- - -## get_chunks_rate_limited_count - -Number of GetChunks RPC rate limited - -| | | -|---|---| -| **Name** | `get_chunks_rate_limited` | -| **Unit** | `count` | -| **Labels** | `reason` | -| **Type** | `counter` | -| **Fully Qualified Name** | `relay_get_chunks_rate_limited_count` | ---- - -## metadata_cache_size - -Number of items in the metadata cache - -| | | -|---|---| -| **Name** | `metadata_cache` | -| **Unit** | `size` | -| **Type** | `gauge` | -| **Fully Qualified Name** | `relay_metadata_cache_size` | ---- - -## metadata_cache_weight - -Total weight of items in the metadata cache - -| | | -|---|---| -| **Name** | `metadata_cache` | -| **Unit** | `weight` | -| **Type** | `gauge` | -| **Fully Qualified Name** | `relay_metadata_cache_weight` | ---- - -## metadata_cache_average_weight - -Average weight of items currently in the metadata cache - -| | | -|---|---| -| **Name** | `metadata_cache_average` | -| **Unit** | `weight` | -| **Type** | `gauge` | -| **Fully Qualified Name** | `relay_metadata_cache_average_weight` | ---- - -## metadata_cache_hit_count - -Number of cache hits in the metadata cache - -| | | -|---|---| -| **Name** | `metadata_cache_hit` | -| **Unit** | `count` | -| **Type** | `counter` | -| **Fully Qualified Name** | `relay_metadata_cache_hit_count` | ---- - -## metadata_cache_lifespan_ms - -Time an item remains in the metadata cache before being evicted. - -| | | -|---|---| -| **Name** | `metadata_cache_lifespan` | -| **Unit** | `ms` | -| **Type** | `gauge` | -| **Fully Qualified Name** | `relay_metadata_cache_lifespan_ms` | ---- - -## metadata_cache_miss_count - -Number of cache misses in the metadata cache - -| | | -|---|---| -| **Name** | `metadata_cache_miss` | -| **Unit** | `count` | -| **Type** | `counter` | -| **Fully Qualified Name** | `relay_metadata_cache_miss_count` | ---- - -## metadata_cache_miss_latency_ms - -Latency of cache misses in the metadata cache - -| | | -|---|---| -| **Name** | `metadata_cache_miss_latency` | -| **Unit** | `ms` | -| **Type** | `latency` | -| **Quantiles** | `0.500`, `0.900`, `0.990` | -| **Fully Qualified Name** | `relay_metadata_cache_miss_latency_ms` | diff --git a/relay/metrics/metrics.go b/relay/metrics/metrics.go index 93ba8da1e1..faa19d1f75 100644 --- a/relay/metrics/metrics.go +++ b/relay/metrics/metrics.go @@ -1,16 +1,26 @@ package metrics import ( - "github.com/Layr-Labs/eigenda/common/metrics" + "fmt" "github.com/Layr-Labs/eigenda/relay/cache" "github.com/Layr-Labs/eigensdk-go/logging" grpcprom "github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/collectors" + "github.com/prometheus/client_golang/prometheus/promauto" + "github.com/prometheus/client_golang/prometheus/promhttp" "google.golang.org/grpc" + "net/http" + "strings" + "time" ) +const namespace = "eigenda_relay" + type RelayMetrics struct { - metricsServer metrics.Metrics + logger logging.Logger grpcServerOption grpc.ServerOption + server *http.Server // Cache metrics MetadataCacheMetrics *cache.CacheAccessorMetrics @@ -18,203 +28,214 @@ type RelayMetrics struct { BlobCacheMetrics *cache.CacheAccessorMetrics // GetChunks metrics - GetChunksLatency metrics.LatencyMetric - GetChunksAuthenticationLatency metrics.LatencyMetric - GetChunksMetadataLatency metrics.LatencyMetric - GetChunksDataLatency metrics.LatencyMetric - GetChunksAuthFailures metrics.CountMetric - GetChunksRateLimited metrics.CountMetric - GetChunksKeyCount metrics.GaugeMetric - GetChunksDataSize metrics.GaugeMetric + getChunksLatency *prometheus.SummaryVec + getChunksAuthenticationLatency *prometheus.SummaryVec + getChunksMetadataLatency *prometheus.SummaryVec + getChunksDataLatency *prometheus.SummaryVec + getChunksAuthFailures *prometheus.CounterVec + getChunksRateLimited *prometheus.CounterVec + getChunksKeyCount *prometheus.GaugeVec + getChunksDataSize *prometheus.GaugeVec // GetBlob metrics - GetBlobLatency metrics.LatencyMetric - GetBlobMetadataLatency metrics.LatencyMetric - GetBlobDataLatency metrics.LatencyMetric - GetBlobRateLimited metrics.CountMetric - GetBlobDataSize metrics.GaugeMetric -} - -type RateLimitLabel struct { - Reason string + getBlobLatency *prometheus.SummaryVec + getBlobMetadataLatency *prometheus.SummaryVec + getBlobDataLatency *prometheus.SummaryVec + getBlobRateLimited *prometheus.CounterVec + getBlobDataSize *prometheus.GaugeVec } // NewRelayMetrics creates a new RelayMetrics instance, which encapsulates all metrics related to the relay. -func NewRelayMetrics(logger logging.Logger, port int) (*RelayMetrics, error) { +func NewRelayMetrics(logger logging.Logger, port int) *RelayMetrics { + + registry := prometheus.NewRegistry() + registry.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{})) + registry.MustRegister(collectors.NewGoCollector()) - server := metrics.NewMetrics(logger, "relay", port) + logger.Infof("Starting metrics server at port %d", port) + addr := fmt.Sprintf(":%d", port) + mux := http.NewServeMux() + mux.Handle("/metrics", promhttp.HandlerFor( + registry, + promhttp.HandlerOpts{}, + )) + server := &http.Server{ + Addr: addr, + Handler: mux, + } grpcMetrics := grpcprom.NewServerMetrics() - server.RegisterExternalMetrics(grpcMetrics) + registry.MustRegister(grpcMetrics) grpcServerOption := grpc.UnaryInterceptor( grpcMetrics.UnaryServerInterceptor(), ) - standardQuantiles := []*metrics.Quantile{ - metrics.NewQuantile(0.5), - metrics.NewQuantile(0.9), - metrics.NewQuantile(0.99), - } - - metadataCacheMetrics, err := cache.NewCacheAccessorMetrics(server, "metadata") - if err != nil { - return nil, err - } - - chunkCacheMetrics, err := cache.NewCacheAccessorMetrics(server, "chunk") - if err != nil { - return nil, err - } + metadataCacheMetrics := cache.NewCacheAccessorMetrics(registry, "metadata") + chunkCacheMetrics := cache.NewCacheAccessorMetrics(registry, "chunk") + blobCacheMetrics := cache.NewCacheAccessorMetrics(registry, "blob") - blobCacheMetrics, err := cache.NewCacheAccessorMetrics(server, "blob") - if err != nil { - return nil, err - } + objectives := map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001} - getChunksLatencyMetric, err := server.NewLatencyMetric( - "get_chunks_latency", - "Latency of the GetChunks RPC", - nil, - standardQuantiles...) - if err != nil { - return nil, err - } + getChunksLatency := promauto.With(registry).NewSummaryVec( + prometheus.SummaryOpts{ + Namespace: namespace, + Name: "get_chunks_latency_ms", + Help: "Latency of the GetChunks RPC", + Objectives: objectives, + }, + []string{}, + ) - getChunksAuthenticationLatencyMetric, err := server.NewLatencyMetric( - "get_chunks_authentication_latency", - "Latency of the GetChunks RPC client authentication", - nil, - standardQuantiles...) - if err != nil { - return nil, err - } + getChunksAuthenticationLatency := promauto.With(registry).NewSummaryVec( + prometheus.SummaryOpts{ + Namespace: namespace, + Name: "get_chunks_authentication_latency_ms", + Help: "Latency of the GetChunks RPC client authentication", + Objectives: objectives, + }, + []string{}, + ) - getChunksMetadataLatencyMetric, err := server.NewLatencyMetric( - "get_chunks_metadata_latency", - "Latency of the GetChunks RPC metadata retrieval", - nil, - standardQuantiles...) - if err != nil { - return nil, err - } + getChunksMetadataLatency := promauto.With(registry).NewSummaryVec( + prometheus.SummaryOpts{ + Namespace: namespace, + Name: "get_chunks_metadata_latency_ms", + Help: "Latency of the GetChunks RPC metadata retrieval", + Objectives: objectives, + }, + []string{}, + ) - getChunksDataLatencyMetric, err := server.NewLatencyMetric( - "get_chunks_data_latency", - "Latency of the GetChunks RPC data retrieval", - nil, - standardQuantiles...) - if err != nil { - return nil, err - } + getChunksDataLatency := promauto.With(registry).NewSummaryVec( + prometheus.SummaryOpts{ + Namespace: namespace, + Name: "get_chunks_data_latency_ms", + Help: "Latency of the GetChunks RPC data retrieval", + Objectives: objectives, + }, + []string{}, + ) - getChunksAuthFailures, err := server.NewCountMetric( - "get_chunks_auth_failure", - "Number of GetChunks RPC authentication failures", - nil) - if err != nil { - return nil, err - } + getChunksAuthFailures := promauto.With(registry).NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Name: "get_chunks_auth_failure_count", + Help: "Number of GetChunks RPC authentication failures", + }, + []string{}, + ) - getChunksRateLimited, err := server.NewCountMetric( - "get_chunks_rate_limited", - "Number of GetChunks RPC rate limited", - RateLimitLabel{}) - if err != nil { - return nil, err - } + getChunksRateLimited := promauto.With(registry).NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Name: "get_chunks_rate_limited_count", + Help: "Number of GetChunks RPC rate limited", + }, + []string{"reason"}, + ) - getChunksKeyCount, err := server.NewGaugeMetric( - "get_chunks_key", - "count", - "Number of keys in a GetChunks request.", - nil) - if err != nil { - return nil, err - } + getChunksKeyCount := promauto.With(registry).NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Name: "get_chunks_key_count", + Help: "Number of keys in a GetChunks request.", + }, + []string{}, + ) - getChunksDataSize, err := server.NewGaugeMetric( - "get_chunks_data_size", - "bytes", - "Data size in a GetChunks request.", - nil) - if err != nil { - return nil, err - } + getChunksDataSize := promauto.With(registry).NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Name: "get_chunks_data_size_bytes", + Help: "Data size in a GetChunks request.", + }, + []string{}, + ) - getBlobLatencyMetric, err := server.NewLatencyMetric( - "get_blob_latency", - "Latency of the GetBlob RPC", - nil, - standardQuantiles...) - if err != nil { - return nil, err - } + getBlobLatency := promauto.With(registry).NewSummaryVec( + prometheus.SummaryOpts{ + Namespace: namespace, + Name: "get_blob_latency_ms", + Help: "Latency of the GetBlob RPC", + Objectives: objectives, + }, + []string{}, + ) - getBlobMetadataLatencyMetric, err := server.NewLatencyMetric( - "get_blob_metadata_latency", - "Latency of the GetBlob RPC metadata retrieval", - nil, - standardQuantiles...) - if err != nil { - return nil, err - } + getBlobMetadataLatency := promauto.With(registry).NewSummaryVec( + prometheus.SummaryOpts{ + Namespace: namespace, + Name: "get_blob_metadata_latency_ms", + Help: "Latency of the GetBlob RPC metadata retrieval", + Objectives: objectives, + }, + []string{}, + ) - getBlobDataLatencyMetric, err := server.NewLatencyMetric( - "get_blob_data_latency", - "Latency of the GetBlob RPC data retrieval", - nil, - standardQuantiles...) - if err != nil { - return nil, err - } + getBlobDataLatency := promauto.With(registry).NewSummaryVec( + prometheus.SummaryOpts{ + Namespace: namespace, + Name: "get_blob_data_latency_ms", + Help: "Latency of the GetBlob RPC data retrieval", + Objectives: objectives, + }, + []string{}, + ) - getBlobRateLimited, err := server.NewCountMetric( - "get_blob_rate_limited", - "Number of GetBlob RPC rate limited", - RateLimitLabel{}) - if err != nil { - return nil, err - } + getBlobRateLimited := promauto.With(registry).NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Name: "get_blob_rate_limited_count", + Help: "Number of GetBlob RPC rate limited", + }, + []string{"reason"}, + ) - getBlobDataSize, err := server.NewGaugeMetric( - "get_blob_data_size", - "bytes", - "Data size of requested blobs.", - nil) - if err != nil { - return nil, err - } + getBlobDataSize := promauto.With(registry).NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Name: "get_blob_data_size_bytes", + Help: "Data size of requested blobs.", + }, + []string{}, + ) return &RelayMetrics{ - metricsServer: server, + logger: logger, + grpcServerOption: grpcServerOption, + server: server, MetadataCacheMetrics: metadataCacheMetrics, ChunkCacheMetrics: chunkCacheMetrics, BlobCacheMetrics: blobCacheMetrics, - grpcServerOption: grpcServerOption, - GetChunksLatency: getChunksLatencyMetric, - GetChunksAuthenticationLatency: getChunksAuthenticationLatencyMetric, - GetChunksMetadataLatency: getChunksMetadataLatencyMetric, - GetChunksDataLatency: getChunksDataLatencyMetric, - GetChunksAuthFailures: getChunksAuthFailures, - GetChunksRateLimited: getChunksRateLimited, - GetChunksKeyCount: getChunksKeyCount, - GetChunksDataSize: getChunksDataSize, - GetBlobLatency: getBlobLatencyMetric, - GetBlobMetadataLatency: getBlobMetadataLatencyMetric, - GetBlobDataLatency: getBlobDataLatencyMetric, - GetBlobRateLimited: getBlobRateLimited, - GetBlobDataSize: getBlobDataSize, - }, nil + getChunksLatency: getChunksLatency, + getChunksAuthenticationLatency: getChunksAuthenticationLatency, + getChunksMetadataLatency: getChunksMetadataLatency, + getChunksDataLatency: getChunksDataLatency, + getChunksAuthFailures: getChunksAuthFailures, + getChunksRateLimited: getChunksRateLimited, + getChunksKeyCount: getChunksKeyCount, + getChunksDataSize: getChunksDataSize, + getBlobLatency: getBlobLatency, + getBlobMetadataLatency: getBlobMetadataLatency, + getBlobDataLatency: getBlobDataLatency, + getBlobRateLimited: getBlobRateLimited, + getBlobDataSize: getBlobDataSize, + } } // Start starts the metrics server. -func (m *RelayMetrics) Start() error { - return m.metricsServer.Start() +func (m *RelayMetrics) Start() { + go func() { + err := m.server.ListenAndServe() + if err != nil && !strings.Contains(err.Error(), "http: Server closed") { + m.logger.Errorf("metrics server error: %v", err) + } + }() } // Stop stops the metrics server. func (m *RelayMetrics) Stop() error { - return m.metricsServer.Stop() + return m.server.Close() } // GetGRPCServerOption returns the gRPC server option that enables automatic GRPC metrics collection. @@ -222,7 +243,55 @@ func (m *RelayMetrics) GetGRPCServerOption() grpc.ServerOption { return m.grpcServerOption } -// WriteMetricsDocumentation writes the metrics for the churner to a markdown file. -func (m *RelayMetrics) WriteMetricsDocumentation() error { - return m.metricsServer.WriteMetricsDocumentation("relay/mdoc/relay-metrics.md") +func (m *RelayMetrics) ReportChunkLatency(duration time.Duration) { + m.getChunksLatency.WithLabelValues().Observe(float64(duration.Nanoseconds()) / float64(time.Millisecond)) +} + +func (m *RelayMetrics) ReportChunkAuthenticationLatency(duration time.Duration) { + m.getChunksAuthenticationLatency.WithLabelValues().Observe( + float64(duration.Nanoseconds()) / float64(time.Millisecond)) +} + +func (m *RelayMetrics) ReportChunkMetadataLatency(duration time.Duration) { + m.getChunksMetadataLatency.WithLabelValues().Observe(float64(duration.Nanoseconds()) / float64(time.Millisecond)) +} + +func (m *RelayMetrics) ReportChunkDataLatency(duration time.Duration) { + m.getChunksDataLatency.WithLabelValues().Observe(float64(duration.Nanoseconds()) / float64(time.Millisecond)) +} + +func (m *RelayMetrics) ReportChunkAuthFailure() { + m.getChunksAuthFailures.WithLabelValues().Inc() +} + +func (m *RelayMetrics) ReportChunkRateLimited(reason string) { + m.getChunksRateLimited.WithLabelValues(reason).Inc() +} + +func (m *RelayMetrics) ReportChunkKeyCount(count int) { + m.getChunksKeyCount.WithLabelValues().Set(float64(count)) +} + +func (m *RelayMetrics) ReportChunkDataSize(size int) { + m.getChunksDataSize.WithLabelValues().Set(float64(size)) +} + +func (m *RelayMetrics) ReportBlobLatency(duration time.Duration) { + m.getBlobLatency.WithLabelValues().Observe(float64(duration.Nanoseconds()) / float64(time.Millisecond)) +} + +func (m *RelayMetrics) ReportBlobMetadataLatency(duration time.Duration) { + m.getBlobMetadataLatency.WithLabelValues().Observe(float64(duration.Nanoseconds()) / float64(time.Millisecond)) +} + +func (m *RelayMetrics) ReportBlobDataLatency(duration time.Duration) { + m.getBlobDataLatency.WithLabelValues().Observe(float64(duration.Nanoseconds()) / float64(time.Millisecond)) +} + +func (m *RelayMetrics) ReportBlobRateLimited(reason string) { + m.getBlobRateLimited.WithLabelValues(reason).Inc() +} + +func (m *RelayMetrics) ReportBlobDataSize(size int) { + m.getBlobDataSize.WithLabelValues().Set(float64(size)) } diff --git a/relay/server.go b/relay/server.go index 4a300762ed..117127de15 100644 --- a/relay/server.go +++ b/relay/server.go @@ -143,10 +143,7 @@ func NewServer( return nil, fmt.Errorf("error fetching blob params: %w", err) } - relayMetrics, err := metrics.NewRelayMetrics(logger, config.MetricsPort) - if err != nil { - return nil, fmt.Errorf("error creating relayMetrics: %w", err) - } + relayMetrics := metrics.NewRelayMetrics(logger, config.MetricsPort) mp, err := newMetadataProvider( ctx, @@ -246,7 +243,7 @@ func (s *Server) GetBlob(ctx context.Context, request *pb.GetBlobRequest) (*pb.G } finishedFetchingMetadata := time.Now() - s.metrics.GetBlobMetadataLatency.ReportLatency(finishedFetchingMetadata.Sub(start)) + s.metrics.ReportBlobMetadataLatency(finishedFetchingMetadata.Sub(start)) err = s.blobRateLimiter.RequestGetBlobBandwidth(time.Now(), metadata.blobSizeBytes) if err != nil { @@ -258,9 +255,9 @@ func (s *Server) GetBlob(ctx context.Context, request *pb.GetBlobRequest) (*pb.G return nil, fmt.Errorf("error fetching blob %s: %w", key.Hex(), err) } - s.metrics.GetBlobDataSize.Set(float64(len(data))) - s.metrics.GetBlobDataLatency.ReportLatency(time.Since(finishedFetchingMetadata)) - s.metrics.GetBlobLatency.ReportLatency(time.Since(start)) + s.metrics.ReportBlobDataSize(len(data)) + s.metrics.ReportBlobDataLatency(time.Since(finishedFetchingMetadata)) + s.metrics.ReportBlobLatency(time.Since(start)) reply := &pb.GetBlobReply{ Blob: data, @@ -285,7 +282,7 @@ func (s *Server) GetChunks(ctx context.Context, request *pb.GetChunksRequest) (* return nil, fmt.Errorf( "too many chunk requests provided, max is %d", s.config.MaxKeysPerGetChunksRequest) } - s.metrics.GetChunksKeyCount.Set(float64(len(request.ChunkRequests))) + s.metrics.ReportChunkKeyCount(len(request.ChunkRequests)) if s.authenticator != nil { client, ok := peer.FromContext(ctx) @@ -296,14 +293,14 @@ func (s *Server) GetChunks(ctx context.Context, request *pb.GetChunksRequest) (* err := s.authenticator.AuthenticateGetChunksRequest(ctx, clientAddress, request, time.Now()) if err != nil { - s.metrics.GetChunksAuthFailures.Increment() + s.metrics.ReportChunkAuthFailure() return nil, fmt.Errorf("auth failed: %w", err) } } finishedAuthenticating := time.Now() if s.authenticator != nil { - s.metrics.GetChunksAuthenticationLatency.ReportLatency(finishedAuthenticating.Sub(start)) + s.metrics.ReportChunkAuthenticationLatency(finishedAuthenticating.Sub(start)) } clientID := string(request.OperatorId) @@ -326,7 +323,7 @@ func (s *Server) GetChunks(ctx context.Context, request *pb.GetChunksRequest) (* } finishedFetchingMetadata := time.Now() - s.metrics.GetChunksMetadataLatency.ReportLatency(finishedFetchingMetadata.Sub(finishedAuthenticating)) + s.metrics.ReportChunkMetadataLatency(finishedFetchingMetadata.Sub(finishedAuthenticating)) requiredBandwidth, err := computeChunkRequestRequiredBandwidth(request, mMap) if err != nil { @@ -336,7 +333,7 @@ func (s *Server) GetChunks(ctx context.Context, request *pb.GetChunksRequest) (* if err != nil { return nil, err } - s.metrics.GetChunksDataSize.Set(float64(requiredBandwidth)) + s.metrics.ReportChunkDataSize(requiredBandwidth) frames, err := s.chunkProvider.GetFrames(ctx, mMap) if err != nil { @@ -348,8 +345,8 @@ func (s *Server) GetChunks(ctx context.Context, request *pb.GetChunksRequest) (* return nil, fmt.Errorf("error gathering chunk data: %w", err) } - s.metrics.GetChunksDataLatency.ReportLatency(time.Since(finishedFetchingMetadata)) - s.metrics.GetChunksLatency.ReportLatency(time.Since(start)) + s.metrics.ReportChunkDataLatency(time.Since(finishedFetchingMetadata)) + s.metrics.ReportChunkLatency(time.Since(start)) return &pb.GetChunksReply{ Data: bytesToSend, @@ -470,10 +467,7 @@ func computeChunkRequestRequiredBandwidth(request *pb.GetChunksRequest, mMap met // Start starts the server listening for requests. This method will block until the server is stopped. func (s *Server) Start(ctx context.Context) error { - err := s.metrics.Start() - if err != nil { - return fmt.Errorf("error starting metrics server: %w", err) - } + s.metrics.Start() if s.chainReader != nil && s.metadataProvider != nil { go func() {