Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: Optimizer Refactor Part 2: Add prometheus metrics for refactor optimizer #1802

Open
wants to merge 5 commits into
base: CNS-1008-score-store-refactor
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions protocol/common/cobra_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,10 @@ const (
SetProviderOptimizerNumberOfTiersToCreate = "set-provider-optimizer-number-of-tiers-to-create"

// optimizer qos server flags
OptimizerQosServerAddressFlag = "optimizer-qos-server-address" // address of the optimizer qos server to send the qos reports
OptimizerQosServerPushIntervalFlag = "optimizer-qos-push-interval" // interval to push the qos reports to the optimizer qos server
OptimizerQosServerSamplingIntervalFlag = "optimizer-qos-sampling-interval" // interval to sample the qos reports
OptimizerQosServerAddressFlag = "optimizer-qos-server-address" // address of the optimizer qos server to send the qos reports
OptimizerQosServerAddressRefactorFlag = "optimizer-qos-server-address-refactor" // address of the refactored optimizer qos server to send the qos reports
OptimizerQosServerPushIntervalFlag = "optimizer-qos-push-interval" // interval to push the qos reports to the optimizer qos server
OptimizerQosServerSamplingIntervalFlag = "optimizer-qos-sampling-interval" // interval to sample the qos reports
// websocket flags
RateLimitWebSocketFlag = "rate-limit-websocket-requests-per-connection"
BanDurationForWebsocketRateLimitExceededFlag = "ban-duration-for-websocket-rate-limit-exceeded"
Expand Down
162 changes: 162 additions & 0 deletions protocol/metrics/consumer_metrics_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"time"

"github.com/lavanet/lava/v4/utils"
scoreutils "github.com/lavanet/lava/v4/utils/score"
pairingtypes "github.com/lavanet/lava/v4/x/pairing/types"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
Expand Down Expand Up @@ -64,6 +65,22 @@ type ConsumerMetricsManager struct {
relayProcessingLatencyBeforeProvider *prometheus.GaugeVec
relayProcessingLatencyAfterProvider *prometheus.GaugeVec
averageProcessingLatency map[string]*LatencyTracker

// optimizer metrics
optimizerProviderScore *prometheus.GaugeVec
optimizerProviderLatency *prometheus.GaugeVec
optimizerProviderSync *prometheus.GaugeVec
optimizerProviderAvailability *prometheus.GaugeVec
optimizerProviderTier *prometheus.GaugeVec
optimizerTierChance *prometheus.GaugeVec

// refactored optimizer metrics
optimizerRefactorProviderScore *prometheus.GaugeVec
optimizerRefactorProviderLatency *prometheus.GaugeVec
optimizerRefactorProviderSync *prometheus.GaugeVec
optimizerRefactorProviderAvailability *prometheus.GaugeVec
optimizerRefactorProviderTier *prometheus.GaugeVec
optimizerRefactorTierChance *prometheus.GaugeVec
}

type ConsumerMetricsManagerOptions struct {
Expand Down Expand Up @@ -210,6 +227,68 @@ func NewConsumerMetricsManager(options ConsumerMetricsManagerOptions) *ConsumerM
Help: "average latency of processing a successful relay after it is received from the provider in µs (10^6)",
}, []string{"spec", "apiInterface"})

optimizerProviderScore := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "lava_consumer_optimizer_provider_score",
Help: "[Optimizer] The total score of a provider",
}, []string{"spec", "api_interface", "provider_address", "epoch"})

optimizerProviderLatency := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "lava_consumer_optimizer_provider_latency",
Help: "[Optimizer] The latency of a provider",
}, []string{"spec", "api_interface", "provider_address", "epoch"})

optimizerProviderSync := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "lava_consumer_optimizer_provider_sync",
Help: "[Optimizer] The sync of a provider",
}, []string{"spec", "api_interface", "provider_address", "epoch"})

optimizerProviderAvailability := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "lava_consumer_optimizer_provider_availability",
Help: "[Optimizer] The availability of a provider",
}, []string{"spec", "api_interface", "provider_address", "epoch"})

optimizerProviderTier := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "lava_consumer_optimizer_provider_tier",
Help: "[Optimizer] The tier of a provider",
}, []string{"spec", "api_interface", "provider_address", "epoch"})

optimizerTierChance := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "lava_consumer_optimizer_provider_tiers_chances",
Help: "[Optimizer] The chances of a tier being selected by the optimizer",
}, []string{"spec", "api_interface", "tier", "epoch"})

//

optimizerRefactorProviderScore := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "lava_consumer_optimizer_refactor_provider_score",
Help: "[Optimizer Refactor] The total score of a provider",
}, []string{"spec", "api_interface", "provider_address", "epoch"})

optimizerRefactorProviderLatency := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "lava_consumer_optimizer_refactor_provider_latency",
Help: "[Optimizer Refactor] The latency of a provider",
}, []string{"spec", "api_interface", "provider_address", "epoch"})

optimizerRefactorProviderSync := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "lava_consumer_optimizer_refactor_provider_sync",
Help: "[Optimizer Refactor] The sync of a provider",
}, []string{"spec", "api_interface", "provider_address", "epoch"})

optimizerRefactorProviderAvailability := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "lava_consumer_optimizer_refactor_provider_availability",
Help: "[Optimizer Refactor] The availability of a provider",
}, []string{"spec", "api_interface", "provider_address", "epoch"})

optimizerRefactorProviderTier := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "lava_consumer_optimizer_refactor_provider_tier",
Help: "[Optimizer Refactor] The tier of a provider",
}, []string{"spec", "api_interface", "provider_address", "epoch"})

optimizerRefactorTierChance := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "lava_consumer_optimizer_refactor_provider_tiers_chances",
Help: "[Optimizer Refactor] The chances of a tier being selected by the optimizer",
}, []string{"spec", "api_interface", "tier", "epoch"})

// Register the metrics with the Prometheus registry.
prometheus.MustRegister(totalCURequestedMetric)
prometheus.MustRegister(totalRelaysRequestedMetric)
Expand Down Expand Up @@ -237,6 +316,18 @@ func NewConsumerMetricsManager(options ConsumerMetricsManagerOptions) *ConsumerM
prometheus.MustRegister(totalFailedWsSubscriptionRequestsMetric)
prometheus.MustRegister(totalDuplicatedWsSubscriptionRequestsMetric)
prometheus.MustRegister(totalWsSubscriptionDissconnectMetric)
prometheus.MustRegister(optimizerProviderScore)
prometheus.MustRegister(optimizerProviderLatency)
prometheus.MustRegister(optimizerProviderSync)
prometheus.MustRegister(optimizerProviderAvailability)
prometheus.MustRegister(optimizerProviderTier)
prometheus.MustRegister(optimizerTierChance)
prometheus.MustRegister(optimizerRefactorProviderScore)
prometheus.MustRegister(optimizerRefactorProviderLatency)
prometheus.MustRegister(optimizerRefactorProviderSync)
prometheus.MustRegister(optimizerRefactorProviderAvailability)
prometheus.MustRegister(optimizerRefactorProviderTier)
prometheus.MustRegister(optimizerRefactorTierChance)

consumerMetricsManager := &ConsumerMetricsManager{
totalCURequestedMetric: totalCURequestedMetric,
Expand Down Expand Up @@ -270,6 +361,20 @@ func NewConsumerMetricsManager(options ConsumerMetricsManagerOptions) *ConsumerM
relayProcessingLatencyBeforeProvider: relayProcessingLatencyBeforeProvider,
relayProcessingLatencyAfterProvider: relayProcessingLatencyAfterProvider,
averageProcessingLatency: map[string]*LatencyTracker{},

optimizerProviderScore: optimizerProviderScore,
optimizerProviderLatency: optimizerProviderLatency,
optimizerProviderSync: optimizerProviderSync,
optimizerProviderAvailability: optimizerProviderAvailability,
optimizerProviderTier: optimizerProviderTier,
optimizerTierChance: optimizerTierChance,

optimizerRefactorProviderScore: optimizerRefactorProviderScore,
optimizerRefactorProviderLatency: optimizerRefactorProviderLatency,
optimizerRefactorProviderSync: optimizerRefactorProviderSync,
optimizerRefactorProviderAvailability: optimizerRefactorProviderAvailability,
optimizerRefactorProviderTier: optimizerRefactorProviderTier,
optimizerRefactorTierChance: optimizerRefactorTierChance,
}

http.Handle("/metrics", promhttp.Handler())
Expand Down Expand Up @@ -545,3 +650,60 @@ func (pme *ConsumerMetricsManager) SetWsSubscriptioDisconnectRequestMetric(chain
}
pme.totalWsSubscriptionDissconnectMetric.WithLabelValues(chainId, apiInterface, disconnectReason).Inc()
}

func (pme *ConsumerMetricsManager) SetOptimizerProviderScoreMetric(chainId string, apiInterface string, providerAddress string, epoch uint64, scoreType string, refactored bool, score float64) {
if pme == nil {
return
}

switch scoreType {
case scoreutils.LatencyScoreType_Refactor:
if refactored {
pme.optimizerRefactorProviderLatency.WithLabelValues(chainId, apiInterface, providerAddress, fmt.Sprintf("%d", epoch)).Set(score)
} else {
pme.optimizerProviderLatency.WithLabelValues(chainId, apiInterface, providerAddress, fmt.Sprintf("%d", epoch)).Set(score)
}
case scoreutils.SyncScoreType_Refactor:
if refactored {
pme.optimizerRefactorProviderSync.WithLabelValues(chainId, apiInterface, providerAddress, fmt.Sprintf("%d", epoch)).Set(score)
} else {
pme.optimizerProviderSync.WithLabelValues(chainId, apiInterface, providerAddress, fmt.Sprintf("%d", epoch)).Set(score)
}
case scoreutils.AvailabilityScoreType_Refactor:
if refactored {
pme.optimizerRefactorProviderAvailability.WithLabelValues(chainId, apiInterface, providerAddress, fmt.Sprintf("%d", epoch)).Set(score)
} else {
pme.optimizerProviderAvailability.WithLabelValues(chainId, apiInterface, providerAddress, fmt.Sprintf("%d", epoch)).Set(score)
}
case scoreutils.TotalScoreType_Refactor:
if refactored {
pme.optimizerRefactorProviderScore.WithLabelValues(chainId, apiInterface, providerAddress, fmt.Sprintf("%d", epoch)).Set(score)
} else {
pme.optimizerProviderScore.WithLabelValues(chainId, apiInterface, providerAddress, fmt.Sprintf("%d", epoch)).Set(score)
}
default:
utils.LavaFormatError("Unknown score type", nil, utils.Attribute{Key: "scoreType", Value: scoreType})
}
}

func (pme *ConsumerMetricsManager) SetOptimizerProviderTierMetric(chainId string, apiInterface string, providerAddress string, epoch uint64, refactored bool, tier int) {
if pme == nil {
return
}
if refactored {
pme.optimizerRefactorProviderTier.WithLabelValues(chainId, apiInterface, providerAddress, fmt.Sprintf("%d", epoch)).Set(float64(tier))
} else {
pme.optimizerProviderTier.WithLabelValues(chainId, apiInterface, providerAddress, fmt.Sprintf("%d", epoch)).Set(float64(tier))
}
}

func (pme *ConsumerMetricsManager) SetOptimizerTierChanceMetric(chainId string, apiInterface string, tier int, epoch uint64, refactored bool, chance float64) {
if pme == nil {
return
}
if refactored {
pme.optimizerRefactorTierChance.WithLabelValues(chainId, apiInterface, fmt.Sprintf("%d", tier), fmt.Sprintf("%d", epoch)).Set(chance)
} else {
pme.optimizerTierChance.WithLabelValues(chainId, apiInterface, fmt.Sprintf("%d", tier), fmt.Sprintf("%d", epoch)).Set(chance)
}
}
87 changes: 71 additions & 16 deletions protocol/metrics/consumer_optimizer_qos_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,11 @@ var (
)

type ConsumerOptimizerQoSClient struct {
consumerOrigin string
queueSender *QueueSender
optimizers map[string]OptimizerInf // keys are chain ids
consumerOrigin string
queueSender *QueueSender
queueSenderRefactored *QueueSender
optimizers map[string]OptimizerInf // keys are chain ids
optimizersRefactored map[string]OptimizerInf // keys are chain ids
// keys are chain ids, values are maps with provider addresses as keys
chainIdToProviderToRelaysCount map[string]map[string]uint64
chainIdToProviderToNodeErrorsCount map[string]map[string]uint64
Expand All @@ -39,21 +41,23 @@ type OptimizerQoSReport struct {
LatencyScore float64
GenericScore float64
EntryIndex int
TierChances map[int]float64
}

type optimizerQoSReportToSend struct {
Timestamp time.Time `json:"timestamp"`
SyncScore float64 `json:"sync_score"`
AvailabilityScore float64 `json:"availability_score"`
LatencyScore float64 `json:"latency_score"`
GenericScore float64 `json:"generic_score"`
ProviderAddress string `json:"provider"`
ConsumerOrigin string `json:"consumer"`
ChainId string `json:"chain_id"`
NodeErrorRate float64 `json:"node_error_rate"`
Epoch uint64 `json:"epoch"`
ProviderStake int64 `json:"provider_stake"`
EntryIndex int `json:"entry_index"`
Timestamp time.Time `json:"timestamp"`
SyncScore float64 `json:"sync_score"`
AvailabilityScore float64 `json:"availability_score"`
LatencyScore float64 `json:"latency_score"`
GenericScore float64 `json:"generic_score"`
ProviderAddress string `json:"provider"`
ConsumerOrigin string `json:"consumer"`
ChainId string `json:"chain_id"`
NodeErrorRate float64 `json:"node_error_rate"`
Epoch uint64 `json:"epoch"`
ProviderStake int64 `json:"provider_stake"`
EntryIndex int `json:"entry_index"`
TierChances map[int]float64 `json:"tier_chances"`
}

func (oqosr optimizerQoSReportToSend) String() string {
Expand All @@ -68,7 +72,7 @@ type OptimizerInf interface {
CalculateQoSScoresForMetrics(allAddresses []string, ignoredProviders map[string]struct{}, cu uint64, requestedBlock int64) []*OptimizerQoSReport
}

func NewConsumerOptimizerQoSClient(endpointAddress string, interval ...time.Duration) *ConsumerOptimizerQoSClient {
func NewConsumerOptimizerQoSClient(endpointAddress string, endpointAddressRefactor string, interval ...time.Duration) *ConsumerOptimizerQoSClient {
hostname, err := os.Hostname()
if err != nil {
utils.LavaFormatWarning("Error while getting hostname for ConsumerOptimizerQoSClient", err)
Expand All @@ -78,7 +82,9 @@ func NewConsumerOptimizerQoSClient(endpointAddress string, interval ...time.Dura
return &ConsumerOptimizerQoSClient{
consumerOrigin: hostname,
queueSender: NewQueueSender(endpointAddress, "ConsumerOptimizerQoS", nil, interval...),
queueSenderRefactored: NewQueueSender(endpointAddressRefactor, "ConsumerOptimizerQoSRefactored", nil, interval...),
optimizers: map[string]OptimizerInf{},
optimizersRefactored: map[string]OptimizerInf{},
chainIdToProviderToRelaysCount: map[string]map[string]uint64{},
chainIdToProviderToNodeErrorsCount: map[string]map[string]uint64{},
chainIdToProviderToEpochToStake: map[string]map[string]map[uint64]int64{},
Expand Down Expand Up @@ -147,6 +153,27 @@ func (coqc *ConsumerOptimizerQoSClient) appendOptimizerQoSReport(report *Optimiz
coqc.queueSender.appendQueue(optimizerQoSReportToSend)
}

func (coqc *ConsumerOptimizerQoSClient) appendOptimizerQoSReportRefactored(report *OptimizerQoSReport, chainId string, epoch uint64) {
// must be called under read lock

optimizerQoSReportToSend := optimizerQoSReportToSend{
Timestamp: time.Now(),
ConsumerOrigin: coqc.consumerOrigin,
SyncScore: report.SyncScore,
AvailabilityScore: report.AvailabilityScore,
LatencyScore: report.LatencyScore,
GenericScore: report.GenericScore,
ProviderAddress: report.ProviderAddress,
EntryIndex: report.EntryIndex,
ChainId: chainId,
Epoch: epoch,
NodeErrorRate: coqc.calculateNodeErrorRate(chainId, report.ProviderAddress),
ProviderStake: coqc.getProviderChainStake(chainId, report.ProviderAddress, epoch),
}

coqc.queueSenderRefactored.appendQueue(optimizerQoSReportToSend)
}

func (coqc *ConsumerOptimizerQoSClient) getReportsFromOptimizers() {
coqc.lock.RLock() // we only read from the maps here
defer coqc.lock.RUnlock()
Expand All @@ -168,6 +195,18 @@ func (coqc *ConsumerOptimizerQoSClient) getReportsFromOptimizers() {
coqc.appendOptimizerQoSReport(report, chainId, currentEpoch)
}
}

for chainId, optimizer := range coqc.optimizersRefactored {
providersMap, ok := coqc.chainIdToProviderToEpochToStake[chainId]
if !ok {
continue
}

reports := optimizer.CalculateQoSScoresForMetrics(maps.Keys(providersMap), ignoredProviders, cu, requestedBlock)
for _, report := range reports {
coqc.appendOptimizerQoSReportRefactored(report, chainId, currentEpoch)
}
}
}

func (coqc *ConsumerOptimizerQoSClient) StartOptimizersQoSReportsCollecting(ctx context.Context, samplingInterval time.Duration) {
Expand Down Expand Up @@ -205,6 +244,22 @@ func (coqc *ConsumerOptimizerQoSClient) RegisterOptimizer(optimizer OptimizerInf
coqc.optimizers[chainId] = optimizer
}

func (coqc *ConsumerOptimizerQoSClient) RegisterOptimizerRefactored(optimizer OptimizerInf, chainId string) {
if coqc == nil {
return
}

coqc.lock.Lock()
defer coqc.lock.Unlock()

if _, found := coqc.optimizersRefactored[chainId]; found {
utils.LavaFormatWarning("Optimizer refactored already registered for chain", nil, utils.LogAttr("chainId", chainId))
return
}

coqc.optimizersRefactored[chainId] = optimizer
}

func (coqc *ConsumerOptimizerQoSClient) incrementStoreCounter(store map[string]map[string]uint64, chainId, providerAddress string) {
// must be called under write lock
if coqc == nil {
Expand Down
Loading