Skip to content

Commit

Permalink
Merge pull request #156 from xmidt-org/update-metrics
Browse files Browse the repository at this point in the history
Update the expired metric to explicitly cover only messages that expi…
  • Loading branch information
schmidtw authored Aug 29, 2019
2 parents 58479b0 + 80d71df commit f96ad60
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 45 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).

## [Unreleased]
### Added
- Metrics to support debugging the problem found by GH Issue #145.

## [v0.1.5]
fixed build upload
Expand Down
74 changes: 62 additions & 12 deletions src/caduceus/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,24 @@ import (
)

const (
ErrorRequestBodyCounter = "error_request_body_count"
EmptyRequestBodyCounter = "empty_request_body_count"
DeliveryCounter = "delivery_count"
DeliveryRetryCounter = "delivery_retry_count"
SlowConsumerDroppedMsgCounter = "slow_consumer_dropped_message_count"
SlowConsumerCounter = "slow_consumer_cut_off_count"
IncomingQueueDepth = "incoming_queue_depth"
IncomingContentTypeCounter = "incoming_content_type_count"
IncomingEventTypeCounter = "incoming_event_type_count"
DropsDueToInvalidPayload = "drops_due_to_invalid_payload"
OutgoingQueueDepth = "outgoing_queue_depths"
DropsDueToPanic = "drops_due_to_panic"
ErrorRequestBodyCounter = "error_request_body_count"
EmptyRequestBodyCounter = "empty_request_body_count"
DeliveryCounter = "delivery_count"
DeliveryRetryCounter = "delivery_retry_count"
DeliveryRetryMaxGauge = "delivery_retry_max"
SlowConsumerDroppedMsgCounter = "slow_consumer_dropped_message_count"
SlowConsumerCounter = "slow_consumer_cut_off_count"
IncomingQueueDepth = "incoming_queue_depth"
IncomingContentTypeCounter = "incoming_content_type_count"
IncomingEventTypeCounter = "incoming_event_type_count"
DropsDueToInvalidPayload = "drops_due_to_invalid_payload"
OutgoingQueueDepth = "outgoing_queue_depths"
DropsDueToPanic = "drops_due_to_panic"
ConsumerRenewalTimeGauge = "consumer_renewal_time"
ConsumerDeliverUntilGauge = "consumer_deliver_until"
ConsumerDropUntilGauge = "consumer_drop_until"
ConsumerDeliveryWorkersGauge = "consumer_delivery_workers"
ConsumerMaxDeliveryWorkersGauge = "consumer_delivery_workers_max"
)

func Metrics() []xmetrics.Metric {
Expand Down Expand Up @@ -53,6 +59,12 @@ func Metrics() []xmetrics.Metric {
Type: "counter",
LabelNames: []string{"url", "event"},
},
{
Name: DeliveryRetryMaxGauge,
Help: "Maximum number of delivery retries attempted",
Type: "gauge",
LabelNames: []string{"url"},
},
{
Name: DeliveryCounter,
Help: "Count of delivered messages to a url with a status code",
Expand Down Expand Up @@ -89,19 +101,57 @@ func Metrics() []xmetrics.Metric {
Type: "counter",
LabelNames: []string{"url"},
},
{
Name: ConsumerRenewalTimeGauge,
Help: "Time when the consumer data was updated.",
Type: "gauge",
LabelNames: []string{"url"},
},
{
Name: ConsumerDeliverUntilGauge,
Help: "Time when the consumer's registration expires and events will be dropped.",
Type: "gauge",
LabelNames: []string{"url"},
},
{
Name: ConsumerDropUntilGauge,
Help: "The time after which events going to a customer will be delivered.",
Type: "gauge",
LabelNames: []string{"url"},
},
{
Name: ConsumerDeliveryWorkersGauge,
Help: "The number of active delivery workers for a particular customer.",
Type: "gauge",
LabelNames: []string{"url"},
},
{
Name: ConsumerMaxDeliveryWorkersGauge,
Help: "The maximum number of delivery workers available for a particular customer.",
Type: "gauge",
LabelNames: []string{"url"},
},
}
}

func CreateOutbounderMetrics(m CaduceusMetricsRegistry, c *CaduceusOutboundSender) {
c.deliveryCounter = m.NewCounter(DeliveryCounter)
c.deliveryRetryCounter = m.NewCounter(DeliveryRetryCounter)
c.deliveryRetryMaxGauge = m.NewGauge(DeliveryRetryMaxGauge)
c.cutOffCounter = m.NewCounter(SlowConsumerCounter).With("url", c.id)
c.droppedQueueFullCounter = m.NewCounter(SlowConsumerDroppedMsgCounter).With("url", c.id, "reason", "queue_full")
c.droppedExpiredCounter = m.NewCounter(SlowConsumerDroppedMsgCounter).With("url", c.id, "reason", "expired")
c.droppedExpiredBeforeQueueCounter = m.NewCounter(SlowConsumerDroppedMsgCounter).With("url", c.id, "reason", "expired_before_queueing")

c.droppedCutoffCounter = m.NewCounter(SlowConsumerDroppedMsgCounter).With("url", c.id, "reason", "cut_off")
c.droppedInvalidConfig = m.NewCounter(SlowConsumerDroppedMsgCounter).With("url", c.id, "reason", "invalid_config")
c.droppedNetworkErrCounter = m.NewCounter(SlowConsumerDroppedMsgCounter).With("url", c.id, "reason", "network_err")
c.droppedPanic = m.NewCounter(DropsDueToPanic).With("url", c.id)
c.queueDepthGauge = m.NewGauge(OutgoingQueueDepth).With("url", c.id)
c.contentTypeCounter = m.NewCounter(IncomingContentTypeCounter)
c.renewalTimeGauge = m.NewGauge(ConsumerRenewalTimeGauge).With("url", c.id)
c.deliverUntilGauge = m.NewGauge(ConsumerDeliverUntilGauge).With("url", c.id)
c.dropUntilGauge = m.NewGauge(ConsumerDropUntilGauge).With("url", c.id)
c.currentWorkersGauge = m.NewGauge(ConsumerDeliveryWorkersGauge).With("url", c.id)
c.maxWorkersGauge = m.NewGauge(ConsumerMaxDeliveryWorkersGauge).With("url", c.id)
}
4 changes: 3 additions & 1 deletion src/caduceus/mocks_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,9 @@ func (m *mockGauge) Add(delta float64) {
}

func (m *mockGauge) Set(value float64) {
m.Called(value)
// We're setting time values & the ROI seems pretty low with this level
// of validation...
//m.Called(value)
}

func (m *mockGauge) With(labelValues ...string) metrics.Gauge {
Expand Down
84 changes: 52 additions & 32 deletions src/caduceus/outboundSender.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,37 +115,44 @@ type OutboundSender interface {

// CaduceusOutboundSender is the outbound sender object.
type CaduceusOutboundSender struct {
id string
urls *ring.Ring
listener webhook.W
deliverUntil time.Time
dropUntil time.Time
sender func(*http.Request) (*http.Response, error)
events []*regexp.Regexp
matcher []*regexp.Regexp
queueSize int
queue chan *wrp.Message
deliveryRetries int
deliveryInterval time.Duration
deliveryCounter metrics.Counter
deliveryRetryCounter metrics.Counter
droppedQueueFullCounter metrics.Counter
droppedCutoffCounter metrics.Counter
droppedExpiredCounter metrics.Counter
droppedNetworkErrCounter metrics.Counter
droppedInvalidConfig metrics.Counter
droppedPanic metrics.Counter
cutOffCounter metrics.Counter
contentTypeCounter metrics.Counter
queueDepthGauge metrics.Gauge
eventType metrics.Counter
wg sync.WaitGroup
cutOffPeriod time.Duration
workers semaphore.Interface
maxWorkers int
failureMsg FailureMessage
logger log.Logger
mutex sync.RWMutex
id string
urls *ring.Ring
listener webhook.W
deliverUntil time.Time
dropUntil time.Time
sender func(*http.Request) (*http.Response, error)
events []*regexp.Regexp
matcher []*regexp.Regexp
queueSize int
queue chan *wrp.Message
deliveryRetries int
deliveryInterval time.Duration
deliveryCounter metrics.Counter
deliveryRetryCounter metrics.Counter
droppedQueueFullCounter metrics.Counter
droppedCutoffCounter metrics.Counter
droppedExpiredCounter metrics.Counter
droppedExpiredBeforeQueueCounter metrics.Counter
droppedNetworkErrCounter metrics.Counter
droppedInvalidConfig metrics.Counter
droppedPanic metrics.Counter
cutOffCounter metrics.Counter
contentTypeCounter metrics.Counter
queueDepthGauge metrics.Gauge
renewalTimeGauge metrics.Gauge
deliverUntilGauge metrics.Gauge
dropUntilGauge metrics.Gauge
maxWorkersGauge metrics.Gauge
currentWorkersGauge metrics.Gauge
deliveryRetryMaxGauge metrics.Gauge
eventType metrics.Counter
wg sync.WaitGroup
cutOffPeriod time.Duration
workers semaphore.Interface
maxWorkers int
failureMsg FailureMessage
logger log.Logger
mutex sync.RWMutex
}

// New creates a new OutboundSender object from the factory, or returns an error.
Expand Down Expand Up @@ -270,6 +277,8 @@ func (obs *CaduceusOutboundSender) Update(wh webhook.W) (err error) {
}
}

obs.renewalTimeGauge.Set(float64(time.Now().Unix()))

// write/update obs
obs.mutex.Lock()

Expand All @@ -281,12 +290,15 @@ func (obs *CaduceusOutboundSender) Update(wh webhook.W) (err error) {

obs.listener.FailureURL = wh.FailureURL
obs.deliverUntil = wh.Until
obs.deliverUntilGauge.Set(float64(obs.deliverUntil.Unix()))

obs.events = events

// update default deliver retry count for sender
if wh.Config.MaxRetryCount != 0 {
obs.deliveryRetries = wh.Config.MaxRetryCount
}
obs.deliveryRetryMaxGauge.Set(float64(obs.deliveryRetries))

// if matcher list is empty set it nil for Queue() logic
obs.matcher = nil
Expand Down Expand Up @@ -314,6 +326,9 @@ func (obs *CaduceusOutboundSender) Update(wh webhook.W) (err error) {
offset--
}

// Update this here in case we make this configurable later
obs.maxWorkersGauge.Set(float64(obs.maxWorkers))

obs.mutex.Unlock()

return
Expand All @@ -328,12 +343,14 @@ func (obs *CaduceusOutboundSender) Shutdown(gentle bool) {
obs.mutex.Lock()
if false == gentle {
obs.deliverUntil = time.Time{}
obs.deliverUntilGauge.Set(float64(obs.deliverUntil.Unix()))
}
obs.mutex.Unlock()
obs.wg.Wait()

obs.mutex.Lock()
obs.deliverUntil = time.Time{}
obs.deliverUntilGauge.Set(float64(obs.deliverUntil.Unix()))
obs.mutex.Unlock()
}

Expand Down Expand Up @@ -433,7 +450,7 @@ func (obs *CaduceusOutboundSender) isValidTimeWindow(now, dropUntil, deliverUnti
if false == now.Before(deliverUntil) {
debugLog.Log(logging.MessageKey(), "Outside delivery window",
"now", now, "before", deliverUntil, "after", dropUntil)
obs.droppedExpiredCounter.Add(1.0)
obs.droppedExpiredBeforeQueueCounter.Add(1.0)
return false
}

Expand Down Expand Up @@ -469,6 +486,7 @@ func (obs *CaduceusOutboundSender) dispatcher() {
continue
}
obs.workers.Acquire()
obs.currentWorkersGauge.Add(1.0)

go obs.send(urls, secret, accept, msg)
}
Expand All @@ -489,6 +507,7 @@ func (obs *CaduceusOutboundSender) send(urls *ring.Ring, secret, acceptType stri
"id", obs.id, "panic", r)
}
obs.workers.Release()
obs.currentWorkersGauge.Add(-1.0)
}()

payload := msg.Payload
Expand Down Expand Up @@ -594,6 +613,7 @@ func (obs *CaduceusOutboundSender) send(urls *ring.Ring, secret, acceptType stri
func (obs *CaduceusOutboundSender) queueOverflow() {
obs.mutex.Lock()
obs.dropUntil = time.Now().Add(obs.cutOffPeriod)
obs.dropUntilGauge.Set(float64(obs.dropUntil.Unix()))
secret := obs.listener.Config.Secret
failureMsg := obs.failureMsg
failureURL := obs.listener.FailureURL
Expand Down
7 changes: 7 additions & 0 deletions src/caduceus/outboundSender_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ func simpleFactorySetup(trans *transport, cutOffPeriod time.Duration, matcher []
fakeDroppedSlow.On("With", []string{"url", w.Config.URL, "reason", "queue_full"}).Return(fakeDroppedSlow)
fakeDroppedSlow.On("With", []string{"url", w.Config.URL, "reason", "cut_off"}).Return(fakeDroppedSlow)
fakeDroppedSlow.On("With", []string{"url", w.Config.URL, "reason", "expired"}).Return(fakeDroppedSlow)
fakeDroppedSlow.On("With", []string{"url", w.Config.URL, "reason", "expired_before_queueing"}).Return(fakeDroppedSlow)
fakeDroppedSlow.On("With", []string{"url", w.Config.URL, "reason", "invalid_config"}).Return(fakeDroppedSlow)
fakeDroppedSlow.On("With", []string{"url", w.Config.URL, "reason", "network_err"}).Return(fakeDroppedSlow)
fakeDroppedSlow.On("Add", 1.0).Return()
Expand Down Expand Up @@ -155,6 +156,12 @@ func simpleFactorySetup(trans *transport, cutOffPeriod time.Duration, matcher []
fakeRegistry.On("NewCounter", IncomingContentTypeCounter).Return(fakeContentType)
fakeRegistry.On("NewCounter", DropsDueToPanic).Return(fakePanicDrop)
fakeRegistry.On("NewGauge", OutgoingQueueDepth).Return(fakeQdepth)
fakeRegistry.On("NewGauge", DeliveryRetryMaxGauge).Return(fakeQdepth)
fakeRegistry.On("NewGauge", ConsumerRenewalTimeGauge).Return(fakeQdepth)
fakeRegistry.On("NewGauge", ConsumerDeliverUntilGauge).Return(fakeQdepth)
fakeRegistry.On("NewGauge", ConsumerDropUntilGauge).Return(fakeQdepth)
fakeRegistry.On("NewGauge", ConsumerDeliveryWorkersGauge).Return(fakeQdepth)
fakeRegistry.On("NewGauge", ConsumerMaxDeliveryWorkersGauge).Return(fakeQdepth)

return &OutboundSenderFactory{
Listener: w,
Expand Down
8 changes: 8 additions & 0 deletions src/caduceus/senderWrapper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,13 @@ func getFakeFactory() *SenderWrapperFactory {
On("With", []string{"url", "http://localhost:8888/foo", "reason", "cut_off"}).Return(fakeIgnore).
On("With", []string{"url", "http://localhost:8888/foo", "reason", "queue_full"}).Return(fakeIgnore).
On("With", []string{"url", "http://localhost:8888/foo", "reason", "expired"}).Return(fakeIgnore).
On("With", []string{"url", "http://localhost:8888/foo", "reason", "expired_before_queueing"}).Return(fakeIgnore).
On("With", []string{"url", "http://localhost:8888/foo", "reason", "network_err"}).Return(fakeIgnore).
On("With", []string{"url", "http://localhost:8888/foo", "reason", "invalid_config"}).Return(fakeIgnore).
On("With", []string{"url", "http://localhost:9999/foo", "reason", "cut_off"}).Return(fakeIgnore).
On("With", []string{"url", "http://localhost:9999/foo", "reason", "queue_full"}).Return(fakeIgnore).
On("With", []string{"url", "http://localhost:9999/foo", "reason", "expired"}).Return(fakeIgnore).
On("With", []string{"url", "http://localhost:9999/foo", "reason", "expired_before_queueing"}).Return(fakeIgnore).
On("With", []string{"url", "http://localhost:9999/foo", "reason", "network_err"}).Return(fakeIgnore).
On("With", []string{"url", "http://localhost:9999/foo", "reason", "invalid_config"}).Return(fakeIgnore).
On("With", []string{"url", "http://localhost:8888/foo", "code", "200", "event", "unknown"}).Return(fakeIgnore).
Expand All @@ -113,6 +115,12 @@ func getFakeFactory() *SenderWrapperFactory {
fakeRegistry.On("NewCounter", IncomingEventTypeCounter).Return(fakeIgnore)
fakeRegistry.On("NewCounter", DropsDueToPanic).Return(fakeIgnore)
fakeRegistry.On("NewGauge", OutgoingQueueDepth).Return(fakeGauge)
fakeRegistry.On("NewGauge", DeliveryRetryMaxGauge).Return(fakeGauge)
fakeRegistry.On("NewGauge", ConsumerRenewalTimeGauge).Return(fakeGauge)
fakeRegistry.On("NewGauge", ConsumerDeliverUntilGauge).Return(fakeGauge)
fakeRegistry.On("NewGauge", ConsumerDropUntilGauge).Return(fakeGauge)
fakeRegistry.On("NewGauge", ConsumerDeliveryWorkersGauge).Return(fakeGauge)
fakeRegistry.On("NewGauge", ConsumerMaxDeliveryWorkersGauge).Return(fakeGauge)

return &SenderWrapperFactory{
NumWorkersPerSender: 10,
Expand Down

0 comments on commit f96ad60

Please sign in to comment.