Skip to content

Commit

Permalink
misc: remove more unused code for smoke and crow (#6121)
Browse files Browse the repository at this point in the history
Follow up on #6118, I missed some extra bits of smoke/crow-related code.
  • Loading branch information
rfratto authored Jan 11, 2024
1 parent 951d490 commit 6b32f7b
Show file tree
Hide file tree
Showing 6 changed files with 6 additions and 766 deletions.
136 changes: 5 additions & 131 deletions operations/agent-static-mixin/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -65,132 +65,6 @@ local _config = config._config;
},
],
},
{
name: 'GrafanaAgentSmokeChecks',
rules: [
{
alert: 'GrafanaAgentDown',
expr: |||
up{
namespace="agent-smoke-test",
pod=~"grafana-agent-smoke-test-(0|cluster-0|cluster-1|cluster-2)",
} == 0
|||,
'for': '5m',
annotations: {
summary: '{{ $labels.job }} is down',
},
},
{
alert: 'GrafanaAgentFlapping',
expr: |||
avg_over_time(up{
namespace="agent-smoke-test",
pod=~"grafana-agent-smoke-test-(0|cluster-0|cluster-1|cluster-2)",
}[5m]) < 1
|||,
'for': '15m',
annotations: {
summary: '{{ $labels.job }} is flapping',
},
},

// Checks that the CPU usage doesn't go too high. This was generated from internal usage where
// every 1,000 active series used roughly 0.0013441% of CPU. This alert only fires if there is a
// minimum load threshold of at least 1000 active series.
{
alert: 'GrafanaAgentCPUHigh',
expr: |||
(sum by (pod) (rate(container_cpu_usage_seconds_total{cluster=~".+", namespace=~"agent-smoke-test", container=~".+", pod="grafana-agent-smoke-test-cluster-2"}[5m]))
/
(sum by (pod) (agent_wal_storage_active_series{cluster=~".+", namespace=~"agent-smoke-test", container=~".+", pod="grafana-agent-smoke-test-cluster-2"}) / 1000)
> 0.0013441)
and
sum by (pod) (agent_wal_storage_active_series{cluster=~".+", namespace=~"agent-smoke-test", container=~".+", pod="grafana-agent-smoke-test-cluster-2"}) > 1000
|||,
'for': '1h',
annotations: {
summary: '{{ $labels.pod }} is using more than 0.0013441 CPU per 1000 series over the last 5 minutes',
},
},

// We assume roughly ~8KB per series. Check that each deployment
// doesn't go too far above this.
//
// We aggregate the memory of the scraping service together since an individual
// node with a really small number of active series will throw this metric off.
{
alert: 'GrafanaAgentMemHigh',
expr: |||
sum without (pod, instance) (go_memstats_heap_inuse_bytes{job=~"agent-smoke-test/grafana-agent-smoke-test.*"}) /
sum without (pod, instance, instance_group_name) (agent_wal_storage_active_series{job=~"agent-smoke-test/grafana-agent-smoke-test.*"}) / 1e3 > 10
|||,
'for': '1h',
annotations: {
summary: '{{ $labels.job }} has used more than 10KB per series for more than 5 minutes',
},
},
{
alert: 'GrafanaAgentContainerRestarts',
expr: |||
sum by (pod) (rate(kube_pod_container_status_restarts_total{namespace="agent-smoke-test"}[10m])) > 0
|||,
annotations: {
summary: '{{ $labels.pod }} has a high rate of container restarts',
},
},
],
},
{
name: 'GrafanaAgentCrowChecks',
rules: [
{
alert: 'CrowDown',
expr: |||
up{job=~"agent-smoke-test/crow-.*"} == 0
|||,
'for': '5m',
annotations: {
summary: 'Crow {{ $labels.job }} is down.',
},
},
{
alert: 'CrowFlapping',
expr: |||
avg_over_time(up{job=~"agent-smoke-test/crow-.*"}[5m]) < 1
|||,
'for': '15m',
annotations: {
summary: 'Crow {{ $labels.job }} is flapping.',
},
},
{
alert: 'CrowNotScraped',
expr: |||
rate(crow_test_samples_total[5m]) == 0
|||,
'for': '15m',
annotations: {
summary: 'Crow {{ $labels.job }} is not being scraped.',
},
},
{
alert: 'CrowFailures',
expr: |||
(
rate(crow_test_sample_results_total{result="success"}[5m])
/
ignoring(result) sum without (result) (rate(crow_test_sample_results_total[5m]))
)
< 1
|||,
'for': '15m',
annotations: {
summary: 'Crow {{ $labels.job }} has had failures for at least 5m',
},
},
],
},
{
name: 'VultureChecks',
rules: [
Expand Down Expand Up @@ -358,7 +232,7 @@ local _config = config._config;
},
annotations: {
message: |||
Instance {{ $labels.instance }} failed to successfully reload the config.
Instance {{ $labels.instance }} failed to successfully reload the config.
|||,
},
},
Expand All @@ -373,7 +247,7 @@ local _config = config._config;
},
annotations: {
message: |||
Instance {{ $labels.instance }} failed to successfully reload the config.
Instance {{ $labels.instance }} failed to successfully reload the config.
|||,
},
},
Expand All @@ -388,7 +262,7 @@ local _config = config._config;
},
annotations: {
message: |||
Instance {{ $labels.instance }} fell back to empty configuration.
Instance {{ $labels.instance }} fell back to empty configuration.
|||,
},
},
Expand All @@ -403,12 +277,12 @@ local _config = config._config;
},
annotations: {
message: |||
Instance {{ $labels.instance }} fell back to empty configuration.
Instance {{ $labels.instance }} fell back to empty configuration.
|||,
},
},
],
},
},
],
},
}
Loading

0 comments on commit 6b32f7b

Please sign in to comment.