From d9bf7728486dc61faab9e85119d241b84d4e080e Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez <1517449+friedrichg@users.noreply.github.com> Date: Sun, 8 Sep 2024 10:36:17 -0700 Subject: [PATCH 1/4] Update CortexProvisioningTooManyActiveSeries to 3.2M series per ingester (#59) * Update CortexProvisioningTooManyActiveSeries to 3.2M series per ingester Signed-off-by: Friedrich Gonzalez * Adjust more things Signed-off-by: Friedrich Gonzalez * Update CHANGELOG.md --------- Signed-off-by: Friedrich Gonzalez Signed-off-by: Narsing Metpally --- CHANGELOG.md | 1 + cortex-mixin/alerts/alerts.libsonnet | 6 +++--- cortex-mixin/docs/playbooks.md | 6 +++--- cortex-mixin/recording_rules.libsonnet | 6 +++--- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e0206ac..e6895ec1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ * [CHANGE] Use cortex v1.17.1 * [CHANGE] Enable shuffle sharding in compactors * [CHANGE] Remove chunks support for dashboards +* [CHANGE] Target 3M memory series per ingester instead of 1.5M * [CHANGE] Update jsonnet-libs to Fri Jul 19 12:51:49 2024 #57 * [ENHANCEMENT] Configure `-ingester.client.grpc-compression` to be `snappy-block` * [ENHANCEMENT] Support Grafana 11 in Cortex Service Scaling Dashboard diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet index e67ef449..7145d028 100644 --- a/cortex-mixin/alerts/alerts.libsonnet +++ b/cortex-mixin/alerts/alerts.libsonnet @@ -389,11 +389,11 @@ rules: [ { alert: 'CortexProvisioningTooManyActiveSeries', - // We target each ingester to 1.5M in-memory series. This alert fires if the average - // number of series / ingester in a Cortex cluster is > 1.6M for 2h (we compact + // We target each ingester to 3.0M in-memory series. This alert fires if the average + // number of series / ingester in a Cortex cluster is > 3.2M for 2h (we compact // the TSDB head every 2h). expr: ||| - avg by (%s) (cortex_ingester_memory_series) > 1.6e6 + avg by (%s) (cortex_ingester_memory_series) > 3.2e6 ||| % [$._config.alert_aggregation_labels], 'for': '2h', labels: { diff --git a/cortex-mixin/docs/playbooks.md b/cortex-mixin/docs/playbooks.md index b5b68895..39586870 100644 --- a/cortex-mixin/docs/playbooks.md +++ b/cortex-mixin/docs/playbooks.md @@ -555,13 +555,13 @@ How to **investigate**: ### CortexProvisioningTooManyActiveSeries -This alert fires if the average number of in-memory series per ingester is above our target (1.5M). +This alert fires if the average number of in-memory series per ingester is above our target (3.0M). How to **fix**: - Scale up ingesters - To find out the Cortex clusters where ingesters should be scaled up and how many minimum replicas are expected: ``` - ceil(sum by(cluster, namespace) (cortex_ingester_memory_series) / 1.5e6) > + ceil(sum by(cluster, namespace) (cortex_ingester_memory_series) / 3.0e6) > count by(cluster, namespace) (cortex_ingester_memory_series) ``` - After the scale up, the in-memory series are expected to be reduced at the next TSDB head compaction (occurring every 2h) @@ -595,7 +595,7 @@ How to **fix**: kubectl -n delete pod ingester-XXX ``` - Restarting an ingester typically reduces the memory allocated by mmap-ed files. After the restart, ingester may allocate this memory again over time, but it may give more time while working on a longer term solution -- Check the `Cortex / Writes Resources` dashboard to see if the number of series per ingester is above the target (1.5M). If so: +- Check the `Cortex / Writes Resources` dashboard to see if the number of series per ingester is above the target (3.0M). If so: - Scale up ingesters - Memory is expected to be reclaimed at the next TSDB head compaction (occurring every 2h) diff --git a/cortex-mixin/recording_rules.libsonnet b/cortex-mixin/recording_rules.libsonnet index 03835247..86650fa5 100644 --- a/cortex-mixin/recording_rules.libsonnet +++ b/cortex-mixin/recording_rules.libsonnet @@ -2,7 +2,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { local _config = { - max_series_per_ingester: 1.5e6, + max_series_per_ingester: 3.0e6, max_samples_per_sec_per_ingester: 80e3, max_samples_per_sec_per_distributor: 240e3, limit_utilisation_target: 0.6, @@ -148,7 +148,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ||| % _config, }, { - // Ingester should have 1.5M series in memory + // Ingester should have 3.0M series in memory record: 'cluster_namespace_deployment_reason:required_replicas:count', labels: { deployment: 'ingester', @@ -167,7 +167,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, { // We should be about to cover 60% of our limits, - // and ingester can have 1.5M series in memory + // and ingester can have 3.0M series in memory record: 'cluster_namespace_deployment_reason:required_replicas:count', labels: { deployment: 'ingester', From 727addc452387560290b9050b76c89b8d1c4bd46 Mon Sep 17 00:00:00 2001 From: Narsing Metpally Date: Tue, 24 Sep 2024 12:39:10 -0600 Subject: [PATCH 2/4] Increase CortexProvisioningTooManyWrites alert threshold to 160k Signed-off-by: Narsing Metpally --- cortex-mixin/alerts/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet index 7145d028..9abe1ba2 100644 --- a/cortex-mixin/alerts/alerts.libsonnet +++ b/cortex-mixin/alerts/alerts.libsonnet @@ -409,7 +409,7 @@ alert: 'CortexProvisioningTooManyWrites', // 80k writes / s per ingester max. expr: ||| - avg by (%s) (rate(cortex_ingester_ingested_samples_total[1m])) > 80e3 + avg by (%s) (rate(cortex_ingester_ingested_samples_total[1m])) > 160e3 ||| % $._config.alert_aggregation_labels, 'for': '15m', labels: { From 9acc4874af959928ed107d9c4a8481f908274a6f Mon Sep 17 00:00:00 2001 From: Narsing Metpally Date: Wed, 25 Sep 2024 12:35:12 -0600 Subject: [PATCH 3/4] updating changelog Signed-off-by: Narsing Metpally --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e6895ec1..74252e88 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ * [CHANGE] Remove chunks support for dashboards * [CHANGE] Target 3M memory series per ingester instead of 1.5M * [CHANGE] Update jsonnet-libs to Fri Jul 19 12:51:49 2024 #57 +* [CHANGE] Increase CortexProvisioningTooManyWrites alert threshold to 160e3 * [ENHANCEMENT] Configure `-ingester.client.grpc-compression` to be `snappy-block` * [ENHANCEMENT] Support Grafana 11 in Cortex Service Scaling Dashboard From f3b61810674a0cecdf9c7edc9314ba5b3e7d2af7 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez <1517449+friedrichg@users.noreply.github.com> Date: Wed, 25 Sep 2024 11:42:12 -0700 Subject: [PATCH 4/4] fix comment --- cortex-mixin/alerts/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet index 9abe1ba2..ec44565c 100644 --- a/cortex-mixin/alerts/alerts.libsonnet +++ b/cortex-mixin/alerts/alerts.libsonnet @@ -407,7 +407,7 @@ }, { alert: 'CortexProvisioningTooManyWrites', - // 80k writes / s per ingester max. + // 160k writes / s per ingester max. expr: ||| avg by (%s) (rate(cortex_ingester_ingested_samples_total[1m])) > 160e3 ||| % $._config.alert_aggregation_labels,