Skip to content

Commit

Permalink
Merge branch 'main' into upgrade-to-cortex-v1.17.0
Browse files Browse the repository at this point in the history
  • Loading branch information
friedrichg authored Jun 8, 2024
2 parents 763518c + 1fcee6a commit 8731ac6
Show file tree
Hide file tree
Showing 18 changed files with 85 additions and 680 deletions.
14 changes: 9 additions & 5 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
runs-on: ubuntu-latest
container: quay.io/cortexproject/cortex-jsonnet-build-image:e158eda
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
name: Checkout
with:
fetch-depth: 0
Expand All @@ -25,7 +25,7 @@ jobs:
runs-on: ubuntu-latest
container: quay.io/cortexproject/cortex-jsonnet-build-image:e158eda
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
name: Checkout
with:
fetch-depth: 0
Expand All @@ -36,10 +36,14 @@ jobs:
runs-on: ubuntu-latest
container: quay.io/cortexproject/cortex-jsonnet-build-image:e158eda
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
name: Checkout
with:
fetch-depth: 0

- name: "Test readme"
run: make test-readme
- name: "Test readme s3"
run: make test-readme/s3
- name: "Test readme azure"
run: make test-readme/azure
- name: "Test readme gcs"
run: make test-readme/gcs
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ cortex-mixin.zip
cortex-mixin/out
cortex-mixin/vendor
/test-readme/
.vscode
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@

## master / unreleased
* [CHANGE] Use cortex v1.17.0
* [CHANGE] Enable shuffle sharding in compactors
* [CHANGE] Remove chunks support for dashboards
* [ENHANCEMENT] Configure `-ingester.client.grpc-compression` to be `snappy-block`
* [ENHANCEMENT] Support Grafana 11 in Cortex Service Scaling Dashboard

## 1.16.1
* [CHANGE] Upgrade memcached to 1.6.23-alpine and memcached-exporter to v0.14.2
Expand Down
17 changes: 8 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.PHONY: lint build-image publish-build-image test-readme
.PHONY: lint build-image publish-build-image test-readme clean

JSONNET_FMT := jsonnetfmt

Expand Down Expand Up @@ -47,18 +47,17 @@ build-mixin:
test-readme: test-readme/azure test-readme/gcs test-readme/s3

test-readme/%:
rm -rf $@ && \
mkdir -p $@ && cd $@ && \
tk init --k8s=1.24 && \
jb install github.com/cortexproject/cortex-jsonnet/cortex@main && \
rm -fr ./vendor/cortex && \
cp -r ../../cortex ./vendor/ && \
cp vendor/cortex/$(notdir $@)/main.jsonnet.example environments/default/main.jsonnet && \
PAGER=cat tk show environments/default
@./scripts/test-readme.sh $@

clean-white-noise:
@$(FIND) . -type f -regextype posix-extended -regex '.*(md|libsonnet)' -print | \
SED_BIN="$(SED)" xargs ./scripts/cleanup-white-noise.sh

check-white-noise: clean-white-noise
@git diff --exit-code --quiet || (echo "Please remove trailing whitespaces running 'make clean-white-noise'" && false)

clean:
rm -rf cortex-mixin/out
rm -rf cortex-mixin/vendor
rm -f cortex-mixin/cortex-mixin.zip
rm -rf test-readme
161 changes: 0 additions & 161 deletions cortex-mixin/alerts/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -71,27 +71,6 @@
|||,
},
},
{
// We're syncing every 10mins, and this means with a 5min rate, we will have a NaN when syncs fail
// and we will never trigger the alert.
// We also have a 3h grace-period for creation of tables which means the we can fail for 3h before it's an outage.
alert: 'CortexTableSyncFailure',
expr: |||
100 * rate(cortex_table_manager_sync_duration_seconds_count{status_code!~"2.."}[15m])
/
rate(cortex_table_manager_sync_duration_seconds_count[15m])
> 10
|||,
'for': '30m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% errors syncing tables.
|||,
},
},
{
alert: 'CortexQueriesIncorrect',
expr: |||
Expand Down Expand Up @@ -206,41 +185,6 @@
|||,
},
},
{
alert: 'CortexTransferFailed',
expr: |||
max_over_time(cortex_shutdown_duration_seconds_count{op="transfer",status!="success"}[15m])
|||,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }}/{{ $labels.instance }} transfer failed.
|||,
},
},
{
alert: 'CortexOldChunkInMemory',
// Even though we should flush chunks after 6h, we see that 99p of age of flushed chunks is closer
// to 10 hours.
// Ignore cortex_oldest_unflushed_chunk_timestamp_seconds that are zero (eg. distributors).
expr: |||
(time() - cortex_oldest_unflushed_chunk_timestamp_seconds > 36000)
and
(cortex_oldest_unflushed_chunk_timestamp_seconds > 0)
|||,
'for': '5m',
labels: {
severity: 'warning',
},
annotations: {
message: |||
{{ $labels.job }}/{{ $labels.instance }} has very old unflushed chunk in memory.
|||,
},
},
{
alert: 'CortexKVStoreFailure',
expr: |||
Expand Down Expand Up @@ -379,87 +323,6 @@
},
],
},
{
name: 'cortex_wal_alerts',
rules: [
{
// Alert immediately if WAL is corrupt.
alert: 'CortexWALCorruption',
expr: |||
increase(cortex_ingester_wal_corruptions_total[5m]) > 0
|||,
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }}/{{ $labels.instance }} has a corrupted WAL or checkpoint.
|||,
},
},
{
// One or more failed checkpoint creation is a warning.
alert: 'CortexCheckpointCreationFailed',
expr: |||
increase(cortex_ingester_checkpoint_creations_failed_total[10m]) > 0
|||,
labels: {
severity: 'warning',
},
annotations: {
message: |||
{{ $labels.job }}/{{ $labels.instance }} failed to create checkpoint.
|||,
},
},
{
// Two or more failed checkpoint creation in 1h means something is wrong.
alert: 'CortexCheckpointCreationFailed',
expr: |||
increase(cortex_ingester_checkpoint_creations_failed_total[1h]) > 1
|||,
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }}/{{ $labels.instance }} is failing to create checkpoint.
|||,
},
},
{
// One or more failed checkpoint deletion is a warning.
alert: 'CortexCheckpointDeletionFailed',
expr: |||
increase(cortex_ingester_checkpoint_deletions_failed_total[10m]) > 0
|||,
labels: {
severity: 'warning',
},
annotations: {
message: |||
{{ $labels.job }}/{{ $labels.instance }} failed to delete checkpoint.
|||,
},
},
{
// Two or more failed checkpoint deletion in 2h means something is wrong.
// We give this more buffer than creation as this is a less critical operation.
alert: 'CortexCheckpointDeletionFailed',
expr: |||
increase(cortex_ingester_checkpoint_deletions_failed_total[2h]) > 1
|||,
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.instance }} is failing to delete checkpoint.
|||,
},
},
],
},
{
name: 'cortex-rollout-alerts',
rules: [
Expand Down Expand Up @@ -524,30 +387,6 @@
{
name: 'cortex-provisioning',
rules: [
{
alert: 'CortexProvisioningMemcachedTooSmall',
// 4 x in-memory series size = 24hrs of data.
expr: |||
(
4 *
sum by (%s) (cortex_ingester_memory_series * cortex_ingester_chunk_size_bytes_sum / cortex_ingester_chunk_size_bytes_count)
/ 1e9
)
>
(
sum by (%s) (memcached_limit_bytes{job=~".+/memcached"}) / 1e9
)
||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels],
'for': '15m',
labels: {
severity: 'warning',
},
annotations: {
message: |||
Chunk memcached cluster in %(alert_aggregation_variables)s is too small, should be at least {{ printf "%%.2f" $value }}GB.
||| % $._config,
},
},
{
alert: 'CortexProvisioningTooManyActiveSeries',
// We target each ingester to 1.5M in-memory series. This alert fires if the average
Expand Down
14 changes: 1 addition & 13 deletions cortex-mixin/config.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,7 @@
grafanaDashboardShards: 4,

_config+:: {
// Switch for overall storage engine.
// May contain 'chunks', 'blocks' or both.
// Enables chunks- or blocks- specific panels and dashboards.
storage_engine: ['blocks'],

// For chunks backend, switch for chunk index type.
// May contain 'bigtable', 'dynamodb' or 'cassandra'.
chunk_index_backend: ['bigtable', 'dynamodb', 'cassandra'],

// For chunks backend, switch for chunk store type.
// May contain 'bigtable', 'dynamodb', 'cassandra', 's3' or 'gcs'.
chunk_store_backend: ['bigtable', 'dynamodb', 'cassandra', 's3', 'gcs'],
storage_engine: ['blocks'], // TODO: Remove this option, it's not needed

// Tags for dashboards.
tags: ['cortex'],
Expand All @@ -32,7 +21,6 @@
ruler: '(ruler|cortex$)',
query_frontend: '(query-frontend.*|cortex$)', // Match also custom query-frontend deployments.
query_scheduler: 'query-scheduler.*', // Not part of single-binary. Match also custom query-scheduler deployments.
table_manager: '(table-manager|cortex$)',
ring_members: ['compactor', 'distributor', 'ingester.*', 'querier.*', 'ruler', 'store-gateway', 'cortex'],
store_gateway: '(store-gateway|cortex$)',
gateway: '(gateway|cortex-gw|cortex-gw-internal)',
Expand Down
19 changes: 3 additions & 16 deletions cortex-mixin/dashboards.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,9 @@
(import 'dashboards/writes.libsonnet') +
(import 'dashboards/slow-queries.libsonnet') +
(import 'dashboards/rollout-progress.libsonnet') +

(if std.member($._config.storage_engine, 'blocks')
then
(import 'dashboards/compactor.libsonnet') +
(import 'dashboards/compactor-resources.libsonnet') +
(import 'dashboards/object-store.libsonnet')
else {}) +

(if std.member($._config.storage_engine, 'chunks')
then import 'dashboards/chunks.libsonnet'
else {}) +

(if std.member($._config.storage_engine, 'blocks')
&& std.member($._config.storage_engine, 'chunks')
then import 'dashboards/comparison.libsonnet'
else {}) +
(import 'dashboards/compactor.libsonnet') +
(import 'dashboards/compactor-resources.libsonnet') +
(import 'dashboards/object-store.libsonnet') +

(if !$._config.resources_dashboards_enabled then {} else
(import 'dashboards/reads-resources.libsonnet') +
Expand Down
Loading

0 comments on commit 8731ac6

Please sign in to comment.