Merge branch 'main' into configure-head-chunks-write-queue-size

cortexproject · Jul 21, 2024 · a5f5156 · a5f5156
2 parents f0ff5c2 + 5038e0c
commit a5f5156
Show file tree

Hide file tree

Showing 24 changed files with 193 additions and 697 deletions.
diff --git a/.github/workflows/build-image.yaml b/.github/workflows/build-image.yaml
@@ -0,0 +1,56 @@
+name: Build Image
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - 'build-image/Dockerfile'
+      - '.github/workflows/build-image.yaml'
+  pull_request:
+    branches: [ main ]
+    paths:
+      - 'build-image/Dockerfile'
+      - '.github/workflows/build-image.yaml'
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        name: Checkout
+
+      - name: Build & save image
+        run: make build-image save-build-image
+
+      - name: Upload Docker Images Artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: build-image
+          path: ./build-image.tar
+          if-no-files-found: error
+
+  push:
+    if: github.ref == 'refs/heads/main' && github.repository == 'cortexproject/cortex-jsonnet'
+    needs: build
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        name: Checkout
+
+      - name: Download Docker Images Artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: build-image
+
+      - name: Load image
+        run: make load-build-image
+
+      - name: Login to Quay.io
+        uses: docker/login-action@v3
+        with:
+          registry: quay.io
+          username: ${{secrets.QUAY_REGISTRY_USER}}
+          password: ${{secrets.QUAY_REGISTRY_PASSWORD}}
+
+      - name: Push image
+        run: make publish-build-image
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -3,43 +3,57 @@ name: CI
 on:
   push:
     branches: [ main ]
+    paths-ignore:
+      - 'build-image/Dockerfile'
+      - '.github/workflows/build-image.yaml'
   pull_request:
     branches: [ main ]
+    paths-ignore:
+      - 'build-image/Dockerfile'
+      - '.github/workflows/build-image.yaml'
 
 jobs:
   lint:
     runs-on: ubuntu-latest
-    container: quay.io/cortexproject/cortex-jsonnet-build-image:e158eda
+    container: quay.io/cortexproject/cortex-jsonnet-build-image:fbe4726
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
         name: Checkout
         with:
           fetch-depth: 0
-      
+
       - name: "Lint mixin"
         run: make lint-mixin
-      
+
       - name: "Lint playbooks"
         run: make lint-playbooks
+
   build:
     runs-on: ubuntu-latest
-    container: quay.io/cortexproject/cortex-jsonnet-build-image:e158eda
+    container: quay.io/cortexproject/cortex-jsonnet-build-image:fbe4726
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
         name: Checkout
         with:
           fetch-depth: 0
-      
+
       - name: "Build mixin"
         run: make build-mixin
+
   readme:
     runs-on: ubuntu-latest
-    container: quay.io/cortexproject/cortex-jsonnet-build-image:e158eda
+    container: quay.io/cortexproject/cortex-jsonnet-build-image:fbe4726
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
         name: Checkout
         with:
           fetch-depth: 0
-
-      - name: "Test readme"
-        run: make test-readme
+
+      - name: "Test readme s3"
+        run: make test-readme/s3
+
+      - name: "Test readme azure"
+        run: make test-readme/azure
+
+      - name: "Test readme gcs"
+        run: make test-readme/gcs
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,5 @@ cortex-mixin.zip
 cortex-mixin/out
 cortex-mixin/vendor
 /test-readme/
+.vscode
+build-image.tar
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,10 +1,18 @@
 # Changelog
 
 ## master / unreleased
+* [CHANGE] Use cortex v1.17.1
+* [CHANGE] Enable shuffle sharding in compactors
+* [CHANGE] Remove chunks support for dashboards
+* [ENHANCEMENT] Configure `-ingester.client.grpc-compression` to be `snappy-block`
+* [ENHANCEMENT] Support Grafana 11 in Cortex Service Scaling Dashboard
+
+## 1.16.1
 * [CHANGE] Upgrade memcached to 1.6.23-alpine and memcached-exporter to v0.14.2
-* [CHANGE] Use cortex v1.16.0
+* [CHANGE] Use cortex v1.16.1
 * [ENHANCEMENT] Enable frontend query stats by default
 * [ENHANCEMENT] Enable ruler query stats by default
+* [ENHANCEMENT] Configure `-blocks-storage.bucket-store.ignore-blocks-within` in queriers, rulers and store-gateways
 
 ## 1.15.3 / 2023-11-24
 * [CHANGE] Add default instance max series for ingesters

diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: lint build-image publish-build-image test-readme
+.PHONY: lint build-image publish-build-image test-readme clean
 
 JSONNET_FMT := jsonnetfmt
 
@@ -34,6 +34,12 @@ fmt:
 build-image:
 	docker build -t quay.io/cortexproject/cortex-jsonnet-build-image:$(shell git rev-parse --short HEAD) build-image
 
+save-build-image:
+	docker save quay.io/cortexproject/cortex-jsonnet-build-image:$(shell git rev-parse --short HEAD) > build-image.tar
+
+load-build-image:
+	docker load < build-image.tar
+
 publish-build-image:
 	docker push quay.io/cortexproject/cortex-jsonnet-build-image:$(shell git rev-parse --short HEAD)
 
@@ -47,18 +53,17 @@ build-mixin:
 test-readme: test-readme/azure test-readme/gcs test-readme/s3
 
 test-readme/%:
-	rm -rf $@ && \
-	mkdir -p $@ && cd $@ && \
-	tk init --k8s=1.24 && \
-	jb install github.com/cortexproject/cortex-jsonnet/cortex@main && \
-	rm -fr ./vendor/cortex && \
-	cp -r ../../cortex ./vendor/ && \
-	cp vendor/cortex/$(notdir $@)/main.jsonnet.example environments/default/main.jsonnet && \
-	PAGER=cat tk show environments/default
+	@./scripts/test-readme.sh $@
 
 clean-white-noise:
 	@$(FIND) . -type f -regextype posix-extended -regex '.*(md|libsonnet)' -print | \
 	SED_BIN="$(SED)" xargs ./scripts/cleanup-white-noise.sh
 
 check-white-noise: clean-white-noise
 	@git diff --exit-code --quiet || (echo "Please remove trailing whitespaces running 'make clean-white-noise'" && false)
+
+clean:
+	rm -rf cortex-mixin/out
+	rm -rf cortex-mixin/vendor
+	rm -f cortex-mixin/cortex-mixin.zip
+	rm -rf test-readme
diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet
@@ -71,27 +71,6 @@
             |||,
           },
         },
-        {
-          // We're syncing every 10mins, and this means with a 5min rate, we will have a NaN when syncs fail
-          // and we will never trigger the alert.
-          // We also have a 3h grace-period for creation of tables which means the we can fail for 3h before it's an outage.
-          alert: 'CortexTableSyncFailure',
-          expr: |||
-            100 * rate(cortex_table_manager_sync_duration_seconds_count{status_code!~"2.."}[15m])
-              /
-            rate(cortex_table_manager_sync_duration_seconds_count[15m])
-              > 10
-          |||,
-          'for': '30m',
-          labels: {
-            severity: 'critical',
-          },
-          annotations: {
-            message: |||
-              {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% errors syncing tables.
-            |||,
-          },
-        },
         {
           alert: 'CortexQueriesIncorrect',
           expr: |||
@@ -206,41 +185,6 @@
             |||,
           },
         },
-        {
-          alert: 'CortexTransferFailed',
-          expr: |||
-            max_over_time(cortex_shutdown_duration_seconds_count{op="transfer",status!="success"}[15m])
-          |||,
-          'for': '5m',
-          labels: {
-            severity: 'critical',
-          },
-          annotations: {
-            message: |||
-              {{ $labels.job }}/{{ $labels.instance }} transfer failed.
-            |||,
-          },
-        },
-        {
-          alert: 'CortexOldChunkInMemory',
-          // Even though we should flush chunks after 6h, we see that 99p of age of flushed chunks is closer
-          // to 10 hours.
-          // Ignore cortex_oldest_unflushed_chunk_timestamp_seconds that are zero (eg. distributors).
-          expr: |||
-            (time() - cortex_oldest_unflushed_chunk_timestamp_seconds > 36000)
-              and
-            (cortex_oldest_unflushed_chunk_timestamp_seconds > 0)
-          |||,
-          'for': '5m',
-          labels: {
-            severity: 'warning',
-          },
-          annotations: {
-            message: |||
-              {{ $labels.job }}/{{ $labels.instance }} has very old unflushed chunk in memory.
-            |||,
-          },
-        },
         {
           alert: 'CortexKVStoreFailure',
           expr: |||
@@ -379,87 +323,6 @@
         },
       ],
     },
-    {
-      name: 'cortex_wal_alerts',
-      rules: [
-        {
-          // Alert immediately if WAL is corrupt.
-          alert: 'CortexWALCorruption',
-          expr: |||
-            increase(cortex_ingester_wal_corruptions_total[5m]) > 0
-          |||,
-          labels: {
-            severity: 'critical',
-          },
-          annotations: {
-            message: |||
-              {{ $labels.job }}/{{ $labels.instance }} has a corrupted WAL or checkpoint.
-            |||,
-          },
-        },
-        {
-          // One or more failed checkpoint creation is a warning.
-          alert: 'CortexCheckpointCreationFailed',
-          expr: |||
-            increase(cortex_ingester_checkpoint_creations_failed_total[10m]) > 0
-          |||,
-          labels: {
-            severity: 'warning',
-          },
-          annotations: {
-            message: |||
-              {{ $labels.job }}/{{ $labels.instance }} failed to create checkpoint.
-            |||,
-          },
-        },
-        {
-          // Two or more failed checkpoint creation in 1h means something is wrong.
-          alert: 'CortexCheckpointCreationFailed',
-          expr: |||
-            increase(cortex_ingester_checkpoint_creations_failed_total[1h]) > 1
-          |||,
-          labels: {
-            severity: 'critical',
-          },
-          annotations: {
-            message: |||
-              {{ $labels.job }}/{{ $labels.instance }} is failing to create checkpoint.
-            |||,
-          },
-        },
-        {
-          // One or more failed checkpoint deletion is a warning.
-          alert: 'CortexCheckpointDeletionFailed',
-          expr: |||
-            increase(cortex_ingester_checkpoint_deletions_failed_total[10m]) > 0
-          |||,
-          labels: {
-            severity: 'warning',
-          },
-          annotations: {
-            message: |||
-              {{ $labels.job }}/{{ $labels.instance }} failed to delete checkpoint.
-            |||,
-          },
-        },
-        {
-          // Two or more failed checkpoint deletion in 2h means something is wrong.
-          // We give this more buffer than creation as this is a less critical operation.
-          alert: 'CortexCheckpointDeletionFailed',
-          expr: |||
-            increase(cortex_ingester_checkpoint_deletions_failed_total[2h]) > 1
-          |||,
-          labels: {
-            severity: 'critical',
-          },
-          annotations: {
-            message: |||
-              {{ $labels.instance }} is failing to delete checkpoint.
-            |||,
-          },
-        },
-      ],
-    },
     {
       name: 'cortex-rollout-alerts',
       rules: [
@@ -524,30 +387,6 @@
     {
       name: 'cortex-provisioning',
       rules: [
-        {
-          alert: 'CortexProvisioningMemcachedTooSmall',
-          // 4 x in-memory series size = 24hrs of data.
-          expr: |||
-            (
-              4 *
-              sum by (%s) (cortex_ingester_memory_series * cortex_ingester_chunk_size_bytes_sum / cortex_ingester_chunk_size_bytes_count)
-               / 1e9
-            )
-              >
-            (
-              sum by (%s) (memcached_limit_bytes{job=~".+/memcached"}) / 1e9
-            )
-          ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels],
-          'for': '15m',
-          labels: {
-            severity: 'warning',
-          },
-          annotations: {
-            message: |||
-              Chunk memcached cluster in %(alert_aggregation_variables)s is too small, should be at least {{ printf "%%.2f" $value }}GB.
-            ||| % $._config,
-          },
-        },
         {
           alert: 'CortexProvisioningTooManyActiveSeries',
           // We target each ingester to 1.5M in-memory series. This alert fires if the average