Skip to content

Commit

Permalink
Merge branch 'main' into configure-head-chunks-write-queue-size
Browse files Browse the repository at this point in the history
  • Loading branch information
CharlieTLe authored Jul 21, 2024
2 parents f0ff5c2 + 5038e0c commit a5f5156
Show file tree
Hide file tree
Showing 24 changed files with 193 additions and 697 deletions.
56 changes: 56 additions & 0 deletions .github/workflows/build-image.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
name: Build Image

on:
push:
branches: [ main ]
paths:
- 'build-image/Dockerfile'
- '.github/workflows/build-image.yaml'
pull_request:
branches: [ main ]
paths:
- 'build-image/Dockerfile'
- '.github/workflows/build-image.yaml'

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
name: Checkout

- name: Build & save image
run: make build-image save-build-image

- name: Upload Docker Images Artifact
uses: actions/upload-artifact@v4
with:
name: build-image
path: ./build-image.tar
if-no-files-found: error

push:
if: github.ref == 'refs/heads/main' && github.repository == 'cortexproject/cortex-jsonnet'
needs: build
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
name: Checkout

- name: Download Docker Images Artifacts
uses: actions/download-artifact@v4
with:
name: build-image

- name: Load image
run: make load-build-image

- name: Login to Quay.io
uses: docker/login-action@v3
with:
registry: quay.io
username: ${{secrets.QUAY_REGISTRY_USER}}
password: ${{secrets.QUAY_REGISTRY_PASSWORD}}

- name: Push image
run: make publish-build-image
38 changes: 26 additions & 12 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,43 +3,57 @@ name: CI
on:
push:
branches: [ main ]
paths-ignore:
- 'build-image/Dockerfile'
- '.github/workflows/build-image.yaml'
pull_request:
branches: [ main ]
paths-ignore:
- 'build-image/Dockerfile'
- '.github/workflows/build-image.yaml'

jobs:
lint:
runs-on: ubuntu-latest
container: quay.io/cortexproject/cortex-jsonnet-build-image:e158eda
container: quay.io/cortexproject/cortex-jsonnet-build-image:fbe4726
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
name: Checkout
with:
fetch-depth: 0

- name: "Lint mixin"
run: make lint-mixin

- name: "Lint playbooks"
run: make lint-playbooks

build:
runs-on: ubuntu-latest
container: quay.io/cortexproject/cortex-jsonnet-build-image:e158eda
container: quay.io/cortexproject/cortex-jsonnet-build-image:fbe4726
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
name: Checkout
with:
fetch-depth: 0

- name: "Build mixin"
run: make build-mixin

readme:
runs-on: ubuntu-latest
container: quay.io/cortexproject/cortex-jsonnet-build-image:e158eda
container: quay.io/cortexproject/cortex-jsonnet-build-image:fbe4726
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
name: Checkout
with:
fetch-depth: 0

- name: "Test readme"
run: make test-readme

- name: "Test readme s3"
run: make test-readme/s3

- name: "Test readme azure"
run: make test-readme/azure

- name: "Test readme gcs"
run: make test-readme/gcs
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@ cortex-mixin.zip
cortex-mixin/out
cortex-mixin/vendor
/test-readme/
.vscode
build-image.tar
10 changes: 9 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
# Changelog

## master / unreleased
* [CHANGE] Use cortex v1.17.1
* [CHANGE] Enable shuffle sharding in compactors
* [CHANGE] Remove chunks support for dashboards
* [ENHANCEMENT] Configure `-ingester.client.grpc-compression` to be `snappy-block`
* [ENHANCEMENT] Support Grafana 11 in Cortex Service Scaling Dashboard

## 1.16.1
* [CHANGE] Upgrade memcached to 1.6.23-alpine and memcached-exporter to v0.14.2
* [CHANGE] Use cortex v1.16.0
* [CHANGE] Use cortex v1.16.1
* [ENHANCEMENT] Enable frontend query stats by default
* [ENHANCEMENT] Enable ruler query stats by default
* [ENHANCEMENT] Configure `-blocks-storage.bucket-store.ignore-blocks-within` in queriers, rulers and store-gateways

## 1.15.3 / 2023-11-24
* [CHANGE] Add default instance max series for ingesters
Expand Down
23 changes: 14 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.PHONY: lint build-image publish-build-image test-readme
.PHONY: lint build-image publish-build-image test-readme clean

JSONNET_FMT := jsonnetfmt

Expand Down Expand Up @@ -34,6 +34,12 @@ fmt:
build-image:
docker build -t quay.io/cortexproject/cortex-jsonnet-build-image:$(shell git rev-parse --short HEAD) build-image

save-build-image:
docker save quay.io/cortexproject/cortex-jsonnet-build-image:$(shell git rev-parse --short HEAD) > build-image.tar

load-build-image:
docker load < build-image.tar

publish-build-image:
docker push quay.io/cortexproject/cortex-jsonnet-build-image:$(shell git rev-parse --short HEAD)

Expand All @@ -47,18 +53,17 @@ build-mixin:
test-readme: test-readme/azure test-readme/gcs test-readme/s3

test-readme/%:
rm -rf $@ && \
mkdir -p $@ && cd $@ && \
tk init --k8s=1.24 && \
jb install github.com/cortexproject/cortex-jsonnet/cortex@main && \
rm -fr ./vendor/cortex && \
cp -r ../../cortex ./vendor/ && \
cp vendor/cortex/$(notdir $@)/main.jsonnet.example environments/default/main.jsonnet && \
PAGER=cat tk show environments/default
@./scripts/test-readme.sh $@

clean-white-noise:
@$(FIND) . -type f -regextype posix-extended -regex '.*(md|libsonnet)' -print | \
SED_BIN="$(SED)" xargs ./scripts/cleanup-white-noise.sh

check-white-noise: clean-white-noise
@git diff --exit-code --quiet || (echo "Please remove trailing whitespaces running 'make clean-white-noise'" && false)

clean:
rm -rf cortex-mixin/out
rm -rf cortex-mixin/vendor
rm -f cortex-mixin/cortex-mixin.zip
rm -rf test-readme
161 changes: 0 additions & 161 deletions cortex-mixin/alerts/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -71,27 +71,6 @@
|||,
},
},
{
// We're syncing every 10mins, and this means with a 5min rate, we will have a NaN when syncs fail
// and we will never trigger the alert.
// We also have a 3h grace-period for creation of tables which means the we can fail for 3h before it's an outage.
alert: 'CortexTableSyncFailure',
expr: |||
100 * rate(cortex_table_manager_sync_duration_seconds_count{status_code!~"2.."}[15m])
/
rate(cortex_table_manager_sync_duration_seconds_count[15m])
> 10
|||,
'for': '30m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% errors syncing tables.
|||,
},
},
{
alert: 'CortexQueriesIncorrect',
expr: |||
Expand Down Expand Up @@ -206,41 +185,6 @@
|||,
},
},
{
alert: 'CortexTransferFailed',
expr: |||
max_over_time(cortex_shutdown_duration_seconds_count{op="transfer",status!="success"}[15m])
|||,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }}/{{ $labels.instance }} transfer failed.
|||,
},
},
{
alert: 'CortexOldChunkInMemory',
// Even though we should flush chunks after 6h, we see that 99p of age of flushed chunks is closer
// to 10 hours.
// Ignore cortex_oldest_unflushed_chunk_timestamp_seconds that are zero (eg. distributors).
expr: |||
(time() - cortex_oldest_unflushed_chunk_timestamp_seconds > 36000)
and
(cortex_oldest_unflushed_chunk_timestamp_seconds > 0)
|||,
'for': '5m',
labels: {
severity: 'warning',
},
annotations: {
message: |||
{{ $labels.job }}/{{ $labels.instance }} has very old unflushed chunk in memory.
|||,
},
},
{
alert: 'CortexKVStoreFailure',
expr: |||
Expand Down Expand Up @@ -379,87 +323,6 @@
},
],
},
{
name: 'cortex_wal_alerts',
rules: [
{
// Alert immediately if WAL is corrupt.
alert: 'CortexWALCorruption',
expr: |||
increase(cortex_ingester_wal_corruptions_total[5m]) > 0
|||,
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }}/{{ $labels.instance }} has a corrupted WAL or checkpoint.
|||,
},
},
{
// One or more failed checkpoint creation is a warning.
alert: 'CortexCheckpointCreationFailed',
expr: |||
increase(cortex_ingester_checkpoint_creations_failed_total[10m]) > 0
|||,
labels: {
severity: 'warning',
},
annotations: {
message: |||
{{ $labels.job }}/{{ $labels.instance }} failed to create checkpoint.
|||,
},
},
{
// Two or more failed checkpoint creation in 1h means something is wrong.
alert: 'CortexCheckpointCreationFailed',
expr: |||
increase(cortex_ingester_checkpoint_creations_failed_total[1h]) > 1
|||,
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }}/{{ $labels.instance }} is failing to create checkpoint.
|||,
},
},
{
// One or more failed checkpoint deletion is a warning.
alert: 'CortexCheckpointDeletionFailed',
expr: |||
increase(cortex_ingester_checkpoint_deletions_failed_total[10m]) > 0
|||,
labels: {
severity: 'warning',
},
annotations: {
message: |||
{{ $labels.job }}/{{ $labels.instance }} failed to delete checkpoint.
|||,
},
},
{
// Two or more failed checkpoint deletion in 2h means something is wrong.
// We give this more buffer than creation as this is a less critical operation.
alert: 'CortexCheckpointDeletionFailed',
expr: |||
increase(cortex_ingester_checkpoint_deletions_failed_total[2h]) > 1
|||,
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.instance }} is failing to delete checkpoint.
|||,
},
},
],
},
{
name: 'cortex-rollout-alerts',
rules: [
Expand Down Expand Up @@ -524,30 +387,6 @@
{
name: 'cortex-provisioning',
rules: [
{
alert: 'CortexProvisioningMemcachedTooSmall',
// 4 x in-memory series size = 24hrs of data.
expr: |||
(
4 *
sum by (%s) (cortex_ingester_memory_series * cortex_ingester_chunk_size_bytes_sum / cortex_ingester_chunk_size_bytes_count)
/ 1e9
)
>
(
sum by (%s) (memcached_limit_bytes{job=~".+/memcached"}) / 1e9
)
||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels],
'for': '15m',
labels: {
severity: 'warning',
},
annotations: {
message: |||
Chunk memcached cluster in %(alert_aggregation_variables)s is too small, should be at least {{ printf "%%.2f" $value }}GB.
||| % $._config,
},
},
{
alert: 'CortexProvisioningTooManyActiveSeries',
// We target each ingester to 1.5M in-memory series. This alert fires if the average
Expand Down
Loading

0 comments on commit a5f5156

Please sign in to comment.