From 2f919df267254e54e0d75af09d87ebbc235f9fa1 Mon Sep 17 00:00:00 2001 From: Steven Kreitzer Date: Wed, 11 Dec 2024 15:02:04 -0600 Subject: [PATCH] feat(kps): move to alertmanager spec --- kubernetes/apps/monitoring/karma/ks.yaml | 2 +- .../app/alertmanagerconfig.yaml | 77 +++++++++++++++++++ .../app/externalsecret.yaml | 11 ++- .../app/helmrelease.yaml | 6 +- .../app/kustomization.yaml | 7 +- .../app/resources/alertmanager.yaml | 71 ----------------- .../monitoring/kube-prometheus-stack/ks.yaml | 3 +- kubernetes/apps/monitoring/kustomization.yaml | 1 + .../apps/monitoring/loki/app/helmrelease.yaml | 1 - kubernetes/apps/monitoring/loki/ks.yaml | 2 +- .../app/helmrelease.yaml | 22 ++++++ .../app/kustomization.yaml | 5 ++ .../prometheus-operator-crds/ks.yaml | 22 ++++++ kubernetes/apps/networking/nginx/ks.yaml | 4 +- 14 files changed, 143 insertions(+), 91 deletions(-) create mode 100644 kubernetes/apps/monitoring/kube-prometheus-stack/app/alertmanagerconfig.yaml delete mode 100644 kubernetes/apps/monitoring/kube-prometheus-stack/app/resources/alertmanager.yaml create mode 100644 kubernetes/apps/monitoring/prometheus-operator-crds/app/helmrelease.yaml create mode 100644 kubernetes/apps/monitoring/prometheus-operator-crds/app/kustomization.yaml create mode 100644 kubernetes/apps/monitoring/prometheus-operator-crds/ks.yaml diff --git a/kubernetes/apps/monitoring/karma/ks.yaml b/kubernetes/apps/monitoring/karma/ks.yaml index 6add3b84df..b30e1afed5 100644 --- a/kubernetes/apps/monitoring/karma/ks.yaml +++ b/kubernetes/apps/monitoring/karma/ks.yaml @@ -19,4 +19,4 @@ spec: wait: true interval: 30m retryInterval: 1m - timeout: 15m + timeout: 5m diff --git a/kubernetes/apps/monitoring/kube-prometheus-stack/app/alertmanagerconfig.yaml b/kubernetes/apps/monitoring/kube-prometheus-stack/app/alertmanagerconfig.yaml new file mode 100644 index 0000000000..b06279be8f --- /dev/null +++ b/kubernetes/apps/monitoring/kube-prometheus-stack/app/alertmanagerconfig.yaml @@ -0,0 +1,77 @@ +--- +apiVersion: monitoring.coreos.com/v1alpha1 +kind: AlertmanagerConfig +metadata: + name: alertmanager +spec: + route: + groupBy: ["alertname", "job"] + groupInterval: 10m + groupWait: 1m + receiver: pushover + repeatInterval: 12h + routes: + - receiver: "null" + matchers: [{name: alertname, value: InfoInhibitor, matchType: =}] + - receiver: heartbeat + groupInterval: 15s + groupWait: 0s + repeatInterval: 5m + matchers: [{name: alertname, value: Watchdog, matchType: =}] + - receiver: pushover + matchers: [{name: severity, value: critical, matchType: =}] + + inhibitRules: + - equal: ["alertname", "namespace"] + sourceMatch: + - name: severity + value: critical + matchType: = + - name: severity + value: warning + matchType: =~ + + receivers: + - name: "null" + - name: heartbeat + webhookConfigs: + - urlSecret: + name: &secret alertmanager-secret + key: ALERTMANAGER_HEARTBEAT_URL + - name: pushover + pushoverConfigs: + - html: true + message: |- + {{- range .Alerts }} + {{- if ne .Annotations.description "" }} + {{ .Annotations.description }} + {{- else if ne .Annotations.summary "" }} + {{ .Annotations.summary }} + {{- else if ne .Annotations.message "" }} + {{ .Annotations.message }} + {{- else }} + Alert description not available + {{- end }} + {{- if gt (len .Labels.SortedPairs) 0 }} + + {{- range .Labels.SortedPairs }} + {{ .Name }}: {{ .Value }} + {{- end }} + + {{- end }} + {{- end }} + priority: |- + {{ if eq .Status "firing" }}1{{ else }}0{{ end }} + sendResolved: true + sound: gamelan + title: >- + [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] + {{ .CommonLabels.alertname }} + ttl: 3600s + token: + name: *secret + key: ALERTMANAGER_PUSHOVER_APP_TOKEN + userKey: + name: *secret + key: ALERTMANAGER_PUSHOVER_USER_KEY + urlTitle: View in Alertmanager diff --git a/kubernetes/apps/monitoring/kube-prometheus-stack/app/externalsecret.yaml b/kubernetes/apps/monitoring/kube-prometheus-stack/app/externalsecret.yaml index 5b44b3c860..c431723ba0 100644 --- a/kubernetes/apps/monitoring/kube-prometheus-stack/app/externalsecret.yaml +++ b/kubernetes/apps/monitoring/kube-prometheus-stack/app/externalsecret.yaml @@ -4,7 +4,6 @@ kind: ExternalSecret metadata: name: alertmanager spec: - refreshInterval: 5m secretStoreRef: kind: ClusterSecretStore name: onepassword-connect @@ -12,11 +11,11 @@ spec: name: alertmanager-secret creationPolicy: Owner template: - templateFrom: - - configMap: - name: alertmanager-config-tpl - items: - - key: alertmanager.yaml + engineVersion: v2 + data: + ALERTMANAGER_HEARTBEAT_URL: "{{ .ALERTMANAGER_HEARTBEAT_URL }}" + ALERTMANAGER_PUSHOVER_APP_TOKEN: "{{ .ALERTMANAGER_PUSHOVER_APP_TOKEN }}" + ALERTMANAGER_PUSHOVER_USER_KEY: "{{ .ALERTMANAGER_PUSHOVER_USER_KEY }}" dataFrom: - extract: key: alertmanager diff --git a/kubernetes/apps/monitoring/kube-prometheus-stack/app/helmrelease.yaml b/kubernetes/apps/monitoring/kube-prometheus-stack/app/helmrelease.yaml index 7550251b65..c21a66c0e4 100644 --- a/kubernetes/apps/monitoring/kube-prometheus-stack/app/helmrelease.yaml +++ b/kubernetes/apps/monitoring/kube-prometheus-stack/app/helmrelease.yaml @@ -32,8 +32,10 @@ spec: ingressClassName: internal hosts: ["am.ktwo.io"] alertmanagerSpec: - useExistingSecret: true - configSecret: alertmanager-secret + alertmanagerConfiguration: + name: alertmanager + global: + resolveTimeout: 5m externalUrl: https://am.ktwo.io storage: volumeClaimTemplate: diff --git a/kubernetes/apps/monitoring/kube-prometheus-stack/app/kustomization.yaml b/kubernetes/apps/monitoring/kube-prometheus-stack/app/kustomization.yaml index 004b793183..bf698149df 100644 --- a/kubernetes/apps/monitoring/kube-prometheus-stack/app/kustomization.yaml +++ b/kubernetes/apps/monitoring/kube-prometheus-stack/app/kustomization.yaml @@ -2,11 +2,6 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: + - ./alertmanagerconfig.yaml - ./externalsecret.yaml - ./helmrelease.yaml -configMapGenerator: - - name: alertmanager-config-tpl - files: - - ./resources/alertmanager.yaml -generatorOptions: - disableNameSuffixHash: true diff --git a/kubernetes/apps/monitoring/kube-prometheus-stack/app/resources/alertmanager.yaml b/kubernetes/apps/monitoring/kube-prometheus-stack/app/resources/alertmanager.yaml deleted file mode 100644 index fd5990ce5a..0000000000 --- a/kubernetes/apps/monitoring/kube-prometheus-stack/app/resources/alertmanager.yaml +++ /dev/null @@ -1,71 +0,0 @@ ---- -global: - resolve_timeout: 5m - -route: - group_by: ["alertname", "job"] - group_interval: 10m - group_wait: 1m - receiver: pushover - repeat_interval: 12h - routes: - - receiver: heartbeat - group_interval: 15s - group_wait: 0s - repeat_interval: 5m - matchers: - - alertname =~ "Watchdog" - - receiver: "null" - matchers: - - alertname =~ "InfoInhibitor" - - receiver: pushover - continue: true - matchers: - - severity = "critical" - -inhibit_rules: - - equal: ["alertname", "namespace"] - source_matchers: - - severity = "critical" - target_matchers: - - severity = "warning" - -receivers: - - name: heartbeat - webhook_configs: - - send_resolved: true - url: "{{ .ALERTMANAGER_HEARTBEAT_URL }}" - - name: "null" - - name: pushover - pushover_configs: - - html: true - message: |- - {{ "{{-" }} range .Alerts {{ "}}" }} - {{ "{{-" }} if ne .Annotations.description "" {{ "}}" }} - {{ "{{" }} .Annotations.description {{ "}}" }} - {{ "{{-" }} else if ne .Annotations.summary "" {{ "}}" }} - {{ "{{" }} .Annotations.summary {{ "}}" }} - {{ "{{-" }} else if ne .Annotations.message "" {{ "}}" }} - {{ "{{" }} .Annotations.message {{ "}}" }} - {{ "{{-" }} else {{ "}}" }} - Alert description not available - {{ "{{-" }} end {{ "}}" }} - {{ "{{-" }} if gt (len .Labels.SortedPairs) 0 {{ "}}" }} - - {{ "{{-" }} range .Labels.SortedPairs {{ "}}" }} - {{ "{{" }} .Name {{ "}}" }}: {{ "{{" }} .Value {{ "}}" }} - {{ "{{-" }} end {{ "}}" }} - - {{ "{{-" }} end {{ "}}" }} - {{ "{{-" }} end {{ "}}" }} - priority: |- - {{ "{{" }} if eq .Status "firing" {{ "}}" }}1{{ "{{" }} else {{ "}}" }}0{{ "{{" }} end {{ "}}" }} - send_resolved: true - sound: gamelan - title: >- - [{{ "{{" }} .Status | toUpper {{ "}}" }}{{ "{{" }} if eq .Status "firing" {{ "}}" }}:{{ "{{" }} .Alerts.Firing | len {{ "}}" }}{{ "{{" }} end {{ "}}" }}] - {{ "{{" }} .CommonLabels.alertname {{ "}}" }} - token: "{{ .ALERTMANAGER_PUSHOVER_APP_TOKEN }}" - # ttl: "{{ .ALERTMANAGER_PUSHOVER_TTL }}" - url_title: View in Alertmanager - user_key: "{{ .ALERTMANAGER_PUSHOVER_USER_KEY }}" diff --git a/kubernetes/apps/monitoring/kube-prometheus-stack/ks.yaml b/kubernetes/apps/monitoring/kube-prometheus-stack/ks.yaml index bcaae4b300..bdb72acf8b 100644 --- a/kubernetes/apps/monitoring/kube-prometheus-stack/ks.yaml +++ b/kubernetes/apps/monitoring/kube-prometheus-stack/ks.yaml @@ -11,6 +11,7 @@ spec: app.kubernetes.io/name: *app dependsOn: - name: external-secrets-stores + - name: prometheus-operator-crds - name: rook-ceph-cluster path: ./kubernetes/apps/monitoring/kube-prometheus-stack/app prune: true @@ -42,4 +43,4 @@ spec: wait: true interval: 30m retryInterval: 1m - timeout: 15m + timeout: 5m diff --git a/kubernetes/apps/monitoring/kustomization.yaml b/kubernetes/apps/monitoring/kustomization.yaml index 2cf5c5cb08..d6a0686e26 100644 --- a/kubernetes/apps/monitoring/kustomization.yaml +++ b/kubernetes/apps/monitoring/kustomization.yaml @@ -9,6 +9,7 @@ resources: - ./kromgo/ks.yaml - ./kube-prometheus-stack/ks.yaml - ./loki/ks.yaml + - ./prometheus-operator-crds/ks.yaml - ./promtail/ks.yaml - ./unpoller/ks.yaml - ./exporters diff --git a/kubernetes/apps/monitoring/loki/app/helmrelease.yaml b/kubernetes/apps/monitoring/loki/app/helmrelease.yaml index b837f91091..52534b389e 100644 --- a/kubernetes/apps/monitoring/loki/app/helmrelease.yaml +++ b/kubernetes/apps/monitoring/loki/app/helmrelease.yaml @@ -5,7 +5,6 @@ metadata: name: loki spec: interval: 30m - timeout: 15m chart: spec: chart: loki diff --git a/kubernetes/apps/monitoring/loki/ks.yaml b/kubernetes/apps/monitoring/loki/ks.yaml index f507ea0569..8011172d8a 100644 --- a/kubernetes/apps/monitoring/loki/ks.yaml +++ b/kubernetes/apps/monitoring/loki/ks.yaml @@ -19,4 +19,4 @@ spec: wait: true interval: 30m retryInterval: 1m - timeout: 15m + timeout: 5m diff --git a/kubernetes/apps/monitoring/prometheus-operator-crds/app/helmrelease.yaml b/kubernetes/apps/monitoring/prometheus-operator-crds/app/helmrelease.yaml new file mode 100644 index 0000000000..991c72a42f --- /dev/null +++ b/kubernetes/apps/monitoring/prometheus-operator-crds/app/helmrelease.yaml @@ -0,0 +1,22 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: prometheus-operator-crds +spec: + interval: 30m + chart: + spec: + chart: prometheus-operator-crds + version: 16.0.1 + sourceRef: + kind: HelmRepository + name: prometheus-community + namespace: flux-system + install: + remediation: + retries: 3 + upgrade: + cleanupOnFail: true + remediation: + retries: 3 diff --git a/kubernetes/apps/monitoring/prometheus-operator-crds/app/kustomization.yaml b/kubernetes/apps/monitoring/prometheus-operator-crds/app/kustomization.yaml new file mode 100644 index 0000000000..5dd7baca73 --- /dev/null +++ b/kubernetes/apps/monitoring/prometheus-operator-crds/app/kustomization.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ./helmrelease.yaml diff --git a/kubernetes/apps/monitoring/prometheus-operator-crds/ks.yaml b/kubernetes/apps/monitoring/prometheus-operator-crds/ks.yaml new file mode 100644 index 0000000000..623fe5f0a2 --- /dev/null +++ b/kubernetes/apps/monitoring/prometheus-operator-crds/ks.yaml @@ -0,0 +1,22 @@ +--- +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: &app prometheus-operator-crds + namespace: flux-system +spec: + targetNamespace: monitoring + commonMetadata: + labels: + app.kubernetes.io/name: *app + dependsOn: + - name: rook-ceph-cluster + path: ./kubernetes/apps/monitoring/prometheus-operator-crds/app + prune: false # never should be deleted + sourceRef: + kind: GitRepository + name: k8s-gitops + wait: true + interval: 30m + retryInterval: 1m + timeout: 5m diff --git a/kubernetes/apps/networking/nginx/ks.yaml b/kubernetes/apps/networking/nginx/ks.yaml index 7bba661eee..6c19b583a0 100644 --- a/kubernetes/apps/networking/nginx/ks.yaml +++ b/kubernetes/apps/networking/nginx/ks.yaml @@ -41,7 +41,7 @@ spec: wait: true interval: 30m retryInterval: 1m - timeout: 5m + timeout: 15m --- apiVersion: kustomize.toolkit.fluxcd.io/v1 kind: Kustomization @@ -63,4 +63,4 @@ spec: wait: true interval: 30m retryInterval: 1m - timeout: 5m + timeout: 15m