From 252437882e669b3df254266992bbf84142cf79b9 Mon Sep 17 00:00:00 2001 From: shehbaz-pathan <46107507+shehbaz-pathan@users.noreply.github.com> Date: Thu, 14 Nov 2024 17:27:49 +0530 Subject: [PATCH] Updated alerts, slo and dashboard template to use certain lables in metrics if istio's operational mode in sidecar (#36) --- tools/nopo11y-operator/main.py | 5 +++++ .../nopo11y-operator/templates/nopo11y-op-alerts.yaml | 4 ++-- .../templates/nopo11y-op-dashboard.yaml | 10 +++++----- .../templates/nopo11y-op-slo-availability.yaml | 4 ++-- .../templates/nopo11y-op-slo-latency.yaml | 4 ++-- 5 files changed, 16 insertions(+), 11 deletions(-) diff --git a/tools/nopo11y-operator/main.py b/tools/nopo11y-operator/main.py index 75acc36..8f4625c 100644 --- a/tools/nopo11y-operator/main.py +++ b/tools/nopo11y-operator/main.py @@ -21,6 +21,7 @@ LOGGING_LEVEL = str(os.getenv("LOG_LEVEL", "INFO")) API_GATEWAY = str(os.getenv("API_GATEWAY", "istio")) GRAFANA_URL = str(os.getenv("GRAFANA_EXTERNAL_URL", "")) +ISTIO_MODE = str(os.getenv("ISTIO_OPERATION_MODE", "")) DEFAULT_CONFIG = { "slo": { "availability": float(os.getenv("AVAILABILITY_SLO", "99")), @@ -184,6 +185,7 @@ def generate_dashboard_alerts(spec, namespace, old, new, **kwargs): namespace=O11Y_NAMEPSACE, apiGateway=API_GATEWAY, grafanaUrl=GRAFANA_URL, + istioMode=ISTIO_MODE, service=service_name, serviceNamespace=service_namespace, cluster=cluster_name, @@ -200,6 +202,7 @@ def generate_dashboard_alerts(spec, namespace, old, new, **kwargs): namespace=O11Y_NAMEPSACE, apiGateway=API_GATEWAY, grafanaUrl=GRAFANA_URL, + istioMode=ISTIO_MODE, service=service_name, serviceNamespace=service_namespace, cluster=cluster_name, @@ -215,6 +218,7 @@ def generate_dashboard_alerts(spec, namespace, old, new, **kwargs): namespace=O11Y_NAMEPSACE, apiGateway=API_GATEWAY, grafanaUrl=GRAFANA_URL, + istioMode=ISTIO_MODE, service=service_name, serviceNamespace=service_namespace, cluster=cluster_name, @@ -229,6 +233,7 @@ def generate_dashboard_alerts(spec, namespace, old, new, **kwargs): dashboard_manifest = dashboard_template.render( namespace=O11Y_NAMEPSACE, apiGateway=API_GATEWAY, + istioMode=ISTIO_MODE, service=service_name, cluster=cluster_name, deployment=deployment_name, diff --git a/tools/nopo11y-operator/templates/nopo11y-op-alerts.yaml b/tools/nopo11y-operator/templates/nopo11y-op-alerts.yaml index 64bec2d..02d8875 100644 --- a/tools/nopo11y-operator/templates/nopo11y-op-alerts.yaml +++ b/tools/nopo11y-operator/templates/nopo11y-op-alerts.yaml @@ -11,7 +11,7 @@ spec: rules: - alert: {{ service }}High5xxErrorRate {%- if apiGateway == "istio" %} - expr: sum(rate(istio_requests_total{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}source_workload!~"unknown", reporter="source", destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}", response_code=~"5.."}[5m])) by (instance) / sum(rate(istio_requests_total{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}source_workload!~"unknown", reporter="source", destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}"}[5m])) by (instance) * 100 > {{ rate5xx }} + expr: sum(rate(istio_requests_total{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~"unknown", reporter="source", {% endif %}destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}", response_code=~"5.."}[5m])) by (instance) / sum(rate(istio_requests_total{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~"unknown", reporter="source", {% endif %}destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}"}[5m])) by (instance) * 100 > {{ rate5xx }} {%- elif apiGateway == "nginx" %} expr: sum(rate(nginx_ingress_controller_requests{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}status=~"5..", exported_service="{{ service }}", exported_namespace="{{ serviceNamespace }}"}[5m])) / sum(rate(nginx_ingress_controller_requests{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}exported_service="{{ service }}", exported_namespace="{{ serviceNamespace }}"}[5m])) * 100 > {{ rate5xx }} {%- endif %} @@ -25,7 +25,7 @@ spec: severity: critical - alert: {{ service }}High4xxErrorRate {%- if apiGateway == "istio" %} - expr: sum(rate(istio_requests_total{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}source_workload!~"unknown", reporter="source", destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}", response_code=~"4.."}[5m])) by (instance) / sum(rate(istio_requests_total{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}source_workload!~"unknown", reporter="source", destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}"}[5m])) by (instance) * 100 > {{ rate4xx }} + expr: sum(rate(istio_requests_total{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~"unknown", reporter="source", {% endif %}destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}", response_code=~"4.."}[5m])) by (instance) / sum(rate(istio_requests_total{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~"unknown", reporter="source", {% endif %}destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}"}[5m])) by (instance) * 100 > {{ rate4xx }} {%- elif apiGateway == "nginx" %} expr: sum(rate(nginx_ingress_controller_requests{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}status=~"4..", exported_service="{{ service }}", exported_namespace="{{ serviceNamespace }}"}[5m])) / sum(rate(nginx_ingress_controller_requests{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}exported_service="{{ service }}", exported_namespace="{{ serviceNamespace }}"}[5m])) * 100 > {{ rate4xx }} {%- endif %} diff --git a/tools/nopo11y-operator/templates/nopo11y-op-dashboard.yaml b/tools/nopo11y-operator/templates/nopo11y-op-dashboard.yaml index cfeba8f..983949a 100644 --- a/tools/nopo11y-operator/templates/nopo11y-op-dashboard.yaml +++ b/tools/nopo11y-operator/templates/nopo11y-op-dashboard.yaml @@ -646,7 +646,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(istio_requests_total{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\", response_code=~\"5..\"}[$__rate_interval])) / sum(rate(istio_requests_total{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval])) * 100", + "expr": "sum(rate(istio_requests_total{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~\"unknown\", reporter=\"source\", {% endif %}destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\", response_code=~\"5..\"}[$__rate_interval])) / sum(rate(istio_requests_total{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~\"unknown\", reporter=\"source\", {% endif %}destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval])) * 100", "legendFormat": "5xx Error Rate", "range": true, "refId": "A" @@ -657,7 +657,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(istio_requests_total{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\", response_code=~\"4..\"}[$__rate_interval])) / sum(rate(istio_requests_total{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval])) * 100", + "expr": "sum(rate(istio_requests_total{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~\"unknown\", reporter=\"source\", {% endif %}destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\", response_code=~\"4..\"}[$__rate_interval])) / sum(rate(istio_requests_total{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~\"unknown\", reporter=\"source\", {% endif %}destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval])) * 100", "hide": false, "instant": false, "legendFormat": "4xx Error Rate", @@ -875,7 +875,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(irate(istio_requests_total{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval]))", + "expr": "sum(irate(istio_requests_total{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~\"unknown\", reporter=\"source\", {% endif %}destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval]))", "legendFormat": "Requests/sec", "range": true, "refId": "A" @@ -886,7 +886,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(istio_request_duration_milliseconds_sum{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval])) / sum(rate(istio_request_duration_milliseconds_count{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval]))", + "expr": "sum(rate(istio_request_duration_milliseconds_sum{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~\"unknown\", reporter=\"source\", {% endif %}destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval])) / sum(rate(istio_request_duration_milliseconds_count{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~\"unknown\", reporter=\"source\", {% endif %}destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval]))", "hide": false, "instant": false, "legendFormat": "Response Time", @@ -985,7 +985,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(istio_response_bytes_sum{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval])) / sum(rate(istio_response_bytes_count{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval]))", + "expr": "sum(rate(istio_response_bytes_sum{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~\"unknown\", reporter=\"source\", {% endif %}destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval])) / sum(rate(istio_response_bytes_count{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~\"unknown\", reporter=\"source\", {% endif %}destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval]))", "legendFormat": "Response Size", "range": true, "refId": "A" diff --git a/tools/nopo11y-operator/templates/nopo11y-op-slo-availability.yaml b/tools/nopo11y-operator/templates/nopo11y-op-slo-availability.yaml index c139f33..ccfe5cc 100644 --- a/tools/nopo11y-operator/templates/nopo11y-op-slo-availability.yaml +++ b/tools/nopo11y-operator/templates/nopo11y-op-slo-availability.yaml @@ -43,8 +43,8 @@ spec: sli: events: {%- if apiGateway == "istio" %} - errorQuery: sum(rate(istio_requests_total{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}source_workload!~"unknown", reporter="source", destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}", response_code=~"5.."}[{%- raw %}{{.window}}{%- endraw %}])) - totalQuery: sum(rate(istio_requests_total{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}source_workload!~"unknown", reporter="source", destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}"}[{%- raw %}{{.window}}{%- endraw %}])) + errorQuery: sum(rate(istio_requests_total{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~"unknown", reporter="source", {% endif %}destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}", response_code=~"5.."}[{%- raw %}{{.window}}{%- endraw %}])) + totalQuery: sum(rate(istio_requests_total{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~"unknown", reporter="source", {% endif %}destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}"}[{%- raw %}{{.window}}{%- endraw %}])) {%- elif apiGateway == "nginx" %} errorQuery: sum(rate(nginx_ingress_controller_requests{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}status=~"5..", exported_service="{{ service }}", exported_namespace="{{ serviceNamespace }}"}[{%- raw %}{{.window}}{%- endraw %}])) totalQuery: sum(rate(nginx_ingress_controller_requests{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}exported_service="{{ service }}", exported_namespace="{{ serviceNamespace }}"}[{%- raw %}{{.window}}{%- endraw %}])) diff --git a/tools/nopo11y-operator/templates/nopo11y-op-slo-latency.yaml b/tools/nopo11y-operator/templates/nopo11y-op-slo-latency.yaml index ba56c71..5acc450 100644 --- a/tools/nopo11y-operator/templates/nopo11y-op-slo-latency.yaml +++ b/tools/nopo11y-operator/templates/nopo11y-op-slo-latency.yaml @@ -43,8 +43,8 @@ spec: sli: events: {%- if apiGateway == "istio" %} - errorQuery: (sum(rate(istio_request_duration_milliseconds_bucket{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}source_workload!~"unknown", reporter="source", destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}", le="+Inf"}[{%- raw %}{{.window}}{%- endraw %}])) - sum(rate(istio_request_duration_milliseconds_bucket{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}source_workload!~"unknown", reporter="source", destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}", le="{{ latencyThreshold }}"}[{%- raw %}{{.window}}{%- endraw %}]))) - totalQuery: sum(rate(istio_request_duration_milliseconds_bucket{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}source_workload!~"unknown", reporter="source", destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}", le="+Inf"}[{%- raw %}{{.window}}{%- endraw %}])) + errorQuery: (sum(rate(istio_request_duration_milliseconds_bucket{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~"unknown", reporter="source", {% endif %}destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}", le="+Inf"}[{%- raw %}{{.window}}{%- endraw %}])) - sum(rate(istio_request_duration_milliseconds_bucket{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~"unknown", reporter="source", {% endif %}destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}", le="{{ latencyThreshold }}"}[{%- raw %}{{.window}}{%- endraw %}]))) + totalQuery: sum(rate(istio_request_duration_milliseconds_bucket{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~"unknown", reporter="source", {% endif %}destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}", le="+Inf"}[{%- raw %}{{.window}}{%- endraw %}])) {%- elif apiGateway == "nginx" %} errorQuery: (sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}le="+Inf", exported_service="{{ service }}", exported_namespace="{{ serviceNamespace }}"}[{%- raw %}{{.window}}{%- endraw %}])) - sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}le="{{ latencyThreshold / 1000 }}", exported_service="{{ service }}", exported_namespace="{{ serviceNamespace }}"}[{%- raw %}{{.window}}{%- endraw %}]))) totalQuery: sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}le="+Inf", exported_service="{{ service }}", exported_namespace="{{ serviceNamespace }}"}[{%- raw %}{{.window}}{%- endraw %}]))