Skip to content

Commit

Permalink
Updated alerts, slo and dashboard template to use certain lables in m…
Browse files Browse the repository at this point in the history
…etrics if istio's operational mode in sidecar (#36)
  • Loading branch information
shehbaz-pathan authored Nov 14, 2024
1 parent d45b673 commit 2524378
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 11 deletions.
5 changes: 5 additions & 0 deletions tools/nopo11y-operator/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
LOGGING_LEVEL = str(os.getenv("LOG_LEVEL", "INFO"))
API_GATEWAY = str(os.getenv("API_GATEWAY", "istio"))
GRAFANA_URL = str(os.getenv("GRAFANA_EXTERNAL_URL", ""))
ISTIO_MODE = str(os.getenv("ISTIO_OPERATION_MODE", ""))
DEFAULT_CONFIG = {
"slo": {
"availability": float(os.getenv("AVAILABILITY_SLO", "99")),
Expand Down Expand Up @@ -184,6 +185,7 @@ def generate_dashboard_alerts(spec, namespace, old, new, **kwargs):
namespace=O11Y_NAMEPSACE,
apiGateway=API_GATEWAY,
grafanaUrl=GRAFANA_URL,
istioMode=ISTIO_MODE,
service=service_name,
serviceNamespace=service_namespace,
cluster=cluster_name,
Expand All @@ -200,6 +202,7 @@ def generate_dashboard_alerts(spec, namespace, old, new, **kwargs):
namespace=O11Y_NAMEPSACE,
apiGateway=API_GATEWAY,
grafanaUrl=GRAFANA_URL,
istioMode=ISTIO_MODE,
service=service_name,
serviceNamespace=service_namespace,
cluster=cluster_name,
Expand All @@ -215,6 +218,7 @@ def generate_dashboard_alerts(spec, namespace, old, new, **kwargs):
namespace=O11Y_NAMEPSACE,
apiGateway=API_GATEWAY,
grafanaUrl=GRAFANA_URL,
istioMode=ISTIO_MODE,
service=service_name,
serviceNamespace=service_namespace,
cluster=cluster_name,
Expand All @@ -229,6 +233,7 @@ def generate_dashboard_alerts(spec, namespace, old, new, **kwargs):
dashboard_manifest = dashboard_template.render(
namespace=O11Y_NAMEPSACE,
apiGateway=API_GATEWAY,
istioMode=ISTIO_MODE,
service=service_name,
cluster=cluster_name,
deployment=deployment_name,
Expand Down
4 changes: 2 additions & 2 deletions tools/nopo11y-operator/templates/nopo11y-op-alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ spec:
rules:
- alert: {{ service }}High5xxErrorRate
{%- if apiGateway == "istio" %}
expr: sum(rate(istio_requests_total{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}source_workload!~"unknown", reporter="source", destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}", response_code=~"5.."}[5m])) by (instance) / sum(rate(istio_requests_total{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}source_workload!~"unknown", reporter="source", destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}"}[5m])) by (instance) * 100 > {{ rate5xx }}
expr: sum(rate(istio_requests_total{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~"unknown", reporter="source", {% endif %}destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}", response_code=~"5.."}[5m])) by (instance) / sum(rate(istio_requests_total{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~"unknown", reporter="source", {% endif %}destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}"}[5m])) by (instance) * 100 > {{ rate5xx }}
{%- elif apiGateway == "nginx" %}
expr: sum(rate(nginx_ingress_controller_requests{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}status=~"5..", exported_service="{{ service }}", exported_namespace="{{ serviceNamespace }}"}[5m])) / sum(rate(nginx_ingress_controller_requests{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}exported_service="{{ service }}", exported_namespace="{{ serviceNamespace }}"}[5m])) * 100 > {{ rate5xx }}
{%- endif %}
Expand All @@ -25,7 +25,7 @@ spec:
severity: critical
- alert: {{ service }}High4xxErrorRate
{%- if apiGateway == "istio" %}
expr: sum(rate(istio_requests_total{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}source_workload!~"unknown", reporter="source", destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}", response_code=~"4.."}[5m])) by (instance) / sum(rate(istio_requests_total{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}source_workload!~"unknown", reporter="source", destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}"}[5m])) by (instance) * 100 > {{ rate4xx }}
expr: sum(rate(istio_requests_total{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~"unknown", reporter="source", {% endif %}destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}", response_code=~"4.."}[5m])) by (instance) / sum(rate(istio_requests_total{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~"unknown", reporter="source", {% endif %}destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}"}[5m])) by (instance) * 100 > {{ rate4xx }}
{%- elif apiGateway == "nginx" %}
expr: sum(rate(nginx_ingress_controller_requests{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}status=~"4..", exported_service="{{ service }}", exported_namespace="{{ serviceNamespace }}"}[5m])) / sum(rate(nginx_ingress_controller_requests{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}exported_service="{{ service }}", exported_namespace="{{ serviceNamespace }}"}[5m])) * 100 > {{ rate4xx }}
{%- endif %}
Expand Down
10 changes: 5 additions & 5 deletions tools/nopo11y-operator/templates/nopo11y-op-dashboard.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -646,7 +646,7 @@ data:
"uid": "prometheus"
},
"editorMode": "code",
"expr": "sum(rate(istio_requests_total{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\", response_code=~\"5..\"}[$__rate_interval])) / sum(rate(istio_requests_total{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval])) * 100",
"expr": "sum(rate(istio_requests_total{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~\"unknown\", reporter=\"source\", {% endif %}destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\", response_code=~\"5..\"}[$__rate_interval])) / sum(rate(istio_requests_total{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~\"unknown\", reporter=\"source\", {% endif %}destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval])) * 100",
"legendFormat": "5xx Error Rate",
"range": true,
"refId": "A"
Expand All @@ -657,7 +657,7 @@ data:
"uid": "prometheus"
},
"editorMode": "code",
"expr": "sum(rate(istio_requests_total{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\", response_code=~\"4..\"}[$__rate_interval])) / sum(rate(istio_requests_total{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval])) * 100",
"expr": "sum(rate(istio_requests_total{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~\"unknown\", reporter=\"source\", {% endif %}destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\", response_code=~\"4..\"}[$__rate_interval])) / sum(rate(istio_requests_total{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~\"unknown\", reporter=\"source\", {% endif %}destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval])) * 100",
"hide": false,
"instant": false,
"legendFormat": "4xx Error Rate",
Expand Down Expand Up @@ -875,7 +875,7 @@ data:
"uid": "prometheus"
},
"editorMode": "code",
"expr": "sum(irate(istio_requests_total{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval]))",
"expr": "sum(irate(istio_requests_total{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~\"unknown\", reporter=\"source\", {% endif %}destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval]))",
"legendFormat": "Requests/sec",
"range": true,
"refId": "A"
Expand All @@ -886,7 +886,7 @@ data:
"uid": "prometheus"
},
"editorMode": "code",
"expr": "sum(rate(istio_request_duration_milliseconds_sum{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval])) / sum(rate(istio_request_duration_milliseconds_count{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval]))",
"expr": "sum(rate(istio_request_duration_milliseconds_sum{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~\"unknown\", reporter=\"source\", {% endif %}destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval])) / sum(rate(istio_request_duration_milliseconds_count{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~\"unknown\", reporter=\"source\", {% endif %}destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval]))",
"hide": false,
"instant": false,
"legendFormat": "Response Time",
Expand Down Expand Up @@ -985,7 +985,7 @@ data:
"uid": "prometheus"
},
"editorMode": "code",
"expr": "sum(rate(istio_response_bytes_sum{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval])) / sum(rate(istio_response_bytes_count{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}source_workload!~\"unknown\", reporter=\"source\", destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval]))",
"expr": "sum(rate(istio_response_bytes_sum{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~\"unknown\", reporter=\"source\", {% endif %}destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval])) / sum(rate(istio_response_bytes_count{ {%- if cluster %}cluster=\"{{ cluster }}\", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~\"unknown\", reporter=\"source\", {% endif %}destination_service_name=\"{{ service }}\", destination_service_namespace=\"{{ deploymentNamespace }}\"}[$__rate_interval]))",
"legendFormat": "Response Size",
"range": true,
"refId": "A"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ spec:
sli:
events:
{%- if apiGateway == "istio" %}
errorQuery: sum(rate(istio_requests_total{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}source_workload!~"unknown", reporter="source", destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}", response_code=~"5.."}[{%- raw %}{{.window}}{%- endraw %}]))
totalQuery: sum(rate(istio_requests_total{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}source_workload!~"unknown", reporter="source", destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}"}[{%- raw %}{{.window}}{%- endraw %}]))
errorQuery: sum(rate(istio_requests_total{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~"unknown", reporter="source", {% endif %}destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}", response_code=~"5.."}[{%- raw %}{{.window}}{%- endraw %}]))
totalQuery: sum(rate(istio_requests_total{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~"unknown", reporter="source", {% endif %}destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}"}[{%- raw %}{{.window}}{%- endraw %}]))
{%- elif apiGateway == "nginx" %}
errorQuery: sum(rate(nginx_ingress_controller_requests{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}status=~"5..", exported_service="{{ service }}", exported_namespace="{{ serviceNamespace }}"}[{%- raw %}{{.window}}{%- endraw %}]))
totalQuery: sum(rate(nginx_ingress_controller_requests{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}exported_service="{{ service }}", exported_namespace="{{ serviceNamespace }}"}[{%- raw %}{{.window}}{%- endraw %}]))
Expand Down
4 changes: 2 additions & 2 deletions tools/nopo11y-operator/templates/nopo11y-op-slo-latency.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ spec:
sli:
events:
{%- if apiGateway == "istio" %}
errorQuery: (sum(rate(istio_request_duration_milliseconds_bucket{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}source_workload!~"unknown", reporter="source", destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}", le="+Inf"}[{%- raw %}{{.window}}{%- endraw %}])) - sum(rate(istio_request_duration_milliseconds_bucket{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}source_workload!~"unknown", reporter="source", destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}", le="{{ latencyThreshold }}"}[{%- raw %}{{.window}}{%- endraw %}])))
totalQuery: sum(rate(istio_request_duration_milliseconds_bucket{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}source_workload!~"unknown", reporter="source", destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}", le="+Inf"}[{%- raw %}{{.window}}{%- endraw %}]))
errorQuery: (sum(rate(istio_request_duration_milliseconds_bucket{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~"unknown", reporter="source", {% endif %}destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}", le="+Inf"}[{%- raw %}{{.window}}{%- endraw %}])) - sum(rate(istio_request_duration_milliseconds_bucket{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~"unknown", reporter="source", {% endif %}destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}", le="{{ latencyThreshold }}"}[{%- raw %}{{.window}}{%- endraw %}])))
totalQuery: sum(rate(istio_request_duration_milliseconds_bucket{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}{% if istioMode == "sidecar" %}source_workload!~"unknown", reporter="source", {% endif %}destination_service_name="{{ service }}", destination_service_namespace="{{ serviceNamespace }}", le="+Inf"}[{%- raw %}{{.window}}{%- endraw %}]))
{%- elif apiGateway == "nginx" %}
errorQuery: (sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}le="+Inf", exported_service="{{ service }}", exported_namespace="{{ serviceNamespace }}"}[{%- raw %}{{.window}}{%- endraw %}])) - sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}le="{{ latencyThreshold / 1000 }}", exported_service="{{ service }}", exported_namespace="{{ serviceNamespace }}"}[{%- raw %}{{.window}}{%- endraw %}])))
totalQuery: sum(rate(nginx_ingress_controller_request_duration_seconds_bucket{ {%- if cluster %}cluster="{{ cluster }}", {%- endif %}le="+Inf", exported_service="{{ service }}", exported_namespace="{{ serviceNamespace }}"}[{%- raw %}{{.window}}{%- endraw %}]))
Expand Down

0 comments on commit 2524378

Please sign in to comment.