From 3c6428cf163b21183e4153e9d1301e792aa9df6c Mon Sep 17 00:00:00 2001 From: Ashwani Singh Date: Thu, 8 Aug 2024 15:27:17 +0530 Subject: [PATCH] Add Alerts and Dashboards --- charts/pga/examples/thanos/pga-simple.yaml | 48 + .../pga/grafana/dashboards/kustomization.yaml | 7 + .../dashboards/opentelemetry-apm/apm.json | 1174 +++++++++++++++++ .../opentelemetry-apm/kustomization.yaml | 13 + charts/pga/grafana/datasources/pga/loki.yaml | 33 + .../grafana/datasources/pga/prometheus.yaml | 28 + charts/pga/grafana/datasources/pga/tempo.yaml | 34 + .../datasources/thanos/kustomization.yaml | 6 + .../grafana/datasources/thanos/thanos.yaml | 27 + charts/pga/values.yaml | 7 +- 10 files changed, 1376 insertions(+), 1 deletion(-) create mode 100644 charts/pga/examples/thanos/pga-simple.yaml create mode 100644 charts/pga/grafana/dashboards/kustomization.yaml create mode 100644 charts/pga/grafana/dashboards/opentelemetry-apm/apm.json create mode 100644 charts/pga/grafana/dashboards/opentelemetry-apm/kustomization.yaml create mode 100644 charts/pga/grafana/datasources/pga/loki.yaml create mode 100644 charts/pga/grafana/datasources/pga/prometheus.yaml create mode 100644 charts/pga/grafana/datasources/pga/tempo.yaml create mode 100644 charts/pga/grafana/datasources/thanos/kustomization.yaml create mode 100644 charts/pga/grafana/datasources/thanos/thanos.yaml diff --git a/charts/pga/examples/thanos/pga-simple.yaml b/charts/pga/examples/thanos/pga-simple.yaml new file mode 100644 index 00000000..7480fdb8 --- /dev/null +++ b/charts/pga/examples/thanos/pga-simple.yaml @@ -0,0 +1,48 @@ +app: + enabled: false + +kube: + enabled: true + grafana: + enabled: true + testFramework: + enabled: false + sidecar: + datasources: + defaultDatasourceEnabled: false + alertmanager: + alertmanagerSpec: + storage: + volumeClaimTemplate: + spec: + storageClassName: buildpiper-storage + prometheus: + enabled: true + prometheusSpec: + retention: 7d + resources: + requests: + cpu: 1 + memory: 1Gi + limits: + cpu: 2 + memory: 2Gi + storageSpec: + volumeClaimTemplate: + spec: + storageClassName: buildpiper-storage + resources: + requests: + storage: 15Gi + +pushgateway: + enabled: false + +blackbox: + enabled: false + +adapter: + enabled: false + +thanos: + enabled: false \ No newline at end of file diff --git a/charts/pga/grafana/dashboards/kustomization.yaml b/charts/pga/grafana/dashboards/kustomization.yaml new file mode 100644 index 00000000..b3780909 --- /dev/null +++ b/charts/pga/grafana/dashboards/kustomization.yaml @@ -0,0 +1,7 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: monitoring +nameSuffix: -grafana-dashboard + +bases: + - opentelmetry-apm diff --git a/charts/pga/grafana/dashboards/opentelemetry-apm/apm.json b/charts/pga/grafana/dashboards/opentelemetry-apm/apm.json new file mode 100644 index 00000000..01ff4729 --- /dev/null +++ b/charts/pga/grafana/dashboards/opentelemetry-apm/apm.json @@ -0,0 +1,1174 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "A language-agnostic application performance management(APM) with OpenTelemetry, Grafana, and Prometheus.", + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": 19419, + "graphTooltip": 0, + "id": 29, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "green", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(duration_milliseconds_count{span_kind=\"SPAN_KIND_SERVER\", service_name=\"$app\", http_route=~\"$route\"}) by(service_name)", + "instant": false, + "legendFormat": "{{label_name}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(duration_milliseconds_count{span_kind=\"SPAN_KIND_SERVER\", service_name=\"$app\", http_route=~\"$route\"}) by(service_name)", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "B" + } + ], + "title": "Total Request", + "transformations": [ + { + "id": "seriesToRows", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "Time" + } + ] + } + } + ], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(duration_milliseconds_count{service_name=\"$app\", span_kind=\"SPAN_KIND_SERVER\", http_status_code=~\"^2.*\", http_route=~\"$route\"}) by(http_status_code)", + "format": "time_series", + "instant": false, + "legendFormat": "Http Status 2XX", + "range": true, + "refId": "2XX" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(duration_milliseconds_count{service_name=\"$app\", span_kind=\"SPAN_KIND_SERVER\", http_status_code=~\"^3.*\", http_route=~\"$route\"}) by(http_status_code)", + "format": "time_series", + "hide": false, + "instant": false, + "legendFormat": "Http Status 3XX", + "range": true, + "refId": "3XX" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(duration_milliseconds_count{service_name=\"$app\", span_kind=\"SPAN_KIND_SERVER\", http_status_code=~\"^4.*\", http_route=~\"$route\"}) by(http_status_code)", + "format": "time_series", + "hide": false, + "instant": false, + "legendFormat": "Http Status 4XX", + "range": true, + "refId": "4XX" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(duration_milliseconds_count{service_name=\"$app\", span_kind=\"SPAN_KIND_SERVER\", http_status_code=~\"^5.*\", http_route=~\"$route\"}) by(http_status_code)", + "format": "time_series", + "hide": false, + "instant": false, + "legendFormat": "Http Status 5XX", + "range": true, + "refId": "5XX" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(duration_milliseconds_count{service_name=\"$app\", span_kind=\"SPAN_KIND_SERVER\", http_status_code=~\"^2.*\", http_route=~\"$route\"}) by(http_status_code)", + "format": "time_series", + "hide": false, + "instant": true, + "legendFormat": "Http Status 2XX", + "range": false, + "refId": "2XX-instant" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(duration_milliseconds_count{service_name=\"$app\", span_kind=\"SPAN_KIND_SERVER\", http_status_code=~\"^3.*\", http_route=~\"$route\"}) by(http_status_code)", + "format": "time_series", + "hide": false, + "instant": false, + "legendFormat": "Http Status 3XX", + "range": true, + "refId": "3XX-instant" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(duration_milliseconds_count{service_name=\"$app\", span_kind=\"SPAN_KIND_SERVER\", http_status_code=~\"^4.*\", http_route=~\"$route\"}) by(http_status_code)", + "format": "time_series", + "hide": false, + "instant": false, + "legendFormat": "Http Status 4XX", + "range": true, + "refId": "4XX-instant" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(duration_milliseconds_count{service_name=\"$app\", span_kind=\"SPAN_KIND_SERVER\", http_status_code=~\"^5.*\", http_route=~\"$route\"}) by(http_status_code)", + "format": "time_series", + "hide": false, + "instant": false, + "legendFormat": "Http Status 5XX", + "range": true, + "refId": "5XX-instant" + } + ], + "title": "Requests Count", + "transformations": [ + { + "id": "seriesToRows", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "Time" + } + ] + } + }, + { + "id": "partitionByValues", + "options": { + "fields": [ + "Metric" + ], + "keepFields": false + } + } + ], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "request amount distribution", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [], + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 4, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "list", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "pieType": "donut", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(duration_milliseconds_count{span_kind=\"SPAN_KIND_SERVER\", service_name=\"$app\", http_route=~\"$route\"}) by(span_name)", + "instant": false, + "legendFormat": "{{label_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Request Distribution", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "cumulative latency distribution", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [], + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 7, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "list", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "pieType": "donut", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(duration_milliseconds_sum{span_kind=\"SPAN_KIND_SERVER\", service_name=\"$app\", http_route=~\"$route\"}) by(span_name)", + "instant": true, + "legendFormat": "{{label_name}}", + "range": false, + "refId": "A" + } + ], + "title": "Loading Distribution", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "overall request rate per minute over last 3 minutes", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqpm" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 4 + }, + "id": 8, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(duration_milliseconds_count{span_kind=\"SPAN_KIND_SERVER\", service_name=\"$app\", http_route=~\"$route\"}[3m])*60)", + "hide": false, + "instant": false, + "range": true, + "refId": "B" + } + ], + "title": "Overall Request Rate", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "percentage of HTTP status 5xx in all requests", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 3, + "y": 4 + }, + "id": 9, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(duration_milliseconds_count{span_kind=\"SPAN_KIND_SERVER\", service_name=\"$app\", http_status_code=~\"5.*|\", http_route=~\"$route\"})/sum(duration_milliseconds_count{span_kind=\"SPAN_KIND_SERVER\", service_name=\"$app\", http_route=~\"$route\"})", + "instant": false, + "range": true, + "refId": "A" + } + ], + "title": "Overall Error Rate", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "request rate per minute over last 3 minutes", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqpm" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 8 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(duration_milliseconds_count{service_name=\"$app\", span_kind=\"SPAN_KIND_SERVER\", http_route=~\"$route\"}[3m])*60) by(span_name)", + "hide": false, + "instant": false, + "legendFormat": "{{label_name}}", + "range": true, + "refId": "B" + } + ], + "title": "Request Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "PR95 latency over last 3 minutes", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 8 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(duration_milliseconds_bucket{span_kind=\"SPAN_KIND_SERVER\", service_name=\"$app\", http_route=~\"$route\"}[3m])) by (le, span_name))", + "instant": false, + "legendFormat": "{{label_name}}", + "range": true, + "refId": "A" + } + ], + "title": "PR95 Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "by route and http status code", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 8 + }, + "id": 6, + "options": { + "displayMode": "lcd", + "maxVizHeight": 300, + "minVizHeight": 10, + "minVizWidth": 0, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sort_desc(duration_milliseconds_sum{span_kind=\"SPAN_KIND_SERVER\", service_name=\"$app\", http_status_code!=\"\", http_route=~\"$route\"} / duration_milliseconds_count{span_kind=\"SPAN_KIND_SERVER\", service_name=\"$app\", http_status_code!=\"\", http_route=~\"$route\"})", + "instant": true, + "legendFormat": "[{{http_status_code}}] {{span_name}}", + "range": false, + "refId": "A" + } + ], + "title": "Average Latency", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Details of each API", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Request Rate" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "type": "sparkline" + } + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + }, + { + "id": "unit", + "value": "reqpm" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Error Rate" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "spanNulls": false, + "type": "sparkline" + } + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + }, + { + "id": "unit", + "value": "percentunit" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PR95" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "hideValue": false, + "type": "sparkline" + } + }, + { + "id": "color", + "value": { + "mode": "palette-classic" + } + }, + { + "id": "unit", + "value": "ms" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 10, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "enablePagination": true, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(duration_milliseconds_count{service_name=\"$app\", http_route=~\"$route\", span_kind=\"SPAN_KIND_SERVER\"}[3m])*60) by(service_name, http_method, http_route)", + "format": "time_series", + "hide": false, + "instant": false, + "range": true, + "refId": "Request Rate" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(duration_milliseconds_bucket{service_name=\"$app\", http_route=~\"$route\", span_kind=\"SPAN_KIND_SERVER\"}[3m])) by (le, service_name, http_method, http_route))", + "hide": false, + "instant": false, + "range": true, + "refId": "PR95" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(duration_milliseconds_count{service_name=\"$app\", http_route=~\"$route\", span_kind=\"SPAN_KIND_SERVER\", http_status_code!~\"2.*|3.*\"}) by(service_name, http_method, http_route) / sum(duration_milliseconds_count{service_name=\"$app\", http_route=~\"$route\", span_kind=\"SPAN_KIND_SERVER\"}) by(service_name, http_method, http_route)", + "format": "time_series", + "hide": false, + "instant": false, + "range": true, + "refId": "Error Rate" + } + ], + "title": "Details", + "transformations": [ + { + "id": "timeSeriesTable", + "options": {} + }, + { + "id": "merge", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "includeByName": {}, + "indexByName": { + "Time": 0, + "Trend #PR95 Trend": 7, + "Trend #Request Rate Trend": 5, + "Value #PR95": 8, + "Value #Request Rate": 6, + "http_method": 3, + "http_route": 2, + "http_status_code": 4, + "service_name": 1 + }, + "renameByName": { + "Trend": "Request Rate Trend", + "Trend #Error Rate": "Error Rate", + "Trend #Error Rate Trend": "Error Rate Trend", + "Trend #PR95": "PR95", + "Trend #PR95 Trend": "PR95 Latency Trend", + "Trend #Request Rate": "Request Rate", + "Trend #Request Rate Trend": "Request Rate Trend", + "Value": "Request Rate", + "Value #A": "Error Rate", + "Value #Error Rate": "Error Rate", + "Value #PR95": "PR95 Latency", + "Value #Request Rate": "Request Rate", + "http_method": "Method", + "http_route": "Route", + "http_status_code": "Status Code", + "service_name": "Application" + } + } + } + ], + "type": "table" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "traya-api-server-dev-dev", + "value": "traya-api-server-dev-dev" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(service_name)", + "hide": 0, + "includeAll": false, + "label": "Application", + "multi": false, + "name": "app", + "options": [], + "query": { + "query": "label_values(service_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(http_route)", + "hide": 0, + "includeAll": true, + "label": "Route", + "multi": true, + "name": "route", + "options": [], + "query": { + "query": "label_values(http_route)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "OpenTelemetry APM", + "uid": "opentelemetry-apm", + "version": 2, + "weekStart": "" + } + \ No newline at end of file diff --git a/charts/pga/grafana/dashboards/opentelemetry-apm/kustomization.yaml b/charts/pga/grafana/dashboards/opentelemetry-apm/kustomization.yaml new file mode 100644 index 00000000..ab7a51d2 --- /dev/null +++ b/charts/pga/grafana/dashboards/opentelemetry-apm/kustomization.yaml @@ -0,0 +1,13 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +generatorOptions: + labels: + grafana_dashboard: "1" + disableNameSuffixHash: true + annotations: + k8s-sidecar-target-directory: "/tmp/dashboards/otel-apm" + +configMapGenerator: + - name: apm + files: + - apm.json \ No newline at end of file diff --git a/charts/pga/grafana/datasources/pga/loki.yaml b/charts/pga/grafana/datasources/pga/loki.yaml new file mode 100644 index 00000000..66b839b4 --- /dev/null +++ b/charts/pga/grafana/datasources/pga/loki.yaml @@ -0,0 +1,33 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: kube-loki-datasource + namespace: logging + labels: + grafana_datasource: "1" + app: kube-grafana + prometheus: kube +data: + kube-loki.yaml: |- + apiVersion: 1 + datasources: + - uid: logging + orgId: 1 + name: logging + type: loki + typeName: Loki + access: proxy + url: http://loki-logging-gateway.logging.svc + password: '' + user: '' + database: '' + basicAuth: false + isDefault: false + jsonData: + derivedFields: + - datasourceUid: tempo + matcherRegex: (?:trace_id)=(\w+) + name: TraceID + url: $${__value.raw} + readOnly: false + editable: true diff --git a/charts/pga/grafana/datasources/pga/prometheus.yaml b/charts/pga/grafana/datasources/pga/prometheus.yaml new file mode 100644 index 00000000..0c9d5fbb --- /dev/null +++ b/charts/pga/grafana/datasources/pga/prometheus.yaml @@ -0,0 +1,28 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: kube-prometheus-datasource + namespace: monitoring + labels: + grafana_datasource: "1" + app: kube-grafana + prometheus: kube +data: + kube-prometheus.yaml: |- + apiVersion: 1 + datasources: + - name: "kube" + type: prometheus + uid: prometheus + url: http://kube-prometheus.monitoring:9090/ + access: proxy + isDefault: true + jsonData: + httpMethod: POST + timeInterval: 30s + exemplarTraceIdDestinations: + - datasourceUid: tempo + name: TraceID + httpMethod: POST + readOnly: false + editable: true diff --git a/charts/pga/grafana/datasources/pga/tempo.yaml b/charts/pga/grafana/datasources/pga/tempo.yaml new file mode 100644 index 00000000..dc45b768 --- /dev/null +++ b/charts/pga/grafana/datasources/pga/tempo.yaml @@ -0,0 +1,34 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: tempo-datasource + namespace: monitoring + labels: + grafana_datasource: "1" + app: kube-grafana + prometheus: kube +data: + tempo.yaml: |- + apiVersion: 1 + datasources: + - name: "tempo" + type: tempo + uid: tempo + url: http://tempo.observability.svc.cluster.local:3100/ + access: proxy + jsonData: + handleGrafanaManagedAlerts: false + implementation: prometheus + nodeGraph: + enabled: true + search: + hide: false + lokiSearch: + datasourceUid: loki + tracesToLogs: + datasourceUid: loki + filterBySpanID: false + filterByTraceID: true + mapTagNamesEnabled: false + tags: + - app diff --git a/charts/pga/grafana/datasources/thanos/kustomization.yaml b/charts/pga/grafana/datasources/thanos/kustomization.yaml new file mode 100644 index 00000000..f45d4c38 --- /dev/null +++ b/charts/pga/grafana/datasources/thanos/kustomization.yaml @@ -0,0 +1,6 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +bases: + - thanos.yaml + - \ No newline at end of file diff --git a/charts/pga/grafana/datasources/thanos/thanos.yaml b/charts/pga/grafana/datasources/thanos/thanos.yaml new file mode 100644 index 00000000..00d2e40b --- /dev/null +++ b/charts/pga/grafana/datasources/thanos/thanos.yaml @@ -0,0 +1,27 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: kube-thanos-datasource + namespace: monitoring + labels: + grafana_datasource: "1" + app: kube-grafana + prometheus: kube +data: + kube-thanos.yaml: |- + apiVersion: 1 + datasources: + - name: "kube-thanos" + type: prometheus + uid: thanos + url: http://monitoring-thanos-query-frontend.monitoring:9090/ + access: proxy + jsonData: + httpMethod: POST + timeInterval: 30s + exemplarTraceIdDestinations: + - datasourceUid: tempo + name: TraceID + httpMethod: POST + readOnly: false + editable: true diff --git a/charts/pga/values.yaml b/charts/pga/values.yaml index b436cc53..5aa399ed 100644 --- a/charts/pga/values.yaml +++ b/charts/pga/values.yaml @@ -144,7 +144,12 @@ kube: requests: storage: 1Gi grafana: - enabled: false + enabled: true + testFramework: + enabled: false + sidecar: + datasources: + defaultDatasourceEnabled: false kubeApiServer: enabled: true kubelet: