-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
e05d834
commit 1eac6e9
Showing
16 changed files
with
852 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,17 @@ | ||
https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.6/internal/metrics/alerts/fluentd.go.FluentdPrometheusAlert release-5.6/fluentd_prometheus_alerts.yaml | ||
https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.7/internal/metrics/alerts/fluentd.go.FluentdPrometheusAlert release-5.7/fluentd_prometheus_alerts.yaml | ||
https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.8/config/prometheus/collector_alerts.yaml release-5.8/collector_prometheus_alerts.yaml | ||
https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.9/config/prometheus/collector_alerts.yaml release-5.9/collector_prometheus_alerts.yaml | ||
https://raw.githubusercontent.com/openshift/cluster-logging-operator/master/config/prometheus/collector_alerts.yaml master/collector_prometheus_alerts.yaml | ||
|
||
https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.6/files/prometheus_alerts.yml release-5.6/elasticsearch_operator_prometheus_alerts.yaml | ||
https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.7/files/prometheus_alerts.yml release-5.7/elasticsearch_operator_prometheus_alerts.yaml | ||
https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.8/files/prometheus_alerts.yml release-5.8/elasticsearch_operator_prometheus_alerts.yaml | ||
https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.8/files/prometheus_alerts.yml release-5.9/elasticsearch_operator_prometheus_alerts.yaml | ||
https://raw.githubusercontent.com/openshift/elasticsearch-operator/master/files/prometheus_alerts.yml master/elasticsearch_operator_prometheus_alerts.yaml | ||
|
||
https://raw.githubusercontent.com/openshift/loki/release-5.6/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.6/lokistack_prometheus_alerts.yaml | ||
https://raw.githubusercontent.com/openshift/loki/release-5.7/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.7/lokistack_prometheus_alerts.yaml | ||
https://raw.githubusercontent.com/openshift/loki/release-5.8/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.8/lokistack_prometheus_alerts.yaml | ||
https://raw.githubusercontent.com/openshift/loki/release-5.9/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.9/lokistack_prometheus_alerts.yaml | ||
https://raw.githubusercontent.com/openshift/loki/main/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml master/lokistack_prometheus_alerts.yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
115 changes: 115 additions & 0 deletions
115
component/extracted_alerts/release-5.9/collector_prometheus_alerts.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
apiVersion: monitoring.coreos.com/v1 | ||
kind: PrometheusRule | ||
metadata: | ||
name: collector | ||
namespace: openshift-logging | ||
spec: | ||
groups: | ||
- name: logging_collector.alerts | ||
rules: | ||
- alert: CollectorNodeDown | ||
annotations: | ||
message: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m." | ||
summary: "Collector cannot be scraped" | ||
expr: | | ||
up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0 | ||
for: 10m | ||
labels: | ||
service: collector | ||
severity: critical | ||
- alert: CollectorHighErrorRate | ||
annotations: | ||
message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component." | ||
summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are high" | ||
expr: | | ||
100 * ( | ||
collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} | ||
/ | ||
collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} | ||
) > 0.001 | ||
for: 15m | ||
labels: | ||
service: collector | ||
severity: critical | ||
- alert: CollectorVeryHighErrorRate | ||
annotations: | ||
message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component." | ||
summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are very high" | ||
expr: | | ||
100 * ( | ||
collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} | ||
/ | ||
collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} | ||
) > 0.05 | ||
for: 15m | ||
labels: | ||
service: collector | ||
severity: critical | ||
- alert: FluentdQueueLengthIncreasing | ||
annotations: | ||
message: "For the last hour, fluentd {{ $labels.pod }} output '{{ $labels.plugin_id }}' average buffer queue length has increased continuously." | ||
summary: "Fluentd pod {{ $labels.pod }} is unable to keep up with traffic over time for forwarder output {{ $labels.plugin_id }}." | ||
expr: | | ||
sum by (pod,plugin_id) ( 0 * (deriv(fluentd_output_status_emit_records[1m] offset 1h))) + on(pod,plugin_id) ( deriv(fluentd_output_status_buffer_queue_length[10m]) > 0 and delta(fluentd_output_status_buffer_queue_length[1h]) > 1 ) | ||
for: 1h | ||
labels: | ||
service: collector | ||
severity: Warning | ||
- alert: ElasticsearchDeprecation | ||
annotations: | ||
message: "The OpenShift Elasticsearch Operator is deprecated and is planned to be removed in a future release. Red Hat provides bug fixes and support for this feature during the current release lifecycle, but this feature no longer receives enhancements. As an alternative to using the OpenShift Elasticsearch Operator to manage the default log storage, you can use the Loki Operator." | ||
summary: "Detected Elasticsearch as the in-cluster storage which is deprecated and will be removed in a future release." | ||
expr: | | ||
sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0 | ||
for: 5m | ||
labels: | ||
service: storage | ||
severity: Warning | ||
namespace: openshift-logging | ||
- alert: FluentdDeprecation | ||
annotations: | ||
message: "Fluentd is deprecated and is planned to be removed in a future release. Red Hat provides bug fixes and support for this feature during the current release lifecycle, but this feature no longer receives enhancements. As an alternative to Fluentd, you can use Vector instead." | ||
summary: "Detected Fluentd as the collector which is deprecated and will be removed in a future release." | ||
expr: | | ||
sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0 | ||
for: 5m | ||
labels: | ||
service: collector | ||
severity: Warning | ||
namespace: openshift-logging | ||
- alert: KibanaDeprecation | ||
annotations: | ||
message: "The Kibana web console is now deprecated and is planned to be removed in a future logging release." | ||
summary: "Detected Kibana as the visualization which is deprecated and will be removed in a future release." | ||
expr: | | ||
sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0 | ||
for: 5m | ||
labels: | ||
service: visualization | ||
severity: Warning | ||
namespace: openshift-logging | ||
- alert: DiskBufferUsage | ||
annotations: | ||
message: "Collectors potentially consuming too much node disk, {{ $value }}% " | ||
summary: "Detected consuming too much node disk on $labels.hostname host" | ||
expr: | | ||
(label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink', buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') | ||
/ on(instance) group_left() sum by(instance) (node_filesystem_size_bytes{mountpoint='/var'})) * 100 > 15 | ||
for: 5m | ||
labels: | ||
service: collector | ||
severity: Warning | ||
- name: logging_clusterlogging_telemetry.rules | ||
rules: | ||
- expr: | | ||
sum by(cluster)(log_collected_bytes_total) | ||
record: cluster:log_collected_bytes_total:sum | ||
- expr: | | ||
sum by(cluster)(log_logged_bytes_total) | ||
record: cluster:log_logged_bytes_total:sum | ||
- expr: | | ||
sum by(pod, namespace, app_kubernetes_io_part_of)(rate(vector_component_errors_total[2m])) or sum by(pod, namespace, app_kubernetes_io_part_of)(rate(fluentd_output_status_num_errors[2m])) | ||
record: collector:log_num_errors:sum_rate | ||
- expr: | | ||
sum by(pod, namespace, app_kubernetes_io_part_of)(rate(vector_component_received_events_total[2m])) or sum by(pod, namespace, app_kubernetes_io_part_of)(rate(fluentd_output_status_emit_records[2m])) | ||
record: collector:received_events:sum_rate |
224 changes: 224 additions & 0 deletions
224
component/extracted_alerts/release-5.9/elasticsearch_operator_prometheus_alerts.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,224 @@ | ||
--- | ||
"groups": | ||
- "name": logging_elasticsearch.alerts | ||
"rules": | ||
- "alert": ElasticsearchClusterNotHealthy | ||
"annotations": | ||
"message": "Cluster {{ $labels.cluster }} health status has been RED for at least 7m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet." | ||
"summary": "Cluster health status is RED" | ||
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Red" | ||
"expr": | | ||
sum by (cluster) (es_cluster_status == 2) | ||
"for": 7m | ||
"labels": | ||
"namespace": openshift-logging | ||
"severity": critical | ||
|
||
- "alert": ElasticsearchClusterNotHealthy | ||
"annotations": | ||
"message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Some shard replicas are not allocated." | ||
"summary": "Cluster health status is YELLOW" | ||
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Yellow" | ||
"expr": | | ||
sum by (cluster) (es_cluster_status == 1) | ||
"for": 20m | ||
"labels": | ||
"namespace": openshift-logging | ||
"severity": warning | ||
|
||
- "alert": ElasticsearchWriteRequestsRejectionJumps | ||
"annotations": | ||
"message": "High Write Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed." | ||
"summary": "High Write Rejection Ratio - {{ $value }}%" | ||
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Write-Requests-Rejection-Jumps" | ||
"expr": | | ||
round( writing:reject_ratio:rate2m * 100, 0.001 ) > 5 | ||
"for": 10m | ||
"labels": | ||
"namespace": openshift-logging | ||
"severity": warning | ||
|
||
- "alert": ElasticsearchNodeDiskWatermarkReached | ||
"annotations": | ||
"message": "Disk Low Watermark Reached at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." | ||
"summary": "Disk Low Watermark Reached - disk saturation is {{ $value }}%" | ||
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached" | ||
"expr": | | ||
sum by (instance, pod) ( | ||
round( | ||
(1 - ( | ||
es_fs_path_available_bytes / | ||
es_fs_path_total_bytes | ||
) | ||
) * 100, 0.001) | ||
) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct | ||
"for": 5m | ||
"labels": | ||
"namespace": openshift-logging | ||
"severity": info | ||
|
||
- "alert": ElasticsearchNodeDiskWatermarkReached | ||
"annotations": | ||
"message": "Disk High Watermark Reached at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." | ||
"summary": "Disk High Watermark Reached - disk saturation is {{ $value }}%" | ||
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached" | ||
"expr": | | ||
sum by (instance, pod) ( | ||
round( | ||
(1 - ( | ||
es_fs_path_available_bytes / | ||
es_fs_path_total_bytes | ||
) | ||
) * 100, 0.001) | ||
) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct | ||
"for": 5m | ||
"labels": | ||
"namespace": openshift-logging | ||
"severity": critical | ||
|
||
- "alert": ElasticsearchNodeDiskWatermarkReached | ||
"annotations": | ||
"message": "Disk Flood Stage Watermark Reached at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark." | ||
"summary": "Disk Flood Stage Watermark Reached - disk saturation is {{ $value }}%" | ||
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached" | ||
"expr": | | ||
sum by (instance, pod) ( | ||
round( | ||
(1 - ( | ||
es_fs_path_available_bytes / | ||
es_fs_path_total_bytes | ||
) | ||
) * 100, 0.001) | ||
) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct | ||
"for": 5m | ||
"labels": | ||
"namespace": openshift-logging | ||
"severity": critical | ||
|
||
- "alert": ElasticsearchJVMHeapUseHigh | ||
"annotations": | ||
"message": "JVM Heap usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." | ||
"summary": "JVM Heap usage on the node is high" | ||
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-JVM-Heap-Use-is-High" | ||
"expr": | | ||
sum by (cluster, instance, node) (es_jvm_mem_heap_used_percent) > 75 | ||
"for": 10m | ||
"labels": | ||
"namespace": openshift-logging | ||
"severity": info | ||
|
||
- "alert": AggregatedLoggingSystemCPUHigh | ||
"annotations": | ||
"message": "System CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." | ||
"summary": "System CPU usage is high" | ||
"runbook_url": "[[.RunbookBaseURL]]#Aggregated-Logging-System-CPU-is-High" | ||
"expr": | | ||
sum by (cluster, instance, node) (es_os_cpu_percent) > 90 | ||
"for": 1m | ||
"labels": | ||
"namespace": openshift-logging | ||
"severity": info | ||
|
||
- "alert": ElasticsearchProcessCPUHigh | ||
"annotations": | ||
"message": "ES process CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." | ||
"summary": "ES process CPU usage is high" | ||
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Process-CPU-is-High" | ||
"expr": | | ||
sum by (cluster, instance, node) (es_process_cpu_percent) > 90 | ||
"for": 1m | ||
"labels": | ||
"namespace": openshift-logging | ||
"severity": info | ||
|
||
- "alert": ElasticsearchDiskSpaceRunningLow | ||
"annotations": | ||
"message": "Cluster {{ $labels.cluster }} is predicted to be out of disk space within the next 6h." | ||
"summary": "Cluster low on disk space" | ||
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Disk-Space-is-Running-Low" | ||
"expr": | | ||
sum(predict_linear(es_fs_path_available_bytes[6h], 6 * 3600)) < 0 | ||
"for": 1h | ||
"labels": | ||
"namespace": openshift-logging | ||
"severity": critical | ||
|
||
- "alert": ElasticsearchHighFileDescriptorUsage | ||
"annotations": | ||
"message": "Cluster {{ $labels.cluster }} is predicted to be out of file descriptors within the next hour." | ||
"summary": "Cluster low on file descriptors" | ||
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-FileDescriptor-Usage-is-high" | ||
"expr": | | ||
predict_linear(es_process_file_descriptors_max_number[1h], 3600) - predict_linear(es_process_file_descriptors_open_number[1h], 3600) < 0 | ||
"for": 10m | ||
"labels": | ||
"namespace": openshift-logging | ||
"severity": warning | ||
|
||
- "alert": ElasticsearchOperatorCSVNotSuccessful | ||
"annotations": | ||
"message": "Elasticsearch Operator CSV has not reconciled succesfully." | ||
"summary": "Elasticsearch Operator CSV Not Successful" | ||
"expr": | | ||
csv_succeeded{name =~ "elasticsearch-operator.*"} == 0 | ||
"for": 10m | ||
"labels": | ||
"namespace": openshift-logging | ||
"severity": warning | ||
|
||
- "alert": ElasticsearchNodeDiskWatermarkReached | ||
"annotations": | ||
"message": "Disk Low Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." | ||
"summary": "Disk Low Watermark is predicted to be reached within next 6h." | ||
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached" | ||
"expr": | | ||
sum by (instance, pod) ( | ||
round( | ||
(1 - ( | ||
predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / | ||
predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) | ||
) | ||
) * 100, 0.001) | ||
) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct | ||
"for": 1h | ||
"labels": | ||
"namespace": openshift-logging | ||
"severity": warning | ||
|
||
- "alert": ElasticsearchNodeDiskWatermarkReached | ||
"annotations": | ||
"message": "Disk High Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." | ||
"summary": "Disk High Watermark is predicted to be reached within next 6h." | ||
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached" | ||
"expr": | | ||
sum by (instance, pod) ( | ||
round( | ||
(1 - ( | ||
predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / | ||
predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) | ||
) | ||
) * 100, 0.001) | ||
) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct | ||
"for": 1h | ||
"labels": | ||
"namespace": openshift-logging | ||
"severity": warning | ||
|
||
- "alert": ElasticsearchNodeDiskWatermarkReached | ||
"annotations": | ||
"message": "Disk Flood Stage Watermark is predicted to be reached within the next 6h at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark." | ||
"summary": "Disk Flood Stage Watermark is predicted to be reached within next 6h." | ||
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached" | ||
"expr": | | ||
sum by (instance, pod) ( | ||
round( | ||
(1 - ( | ||
predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / | ||
predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) | ||
) | ||
) * 100, 0.001) | ||
) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct | ||
"for": 1h | ||
"labels": | ||
"namespace": openshift-logging | ||
"severity": warning |
Oops, something went wrong.