Skip to content

Commit

Permalink
Update to OpenShift Logging 5.9
Browse files Browse the repository at this point in the history
  • Loading branch information
DebakelOrakel committed Jul 18, 2024
1 parent e05d834 commit 1eac6e9
Show file tree
Hide file tree
Showing 16 changed files with 852 additions and 9 deletions.
3 changes: 3 additions & 0 deletions alerts.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.6/internal/metrics/alerts/fluentd.go.FluentdPrometheusAlert release-5.6/fluentd_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.7/internal/metrics/alerts/fluentd.go.FluentdPrometheusAlert release-5.7/fluentd_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.8/config/prometheus/collector_alerts.yaml release-5.8/collector_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.9/config/prometheus/collector_alerts.yaml release-5.9/collector_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/cluster-logging-operator/master/config/prometheus/collector_alerts.yaml master/collector_prometheus_alerts.yaml

https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.6/files/prometheus_alerts.yml release-5.6/elasticsearch_operator_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.7/files/prometheus_alerts.yml release-5.7/elasticsearch_operator_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.8/files/prometheus_alerts.yml release-5.8/elasticsearch_operator_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.8/files/prometheus_alerts.yml release-5.9/elasticsearch_operator_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/elasticsearch-operator/master/files/prometheus_alerts.yml master/elasticsearch_operator_prometheus_alerts.yaml

https://raw.githubusercontent.com/openshift/loki/release-5.6/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.6/lokistack_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/loki/release-5.7/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.7/lokistack_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/loki/release-5.8/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.8/lokistack_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/loki/release-5.9/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.9/lokistack_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/loki/main/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml master/lokistack_prometheus_alerts.yaml
2 changes: 1 addition & 1 deletion class/defaults.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ parameters:
"False": {}

namespace: openshift-logging
version: '5.8'
version: '5.9'
channel: 'stable-${openshift4_logging:version}'
alerts: 'release-${openshift4_logging:version}'

Expand Down
8 changes: 7 additions & 1 deletion component/alertrules.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,13 @@ local prometheus_rules(name, groups, baseURL) = kube._Object('monitoring.coreos.

// Elasticstack alerts

local isVersion58 = if params.version == '5.8' || params.version == 'master' then true else false;
local isVersion58 =
local major = std.split(params.version, '.')[0];
local minor = std.split(params.version, '.')[1];
if major == 'master' then true
else if std.parseInt(major) >= 6 then true
else if std.parseInt(major) == 5 && std.parseInt(minor) >= 8 then true
else false;

local esStorageGroup = {
name: 'elasticsearch_node_storage.alerts',
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: collector
namespace: openshift-logging
spec:
groups:
- name: logging_collector.alerts
rules:
- alert: CollectorNodeDown
annotations:
message: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m."
summary: "Collector cannot be scraped"
expr: |
up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0
for: 10m
labels:
service: collector
severity: critical
- alert: CollectorHighErrorRate
annotations:
message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component."
summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are high"
expr: |
100 * (
collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
/
collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
) > 0.001
for: 15m
labels:
service: collector
severity: critical
- alert: CollectorVeryHighErrorRate
annotations:
message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component."
summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are very high"
expr: |
100 * (
collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
/
collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
) > 0.05
for: 15m
labels:
service: collector
severity: critical
- alert: FluentdQueueLengthIncreasing
annotations:
message: "For the last hour, fluentd {{ $labels.pod }} output '{{ $labels.plugin_id }}' average buffer queue length has increased continuously."
summary: "Fluentd pod {{ $labels.pod }} is unable to keep up with traffic over time for forwarder output {{ $labels.plugin_id }}."
expr: |
sum by (pod,plugin_id) ( 0 * (deriv(fluentd_output_status_emit_records[1m] offset 1h))) + on(pod,plugin_id) ( deriv(fluentd_output_status_buffer_queue_length[10m]) > 0 and delta(fluentd_output_status_buffer_queue_length[1h]) > 1 )
for: 1h
labels:
service: collector
severity: Warning
- alert: ElasticsearchDeprecation
annotations:
message: "The OpenShift Elasticsearch Operator is deprecated and is planned to be removed in a future release. Red Hat provides bug fixes and support for this feature during the current release lifecycle, but this feature no longer receives enhancements. As an alternative to using the OpenShift Elasticsearch Operator to manage the default log storage, you can use the Loki Operator."
summary: "Detected Elasticsearch as the in-cluster storage which is deprecated and will be removed in a future release."
expr: |
sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0
for: 5m
labels:
service: storage
severity: Warning
namespace: openshift-logging
- alert: FluentdDeprecation
annotations:
message: "Fluentd is deprecated and is planned to be removed in a future release. Red Hat provides bug fixes and support for this feature during the current release lifecycle, but this feature no longer receives enhancements. As an alternative to Fluentd, you can use Vector instead."
summary: "Detected Fluentd as the collector which is deprecated and will be removed in a future release."
expr: |
sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0
for: 5m
labels:
service: collector
severity: Warning
namespace: openshift-logging
- alert: KibanaDeprecation
annotations:
message: "The Kibana web console is now deprecated and is planned to be removed in a future logging release."
summary: "Detected Kibana as the visualization which is deprecated and will be removed in a future release."
expr: |
sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0
for: 5m
labels:
service: visualization
severity: Warning
namespace: openshift-logging
- alert: DiskBufferUsage
annotations:
message: "Collectors potentially consuming too much node disk, {{ $value }}% "
summary: "Detected consuming too much node disk on $labels.hostname host"
expr: |
(label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink', buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)')
/ on(instance) group_left() sum by(instance) (node_filesystem_size_bytes{mountpoint='/var'})) * 100 > 15
for: 5m
labels:
service: collector
severity: Warning
- name: logging_clusterlogging_telemetry.rules
rules:
- expr: |
sum by(cluster)(log_collected_bytes_total)
record: cluster:log_collected_bytes_total:sum
- expr: |
sum by(cluster)(log_logged_bytes_total)
record: cluster:log_logged_bytes_total:sum
- expr: |
sum by(pod, namespace, app_kubernetes_io_part_of)(rate(vector_component_errors_total[2m])) or sum by(pod, namespace, app_kubernetes_io_part_of)(rate(fluentd_output_status_num_errors[2m]))
record: collector:log_num_errors:sum_rate
- expr: |
sum by(pod, namespace, app_kubernetes_io_part_of)(rate(vector_component_received_events_total[2m])) or sum by(pod, namespace, app_kubernetes_io_part_of)(rate(fluentd_output_status_emit_records[2m]))
record: collector:received_events:sum_rate
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
---
"groups":
- "name": logging_elasticsearch.alerts
"rules":
- "alert": ElasticsearchClusterNotHealthy
"annotations":
"message": "Cluster {{ $labels.cluster }} health status has been RED for at least 7m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet."
"summary": "Cluster health status is RED"
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Red"
"expr": |
sum by (cluster) (es_cluster_status == 2)
"for": 7m
"labels":
"namespace": openshift-logging
"severity": critical

- "alert": ElasticsearchClusterNotHealthy
"annotations":
"message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Some shard replicas are not allocated."
"summary": "Cluster health status is YELLOW"
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Yellow"
"expr": |
sum by (cluster) (es_cluster_status == 1)
"for": 20m
"labels":
"namespace": openshift-logging
"severity": warning

- "alert": ElasticsearchWriteRequestsRejectionJumps
"annotations":
"message": "High Write Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed."
"summary": "High Write Rejection Ratio - {{ $value }}%"
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Write-Requests-Rejection-Jumps"
"expr": |
round( writing:reject_ratio:rate2m * 100, 0.001 ) > 5
"for": 10m
"labels":
"namespace": openshift-logging
"severity": warning

- "alert": ElasticsearchNodeDiskWatermarkReached
"annotations":
"message": "Disk Low Watermark Reached at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node."
"summary": "Disk Low Watermark Reached - disk saturation is {{ $value }}%"
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached"
"expr": |
sum by (instance, pod) (
round(
(1 - (
es_fs_path_available_bytes /
es_fs_path_total_bytes
)
) * 100, 0.001)
) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct
"for": 5m
"labels":
"namespace": openshift-logging
"severity": info

- "alert": ElasticsearchNodeDiskWatermarkReached
"annotations":
"message": "Disk High Watermark Reached at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node."
"summary": "Disk High Watermark Reached - disk saturation is {{ $value }}%"
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached"
"expr": |
sum by (instance, pod) (
round(
(1 - (
es_fs_path_available_bytes /
es_fs_path_total_bytes
)
) * 100, 0.001)
) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct
"for": 5m
"labels":
"namespace": openshift-logging
"severity": critical

- "alert": ElasticsearchNodeDiskWatermarkReached
"annotations":
"message": "Disk Flood Stage Watermark Reached at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark."
"summary": "Disk Flood Stage Watermark Reached - disk saturation is {{ $value }}%"
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached"
"expr": |
sum by (instance, pod) (
round(
(1 - (
es_fs_path_available_bytes /
es_fs_path_total_bytes
)
) * 100, 0.001)
) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct
"for": 5m
"labels":
"namespace": openshift-logging
"severity": critical

- "alert": ElasticsearchJVMHeapUseHigh
"annotations":
"message": "JVM Heap usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%."
"summary": "JVM Heap usage on the node is high"
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-JVM-Heap-Use-is-High"
"expr": |
sum by (cluster, instance, node) (es_jvm_mem_heap_used_percent) > 75
"for": 10m
"labels":
"namespace": openshift-logging
"severity": info

- "alert": AggregatedLoggingSystemCPUHigh
"annotations":
"message": "System CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%."
"summary": "System CPU usage is high"
"runbook_url": "[[.RunbookBaseURL]]#Aggregated-Logging-System-CPU-is-High"
"expr": |
sum by (cluster, instance, node) (es_os_cpu_percent) > 90
"for": 1m
"labels":
"namespace": openshift-logging
"severity": info

- "alert": ElasticsearchProcessCPUHigh
"annotations":
"message": "ES process CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%."
"summary": "ES process CPU usage is high"
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Process-CPU-is-High"
"expr": |
sum by (cluster, instance, node) (es_process_cpu_percent) > 90
"for": 1m
"labels":
"namespace": openshift-logging
"severity": info

- "alert": ElasticsearchDiskSpaceRunningLow
"annotations":
"message": "Cluster {{ $labels.cluster }} is predicted to be out of disk space within the next 6h."
"summary": "Cluster low on disk space"
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Disk-Space-is-Running-Low"
"expr": |
sum(predict_linear(es_fs_path_available_bytes[6h], 6 * 3600)) < 0
"for": 1h
"labels":
"namespace": openshift-logging
"severity": critical

- "alert": ElasticsearchHighFileDescriptorUsage
"annotations":
"message": "Cluster {{ $labels.cluster }} is predicted to be out of file descriptors within the next hour."
"summary": "Cluster low on file descriptors"
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-FileDescriptor-Usage-is-high"
"expr": |
predict_linear(es_process_file_descriptors_max_number[1h], 3600) - predict_linear(es_process_file_descriptors_open_number[1h], 3600) < 0
"for": 10m
"labels":
"namespace": openshift-logging
"severity": warning

- "alert": ElasticsearchOperatorCSVNotSuccessful
"annotations":
"message": "Elasticsearch Operator CSV has not reconciled succesfully."
"summary": "Elasticsearch Operator CSV Not Successful"
"expr": |
csv_succeeded{name =~ "elasticsearch-operator.*"} == 0
"for": 10m
"labels":
"namespace": openshift-logging
"severity": warning

- "alert": ElasticsearchNodeDiskWatermarkReached
"annotations":
"message": "Disk Low Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node."
"summary": "Disk Low Watermark is predicted to be reached within next 6h."
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached"
"expr": |
sum by (instance, pod) (
round(
(1 - (
predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) /
predict_linear(es_fs_path_total_bytes[3h], 6 * 3600)
)
) * 100, 0.001)
) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct
"for": 1h
"labels":
"namespace": openshift-logging
"severity": warning

- "alert": ElasticsearchNodeDiskWatermarkReached
"annotations":
"message": "Disk High Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node."
"summary": "Disk High Watermark is predicted to be reached within next 6h."
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached"
"expr": |
sum by (instance, pod) (
round(
(1 - (
predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) /
predict_linear(es_fs_path_total_bytes[3h], 6 * 3600)
)
) * 100, 0.001)
) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct
"for": 1h
"labels":
"namespace": openshift-logging
"severity": warning

- "alert": ElasticsearchNodeDiskWatermarkReached
"annotations":
"message": "Disk Flood Stage Watermark is predicted to be reached within the next 6h at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark."
"summary": "Disk Flood Stage Watermark is predicted to be reached within next 6h."
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached"
"expr": |
sum by (instance, pod) (
round(
(1 - (
predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) /
predict_linear(es_fs_path_total_bytes[3h], 6 * 3600)
)
) * 100, 0.001)
) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct
"for": 1h
"labels":
"namespace": openshift-logging
"severity": warning
Loading

0 comments on commit 1eac6e9

Please sign in to comment.