Skip to content

Commit

Permalink
Merge pull request #9 from logzio/release/prometheus-alerts-migrator-…
Browse files Browse the repository at this point in the history
…v1.1.0

Release prometheus alerts migrator v1.1.0
  • Loading branch information
ralongit authored Nov 11, 2024
2 parents 4c3c236 + e02f6ab commit 4d38cbb
Show file tree
Hide file tree
Showing 11 changed files with 472 additions and 290 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ jobs:
- name: Set up Go
uses: actions/setup-go@v4
with:
go-version: '1.19'
go-version: '1.23'

- name: Check out code into the Go module directory
uses: actions/checkout@v4

- name: Install Kind
run: |
curl -Lo ./kind "https://kind.sigs.k8s.io/dl/v0.11.1/kind-Linux-amd64"
curl -Lo ./kind "https://kind.sigs.k8s.io/dl/v0.24.0/kind-Linux-amd64"
chmod +x ./kind
mv ./kind /usr/local/bin/kind
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

/.idea
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Golang base image
FROM golang:1.19-alpine AS build
FROM golang:1.23-alpine AS build

# Set working directory
WORKDIR /app
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
IMAGE_NAME := prometheus-alerts-migrator
IMAGE_TAG ?= v1.0.2
IMAGE_TAG ?= v1.1.0
DOCKER_REPO := logzio/$(IMAGE_NAME):$(IMAGE_TAG)


Expand All @@ -18,4 +18,4 @@ install-tools:
go install golang.org/x/tools/cmd/goimports@latest
go install github.com/client9/misspell/cmd/misspell@latest
go install github.com/pavius/impi/cmd/impi@latest
go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.47.3
go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.61.0
53 changes: 52 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ The controller is designed to process ConfigMaps containing Prometheus alert rul

### Example rules configMap

Below is an example of how a rules configMap should be structured:
Below is an example of how a rules configMap should be structured per alert rule:

```yaml
apiVersion: v1
Expand All @@ -67,6 +67,45 @@ data:
description: "The OpenTelemetry collector has been down for more than 5 minutes."
summary: "Instance down"
```
Below is an example of how a rules configMap should be structured per alert rule group:
```yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: logzio-grouped-rules
namespace: monitoring
annotations:
prometheus.io/kube-rules: "true"
data:
high_latency_memory_usage_grouped: |
groups:
- name: high_latency
rules:
- alert: High_Latency
expr: histogram_quantile(0.95, sum(rate(otelcol_process_latency_seconds_bucket{app="test-otel-collector"}[5m])) by (le)) > 0.6
for: 5m
labels:
team: "sre"
severity: "critical"
purpose: "test"
annotations:
description: "95th percentile latency is above 600ms for the test OpenTelemetry collector test"
summary: "High 95th percentile latency observed in test environment"
- alert: High_Memory_Usage
expr: sum by (instance) (container_memory_usage_bytes{container="otel-collector-test"}) / sum by (instance) (container_spec_memory_limit_bytes{container="otel-collector-test"}) > 0.7
for: 5m
labels:
team: "sre"
severity: "warning"
purpose: "test"
annotations:
description: "Memory usage for the test OpenTelemetry collector is above 70% of the limit"
summary: "High memory usage detected for the test OpenTelemetry collector"
```
- Replace `prometheus.io/kube-rules` with the actual annotation you use to identify relevant ConfigMaps. The data section should contain your Prometheus alert rules in YAML format.
- Deploy the configmap to your cluster `kubectl apply -f <configmap-file>.yml`

Expand Down Expand Up @@ -128,6 +167,18 @@ data:


## Changelog
- v1.1.0
- Add support for migrating alert rules groups
- Upgrade GoLang version to 1.23
- Upgrade dependencies
- `k8s.io/client-go`: `v0.28.3` -> `v0.31.2`
- `k8s.io/apimachinery`: `v0.28.3` -> `v0.31.2`
- `k8s.io/api`: `v0.28.3` -> `v0.31.2`
- `k8s.io/klog/v2`: `v2.110.1` -> `v2.130.1`
- `logzio_terraform_client`: `1.20.0` -> `1.22.0`
- `prometheus/common`: `v0.44.0` -> `v0.60.1`
- `prometheus/alertmanager`: `v0.26.0` -> `v0.27.0`
- `prometheus/prometheus`: `v0.47.2` -> `v0.55.0`
- v1.0.3
- Handle Prometheus alert manager configuration file
- Add CRUD operations for contact points and notification policies
Expand Down
98 changes: 60 additions & 38 deletions controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,7 @@ type Controller struct {
alertManagerAnnotation *string

configmapEventRecorderFunc func(cm *corev1.ConfigMap, eventtype, reason, msg string)
}

type MultiRuleGroups struct {
Values []rulefmt.RuleGroups
ruleGroupsName string
}

func NewController(
Expand Down Expand Up @@ -430,49 +427,77 @@ func (c *Controller) getClusterAlertRules(mapList *corev1.ConfigMapList) *[]rule
return &finalRules
}

// validateAndRecordRule validates a single rule, records events, and appends valid rules to finalRules.
func (c *Controller) validateAndRecordRule(rule rulefmt.RuleNode, cm *corev1.ConfigMap, fallbackNameStub, key string, finalRules *[]rulefmt.RuleNode) {
if len(rule.Expr.Value) == 0 {
klog.Warningf("0:0: field 'expr' must be set in rule")
}
if len(rule.Alert.Value) == 0 {
errorMsg := fmt.Sprintf("Configmap: %s key: %s does not conform to any of the legal format.", fallbackNameStub, key)
c.configmapEventRecorderFunc(cm, corev1.EventTypeWarning, ErrInvalidKey, errorMsg)
}
// Add unique name for the alert rule to prevent duplicate rules ([alert_name]-[configmap_name]-[configmap_namespace])
rule.Alert.Value = fmt.Sprintf("%s-%s-%s", cm.Name, cm.Namespace, key)

validationErrs := rule.Validate()
if len(validationErrs) > 0 {
for _, ruleErr := range validationErrs {
c.configmapEventRecorderFunc(cm, corev1.EventTypeWarning, ErrInvalidKey, ruleErr.Error())
}
failMessage := fmt.Sprintf("Configmap: %s key: %s Rejected, no valid rules.", fallbackNameStub, key)
c.configmapEventRecorderFunc(cm, corev1.EventTypeWarning, ErrInvalidKey, failMessage)
} else {
if rule.Annotations == nil {
rule.Annotations = make(map[string]string)
}

if (rule.Annotations["ruleGroupsName"] == rule.Alert.Value || rule.Annotations["ruleGroupsName"] == "") && c.ruleGroupsName != "" {
rule.Annotations["ruleGroupsName"] = c.ruleGroupsName
} else {
rule.Annotations["ruleGroupsName"] = rule.Alert.Value
}
*finalRules = append(*finalRules, rule)
}
}

// extractValues extracts the rules from the configmap, and validates them
func (c *Controller) extractValues(cm *corev1.ConfigMap) []rulefmt.RuleNode {
func (c *Controller) extractValues(cm *corev1.ConfigMap) (totalRules []rulefmt.RuleNode) {

fallbackNameStub := common.CreateNameStub(cm)
configmapData := cm.Data

for key, value := range configmapData {
// Check if the rule contains groups
var ruleGroups rulefmt.RuleGroups
err := yaml.Unmarshal([]byte(value), &ruleGroups)
if err == nil && len(ruleGroups.Groups) > 0 {
// Process rule groups and rules
for _, ruleGroup := range ruleGroups.Groups {
c.ruleGroupsName = ruleGroup.Name
for _, rule := range ruleGroup.Rules {
c.validateAndRecordRule(rule, cm, fallbackNameStub, key, &totalRules)
}
}
continue
}

var toalRules []rulefmt.RuleNode

for key, value := range cm.Data {
// try each encoding
// try to extract a rules
// Process single rules
var rule rulefmt.RuleNode
var err error
err, rule = c.extractRules(value)
err = yaml.Unmarshal([]byte(value), &rule)
if err != nil {
errorMsg := fmt.Sprintf("Configmap: %s key: %s Error during extraction.", fallbackNameStub, key)
errorMsg := fmt.Sprintf("Configmap: %s key: %s Error during extraction: %s.", fallbackNameStub, key, err)
c.configmapEventRecorderFunc(cm, corev1.EventTypeWarning, ErrInvalidKey, errorMsg)
continue
}

// Add unique name for the alert rule to prevent duplicate rules ([alert_name]-[configmap_name]-[configmap_namespace])
rule.Alert.Value = fmt.Sprintf("%s-%s-%s", cm.Name, cm.Namespace, key)

if len(rule.Alert.Value) == 0 {
errorMsg := fmt.Sprintf("Configmap: %s key: %s does not conform to any of the legal format Skipping.", fallbackNameStub, key)
c.configmapEventRecorderFunc(cm, corev1.EventTypeWarning, ErrInvalidKey, errorMsg)
} else {
// validate the rule
validationErrs := rule.Validate()
if len(validationErrs) > 0 {
for _, ruleErr := range validationErrs {
c.configmapEventRecorderFunc(cm, corev1.EventTypeWarning, ErrInvalidKey, ruleErr.Error())
}
failMessage := fmt.Sprintf("Configmap: %s key: %s Rejected, no valid rules.", fallbackNameStub, key)
c.configmapEventRecorderFunc(cm, corev1.EventTypeWarning, ErrInvalidKey, failMessage)
c.validateAndRecordRule(rule, cm, fallbackNameStub, key, &totalRules)
}

} else {
// add to the rulegroups
toalRules = append(toalRules, rule)
}
}
if len(totalRules) > 0 {
klog.Info(fmt.Sprintf("Found %d alert rules in %s configmap", len(totalRules), cm.Name))
}
klog.Info(fmt.Sprintf("Found %d rules in %s configmap", len(toalRules), cm.Name))

return toalRules
return totalRules
}

// extractRules extracts the rules from the configmap key
Expand All @@ -482,9 +507,6 @@ func (c *Controller) extractRules(value string) (error, rulefmt.RuleNode) {
if err != nil {
return err, rulefmt.RuleNode{}
}
if len(rule.Alert.Value) == 0 {
return fmt.Errorf("no Rules found"), rule
}
return nil, rule
}

Expand Down
148 changes: 148 additions & 0 deletions controller/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,154 @@ func TestExtractValues(t *testing.T) {
},
expectedRules: 1,
},
{
name: "configmap with grouped rules",
configMap: &v1.ConfigMap{
Data: map[string]string{
"group_rules1": `groups:
- name: high_latency_memory_usage_group
rules:
- alert: High_Latency
expr: histogram_quantile(0.95, sum(rate(otelcol_process_latency_seconds_bucket{app="test-otel-collector"}[5m])) by (le)) > 0.6
for: 5m
labels:
team: "sre"
severity: "critical"
purpose: "test"
annotations:
description: "95th percentile latency is above 600ms for the test OpenTelemetry collector test"
summary: "High 95th percentile latency observed in test environment"
- alert: High_Memory_Usage
expr: sum by (instance) (container_memory_usage_bytes{container="otel-collector-test"}) / sum by (instance) (container_spec_memory_limit_bytes{container="otel-collector-test"}) > 0.7
for: 5m
labels:
team: "sre"
severity: "warning"
purpose: "test"
annotations:
description: "Memory usage for the test OpenTelemetry collector is above 70% of the limit"
summary: "High memory usage detected for the test OpenTelemetry collector"`,
},
},
expectedRules: 2,
},
{
name: "configmap with grouped rules and single rule",
configMap: &v1.ConfigMap{
Data: map[string]string{
"rule1": "alert: HighLatency\nexpr: job:request_latency_seconds:mean5m{job=\"myjob\"} > 0.5\nfor: 10m\n",
"group_rules1": `groups:
- name: packet_loss_group
rules:
- alert: Packet_Loss
expr: rate(packet_loss_total{app="test-network"}[5m]) > 0.1
for: 5m
labels:
team: "network"
severity: "critical"
purpose: "test"
annotations:
description: "Packet loss rate is above 10% on the test network"
summary: "Significant packet loss detected in test network"
- alert: Disk_Usage
expr: (node_filesystem_size_bytes{mountpoint="/var/lib/docker"} - node_filesystem_free_bytes{mountpoint="/var/lib/docker"}) / node_filesystem_size_bytes{mountpoint="/var/lib/docker"} > 0.8
for: 5m
labels:
team: "ops"
severity: "warning"
purpose: "test"
annotations:
description: "Disk usage for /var/lib/docker is above 80%"
summary: "High disk usage detected on /var/lib/docker"`,
},
},
expectedRules: 3,
},
{ // Test case for grouped rules with invalid rule data
name: "configmap with grouped rules and invalid rule",
configMap: &v1.ConfigMap{
Data: map[string]string{
"invalid_rule": "this is not a valid prometheus rule data",
"group_rules1": `groups:
- name: packet_loss_group
rules:
- alert: Packet_Loss
expr: rate(packet_loss_total{app="test-network"}[5m]) > 0.1
for: 5m
labels:
team: "network"
severity: "critical"
purpose: "test"
annotations:
description: "Packet loss rate is above 10% on the test network"
summary: "Significant packet loss detected in test network"
- alert: Disk_Usage
expr: (node_filesystem_size_bytes{mountpoint="/var/lib/docker"} - node_filesystem_free_bytes{mountpoint="/var/lib/docker"}) / node_filesystem_size_bytes{mountpoint="/var/lib/docker"} > 0.8
for: 5m
labels:
team: "ops"
severity: "warning"
purpose: "test"
annotations:
description: "Disk usage for /var/lib/docker is above 80%"
summary: "High disk usage detected on /var/lib/docker"`,
},
},
expectedRules: 2,
},
{
name: "configmap with multiple groups of grouped rules",
configMap: &v1.ConfigMap{
Data: map[string]string{
"multiple_groups_rules1": `groups:
- name: cpu_usage_group
rules:
- alert: High_CPU_Usage
expr: sum(rate(container_cpu_usage_seconds_total{container="otel-collector-test"}[5m])) by (instance) > 0.9
for: 5m
labels:
team: "sre"
severity: "warning"
purpose: "test"
annotations:
description: "CPU usage for the test OpenTelemetry collector is above 90%"
summary: "High CPU usage detected for the test OpenTelemetry collector"
- alert: High_Memory_Usage
expr: sum by (instance) (container_memory_usage_bytes{container="otel-collector-test"}) / sum by (instance) (container_spec_memory_limit_bytes{container="otel-collector-test"}) > 0.7
for: 5m
labels:
team: "sre"
severity: "warning"
purpose: "test"
annotations:
description: "Memory usage for the test OpenTelemetry collector is above 70% of the limit"
summary: "High memory usage detected for the test OpenTelemetry collector"
- name: packet_loss_group
rules:
- alert: Packet_Loss
expr: rate(packet_loss_total{app="test-network"}[5m]) > 0.1
for: 5m
labels:
team: "network"
severity: "critical"
purpose: "test"
annotations:
description: "Packet loss rate is above 10% on the test network"
summary: "Significant packet loss detected in test network"
- alert: Disk_Usage
expr: (node_filesystem_size_bytes{mountpoint="/var/lib/docker"} - node_filesystem_free_bytes{mountpoint="/var/lib/docker"}) / node_filesystem_size_bytes{mountpoint="/var/lib/docker"} > 0.8
for: 5m
labels:
team: "ops"
severity: "warning"
purpose: "test"
annotations:
description: "Disk usage for /var/lib/docker is above 80%"
summary: "High disk usage detected on /var/lib/docker"`,
},
},
expectedRules: 4,
},
}

for _, tc := range testCases {
Expand Down
Loading

0 comments on commit 4d38cbb

Please sign in to comment.