Skip to content

Commit

Permalink
Allow automatically mapping alert to teams
Browse files Browse the repository at this point in the history
  • Loading branch information
bastjan committed Oct 2, 2023
1 parent 4a7adc7 commit 63c8ddb
Show file tree
Hide file tree
Showing 9 changed files with 589 additions and 1 deletion.
9 changes: 9 additions & 0 deletions class/defaults.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
parameters:
openshift4_monitoring:
team_fallback: null

=_metadata:
library_aliases:
prom.libsonnet: openshift4-monitoring-prom.libsonnet
Expand Down Expand Up @@ -99,6 +101,13 @@ parameters:
equal:
- namespace
- alertname
alertManagerAutoDiscovery:
enabled: true
debug_config_map: false
team_receiver_format: team_default_%s
additional_alert_matchers: []
prepend_routes: []
append_routes: []
alerts:
includeNamespaces:
- appuio.*
Expand Down
148 changes: 148 additions & 0 deletions component/alert-routing-discovery.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
local com = import 'lib/commodore.libjsonnet';
local kap = import 'lib/kapitan.libjsonnet';
local kube = import 'lib/kube.libjsonnet';
local prom = import 'lib/prom.libsonnet';

local inv = kap.inventory();
local params = inv.parameters;

local adParams = params.openshift4_monitoring.alertManagerAutoDiscovery;

local nullReceiver = '__component_openshift4_monitoring_null';

// appKeys returns the (aliased) application name and if aliased the original name in the second position.
// The application name is translated from kebab-case to snake_case, except if the second parameter is set to true.
local appKeys = function(name, raw=false)
local normalized = function(name) if raw then name else std.strReplace(name, '-', '_');
// can be simplified with jsonnet > 0.19 which would support ' as ' as the substring
local parts = std.split(name, ' ');
if std.length(parts) == 1 then
[ normalized(parts[0]) ]
else if std.length(parts) == 3 && parts[1] == 'as' then
[ normalized(parts[2]), normalized(parts[0]) ]
else
error 'invalid application name `%s`' % name;

// discoverNS returns the namespace for the given application.
// It looks into the follwing places:
// - params.<app>.namespace
// - params.<app>.namespace.name
// It does respect aliased applications and looks in the instance first and then in the base application.
local discoverNS = function(app)
local f = function(k)
if std.objectHas(params, k) then
local p = params[k];
if std.objectHas(p, 'namespace') then
if std.isString(p.namespace) then
p.namespace
else if std.isObject(p.namespace) && std.objectHas(p.namespace, 'name') && std.isString(p.namespace.name) then
p.namespace.name;

local ks = appKeys(app);
local aliased = f(ks[0]);
if aliased != null then
aliased
else if std.length(ks) == 2 then
f(ks[1]);


local ownerOrFallbackTeam =
if std.objectHas(params, 'syn') && std.objectHas(params.syn, 'owner') then
params.syn.owner
else
params.openshift4_monitoring.fallback_team;

// teamsForApplication returns the teams for the given application.
// It does so by looking at the top level syn parameter.
// The syn parameter should look roughly like this.
//
// syn:
// owner: clumsy-donkeys
// teams:
// chubby-cockroaches:
// instances:
// - superb-visualization
// lovable-lizards:
// instances:
// - apartment-cats
//
// The application is first looked up in the instances of the teams, if no team is found, owner is used as fallback.
local teamsForApplication = function(app)
local lookup = function(app)
if std.objectHas(params, 'syn') && std.objectHas(params.syn, 'teams') then
local teams = params.syn.teams;
std.foldl(
function(prev, team)
if std.objectHas(teams, team) && std.objectHas(teams[team], 'instances') && std.member(teams[team].instances, app) then
prev + [ team ]
else
prev,
std.objectFields(teams),
[],
);

local teams = std.prune(std.map(lookup, appKeys(app, true)));

if std.length(teams) > 0 then
teams[0]
else
[ ownerOrFallbackTeam ];

// teamToNS is a map from a team to namespaces.
local teamToNS = std.foldl(
function(prev, app)
local tms = teamsForApplication(app);
std.foldl(
function(prev, tm) prev { [tm]+: [ discoverNS(app) ] }, tms, prev
)
,
inv.applications,
{}
);

// teamBasedRouting contains discovered routes for teams.
// The routes are set up with `continue: true` so we can route to multiple teams.
// The last route catches all alerts already routed to a team.
local teamBasedRouting = std.map(
function(k) {
receiver: adParams.team_receiver_format % k,
matchers: adParams.additional_alert_matchers + [
'namespace =~ "%s"' % std.join('|', teamToNS[k]),
],
continue: true,
},
std.objectFields(teamToNS)
) + [ {
// catch all alerts already routed to a team
receiver: nullReceiver,
matchers: adParams.additional_alert_matchers + [
'namespace =~ "%s"' % std.join('|', std.foldl(function(prev, nss) prev + nss, std.objectValues(teamToNS), [])),
],
continue: false,
} ];

local alertmanagerConfig =
params.openshift4_monitoring.alertManagerConfig {
receivers+: [ { name: nullReceiver } ],
route+: {
routes: adParams.prepend_routes + teamBasedRouting + adParams.append_routes + super.routes + if ownerOrFallbackTeam != null then [ {
receiver: adParams.team_receiver_format % ownerOrFallbackTeam,
} ] else [ { receiver: nullReceiver } ],
},
};

{
debugConfigMap: kube.ConfigMap('discovery-debug') {
data: {
local discoveredNamespaces = std.foldl(function(prev, app) prev { [app]: discoverNS(app) }, inv.applications, {}),
local discoveredTeams = std.foldl(function(prev, app) prev { [app]: teamsForApplication(app) }, inv.applications, {}),
applications: std.manifestJsonMinified(inv.applications),
discovered_namespaces: std.manifestYamlDoc(discoveredNamespaces),
apps_without_namespaces: std.manifestYamlDoc(std.foldl(function(prev, app) if discoveredNamespaces[app] == null then prev + [ app ] else prev, std.objectFields(discoveredNamespaces), [])),
discovered_teams: std.manifestYamlDoc(discoveredTeams),
proposed_routes: std.manifestYamlDoc(teamBasedRouting),
alertmanager: std.manifestYamlDoc(alertmanagerConfig),
},
},
alertmanagerConfig: alertmanagerConfig,
}
6 changes: 5 additions & 1 deletion component/main.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ local params = inv.parameters.openshift4_monitoring;
local rules = import 'rules.jsonnet';
local capacity = import 'capacity.libsonnet';

local alertDiscovery = import 'alert-routing-discovery.libsonnet';

local ns =
if params.namespace != 'openshift-monitoring' then
error 'Component openshift4-monitoring does not support values for parameter `namespace` other than "openshift-monitoring".'
Expand Down Expand Up @@ -105,9 +107,11 @@ local customRules =
namespace: ns,
},
stringData: {
'alertmanager.yaml': std.manifestYamlDoc(params.alertManagerConfig),
'alertmanager.yaml': if params.alertManagerAutoDiscovery.enabled then std.manifestYamlDoc(alertDiscovery.alertmanagerConfig) else alertDiscovery.alertmanagerConfig,
},
},
[if params.alertManagerAutoDiscovery.enabled && params.alertManagerAutoDiscovery.debug_config_map then '99_discovery_debug_cm']: alertDiscovery.debugConfigMap,

[if params.enableAlertmanagerIsolationNetworkPolicy then '20_networkpolicy']: std.map(function(p) com.namespaced('openshift-monitoring', p), import 'networkpolicy.libsonnet'),
[if params.enableUserWorkload && params.enableUserWorkloadAlertmanagerIsolationNetworkPolicy then '20_user_workload_networkpolicy']: std.map(function(p) com.namespaced('openshift-user-workload-monitoring', p), import 'networkpolicy.libsonnet'),
rbac: import 'rbac.libsonnet',
Expand Down
68 changes: 68 additions & 0 deletions docs/modules/ROOT/pages/references/parameters.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,74 @@ A dictionary holding the configuration for the AlertManager.

See the https://docs.openshift.com/container-platform/latest/monitoring/cluster_monitoring/configuring-the-monitoring-stack.html#configuring-alertmanager[OpenShift docs] for available parameters.


== `alertManagerAutoDiscovery`

[horizontal]
type:: dictionary
default::
+
[source,yaml]
----
alertManagerAutoDiscovery:
enabled: true
debug_config_map: false
team_receiver_format: team_default_%s
additional_alert_matchers: []
prepend_routes: []
append_routes: []
----

`alertManagerAutoDiscovery` holds the configuration for the Alertmanager auto-discovery feature.

The auto-discovery routes alerts to the configured teams based on their namespaces and the top-level `syn.teams[*].instances` and `syn.owner` parameters.
It tries to find the namespaces associated with a Commodore application (called instance in the `syn` configuration) in the applications array, and matches the namespace to the team based on the `syn.teams[*].instances` parameter.

.`syn` Team Example
[source,yaml]
----
syn:
owner: daring-donkeys
teams:
electric-elephants:
instances: [postgres]
----

The auto-discovery feature is enabled by default.
A ConfigMap can be enabled with `debug_config_map` to debug the auto-discovery feature.

The configuration is merged with the `alertManagerConfig` parameter.
Route receivers are generated for each team based on the `team_receiver_format` parameter.
The routes are ordered as follows:

[source]
----
alertManagerAutoDiscovery.prepend_routes + generated routes + alertManagerAutoDiscovery.append_routes + alertManagerConfig.routes + route all to syn.owner
----

`additional_alert_matchers` is a list of additional alert matchers to add to the generated routes.
This can be used to handle special cases where the auto-discovery feature does not work as expected.
For example if an alert should go to a different team than the namespace suggests based on a label.

[source,yaml]
----
alertManagerAutoDiscovery:
additional_alert_matchers:
- 'syn_team = ""'
# becomes
- continue: true
matchers:
- syn_team = ""
- namespace =~ "my-ns"
receiver: team_default_lovable-lizards
- continue: false
matchers:
- syn_team = ""
- namespace =~ "my-ns"
receiver: __component_openshift4_monitoring_null
----


== `alerts`

[horizontal]
Expand Down
68 changes: 68 additions & 0 deletions tests/capacity-alerts.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
applications:
- non-existing
- no-ns
- ns-string
- ns-object
- base as ns-in-base
- base as ns-overridden
- non-existing as still-non-existing
- shared-responsibility
- same-ns-1
- same-ns-2

parameters:
kapitan:
dependencies:
Expand All @@ -13,3 +25,59 @@ parameters:
openshift4_monitoring:
capacityAlerts:
enabled: true
alertManagerConfig:
route:
routes:
- receiver: other
matchers:
- other = "true"
alertManagerAutoDiscovery:
debug_config_map: true
team_receiver_format: team_default_%s
additional_alert_matchers:
- 'syn_team = ""'
prepend_routes:
- matchers:
- alertname = Watchdog
repeat_interval: 60s
receiver: heartbeat

syn:
owner: clumsy-donkeys
teams:
chubby-cockroaches:
instances:
- ns-in-base
- ns-overridden
- shared-responsibility
lovable-lizards:
instances:
- ns-object
- shared-responsibility
- same-ns-2

no_ns: {}

ns_string:
namespace: "ns-string"

ns_object:
namespace:
name: "ns-object"

base:
namespace: base

ns_in_base: {}

ns_overridden:
namespace: overridden

shared_responsibility:
namespace: shared-responsibility

same_ns_1:
namespace: same-ns

same_ns_2:
namespace: same-ns
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,39 @@ stringData:
"severity": "warning"
"target_match_re":
"severity": "info"
"receivers":
- "name": "__component_openshift4_monitoring_null"
"route":
"group_interval": "5s"
"group_wait": "0s"
"repeat_interval": "10m"
"routes":
- "matchers":
- "alertname = Watchdog"
"receiver": "heartbeat"
"repeat_interval": "60s"
- "continue": true
"matchers":
- "syn_team = \"\""
- "namespace =~ \"base|overridden|shared-responsibility\""
"receiver": "team_default_chubby-cockroaches"
- "continue": true
"matchers":
- "syn_team = \"\""
- "namespace =~ \"ns-string|same-ns\""
"receiver": "team_default_clumsy-donkeys"
- "continue": true
"matchers":
- "syn_team = \"\""
- "namespace =~ \"ns-object|shared-responsibility|same-ns\""
"receiver": "team_default_lovable-lizards"
- "continue": false
"matchers":
- "syn_team = \"\""
- "namespace =~ \"base|overridden|shared-responsibility|ns-string|same-ns|ns-object|shared-responsibility|same-ns\""
"receiver": "__component_openshift4_monitoring_null"
- "matchers":
- "other = \"true\""
"receiver": "other"
- "receiver": "team_default_clumsy-donkeys"
type: Opaque
Loading

0 comments on commit 63c8ddb

Please sign in to comment.