Skip to content

Commit

Permalink
Implement canary storage SLO
Browse files Browse the repository at this point in the history
  • Loading branch information
bastjan committed Jul 25, 2024
1 parent 939f5d3 commit 741d7f9
Show file tree
Hide file tree
Showing 13 changed files with 1,127 additions and 53 deletions.
15 changes: 15 additions & 0 deletions class/defaults.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,19 @@ parameters:
_sli:
volume_plugin: "kubernetes.io/csi.+"
operation_name: ".+"
canary:
enabled: true
objective: 99.0
_sli:
volume_plugins_default_params:
size: 1Gi
accessMode: ReadWriteOnce
interval: 1m
maxPodCompletionTimeout: 3m

volume_plugins:
# Empty value for the default plugin
"": {}
ingress:
canary:
enabled: true
Expand All @@ -76,6 +89,8 @@ parameters:

specs: {}

secrets: {}

controller_node_affinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
Expand Down
83 changes: 83 additions & 0 deletions component/main.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,87 @@ local canary = kube._Object('monitoring.appuio.io/v1beta1', 'SchedulerCanary', '
},
};

local storageCanaries = std.flattenArrays(std.filterMap(
function(storageclass) params.slos.storage.canary._sli.volume_plugins[storageclass] != null,
function(storageclass)
local p = params.slos.storage.canary._sli.volume_plugins_default_params + com.makeMergeable(params.slos.storage.canary._sli.volume_plugins[storageclass]);
local manifestName = 'canary-%s' % if storageclass == '' then 'default' else storageclass;
[
kube.PersistentVolumeClaim(manifestName) {
metadata+: {
namespace: params.namespace,
},
spec+: {
accessModes: [ p.accessMode ],
resources: {
requests: {
storage: p.size,
},
},
[if storageclass != '' then 'storageClassName']: storageclass,
},
},
kube._Object('monitoring.appuio.io/v1beta1', 'SchedulerCanary', manifestName) {
metadata+: {
namespace: params.namespace,
},
spec: {
interval: p.interval,
maxPodCompletionTimeout: p.maxPodCompletionTimeout,
forbidParallelRuns: true,
podTemplate: {
metadata: {},
spec: {
affinity: {
nodeAffinity: params.canary_node_affinity,
},
containers: [
{
command: [
'sh',
'-c',
],
args: [
std.join(';', [
'echo test > /testmount/test',
'rm -f /testmount/test',
]),
],
image: 'image-registry.openshift-image-registry.svc:5000/%s/%s:latest' % [ canaryImageStream.metadata.namespace, canaryImageStream.metadata.name ],
imagePullPolicy: 'Always',
name: 'storage',
resources: {},
terminationMessagePath: '/dev/termination-log',
terminationMessagePolicy: 'File',
volumeMounts: [
{
mountPath: '/testmount',
name: 'test',
},
],
},
],
volumes: [
{
name: 'test',
persistentVolumeClaim: {
claimName: manifestName,
},
},
],
restartPolicy: 'Never',
schedulerName: 'default-scheduler',
securityContext: {},
terminationGracePeriodSeconds: 10,
},
},
},
},
]
,
std.objectFields(params.slos.storage.canary._sli.volume_plugins)
));

{
'00_namespace': kube.Namespace(params.namespace) {
metadata+: {
Expand All @@ -112,8 +193,10 @@ local canary = kube._Object('monitoring.appuio.io/v1beta1', 'SchedulerCanary', '
},
},
},
'10_secrets': com.generateResources(params.secrets, function(name) com.namespaced(params.namespace, kube.Secret(name))),
[if params.canary_scheduler_controller.enabled then '30_canaryImageStream']: canaryImageStream,
[if params.canary_scheduler_controller.enabled then '30_canary']: canary,
[if params.canary_scheduler_controller.enabled then '32_storageCanary']: storageCanaries,
}
+ blackbox.deployment
+ blackbox.probes
Expand Down
106 changes: 69 additions & 37 deletions component/slos.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ local defaultSlos = {
sli: {
events: {
local queryParams = { namespace: params.namespace },
error_query: 'sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="%(namespace)s",reason="timed_out"}[{{.window}}]))' % queryParams,
total_query: 'sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="%(namespace)s"}[{{.window}}]))' % queryParams,
error_query: 'sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="%(namespace)s",reason!="completed"}[{{.window}}]))' % queryParams,
total_query: 'sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="%(namespace)s"}[{{.window}}]))' % queryParams,
},
},
alerting: {
Expand All @@ -52,43 +52,75 @@ local defaultSlos = {
sloth_input: {
version: 'prometheus/v1',
service: 'storage',
_slos: {
'csi-operations': {
description: 'SLO based on number of failed csi operations',
sli: {
events: {
// We use `or on() vector(0)` here to ensure we always have a
// value for the error query, even if there's 0 failing storage
// operations in a time window. This is necessary because the
// timeseries for status="fail-unknown" may not exist at all if
// there's no failures.
error_query:
'sum(rate(storage_operation_duration_seconds_count{volume_plugin=~"%s",operation_name=~"%s",status="fail-unknown"}[{{.window}}])) or on() vector(0)'
% [ config['csi-operations']._sli.volume_plugin, config['csi-operations']._sli.operation_name ],
total_query:
// We use (sum() > 0) or on() vector(1)) to guard against time
// windows where we have 0 storage operations, which would
// otherwise result in a division by 0. We do this because,
// dividing by 0 results in the whole expression evaluating to
// NaN which breaks the SLO alert.
// Note that we can safely divide by 1, since there can't be
// any failed operations when there's no operations at all, so
// if the `vector(1)` is used, the expression will always
// reduce to 0/1.
'(sum(rate(storage_operation_duration_seconds_count{volume_plugin=~"%s",operation_name=~"%s"}[{{.window}}])) > 0) or on() vector(1)' %
[ config['csi-operations']._sli.volume_plugin, config['csi-operations']._sli.operation_name ],
},
_slos: std.foldl(
function(prev, plugin)
local storageClassName = if plugin == '' then 'default' else plugin;
local canaryName = 'canary-%s' % storageClassName;
prev {
[canaryName]: {
description: 'OpenShift workload schedulability SLO based on github.com/appuio/scheduler-canary-controller canary',
sli: {
events: {
local queryParams = { name: canaryName, namespace: params.namespace },
error_query: 'sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="%(name)s",exported_namespace="%(namespace)s",reason!="completed"}[{{.window}}]))' % queryParams,
total_query: 'sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="%(name)s",exported_namespace="%(namespace)s"}[{{.window}}]))' % queryParams,
},
},
alerting: {
name: 'SLO_StorageCanaryWorkloadTimesOut',
annotations: {
summary: 'Storage canary workloads time out.',
},
labels: {
storageclass: storageClassName,
},
page_alert: {},
ticket_alert: {},
},
} + config.canary,
},
alerting: {
name: 'SLO_StorageOperationHighErrorRate',
annotations: {
summary: 'High storage operation error rate',
std.filter(
function(plugin) config.canary._sli.volume_plugins[plugin] != null,
std.objectFields(config.canary._sli.volume_plugins)
),
{
'csi-operations': {
description: 'SLO based on number of failed csi operations',
sli: {
events: {
// We use `or on() vector(0)` here to ensure we always have a
// value for the error query, even if there's 0 failing storage
// operations in a time window. This is necessary because the
// timeseries for status="fail-unknown" may not exist at all if
// there's no failures.
error_query:
'sum(rate(storage_operation_duration_seconds_count{volume_plugin=~"%s",operation_name=~"%s",status="fail-unknown"}[{{.window}}])) or on() vector(0)'
% [ config['csi-operations']._sli.volume_plugin, config['csi-operations']._sli.operation_name ],
total_query:
// We use (sum() > 0) or on() vector(1)) to guard against time
// windows where we have 0 storage operations, which would
// otherwise result in a division by 0. We do this because,
// dividing by 0 results in the whole expression evaluating to
// NaN which breaks the SLO alert.
// Note that we can safely divide by 1, since there can't be
// any failed operations when there's no operations at all, so
// if the `vector(1)` is used, the expression will always
// reduce to 0/1.
'(sum(rate(storage_operation_duration_seconds_count{volume_plugin=~"%s",operation_name=~"%s"}[{{.window}}])) > 0) or on() vector(1)' %
[ config['csi-operations']._sli.volume_plugin, config['csi-operations']._sli.operation_name ],
},
},
page_alert: {},
ticket_alert: {},
},
} + config['csi-operations'],
},
alerting: {
name: 'SLO_StorageOperationHighErrorRate',
annotations: {
summary: 'High storage operation error rate',
},
page_alert: {},
ticket_alert: {},
},
} + config['csi-operations'],
}
),
},
},
ingress: {
Expand Down
78 changes: 77 additions & 1 deletion docs/modules/ROOT/pages/references/parameters.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,28 @@ Sloth isn't actually deployed to the cluster, but used to render `PrometheusRule
The entry in `images` allows Renovate to create version upgrade PRs.
The Sloth version can be overridden by the `tag` parameter.


== `secrets`

[horizontal]
type:: dictionary
default:: `{}`
example::
+
[source,yaml]
----
secrets:
canary-ssd-encrypted-luks-key:
stringData:
luksKey: XXXXXX
----

This parameter allows creating arbitrary `Secret` resources.

The dictionary keys are used as `metadata.name` for the resulting `Secret` resources.
The secrets are created in the namespace indicated by parameter `namespace`.


== `slos`

[horizontal]
Expand All @@ -51,7 +73,7 @@ csi-operations:
operation_name: ".+"
----

The configuration for the csi-operations SLO.
The configuration for the csi-operations storage SLO.

The SLO can be disabled by setting `enabled` to false.

Expand All @@ -62,6 +84,60 @@ Any additional field is added directly to the `slo` input for sloth.

NOTE: Look at xref:runbooks/storage.adoc#csi-operations[the runbook] for an explanation of this SLO.

=== `slos.storage.canary`

[horizontal]
type:: dictionary
default::
+
[source,yaml]
----
canary:
enabled: true
objective: 99.0
_sli:
volume_plugins_default_params:
size: 1Gi
accessMode: ReadWriteOnce
interval: 1m
maxPodCompletionTimeout: 3m
volume_plugins:
# Empty value for the default plugin
"": {}
----
example::
+
[source,yaml]
----
canary:
enabled: true
objective: 99.0
_sli:
volume_plugins:
# Disable the canary for the default storage class
"": null
# Enable the canaries for ssd and bulk storage classes
ssd: {}
bulk:
size: 10Gi
----

The configuration for the canary storage SLO.

The SLO can be disabled by setting `enabled` to false.

The canary SLO is tested by creating a PVC for every configured storage class and periodically running a pod that writes and deletes a file on the respective PVC.
You can configure which volume plugins are tested with `_sli.volume_plugins`.
The key is the storage class name and the value is a dictionary which can override the default parameters set in `volume_plugins_default_params`.
An empty key (`""`) is used for the default storage class.
The value can be set to `null` to disable the canary for a specific storage class.

Any additional field is added directly to the `slo` input for sloth.

NOTE: Look at xref:runbooks/storage.adoc#canaries[the runbook] for an explanation of this SLO.


=== `slos.kubernetes_api.requests`

[horizontal]
Expand Down
Loading

0 comments on commit 741d7f9

Please sign in to comment.