Skip to content

Commit

Permalink
Add tool to generate metrics documentation (#2043)
Browse files Browse the repository at this point in the history
Signed-off-by: João Vilaça <[email protected]>
  • Loading branch information
machadovilaca authored Jan 26, 2022
1 parent f3e54f7 commit 2b7f786
Show file tree
Hide file tree
Showing 4 changed files with 247 additions and 93 deletions.
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,12 @@ builder-push:
openshift-ci-image-push:
./hack/build/osci-image-builder.sh

generate-doc: build-docgen
_out/metricsdocs > doc/metrics.md

build-docgen:
go build -ldflags="-s -w" -o _out/metricsdocs ./tools/metricsdocs

help:
@echo "Usage: make [Targets ...]"
@echo " all "
Expand Down
19 changes: 19 additions & 0 deletions doc/metrics.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Containerized Data Importer metrics
This document aims to help users that are not familiar with metrics exposed by the Containerized Data Importer.
All metrics documented here are auto-generated by the utility tool `tools/metricsdocs` and reflects exactly what is being exposed.

## Containerized Data Importer Metrics List
### kubevirt_cdi_clone_dv_unusual_restartcount_total
Total restart count in CDI Data Volume cloner pod
### kubevirt_cdi_dataimportcron_outdated_total
Total count of outdated DataImportCron imports
### kubevirt_cdi_import_dv_unusual_restartcount_total
Total restart count in CDI Data Volume importer pod
### kubevirt_cdi_operator_up_total
CDI operator status
### kubevirt_cdi_upload_dv_unusual_restartcount_total
Total restart count in CDI Data Volume upload server pod
## Developing new metrics
After developing new metrics or changing old ones, please run `make generate-doc` to regenerate this document.

If you feel that the new metric doesn't follow these rules, please change `tools/metricsdocs` with your needs.
218 changes: 125 additions & 93 deletions pkg/operator/controller/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,129 @@ func isPrometheusDeployed(logger logr.Logger, c client.Client, namespace string)
return true, nil
}

// RecordRulesDesc represent CDI Prometheus Record Rules
type RecordRulesDesc struct {
Name string
Expr string
Description string
}

// GetRecordRulesDesc returns CDI Prometheus Record Rules
func GetRecordRulesDesc(namespace string) []RecordRulesDesc {
return []RecordRulesDesc{
{
"kubevirt_cdi_operator_up_total",
fmt.Sprintf("sum(up{namespace='%s', pod=~'cdi-operator-.*'} or vector(0))", namespace),
"CDI operator status",
},
{
"kubevirt_cdi_import_dv_unusual_restartcount_total",
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'%s-.*', container='%s'} > %s)", common.ImporterPodName, common.ImporterPodName, strconv.Itoa(common.UnusualRestartCountThreshold)),
"Total restart count in CDI Data Volume importer pod",
},
{
"kubevirt_cdi_upload_dv_unusual_restartcount_total",
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'%s-.*', container='%s'} > %s)", common.UploadPodName, common.UploadServerPodname, strconv.Itoa(common.UnusualRestartCountThreshold)),
"Total restart count in CDI Data Volume upload server pod",
},
{
"kubevirt_cdi_clone_dv_unusual_restartcount_total",
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'.*%s', container='%s'} > %s)", common.ClonerSourcePodNameSuffix, common.ClonerSourcePodName, strconv.Itoa(common.UnusualRestartCountThreshold)),
"Total restart count in CDI Data Volume cloner pod",
},
{
"kubevirt_cdi_dataimportcron_outdated_total",
"sum(kubevirt_cdi_dataimportcron_outdated or vector(0))",
"Total count of outdated DataImportCron imports",
},
}
}

func getRecordRules(namespace string) []promv1.Rule {
var recordRules []promv1.Rule

for _, rrd := range GetRecordRulesDesc(namespace) {
recordRules = append(recordRules, generateRecordRule(rrd.Name, rrd.Expr))
}

return recordRules
}

func getAlertRules() []promv1.Rule {
return []promv1.Rule{
generateAlertRule(
"CDIOperatorDown",
"kubevirt_cdi_operator_up_total == 0",
"5m",
map[string]string{
"summary": "CDI operator is down",
"runbook_url": runbookURLBasePath + "CDIOperatorDown",
},
map[string]string{
severityAlertLabelKey: "warning",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
generateAlertRule(
"CDINotReady",
"kubevirt_cdi_cr_ready == 0",
"5m",
map[string]string{
"summary": "CDI is not available to use",
"runbook_url": runbookURLBasePath + "CDINotReady",
},
map[string]string{
severityAlertLabelKey: "warning",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
generateAlertRule(
"CDIDataVolumeUnusualRestartCount",
"kubevirt_cdi_import_dv_unusual_restartcount_total > 0 or kubevirt_cdi_upload_dv_unusual_restartcount_total > 0 or kubevirt_cdi_clone_dv_unusual_restartcount_total > 0",
"5m",
map[string]string{
"summary": "Cluster has DVs with an unusual restart count, meaning they are probably failing and need to be investigated",
"runbook_url": runbookURLBasePath + "CDIDataVolumeUnusualRestartCount",
},
map[string]string{
severityAlertLabelKey: "warning",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
generateAlertRule(
"CDIStorageProfilesIncomplete",
"kubevirt_cdi_incomplete_storageprofiles_total > 0",
"5m",
map[string]string{
"summary": "StorageProfiles are incomplete, accessMode/volumeMode cannot be inferred by CDI",
"runbook_url": runbookURLBasePath + "CDIStorageProfilesIncomplete",
},
map[string]string{
severityAlertLabelKey: "info",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
generateAlertRule(
"CDIDataImportCronOutdated",
"kubevirt_cdi_dataimportcron_outdated_total > 0",
"15m",
map[string]string{
"summary": "DataImportCron latest imports are outdated",
"runbook_url": runbookURLBasePath + "CDIDataImportCronOutdated",
},
map[string]string{
severityAlertLabelKey: "info",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
}
}

func newPrometheusRule(namespace string) *promv1.PrometheusRule {
return &promv1.PrometheusRule{
ObjectMeta: metav1.ObjectMeta{
Expand All @@ -141,99 +264,8 @@ func newPrometheusRule(namespace string) *promv1.PrometheusRule {
Spec: promv1.PrometheusRuleSpec{
Groups: []promv1.RuleGroup{
{
Name: "cdi.rules",
Rules: []promv1.Rule{
generateRecordRule(
"kubevirt_cdi_operator_up_total",
fmt.Sprintf("sum(up{namespace='%s', pod=~'cdi-operator-.*'} or vector(0))", namespace),
),
generateAlertRule(
"CDIOperatorDown",
"kubevirt_cdi_operator_up_total == 0",
"5m",
map[string]string{
"summary": "CDI operator is down",
"runbook_url": runbookURLBasePath + "CDIOperatorDown",
},
map[string]string{
severityAlertLabelKey: "warning",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
generateAlertRule(
"CDINotReady",
"kubevirt_cdi_cr_ready == 0",
"5m",
map[string]string{
"summary": "CDI is not available to use",
"runbook_url": runbookURLBasePath + "CDINotReady",
},
map[string]string{
severityAlertLabelKey: "warning",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
generateRecordRule(
"kubevirt_cdi_import_dv_unusual_restartcount_total",
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'%s-.*', container='%s'} > %s)", common.ImporterPodName, common.ImporterPodName, strconv.Itoa(common.UnusualRestartCountThreshold)),
),
generateRecordRule(
"kubevirt_cdi_upload_dv_unusual_restartcount_total",
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'%s-.*', container='%s'} > %s)", common.UploadPodName, common.UploadServerPodname, strconv.Itoa(common.UnusualRestartCountThreshold)),
),
generateRecordRule(
"kubevirt_cdi_clone_dv_unusual_restartcount_total",
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'.*%s', container='%s'} > %s)", common.ClonerSourcePodNameSuffix, common.ClonerSourcePodName, strconv.Itoa(common.UnusualRestartCountThreshold)),
),
generateAlertRule(
"CDIDataVolumeUnusualRestartCount",
"kubevirt_cdi_import_dv_unusual_restartcount_total > 0 or kubevirt_cdi_upload_dv_unusual_restartcount_total > 0 or kubevirt_cdi_clone_dv_unusual_restartcount_total > 0",
"5m",
map[string]string{
"summary": "Cluster has DVs with an unusual restart count, meaning they are probably failing and need to be investigated",
"runbook_url": runbookURLBasePath + "CDIDataVolumeUnusualRestartCount",
},
map[string]string{
severityAlertLabelKey: "warning",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
generateAlertRule(
"CDIStorageProfilesIncomplete",
"kubevirt_cdi_incomplete_storageprofiles_total > 0",
"5m",
map[string]string{
"summary": "StorageProfiles are incomplete, accessMode/volumeMode cannot be inferred by CDI",
"runbook_url": runbookURLBasePath + "CDIStorageProfilesIncomplete",
},
map[string]string{
severityAlertLabelKey: "info",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
generateRecordRule(
"kubevirt_cdi_dataimportcron_outdated_total",
"sum(kubevirt_cdi_dataimportcron_outdated or vector(0))",
),
generateAlertRule(
"CDIDataImportCronOutdated",
"kubevirt_cdi_dataimportcron_outdated_total > 0",
"15m",
map[string]string{
"summary": "DataImportCron latest imports are outdated",
"runbook_url": runbookURLBasePath + "CDIDataImportCronOutdated",
},
map[string]string{
severityAlertLabelKey: "info",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
},
Name: "cdi.rules",
Rules: append(getRecordRules(namespace), getAlertRules()...),
},
},
},
Expand Down
97 changes: 97 additions & 0 deletions tools/metricsdocs/metricsdocs.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
package main

import (
"fmt"
"kubevirt.io/containerized-data-importer/pkg/operator/controller"
"sort"
"strings"
)

// constant parts of the file
const (
title = "# Containerized Data Importer metrics\n"
background = "This document aims to help users that are not familiar with metrics exposed by the Containerized Data Importer.\n" +
"All metrics documented here are auto-generated by the utility tool `tools/metricsdocs` and reflects exactly what is being exposed.\n\n"

KVSpecificMetrics = "## Containerized Data Importer Metrics List\n"

opening = title +
background +
KVSpecificMetrics

// footer
footerHeading = "## Developing new metrics\n"
footerContent = "After developing new metrics or changing old ones, please run `make generate-doc` to regenerate this document.\n\n" +
"If you feel that the new metric doesn't follow these rules, please change `tools/metricsdocs` with your needs.\n"

footer = footerHeading + footerContent
)

func main() {
metricsList := recordRulesDescToMetricList(controller.GetRecordRulesDesc(""))
sort.Sort(metricsList)
writeToFile(metricsList)
}

func writeToFile(metricsList metricList) {
fmt.Print(opening)
metricsList.writeOut()
fmt.Print(footer)
}

type metric struct {
name string
description string
}

func recordRulesDescToMetricList(mdl []controller.RecordRulesDesc) metricList {
res := make([]metric, len(mdl))
for i, md := range mdl {
res[i] = metricDescriptionToMetric(md)
}

return res
}

func metricDescriptionToMetric(rrd controller.RecordRulesDesc) metric {
return metric{
name: rrd.Name,
description: rrd.Description,
}
}

func (m metric) writeOut() {
fmt.Println("###", m.name)
fmt.Println(m.description)
}

type metricList []metric

// Len implements sort.Interface.Len
func (m metricList) Len() int {
return len(m)
}

// Less implements sort.Interface.Less
func (m metricList) Less(i, j int) bool {
return m[i].name < m[j].name
}

// Swap implements sort.Interface.Swap
func (m metricList) Swap(i, j int) {
m[i], m[j] = m[j], m[i]
}

func (m *metricList) add(line string) {
split := strings.Split(line, " ")
name := split[2]
split[3] = strings.Title(split[3])
description := strings.Join(split[3:], " ")
*m = append(*m, metric{name: name, description: description})
}

func (m metricList) writeOut() {
for _, met := range m {
met.writeOut()
}
}

0 comments on commit 2b7f786

Please sign in to comment.