Extend the VSHN Managed OpenShift queries to tolerate changes to unus…

…ed labels We extend the queries with `max by (relevant_label_set)` where necessary to ensure that changes to unused labels (e.g. `cluster_name`) don't break the billing queries. Note that the list of labels which we keep for `node_cpu_info` must contain `core` and `cpu` for the vCPU-Hour billed clusters. The same list must contain `core` but mustn't not contain `cpu` for the Core-Hour billed clusters.
vshn · Oct 31, 2024 · 2a81b8b · 2a81b8b
1 parent 7017b62
commit 2a81b8b
Show file tree

Hide file tree

Showing 5 changed files with 818 additions and 349 deletions.
diff --git a/main.yml b/main.yml
@@ -507,21 +507,26 @@ parameters:
           max_over_time(
             # Sum the vCPUs by cluster
             sum by(cluster_id, role) (
-                # Get the node vCPUs
-                node_cpu_info
+                # Get the node vCPUs (do not remove `cpu and `core` from the
+                # `by` clause, otherwise the vCPU amount is incorrect)
+                max by (cluster_id, instance, cpu, core) (node_cpu_info)
                 # Limit to worker nodes only
                 * on (cluster_id, instance) group_left(role) (
                     # node_cpu_info and kube_node_role use different labels to identify the node.
-                    label_join(kube_node_role{role="%(role)s"}, "instance", "", "node")
+                    max by (role, instance, cluster_id) (
+                        label_join(kube_node_role{role="%(role)s"}, "instance", "", "node")
+                    )
                 )
             )[59m:1m]
           )
           # Pull in the APPUiO managed info labels
-          * on(cluster_id) group_left(sales_order) appuio_managed_info{
+          * on(cluster_id) group_left(sales_order) max by (cluster_id, sales_order) (
+            appuio_managed_info{
               cloud_provider=~"%(cloud_provider)s",
               distribution=~"%(distribution)s",
               vshn_service_level="%(vshn_service_level)s",
-          }
+            }
+          )
           , "flavor_display", "%(flavor_display)s", "", "")
 
       appuio_managed_core:
@@ -547,21 +552,26 @@ parameters:
           max_over_time(
             # Sum the vCPUs by cluster
             sum by(cluster_id, role) (
-                # Get the node cores (without hyperthreads)
-                max without (cpu) (node_cpu_info)
+                # Get the node cores (without hyperthreads, otherwise we'd
+                # include `cpu` in the `by`)
+                max by (cluster_id, instance, core) (node_cpu_info)
                 # Limit to worker nodes only
                 * on (cluster_id, instance) group_left(role) (
                     # node_cpu_info and kube_node_role use different labels to identify the node.
-                    label_join(kube_node_role{role="worker"}, "instance", "", "node")
+                    max by (cluster_id, instance, role) (
+                        label_join(kube_node_role{role="worker"}, "instance", "", "node")
+                    )
                 )
             )[59m:1m]
           )
           # Pull in the APPUiO managed info labels
-          * on(cluster_id) group_left(sales_order) appuio_managed_info{
+          * on(cluster_id) group_left(sales_order) max by (cluster_id, sales_order) (
+            appuio_managed_info{
               cloud_provider=~"%(cloud_provider)s",
               distribution=~"%(distribution)s",
               vshn_service_level="%(vshn_service_level)s",
-          }
+            }
+          )
 
       legacy_appuio_rke_cluster:
         enabled: true
@@ -1052,18 +1062,22 @@ parameters:
             # Sum the vCPUs by cluster
             sum by(cluster_id) (
                 # Get the node vCPUs
-                node_cpu_info
+                max by (cluster_id, instance, core, cpu) (node_cpu_info)
                 # Limit to worker nodes only
                 * on (cluster_id, instance) group_left(role) (
                     # node_cpu_info and kube_node_role use different labels to identify the node.
-                    label_join(kube_node_role{role=~"app|storage"}, "instance", "", "node")
+                    max by (cluster_id, instance, role) (
+                        label_join(kube_node_role{role=~"app|storage"}, "instance", "", "node")
+                    )
                 )
             )[59m:1m]
           )
           # Pull in the APPUiO managed info labels
-          * on(cluster_id) group_left(sales_order) appuio_managed_info{
+          * on(cluster_id) group_left(sales_order) max by (cluster_id, sales_order) (
+            appuio_managed_info{
               vshn_service_level=~"%(vshn_service_level)s",
               cilium_addons=~".*%(cilium_addon)s.*"
-          }
+            }
+          )
           , "cilium_addon", "%(cilium_addon)s", "", "")
           , "addon_display", "%(addon_display)s", "", "")
diff --git a/querytests/appuio_managed_core.jsonnet b/querytests/appuio_managed_core.jsonnet
@@ -1,12 +1,12 @@
-local c = import 'promtest.libsonnet'; // provided by promtest-jsonnet
+local c = import 'promtest.libsonnet';  // provided by promtest-jsonnet
 
-local config = std.extVar("main.yml");
-local queryPattern = config.parameters.appuio_reporting_aldebaran.rules.appuio_managed_core.query_pattern ;
+local config = std.extVar('main.yml');
+local queryPattern = config.parameters.appuio_reporting_aldebaran.rules.appuio_managed_core.query_pattern;
 
 local appParams = {
-  cloud_provider: "baremetal",
-  distribution: "oke",
-  vshn_service_level: "best_effort",
+  cloud_provider: 'baremetal',
+  distribution: 'oke',
+  vshn_service_level: 'best_effort',
 };
 
 local commonLabels = {
@@ -56,9 +56,59 @@ local baseSeries = {
   appuioInfoLabel: c.series('appuio_managed_info', infoLabels, '1x120'),
 };
 
+local displayNameChange = {
+  appNodeRoleLabel: c.series('kube_node_role', commonLabels {
+    node: 'app-test',
+    role: 'worker',
+    cluster_name: 'foo',
+  }, '1x60 _x60') + c.series('kube_node_role', commonLabels {
+    node: 'app-test',
+    role: 'worker',
+    cluster_name: 'Foo',
+  }, '_x60 1x60'),
+
+  appNodeCPUInfoLabel0: c.series('node_cpu_info', commonLabels {
+    instance: 'app-test',
+    cpu: '1',
+    core: '0',
+    cluster_name: 'foo',
+  }, '1x60 _x60') + c.series('node_cpu_info', commonLabels {
+    instance: 'app-test',
+    cpu: '1',
+    core: '0',
+    cluster_name: 'Foo',
+  }, '_x60 1x60'),
+  appNodeCPUInfoLabel2: c.series('node_cpu_info', commonLabels {
+    instance: 'app-test',
+    cpu: '2',
+    core: '0',
+    cluster_name: 'foo',
+  }, '1x60 _x60') + c.series('node_cpu_info', commonLabels {
+    instance: 'app-test',
+    cpu: '2',
+    core: '0',
+    cluster_name: 'Foo',
+  }, '_x60 1x60'),
+  appNodeCPUInfoLabel1: c.series('node_cpu_info', commonLabels {
+    instance: 'app-test',
+    cpu: '1',
+    core: '1',
+    cluster_name: 'foo',
+  }, '1x60 _x60') + c.series('node_cpu_info', commonLabels {
+    instance: 'app-test',
+    cpu: '1',
+    core: '1',
+    cluster_name: 'Foo',
+  }, '_x60 1x60'),
+
+  appuioInfoLabel:
+    c.series('appuio_managed_info', infoLabels { cluster_name: 'foo' }, '1x60 _x60') +
+    c.series('appuio_managed_info', infoLabels { cluster_name: 'Foo' }, '_x60 1x60'),
+};
+
 local baseCalculatedLabels = {
-  cluster_id: "c-managed-openshift",
-  sales_order: "SO123123",
+  cluster_id: 'c-managed-openshift',
+  sales_order: 'SO123123',
 };
 
 {
@@ -76,10 +126,23 @@ local baseCalculatedLabels = {
         },
       ]
     ),
+    c.test(
+      'two app CPUs with display name change',
+      baseSeries + displayNameChange,
+      queryPattern % appParams,
+      [
+        {
+          labels: c.formatLabels(baseCalculatedLabels {
+            role: 'worker',
+          }),
+          value: 2,
+        },
+      ]
+    ),
     c.test(
       'no openshift',
       baseSeries {
-        appuioInfoLabel: c.series('appuio_managed_info', infoLabels {distribution: 'openshift4'}, '1x120')
+        appuioInfoLabel: c.series('appuio_managed_info', infoLabels { distribution: 'openshift4' }, '1x120'),
       },
       queryPattern % appParams,
       [

diff --git a/querytests/appuio_managed_vcpu.jsonnet b/querytests/appuio_managed_vcpu.jsonnet
@@ -1,21 +1,21 @@
-local c = import 'promtest.libsonnet'; // provided by promtest-jsonnet
+local c = import 'promtest.libsonnet';  // provided by promtest-jsonnet
 
-local config = std.extVar("main.yml");
-local queryPattern = config.parameters.appuio_reporting_aldebaran.rules.appuio_managed_vcpu.query_pattern ;
+local config = std.extVar('main.yml');
+local queryPattern = config.parameters.appuio_reporting_aldebaran.rules.appuio_managed_vcpu.query_pattern;
 
 local appParams = {
-  cloud_provider: "cloudscale",
-  vshn_service_level: "best_effort",
-  distribution: "openshift4",
-  role: "app",
-  flavor_display: "OpenShift Container Platform"
+  cloud_provider: 'cloudscale',
+  vshn_service_level: 'best_effort',
+  distribution: 'openshift4',
+  role: 'app',
+  flavor_display: 'OpenShift Container Platform',
 };
 local storageParams = {
-  cloud_provider: "cloudscale",
-  vshn_service_level: "best_effort",
-  distribution: "openshift4",
-  role: "storage",
-  flavor_display: "OpenShift Container Platform"
+  cloud_provider: 'cloudscale',
+  vshn_service_level: 'best_effort',
+  distribution: 'openshift4',
+  role: 'storage',
+  flavor_display: 'OpenShift Container Platform',
 };
 
 local commonLabels = {
@@ -58,10 +58,44 @@ local baseSeries = {
   appuioInfoLabel: c.series('appuio_managed_info', infoLabels, '1x120'),
 };
 
+local displayNameChange = {
+  appNodeRoleLabel: c.series('kube_node_role', commonLabels {
+    node: 'app-test',
+    role: 'app',
+    cluster_name: 'foo',
+  }, '1x60 _x60') + c.series('kube_node_role', commonLabels {
+    node: 'app-test',
+    role: 'app',
+    cluster_name: 'Foo',
+  }, '_x60 1x60'),
+  appNodeCPUInfoLabel0: c.series('node_cpu_info', commonLabels {
+    instance: 'app-test',
+    core: '0',
+    cluster_name: 'foo',
+  }, '1x60 _x60') + c.series('node_cpu_info', commonLabels {
+    instance: 'app-test',
+    core: '0',
+    cluster_name: 'Foo',
+  }, '_x60 1x60'),
+  appNodeCPUInfoLabel1: c.series('node_cpu_info', commonLabels {
+    instance: 'app-test',
+    core: '1',
+    cluster_name: 'foo',
+  }, '1x60 _x60') + c.series('node_cpu_info', commonLabels {
+    instance: 'app-test',
+    core: '1',
+    cluster_name: 'Foo',
+  }, '_x60 1x60'),
+
+  appuioInfoLabel:
+    c.series('appuio_managed_info', infoLabels { cluster_name: 'foo' }, '1x60 _x60') +
+    c.series('appuio_managed_info', infoLabels { cluster_name: 'Foo' }, '_x60 1x60'),
+};
+
 local baseCalculatedLabels = {
-  cluster_id: "c-managed-openshift",
-  sales_order: "SO123123",
-  flavor_display: "OpenShift Container Platform",
+  cluster_id: 'c-managed-openshift',
+  sales_order: 'SO123123',
+  flavor_display: 'OpenShift Container Platform',
 };
 
 {
@@ -92,5 +126,18 @@ local baseCalculatedLabels = {
         },
       ]
     ),
+    c.test(
+      'and two app CPUs with a display name change',
+      baseSeries + displayNameChange,
+      queryPattern % appParams,
+      [
+        {
+          labels: c.formatLabels(baseCalculatedLabels {
+            role: 'app',
+          }),
+          value: 2,
+        },
+      ]
+    ),
   ],
 }