multus: allow node profiles in validation test

Add the ability to specify node profiles in the multus validation test. This addresses a few points of early feedback on the validation tool. Statements below critique the tool's behavior before this patch. 1. The tool assumes all daemons are on public and cluster network, which means users who have a significantly smaller cluster net (a design choice) cannot run a single test to determine if Rook is likely to install correctly. 2. The tool does not have placement options to select only a subset of Kubernetes nodes to run validation on. 3. Users of multus seem to have a dedicated pool of storage nodes more often than the average Rook install. This makes sense for security- and perforance-minded users. The tool cannot run a single test to verify storage-only and general-workload nodes at one time. These points are addressed by allowing users to specify configurations for different "NodeTypes." Each NodeType config has options for selecting the number of OSDs as well as the number of other (non-OSD) Ceph daemons. This limits the unnecessary exhaustion of cluster network addresses from critique 1. Each NodeType config has its own placement (critique 2). Users can define as many NodeTypes as needed to test the network for their planned CephCluster. Specifically, this allows the tool to test storage-only nodes and generalized-workload nodes at the same time. An arbitrary number of NodeTypes are allowed to support even more highly specialized cluster setups, such as multiple tiers of storage nodes where some storage-only nodes may run more OSDs than others. Signed-off-by: Blaine Gardner <[email protected]>
travisn · Oct 4, 2023 · 0c721e0 · 0c721e0
1 parent 7d7d1ba
commit 0c721e0
Show file tree

Hide file tree

Showing 24 changed files with 1,024 additions and 152 deletions.
diff --git a/.github/workflows/multus.yaml b/.github/workflows/multus.yaml
@@ -45,16 +45,12 @@ jobs:
         uses: helm/[email protected]
         with:
           config: tests/scripts/multus/kind-config.yaml
-          cluster_name: rook-multus-e2e
+          cluster_name: kind
 
-      - name: Start tmate
-        if: runner.debug || contains(github.event.pull_request.labels.*.name, 'debug-ci')
-        run: |
-          kubectl apply -f tests/scripts/tmate-pod.yaml
-          sleep 3 # sometimes the next command errors b/c k8s hasn't had time to schedule the pod yet
-          kubectl -n tmate wait --for=condition=ready -l app=tmate pod --timeout=300s
-          sleep 1 # just in case tmate hasn't output its web/ssh links yet
-          kubectl -n tmate logs deploy/tmate
+      - name: consider debugging
+        uses: ./.github/workflows/tmate_debug
+        with:
+          use-tmate: ${{ secrets.USE_TMATE }}
 
       - name: Setup multus
         run: ./tests/scripts/multus/setup-multus.sh
@@ -63,14 +59,47 @@ jobs:
         run: kubectl create -f tests/scripts/multus/default-public-cluster-nads.yaml
 
       - name: Quickly build Rook binary
+        id: build
         run: go build -o rook cmd/rook/*.go
 
-      - name: Run multus validation test
-        run: |
-          export KUBECONFIG="/home/runner/.kube/config"
-          kubectl create namespace rook-ceph
-          ./rook --log-level debug multus validation run \
-            --namespace rook-ceph \
-            --public-network default/public-net \
-            --cluster-network default/cluster-net \
-            --daemons-per-node 2
+      - name: Run CLI validation test
+        run: ./tests/scripts/multus/test-110-cli.sh
+
+      - name: Label kind nodes for stretch cluster tests
+        id: label
+        # tests that require labels are independent from previous tests
+        if: steps.build.outcome == 'success' && !cancelled()
+        run: ./tests/scripts/multus/test-200-stretch-label-nodes.sh
+        # nodes are purposefully not tainted yet for overlap test
+
+      - name: Run stretch cluster overlap test
+        id: overlap
+        # independent from other tests as long as nodes are labeled
+        if: steps.label.outcome == 'success' && !cancelled()
+        run: ./tests/scripts/multus/test-210-stretch-overlap.sh
+
+      - name: Run cleanup test
+        # cleanup relies on overlap test to be successful
+        if: steps.overlap.outcome == 'success' && !cancelled()
+        run: ./tests/scripts/multus/test-211-stretch-cleanup.sh
+
+      - name: Taint kind nodes for remaining stretch cluster tests
+        id: taint
+        # tests that require taints+labels are independent from previous tests
+        if: steps.label.outcome == 'success' && !cancelled()
+        run: ./tests/scripts/multus/test-200-stretch-taint-nodes.sh
+
+      - name: Run stretch cluster test with public and cluster networks
+        # independent from other tests as long as nodes are tainted and labeled
+        if: steps.taint.outcome == 'success' && !cancelled()
+        run: ./tests/scripts/multus/test-220-stretch-pub-and-cluster.sh
+
+      - name: Run stretch cluster test with public network only
+        # independent from other tests as long as nodes are tainted and labeled
+        if: steps.taint.outcome == 'success' && !cancelled()
+        run: ./tests/scripts/multus/test-230-stretch-pub-only.sh
+
+      - name: Run stretch cluster test with cluster network only
+        # independent from other tests as long as nodes are tainted and labeled
+        if: steps.taint.outcome == 'success' && !cancelled()
+        run: ./tests/scripts/multus/test-240-stretch-cluster-only.sh
diff --git a/cmd/rook/userfacing/multus/validation/config.go b/cmd/rook/userfacing/multus/validation/config.go
@@ -26,13 +26,13 @@ import (
 var (
 	configCmd = &cobra.Command{
 		Use:   "config",
-		Short: "Generate a validation test config file for different default scenarios.",
+		Short: "Generate a validation test config file for different default scenarios to stdout.",
 		Args:  cobra.NoArgs,
 	}
 
-	simpleConfigCmd = &cobra.Command{
-		Use:   "simple",
-		Short: "Generate a simple default config file.",
+	converged = &cobra.Command{
+		Use:   "converged",
+		Short: "Example config for a cluster that runs storage and user workloads on all nodes.",
 		Args:  cobra.NoArgs,
 		RunE: func(cmd *cobra.Command, args []string) error {
 			y, err := multus.NewDefaultValidationTestConfig().ToYAML()
@@ -43,8 +43,38 @@ var (
 			return nil
 		},
 	}
+
+	dedicatedStorageNodesConfigCmd = &cobra.Command{
+		Use:   "dedicated-storage-nodes",
+		Short: "Example config file for a cluster that uses dedicated storage nodes.",
+		Args:  cobra.NoArgs,
+		RunE: func(cmd *cobra.Command, args []string) error {
+			y, err := multus.NewDedicatedStorageNodesValidationTestConfig().ToYAML()
+			if err != nil {
+				return err
+			}
+			fmt.Print(y)
+			return nil
+		},
+	}
+
+	stretchClusterConfigCmd = &cobra.Command{
+		Use:   "stretch-cluster",
+		Short: "Example config file for a stretch cluster with dedicated storage nodes.",
+		Args:  cobra.NoArgs,
+		RunE: func(cmd *cobra.Command, args []string) error {
+			y, err := multus.NewArbiterValidationTestConfig().ToYAML()
+			if err != nil {
+				return err
+			}
+			fmt.Print(y)
+			return nil
+		},
+	}
 )
 
 func init() {
-	configCmd.AddCommand(simpleConfigCmd)
+	configCmd.AddCommand(converged)
+	configCmd.AddCommand(dedicatedStorageNodesConfigCmd)
+	configCmd.AddCommand(stretchClusterConfigCmd)
 }
diff --git a/cmd/rook/userfacing/multus/validation/validation.go b/cmd/rook/userfacing/multus/validation/validation.go
@@ -35,6 +35,9 @@ var (
 	validationConfig     = multus.ValidationTest{
 		Logger: capnslog.NewPackageLogger("github.com/rook/rook", "multus-validation"),
 	}
+
+	// keep special var for `--daemons-per-node` that needs put into node config for validation run
+	flagDaemonsPerNode = -1
 )
 
 // commands
@@ -122,7 +125,7 @@ func init() {
 	runCmd.Flags().StringVar(&validationConfig.ClusterNetwork, "cluster-network", defaultConfig.ClusterNetwork,
 		"The name of the Network Attachment Definition (NAD) that will be used for Ceph's cluster network. "+
 			"This should be a namespaced name in the form <namespace>/<name> if the NAD is defined in a different namespace from the cluster namespace.")
-	runCmd.Flags().IntVar(&validationConfig.DaemonsPerNode, "daemons-per-node", defaultConfig.DaemonsPerNode,
+	runCmd.Flags().IntVar(&flagDaemonsPerNode, "daemons-per-node", defaultConfig.TotalDaemonsPerNode(),
 		"The number of validation test daemons to run per node. "+
 			"It is recommended to set this to the maximum number of Ceph daemons that can run on any node in the worst case of node failure(s). "+
 			"The default value is set to the worst-case value for a Rook Ceph cluster with 3 portable OSDs, 3 portable monitors, "+
@@ -157,6 +160,15 @@ func runValidation(ctx context.Context) {
 			os.Exit(22 /* EINVAL */)
 		}
 		validationConfig.ValidationTestConfig = *c
+	} else {
+		// the default CLI test is simplified and assumes all Ceph daemons are OSDs, which get both
+		// public and cluster network attachments. This also preserves legacy CLI behavior.
+		validationConfig.NodeTypes = map[string]multus.NodeConfig{
+			multus.DefaultValidationNodeType: {
+				OSDsPerNode:         flagDaemonsPerNode,
+				OtherDaemonsPerNode: 0,
+			},
+		}
 	}
 
 	if err := validationConfig.ValidationTestConfig.Validate(); err != nil {

diff --git a/deploy/examples/multus-validation.yaml b/deploy/examples/multus-validation.yaml
@@ -27,7 +27,7 @@
 # Flags:
 #       --cluster-network string           The name of the Network Attachment Definition (NAD) that will be used for Ceph's cluster network. This should be a namespaced name in the form <namespace>/<name> if the NAD is defined in a different namespace from the cluster namespace.
 #   -c, --config string                    The validation test config file to use. This cannot be used with other flags.
-#       --daemons-per-node int             The number of validation test daemons to run per node. It is recommended to set this to the maximum number of Ceph daemons that can run on any node in the worst case of node failure(s). The default value is set to the worst-case value for a Rook Ceph cluster with 3 portable OSDs, 3 portable monitors, and where all optional child resources have been created with 1 daemon such that they all might run on a single node in a failure scenario. If you aren't sure what to choose for this value, add 1 for each additional OSD beyond 3. (default 16)
+#       --daemons-per-node int             The number of validation test daemons to run per node. It is recommended to set this to the maximum number of Ceph daemons that can run on any node in the worst case of node failure(s). The default value is set to the worst-case value for a Rook Ceph cluster with 3 portable OSDs, 3 portable monitors, and where all optional child resources have been created with 1 daemon such that they all might run on a single node in a failure scenario. If you aren't sure what to choose for this value, add 1 for each additional OSD beyond 3. (default 19)
 #   -h, --help                             help for run
 #   -n, --namespace string                 The namespace for validation test resources. It is recommended to set this to the namespace in which Rook's Ceph cluster will be installed. (default "rook-ceph")
 #       --nginx-image string               The Nginx image used for the validation server and clients. (default "nginxinc/nginx-unprivileged:stable-alpine")

diff --git a/pkg/daemon/multus/client-daemonset.yaml b/pkg/daemon/multus/client-daemonset.yaml
@@ -1,28 +1,42 @@
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
-  name: multus-validation-test-client-{{ .ClientID }}
+  name: multus-validation-test-client-{{ .NodeType }}-{{ .ClientType }}-{{ .ClientID }}
   labels:
     app: multus-validation-test-client
+    nodeType: "{{ .NodeType }}"
+    clientType: "{{ .ClientType }}"
+    clientID: "{{ .ClientID }}"
     app.kubernetes.io/name: "client"
-    app.kubernetes.io/instance: "client-{{ .ClientID }}"
+    app.kubernetes.io/instance: "client-{{ .NodeType }}-{{ .ClientType }}-{{ .ClientID }}"
     app.kubernetes.io/component: "client"
     app.kubernetes.io/part-of: "multus-validation-test"
     app.kubernetes.io/managed-by: "rook-cli"
 spec:
   selector:
     matchLabels:
       app: multus-validation-test-client
+      nodeType: "{{ .NodeType }}"
+      clientType: "{{ .ClientType }}"
       clientID: "{{ .ClientID }}"
   template:
     metadata:
       labels:
         app: multus-validation-test-client
+        nodeType: "{{ .NodeType }}"
+        clientType: "{{ .ClientType }}"
         clientID: "{{ .ClientID }}"
       annotations:
         k8s.v1.cni.cncf.io/networks: "{{ .NetworksAnnotationValue }}"
     spec:
-      # TODO: selectors, affinities, tolerations
+      nodeSelector:
+      {{- range $k, $v := .Placement.NodeSelector }}
+        {{ $k }}: {{ $v }}
+      {{- end }}
+      tolerations:
+      {{- range $idx, $toleration := .Placement.Tolerations }}
+        - {{ $toleration.ToJSON }}
+      {{- end }}
       securityContext:
         runAsNonRoot: true
         seccompProfile: