Merge pull request rook#12968 from BlaineEXE/multus-validation-allow-…

…node-types multus: allow node profiles in validation test
travisn · Oct 4, 2023 · 04c9f98 · 04c9f98
2 parents 7d7d1ba + 0c721e0
commit 04c9f98
Show file tree

Hide file tree

Showing 24 changed files with 1,024 additions and 152 deletions.
diff --git a/.github/workflows/multus.yaml b/.github/workflows/multus.yaml
@@ -45,16 +45,12 @@ jobs:
         uses: helm/[email protected]
         with:
           config: tests/scripts/multus/kind-config.yaml
-          cluster_name: rook-multus-e2e
+          cluster_name: kind
 
-      - name: Start tmate
-        if: runner.debug || contains(github.event.pull_request.labels.*.name, 'debug-ci')
-        run: |
-          kubectl apply -f tests/scripts/tmate-pod.yaml
-          sleep 3 # sometimes the next command errors b/c k8s hasn't had time to schedule the pod yet
-          kubectl -n tmate wait --for=condition=ready -l app=tmate pod --timeout=300s
-          sleep 1 # just in case tmate hasn't output its web/ssh links yet
-          kubectl -n tmate logs deploy/tmate
+      - name: consider debugging
+        uses: ./.github/workflows/tmate_debug
+        with:
+          use-tmate: ${{ secrets.USE_TMATE }}
 
       - name: Setup multus
         run: ./tests/scripts/multus/setup-multus.sh
@@ -63,14 +59,47 @@ jobs:
         run: kubectl create -f tests/scripts/multus/default-public-cluster-nads.yaml
 
       - name: Quickly build Rook binary
+        id: build
         run: go build -o rook cmd/rook/*.go
 
-      - name: Run multus validation test
-        run: |
-          export KUBECONFIG="/home/runner/.kube/config"
-          kubectl create namespace rook-ceph
-          ./rook --log-level debug multus validation run \
-            --namespace rook-ceph \
-            --public-network default/public-net \
-            --cluster-network default/cluster-net \
-            --daemons-per-node 2
+      - name: Run CLI validation test
+        run: ./tests/scripts/multus/test-110-cli.sh
+
+      - name: Label kind nodes for stretch cluster tests
+        id: label
+        # tests that require labels are independent from previous tests
+        if: steps.build.outcome == 'success' && !cancelled()
+        run: ./tests/scripts/multus/test-200-stretch-label-nodes.sh
+        # nodes are purposefully not tainted yet for overlap test
+
+      - name: Run stretch cluster overlap test
+        id: overlap
+        # independent from other tests as long as nodes are labeled
+        if: steps.label.outcome == 'success' && !cancelled()
+        run: ./tests/scripts/multus/test-210-stretch-overlap.sh
+
+      - name: Run cleanup test
+        # cleanup relies on overlap test to be successful
+        if: steps.overlap.outcome == 'success' && !cancelled()
+        run: ./tests/scripts/multus/test-211-stretch-cleanup.sh
+
+      - name: Taint kind nodes for remaining stretch cluster tests
+        id: taint
+        # tests that require taints+labels are independent from previous tests
+        if: steps.label.outcome == 'success' && !cancelled()
+        run: ./tests/scripts/multus/test-200-stretch-taint-nodes.sh
+
+      - name: Run stretch cluster test with public and cluster networks
+        # independent from other tests as long as nodes are tainted and labeled
+        if: steps.taint.outcome == 'success' && !cancelled()
+        run: ./tests/scripts/multus/test-220-stretch-pub-and-cluster.sh
+
+      - name: Run stretch cluster test with public network only
+        # independent from other tests as long as nodes are tainted and labeled
+        if: steps.taint.outcome == 'success' && !cancelled()
+        run: ./tests/scripts/multus/test-230-stretch-pub-only.sh
+
+      - name: Run stretch cluster test with cluster network only
+        # independent from other tests as long as nodes are tainted and labeled
+        if: steps.taint.outcome == 'success' && !cancelled()
+        run: ./tests/scripts/multus/test-240-stretch-cluster-only.sh
diff --git a/cmd/rook/userfacing/multus/validation/config.go b/cmd/rook/userfacing/multus/validation/config.go
@@ -26,13 +26,13 @@ import (
 var (
 	configCmd = &cobra.Command{
 		Use:   "config",
-		Short: "Generate a validation test config file for different default scenarios.",
+		Short: "Generate a validation test config file for different default scenarios to stdout.",
 		Args:  cobra.NoArgs,
 	}
 
-	simpleConfigCmd = &cobra.Command{
-		Use:   "simple",
-		Short: "Generate a simple default config file.",
+	converged = &cobra.Command{
+		Use:   "converged",
+		Short: "Example config for a cluster that runs storage and user workloads on all nodes.",
 		Args:  cobra.NoArgs,
 		RunE: func(cmd *cobra.Command, args []string) error {
 			y, err := multus.NewDefaultValidationTestConfig().ToYAML()
@@ -43,8 +43,38 @@ var (
 			return nil
 		},
 	}
+
+	dedicatedStorageNodesConfigCmd = &cobra.Command{
+		Use:   "dedicated-storage-nodes",
+		Short: "Example config file for a cluster that uses dedicated storage nodes.",
+		Args:  cobra.NoArgs,
+		RunE: func(cmd *cobra.Command, args []string) error {
+			y, err := multus.NewDedicatedStorageNodesValidationTestConfig().ToYAML()
+			if err != nil {
+				return err
+			}
+			fmt.Print(y)
+			return nil
+		},
+	}
+
+	stretchClusterConfigCmd = &cobra.Command{
+		Use:   "stretch-cluster",
+		Short: "Example config file for a stretch cluster with dedicated storage nodes.",
+		Args:  cobra.NoArgs,
+		RunE: func(cmd *cobra.Command, args []string) error {
+			y, err := multus.NewArbiterValidationTestConfig().ToYAML()
+			if err != nil {
+				return err
+			}
+			fmt.Print(y)
+			return nil
+		},
+	}
 )
 
 func init() {
-	configCmd.AddCommand(simpleConfigCmd)
+	configCmd.AddCommand(converged)
+	configCmd.AddCommand(dedicatedStorageNodesConfigCmd)
+	configCmd.AddCommand(stretchClusterConfigCmd)
 }
diff --git a/cmd/rook/userfacing/multus/validation/validation.go b/cmd/rook/userfacing/multus/validation/validation.go
@@ -35,6 +35,9 @@ var (
 	validationConfig     = multus.ValidationTest{
 		Logger: capnslog.NewPackageLogger("github.com/rook/rook", "multus-validation"),
 	}
+
+	// keep special var for `--daemons-per-node` that needs put into node config for validation run
+	flagDaemonsPerNode = -1
 )
 
 // commands
@@ -122,7 +125,7 @@ func init() {
 	runCmd.Flags().StringVar(&validationConfig.ClusterNetwork, "cluster-network", defaultConfig.ClusterNetwork,
 		"The name of the Network Attachment Definition (NAD) that will be used for Ceph's cluster network. "+
 			"This should be a namespaced name in the form <namespace>/<name> if the NAD is defined in a different namespace from the cluster namespace.")
-	runCmd.Flags().IntVar(&validationConfig.DaemonsPerNode, "daemons-per-node", defaultConfig.DaemonsPerNode,
+	runCmd.Flags().IntVar(&flagDaemonsPerNode, "daemons-per-node", defaultConfig.TotalDaemonsPerNode(),
 		"The number of validation test daemons to run per node. "+
 			"It is recommended to set this to the maximum number of Ceph daemons that can run on any node in the worst case of node failure(s). "+
 			"The default value is set to the worst-case value for a Rook Ceph cluster with 3 portable OSDs, 3 portable monitors, "+
@@ -157,6 +160,15 @@ func runValidation(ctx context.Context) {
 			os.Exit(22 /* EINVAL */)
 		}
 		validationConfig.ValidationTestConfig = *c
+	} else {
+		// the default CLI test is simplified and assumes all Ceph daemons are OSDs, which get both
+		// public and cluster network attachments. This also preserves legacy CLI behavior.
+		validationConfig.NodeTypes = map[string]multus.NodeConfig{
+			multus.DefaultValidationNodeType: {
+				OSDsPerNode:         flagDaemonsPerNode,
+				OtherDaemonsPerNode: 0,
+			},
+		}
 	}
 
 	if err := validationConfig.ValidationTestConfig.Validate(); err != nil {

diff --git a/deploy/examples/multus-validation.yaml b/deploy/examples/multus-validation.yaml
@@ -27,7 +27,7 @@
 # Flags:
 #       --cluster-network string           The name of the Network Attachment Definition (NAD) that will be used for Ceph's cluster network. This should be a namespaced name in the form <namespace>/<name> if the NAD is defined in a different namespace from the cluster namespace.
 #   -c, --config string                    The validation test config file to use. This cannot be used with other flags.
-#       --daemons-per-node int             The number of validation test daemons to run per node. It is recommended to set this to the maximum number of Ceph daemons that can run on any node in the worst case of node failure(s). The default value is set to the worst-case value for a Rook Ceph cluster with 3 portable OSDs, 3 portable monitors, and where all optional child resources have been created with 1 daemon such that they all might run on a single node in a failure scenario. If you aren't sure what to choose for this value, add 1 for each additional OSD beyond 3. (default 16)
+#       --daemons-per-node int             The number of validation test daemons to run per node. It is recommended to set this to the maximum number of Ceph daemons that can run on any node in the worst case of node failure(s). The default value is set to the worst-case value for a Rook Ceph cluster with 3 portable OSDs, 3 portable monitors, and where all optional child resources have been created with 1 daemon such that they all might run on a single node in a failure scenario. If you aren't sure what to choose for this value, add 1 for each additional OSD beyond 3. (default 19)
 #   -h, --help                             help for run
 #   -n, --namespace string                 The namespace for validation test resources. It is recommended to set this to the namespace in which Rook's Ceph cluster will be installed. (default "rook-ceph")
 #       --nginx-image string               The Nginx image used for the validation server and clients. (default "nginxinc/nginx-unprivileged:stable-alpine")

diff --git a/pkg/daemon/multus/client-daemonset.yaml b/pkg/daemon/multus/client-daemonset.yaml
@@ -1,28 +1,42 @@
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
-  name: multus-validation-test-client-{{ .ClientID }}
+  name: multus-validation-test-client-{{ .NodeType }}-{{ .ClientType }}-{{ .ClientID }}
   labels:
     app: multus-validation-test-client
+    nodeType: "{{ .NodeType }}"
+    clientType: "{{ .ClientType }}"
+    clientID: "{{ .ClientID }}"
     app.kubernetes.io/name: "client"
-    app.kubernetes.io/instance: "client-{{ .ClientID }}"
+    app.kubernetes.io/instance: "client-{{ .NodeType }}-{{ .ClientType }}-{{ .ClientID }}"
     app.kubernetes.io/component: "client"
     app.kubernetes.io/part-of: "multus-validation-test"
     app.kubernetes.io/managed-by: "rook-cli"
 spec:
   selector:
     matchLabels:
       app: multus-validation-test-client
+      nodeType: "{{ .NodeType }}"
+      clientType: "{{ .ClientType }}"
       clientID: "{{ .ClientID }}"
   template:
     metadata:
       labels:
         app: multus-validation-test-client
+        nodeType: "{{ .NodeType }}"
+        clientType: "{{ .ClientType }}"
         clientID: "{{ .ClientID }}"
       annotations:
         k8s.v1.cni.cncf.io/networks: "{{ .NetworksAnnotationValue }}"
     spec:
-      # TODO: selectors, affinities, tolerations
+      nodeSelector:
+      {{- range $k, $v := .Placement.NodeSelector }}
+        {{ $k }}: {{ $v }}
+      {{- end }}
+      tolerations:
+      {{- range $idx, $toleration := .Placement.Tolerations }}
+        - {{ $toleration.ToJSON }}
+      {{- end }}
       securityContext:
         runAsNonRoot: true
         seccompProfile: