From 6202f38e863e64a434b5e121513e08d57e8faa30 Mon Sep 17 00:00:00 2001 From: Ryan Zhang Date: Tue, 12 Nov 2024 20:42:11 -0800 Subject: [PATCH] feat: emit the cluster profile objects (#941) --- Makefile | 1 + ...multicluster.x-k8s.io_clusterprofiles.yaml | 1 + charts/hub-agent/templates/deployment.yaml | 2 + charts/hub-agent/values.yaml | 5 +- cmd/hubagent/main.go | 2 + cmd/hubagent/options/options.go | 4 + cmd/hubagent/options/validation_test.go | 2 +- cmd/hubagent/workload/setup.go | 98 ++++--- ...multicluster.x-k8s.io_clusterprofiles.yaml | 185 ++++++++++++ go.mod | 7 +- go.sum | 14 +- hack/Azure/setup/createHubCluster.sh | 4 +- hack/Azure/setup/joinMC.sh | 2 +- .../clusterprofile/controller.go | 267 ++++++++++++++++++ .../controller_integration_test.go | 128 +++++++++ .../clusterprofile/controller_test.go | 162 +++++++++++ .../clusterprofile/suite_test.go | 108 +++++++ pkg/utils/controller/controller.go | 5 + test/e2e/setup.sh | 1 + test/e2e/setup_test.go | 4 + 20 files changed, 944 insertions(+), 58 deletions(-) create mode 120000 charts/hub-agent/templates/crds/multicluster.x-k8s.io_clusterprofiles.yaml create mode 100644 config/crd/bases/multicluster.x-k8s.io_clusterprofiles.yaml create mode 100644 pkg/controllers/clusterinventory/clusterprofile/controller.go create mode 100644 pkg/controllers/clusterinventory/clusterprofile/controller_integration_test.go create mode 100644 pkg/controllers/clusterinventory/clusterprofile/controller_test.go create mode 100644 pkg/controllers/clusterinventory/clusterprofile/suite_test.go diff --git a/Makefile b/Makefile index fe8586f97..ddebaddb9 100644 --- a/Makefile +++ b/Makefile @@ -165,6 +165,7 @@ install-hub-agent-helm: --set webhookClientConnectionType=service \ --set enableV1Alpha1APIs=true \ --set enableV1Beta1APIs=false \ + --set enableClusterInventoryAPI=true \ --set logFileMaxSize=1000000 .PHONY: e2e-v1alpha1-hub-kubeconfig-secret diff --git a/charts/hub-agent/templates/crds/multicluster.x-k8s.io_clusterprofiles.yaml b/charts/hub-agent/templates/crds/multicluster.x-k8s.io_clusterprofiles.yaml new file mode 120000 index 000000000..88e5230ab --- /dev/null +++ b/charts/hub-agent/templates/crds/multicluster.x-k8s.io_clusterprofiles.yaml @@ -0,0 +1 @@ +../../../../config/crd/bases/multicluster.x-k8s.io_clusterprofiles.yaml \ No newline at end of file diff --git a/charts/hub-agent/templates/deployment.yaml b/charts/hub-agent/templates/deployment.yaml index d243463cf..1a7a98037 100644 --- a/charts/hub-agent/templates/deployment.yaml +++ b/charts/hub-agent/templates/deployment.yaml @@ -30,6 +30,7 @@ spec: - -add_dir_header - --enable-v1alpha1-apis={{ .Values.enableV1Alpha1APIs }} - --enable-v1beta1-apis={{ .Values.enableV1Beta1APIs }} + - --enable-cluster-inventory-apis={{ .Values.enableClusterInventoryAPI }} - --max-concurrent-cluster-placement={{ .Values.MaxConcurrentClusterPlacement }} - --concurrent-resource-change-syncs={{ .Values.ConcurrentResourceChangeSyncs }} - --log_file_max_size={{ .Values.logFileMaxSize }} @@ -37,6 +38,7 @@ spec: - --hub-api-qps={{ .Values.hubAPIQPS }} - --hub-api-burst={{ .Values.hubAPIBurst }} - --force-delete-wait-time={{ .Values.forceDeleteWaitTime }} + - --cluster-unhealthy-threshold={{ .Values.clusterUnhealthyThreshold }} ports: - name: metrics containerPort: 8080 diff --git a/charts/hub-agent/values.yaml b/charts/hub-agent/values.yaml index 392f45971..7a3394fe4 100644 --- a/charts/hub-agent/values.yaml +++ b/charts/hub-agent/values.yaml @@ -17,7 +17,7 @@ webhookServiceName: fleetwebhook enableGuardRail: true webhookClientConnectionType: service forceDeleteWaitTime: 15m0s - +clusterUnhealthyThreshold: 3m0s namespace: fleet-system @@ -35,10 +35,11 @@ affinity: {} enableV1Alpha1APIs: false enableV1Beta1APIs: true +enableClusterInventoryAPI: true hubAPIQPS: 250 hubAPIBurst: 1000 MaxConcurrentClusterPlacement: 100 ConcurrentResourceChangeSyncs: 20 -logFileMaxSize: 1000000 +logFileMaxSize: 10000000 MaxFleetSizeSupported: 100 diff --git a/cmd/hubagent/main.go b/cmd/hubagent/main.go index 7505ca265..b7c263fc4 100644 --- a/cmd/hubagent/main.go +++ b/cmd/hubagent/main.go @@ -17,6 +17,7 @@ import ( utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/klog/v2" + clusterinventory "sigs.k8s.io/cluster-inventory-api/apis/v1alpha1" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/healthz" @@ -67,6 +68,7 @@ func init() { utilruntime.Must(apiextensionsv1.AddToScheme(scheme)) utilruntime.Must(fleetnetworkingv1alpha1.AddToScheme(scheme)) utilruntime.Must(placementv1alpha1.AddToScheme(scheme)) + utilruntime.Must(clusterinventory.AddToScheme(scheme)) // +kubebuilder:scaffold:scheme klog.InitFlags(nil) diff --git a/cmd/hubagent/options/options.go b/cmd/hubagent/options/options.go index ceb2f2375..b5f728bfd 100644 --- a/cmd/hubagent/options/options.go +++ b/cmd/hubagent/options/options.go @@ -79,6 +79,8 @@ type Options struct { EnableV1Alpha1APIs bool // EnableV1Beta1APIs enables the agents to watch the v1beta1 CRs. EnableV1Beta1APIs bool + // EnableClusterInventoryAPIs enables the agents to watch the cluster inventory CRs. + EnableClusterInventoryAPIs bool // ForceDeleteWaitTime is the duration the hub agent waits before force deleting a member cluster. ForceDeleteWaitTime metav1.Duration } @@ -96,6 +98,7 @@ func NewOptions() *Options { ConcurrentResourceChangeSyncs: 1, MaxFleetSizeSupported: 100, EnableV1Alpha1APIs: false, + EnableClusterInventoryAPIs: false, } } @@ -135,6 +138,7 @@ func (o *Options) AddFlags(flags *flag.FlagSet) { flags.IntVar(&o.MaxFleetSizeSupported, "max-fleet-size", 100, "The max number of member clusters supported in this fleet") flags.BoolVar(&o.EnableV1Alpha1APIs, "enable-v1alpha1-apis", false, "If set, the agents will watch for the v1alpha1 APIs.") flags.BoolVar(&o.EnableV1Beta1APIs, "enable-v1beta1-apis", true, "If set, the agents will watch for the v1beta1 APIs.") + flags.BoolVar(&o.EnableClusterInventoryAPIs, "enable-cluster-inventory-apis", false, "If set, the agents will watch for the ClusterInventory APIs.") flags.DurationVar(&o.ForceDeleteWaitTime.Duration, "force-delete-wait-time", 15*time.Minute, "The duration the hub agent waits before force deleting a member cluster.") o.RateLimiterOpts.AddFlags(flags) diff --git a/cmd/hubagent/options/validation_test.go b/cmd/hubagent/options/validation_test.go index d94cf97bc..3965875c1 100644 --- a/cmd/hubagent/options/validation_test.go +++ b/cmd/hubagent/options/validation_test.go @@ -22,7 +22,7 @@ func newTestOptions(modifyOptions ModifyOptions) Options { option := Options{ SkippedPropagatingAPIs: "fleet.azure.com;multicluster.x-k8s.io", WorkPendingGracePeriod: metav1.Duration{Duration: 10 * time.Second}, - ClusterUnhealthyThreshold: metav1.Duration{Duration: 1 * time.Second}, + ClusterUnhealthyThreshold: metav1.Duration{Duration: 60 * time.Second}, WebhookClientConnectionType: "url", EnableV1Alpha1APIs: true, } diff --git a/cmd/hubagent/workload/setup.go b/cmd/hubagent/workload/setup.go index 908d6ef4c..8a582f561 100644 --- a/cmd/hubagent/workload/setup.go +++ b/cmd/hubagent/workload/setup.go @@ -16,6 +16,7 @@ import ( "k8s.io/client-go/dynamic" "k8s.io/client-go/rest" "k8s.io/klog/v2" + clusterinventory "sigs.k8s.io/cluster-inventory-api/apis/v1alpha1" ctrl "sigs.k8s.io/controller-runtime" workv1alpha1 "sigs.k8s.io/work-api/pkg/apis/v1alpha1" @@ -24,6 +25,7 @@ import ( placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" fleetv1alpha1 "go.goms.io/fleet/apis/v1alpha1" "go.goms.io/fleet/cmd/hubagent/options" + "go.goms.io/fleet/pkg/controllers/clusterinventory/clusterprofile" "go.goms.io/fleet/pkg/controllers/clusterresourcebindingwatcher" "go.goms.io/fleet/pkg/controllers/clusterresourceplacement" "go.goms.io/fleet/pkg/controllers/clusterresourceplacementwatcher" @@ -81,6 +83,10 @@ var ( placementv1alpha1.GroupVersion.WithKind(placementv1alpha1.ResourceOverrideKind), placementv1alpha1.GroupVersion.WithKind(placementv1alpha1.ResourceOverrideSnapshotKind), } + + clusterInventoryGVKs = []schema.GroupVersionKind{ + clusterinventory.GroupVersion.WithKind("ClusterProfile"), + } ) // SetupControllers set up the customized controllers we developed @@ -92,33 +98,14 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, } discoverClient := discovery.NewDiscoveryClientForConfigOrDie(config) - // Verify CRD installation status. - if opts.EnableV1Alpha1APIs { - for _, gvk := range v1Alpha1RequiredGVKs { - if err = utils.CheckCRDInstalled(discoverClient, gvk); err != nil { - klog.ErrorS(err, "unable to find the required CRD", "GVK", gvk) - return err - } - } - } - - if opts.EnableV1Beta1APIs { - for _, gvk := range v1Beta1RequiredGVKs { - if err = utils.CheckCRDInstalled(discoverClient, gvk); err != nil { - klog.ErrorS(err, "unable to find the required CRD", "GVK", gvk) - return err - } - } - } - // AllowedPropagatingAPIs and SkippedPropagatingAPIs are mutually exclusive. // If none of them are set, the resourceConfig by default stores a list of skipped propagation APIs. resourceConfig := utils.NewResourceConfig(opts.AllowedPropagatingAPIs != "") - if err := resourceConfig.Parse(opts.AllowedPropagatingAPIs); err != nil { + if err = resourceConfig.Parse(opts.AllowedPropagatingAPIs); err != nil { // The program will never go here because the parameters have been checked. return err } - if err := resourceConfig.Parse(opts.SkippedPropagatingAPIs); err != nil { + if err = resourceConfig.Parse(opts.SkippedPropagatingAPIs); err != nil { // The program will never go here because the parameters have been checked return err } @@ -154,32 +141,16 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, rateLimiter := options.DefaultControllerRateLimiter(opts.RateLimiterOpts) var clusterResourcePlacementControllerV1Alpha1 controller.Controller var clusterResourcePlacementControllerV1Beta1 controller.Controller - + var memberClusterPlacementController controller.Controller if opts.EnableV1Alpha1APIs { + for _, gvk := range v1Alpha1RequiredGVKs { + if err = utils.CheckCRDInstalled(discoverClient, gvk); err != nil { + klog.ErrorS(err, "unable to find the required CRD", "GVK", gvk) + return err + } + } klog.Info("Setting up clusterResourcePlacement v1alpha1 controller") clusterResourcePlacementControllerV1Alpha1 = controller.NewController(crpControllerV1Alpha1Name, controller.NamespaceKeyFunc, crpc.ReconcileV1Alpha1, rateLimiter) - } - - if opts.EnableV1Beta1APIs { - klog.Info("Setting up clusterResourcePlacement v1beta1 controller") - clusterResourcePlacementControllerV1Beta1 = controller.NewController(crpControllerV1Beta1Name, controller.NamespaceKeyFunc, crpc.Reconcile, rateLimiter) - } - - // Set up a new controller to reconcile any resources in the cluster - klog.Info("Setting up resource change controller") - rcr := &resourcechange.Reconciler{ - DynamicClient: dynamicClient, - Recorder: mgr.GetEventRecorderFor(resourceChangeControllerName), - RestMapper: mgr.GetRESTMapper(), - InformerManager: dynamicInformerManager, - PlacementControllerV1Alpha1: clusterResourcePlacementControllerV1Alpha1, - PlacementControllerV1Beta1: clusterResourcePlacementControllerV1Beta1, - } - - resourceChangeController := controller.NewController(resourceChangeControllerName, controller.ClusterWideKeyFunc, rcr.Reconcile, rateLimiter) - - var memberClusterPlacementController controller.Controller - if opts.EnableV1Alpha1APIs { klog.Info("Setting up member cluster change controller") mcp := &memberclusterplacement.Reconciler{ InformerManager: dynamicInformerManager, @@ -189,6 +160,14 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, } if opts.EnableV1Beta1APIs { + for _, gvk := range v1Beta1RequiredGVKs { + if err = utils.CheckCRDInstalled(discoverClient, gvk); err != nil { + klog.ErrorS(err, "unable to find the required CRD", "GVK", gvk) + return err + } + } + klog.Info("Setting up clusterResourcePlacement v1beta1 controller") + clusterResourcePlacementControllerV1Beta1 = controller.NewController(crpControllerV1Beta1Name, controller.NamespaceKeyFunc, crpc.Reconcile, rateLimiter) klog.Info("Setting up clusterResourcePlacement watcher") if err := (&clusterresourceplacementwatcher.Reconciler{ PlacementController: clusterResourcePlacementControllerV1Beta1, @@ -318,7 +297,38 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, klog.ErrorS(err, "Unable to set up resourceOverride controller") return err } + + // Verify cluster inventory CRD installation status. + if opts.EnableClusterInventoryAPIs { + for _, gvk := range clusterInventoryGVKs { + if err = utils.CheckCRDInstalled(discoverClient, gvk); err != nil { + klog.ErrorS(err, "unable to find the required CRD", "GVK", gvk) + return err + } + } + klog.Info("Setting up cluster profile controller") + if err = (&clusterprofile.Reconciler{ + Client: mgr.GetClient(), + ClusterProfileNamespace: utils.FleetSystemNamespace, + ClusterUnhealthyThreshold: opts.ClusterUnhealthyThreshold.Duration, + }).SetupWithManager(mgr); err != nil { + klog.ErrorS(err, "unable to set up ClusterProfile controller") + return err + } + } + } + + // Set up a new controller to reconcile any resources in the cluster + klog.Info("Setting up resource change controller") + rcr := &resourcechange.Reconciler{ + DynamicClient: dynamicClient, + Recorder: mgr.GetEventRecorderFor(resourceChangeControllerName), + RestMapper: mgr.GetRESTMapper(), + InformerManager: dynamicInformerManager, + PlacementControllerV1Alpha1: clusterResourcePlacementControllerV1Alpha1, + PlacementControllerV1Beta1: clusterResourcePlacementControllerV1Beta1, } + resourceChangeController := controller.NewController(resourceChangeControllerName, controller.ClusterWideKeyFunc, rcr.Reconcile, rateLimiter) // Set up a runner that starts all the custom controllers we created above resourceChangeDetector := &resourcewatcher.ChangeDetector{ diff --git a/config/crd/bases/multicluster.x-k8s.io_clusterprofiles.yaml b/config/crd/bases/multicluster.x-k8s.io_clusterprofiles.yaml new file mode 100644 index 000000000..f8883b17d --- /dev/null +++ b/config/crd/bases/multicluster.x-k8s.io_clusterprofiles.yaml @@ -0,0 +1,185 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.14.0 + name: clusterprofiles.multicluster.x-k8s.io +spec: + group: multicluster.x-k8s.io + names: + kind: ClusterProfile + listKind: ClusterProfileList + plural: clusterprofiles + singular: clusterprofile + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: ClusterProfile represents a single cluster in a multi-cluster + deployment. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: ClusterProfileSpec defines the desired state of ClusterProfile. + properties: + clusterManager: + description: ClusterManager defines which cluster manager owns this + ClusterProfile resource + properties: + name: + description: Name defines the name of the cluster manager + type: string + required: + - name + type: object + x-kubernetes-validations: + - message: ClusterManager is immutable + rule: self == oldSelf + displayName: + description: DisplayName defines a human-readable name of the ClusterProfile + type: string + required: + - clusterManager + type: object + status: + description: ClusterProfileStatus defines the observed state of ClusterProfile. + properties: + conditions: + description: Conditions contains the different condition statuses + for this cluster. + items: + description: "Condition contains details for one aspect of the current + state of this API Resource.\n---\nThis struct is intended for + direct use as an array at the field path .status.conditions. For + example,\n\n\n\ttype FooStatus struct{\n\t // Represents the + observations of a foo's current state.\n\t // Known .status.conditions.type + are: \"Available\", \"Progressing\", and \"Degraded\"\n\t // + +patchMergeKey=type\n\t // +patchStrategy=merge\n\t // +listType=map\n\t + \ // +listMapKey=type\n\t Conditions []metav1.Condition `json:\"conditions,omitempty\" + patchStrategy:\"merge\" patchMergeKey:\"type\" protobuf:\"bytes,1,rep,name=conditions\"`\n\n\n\t + \ // other fields\n\t}" + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: |- + type of condition in CamelCase or in foo.example.com/CamelCase. + --- + Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be + useful (see .node.status.conditions), the ability to deconflict is important. + The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + properties: + description: |- + Properties defines name/value pairs to represent properties of a cluster. + It could be a collection of ClusterProperty (KEP-2149) resources, + but could also be info based on other implementations. + The names of the properties can be predefined names from ClusterProperty resources + and is allowed to be customized by different cluster managers. + items: + description: |- + Property defines a name/value pair to represent a property of a cluster. + It could be a ClusterProperty (KEP-2149) resource, + but could also be info based on other implementations. + The name of the property can be predefined name from a ClusterProperty resource + and is allowed to be customized by different cluster managers. + This property can store various configurable details and metrics of a cluster, + which may include information such as the number of nodes, total and free CPU, + and total and free memory, among other potential attributes. + properties: + name: + description: |- + Name is the name of a property resource on cluster. It's a well-known + or customized name to identify the property. + maxLength: 253 + minLength: 1 + type: string + value: + description: Value is a property-dependent string + maxLength: 1024 + minLength: 1 + type: string + required: + - name + - value + type: object + type: array + version: + description: Version defines the version information of the cluster. + properties: + kubernetes: + description: Kubernetes is the kubernetes version of the cluster. + type: string + type: object + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/go.mod b/go.mod index 114f2f183..0dfe70119 100644 --- a/go.mod +++ b/go.mod @@ -23,16 +23,17 @@ require ( golang.org/x/exp v0.0.0-20241004190924-225e2abe05e6 golang.org/x/sync v0.8.0 golang.org/x/time v0.7.0 - k8s.io/api v0.30.2 + k8s.io/api v0.30.3 k8s.io/apiextensions-apiserver v0.30.2 - k8s.io/apimachinery v0.30.2 - k8s.io/client-go v0.30.2 + k8s.io/apimachinery v0.30.3 + k8s.io/client-go v0.30.3 k8s.io/component-base v0.30.2 k8s.io/klog/v2 v2.130.1 k8s.io/metrics v0.25.2 k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 sigs.k8s.io/cloud-provider-azure v1.28.2 sigs.k8s.io/cloud-provider-azure/pkg/azclient v0.0.50 + sigs.k8s.io/cluster-inventory-api v0.0.0-20240730014211-ef0154379848 sigs.k8s.io/controller-runtime v0.18.5 sigs.k8s.io/work-api v0.0.0-20220407021756-586d707fdb2c ) diff --git a/go.sum b/go.sum index ebcc8bc87..6d73d17e5 100644 --- a/go.sum +++ b/go.sum @@ -345,14 +345,14 @@ gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.30.2 h1:+ZhRj+28QT4UOH+BKznu4CBgPWgkXO7XAvMcMl0qKvI= -k8s.io/api v0.30.2/go.mod h1:ULg5g9JvOev2dG0u2hig4Z7tQ2hHIuS+m8MNZ+X6EmI= +k8s.io/api v0.30.3 h1:ImHwK9DCsPA9uoU3rVh4QHAHHK5dTSv1nxJUapx8hoQ= +k8s.io/api v0.30.3/go.mod h1:GPc8jlzoe5JG3pb0KJCSLX5oAFIW3/qNJITlDj8BH04= k8s.io/apiextensions-apiserver v0.30.2 h1:l7Eue2t6QiLHErfn2vwK4KgF4NeDgjQkCXtEbOocKIE= k8s.io/apiextensions-apiserver v0.30.2/go.mod h1:lsJFLYyK40iguuinsb3nt+Sj6CmodSI4ACDLep1rgjw= -k8s.io/apimachinery v0.30.2 h1:fEMcnBj6qkzzPGSVsAZtQThU62SmQ4ZymlXRC5yFSCg= -k8s.io/apimachinery v0.30.2/go.mod h1:iexa2somDaxdnj7bha06bhb43Zpa6eWH8N8dbqVjTUc= -k8s.io/client-go v0.30.2 h1:sBIVJdojUNPDU/jObC+18tXWcTJVcwyqS9diGdWHk50= -k8s.io/client-go v0.30.2/go.mod h1:JglKSWULm9xlJLx4KCkfLLQ7XwtlbflV6uFFSHTMgVs= +k8s.io/apimachinery v0.30.3 h1:q1laaWCmrszyQuSQCfNB8cFgCuDAoPszKY4ucAjDwHc= +k8s.io/apimachinery v0.30.3/go.mod h1:iexa2somDaxdnj7bha06bhb43Zpa6eWH8N8dbqVjTUc= +k8s.io/client-go v0.30.3 h1:bHrJu3xQZNXIi8/MoxYtZBBWQQXwy16zqJwloXXfD3k= +k8s.io/client-go v0.30.3/go.mod h1:8d4pf8vYu665/kUbsxWAQ/JDBNWqfFeZnvFiVdmx89U= k8s.io/cloud-provider v0.28.3 h1:9u+JjA3zIn0nqLOOa8tWnprFkffguSAhfBvo8p7LhBQ= k8s.io/cloud-provider v0.28.3/go.mod h1:shAJxdrKu+SwwGUhkodxByPjaH8KBFZqXo6jU1F0ehI= k8s.io/component-base v0.30.2 h1:pqGBczYoW1sno8q9ObExUqrYSKhtE5rW3y6gX88GZII= @@ -375,6 +375,8 @@ sigs.k8s.io/cloud-provider-azure v1.28.2 h1:KKrWdC1+p2xXdT1VRmSkT57MhKNzPXk3yPcr sigs.k8s.io/cloud-provider-azure v1.28.2/go.mod h1:vDsaFOrvDDEUg0mLF2eoUeneCK+ROlRf4zACA91iwHs= sigs.k8s.io/cloud-provider-azure/pkg/azclient v0.0.50 h1:l9igMANNptVwYmZrqGS51oW0zvfSxBGmlOaDPe407FI= sigs.k8s.io/cloud-provider-azure/pkg/azclient v0.0.50/go.mod h1:1M90A+akyTabHVnveSKlvIO/Kk9kEr1LjRx+08twKVU= +sigs.k8s.io/cluster-inventory-api v0.0.0-20240730014211-ef0154379848 h1:WYPi2PdQyZwZkHG648v2jQl6deyCgyjJ0fkLYgUJ618= +sigs.k8s.io/cluster-inventory-api v0.0.0-20240730014211-ef0154379848/go.mod h1:/aN4e7RWOMHgT4xAjCNkV4YFcpKfpZCeumMIL7S+KNM= sigs.k8s.io/controller-runtime v0.18.5 h1:nTHio/W+Q4aBlQMgbnC5hZb4IjIidyrizMai9P6n4Rk= sigs.k8s.io/controller-runtime v0.18.5/go.mod h1:TVoGrfdpbA9VRFaRnKgk9P5/atA0pMwq+f+msb9M8Sg= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= diff --git a/hack/Azure/setup/createHubCluster.sh b/hack/Azure/setup/createHubCluster.sh index 005c4248b..4c55ae355 100755 --- a/hack/Azure/setup/createHubCluster.sh +++ b/hack/Azure/setup/createHubCluster.sh @@ -26,12 +26,14 @@ helm install hub-agent charts/hub-agent/ \ --set image.pullPolicy=Always \ --set image.repository=$REGISTRY/hub-agent \ --set image.tag=$TAG \ - --set logVerbosity=2 \ + --set logVerbosity=5 \ --set namespace=fleet-system \ --set enableWebhook=false \ --set webhookClientConnectionType=service \ --set enableV1Alpha1APIs=false \ --set enableV1Beta1APIs=true \ + --set clusterUnhealthyThreshold="3m0s" \ + --set forceDeleteWaitTime="1m0s" \ --set resources.limits.cpu=4 \ --set resources.limits.memory=4Gi \ --set concurrentClusterPlacementSyncs=10 \ diff --git a/hack/Azure/setup/joinMC.sh b/hack/Azure/setup/joinMC.sh index 331286405..3a5032da1 100755 --- a/hack/Azure/setup/joinMC.sh +++ b/hack/Azure/setup/joinMC.sh @@ -87,7 +87,7 @@ helm install member-agent charts/member-agent/ \ --set image.pullPolicy=Always \ --set refreshtoken.pullPolicy=Always \ --set config.memberClusterName=$MEMBER_CLUSTER \ - --set logVerbosity=8 \ + --set logVerbosity=5 \ --set namespace=fleet-system \ --set enableV1Alpha1APIs=false \ --set enableV1Beta1APIs=true diff --git a/pkg/controllers/clusterinventory/clusterprofile/controller.go b/pkg/controllers/clusterinventory/clusterprofile/controller.go new file mode 100644 index 000000000..5b66fce8b --- /dev/null +++ b/pkg/controllers/clusterinventory/clusterprofile/controller.go @@ -0,0 +1,267 @@ +/* +Copyright (c) Microsoft Corporation. +Licensed under the MIT license. +*/ + +// Package clusterprofile features a controller to generate clusterprofile objects from MemberCluster. +package clusterprofile + +import ( + "context" + "fmt" + "time" + + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/util/workqueue" + "k8s.io/klog/v2" + clusterinventory "sigs.k8s.io/cluster-inventory-api/apis/v1alpha1" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + clusterv1beta1 "go.goms.io/fleet/apis/cluster/v1beta1" + "go.goms.io/fleet/pkg/utils/controller" +) + +const ( + // clusterProfileCleanupFinalizer is the finalizer added to a MemberCluster object if + // a corresponding ClusterProfile object has been created. + clusterProfileCleanupFinalizer = "kubernetes-fleet.io/cluster-profile-cleanup" + + // the list of reasons in the cluster profile status + clusterNoStatusReason = "MemberAgentReportedNoStatus" + clusterHeartbeatLostReason = "MemberAgentHeartbeatLost" + clusterHealthUnknownReason = "MemberAgentReportedNoHealthInfo" + clusterUnHealthyReason = "MemberClusterAPIServerUnhealthy" + clusterHealthyReason = "MemberClusterAPIServerHealthy" +) + +// Reconciler reconciles a MemberCluster object and creates the corresponding ClusterProfile +// object in the designated namespace. +type Reconciler struct { + client.Client + ClusterProfileNamespace string + ClusterUnhealthyThreshold time.Duration +} + +// Reconcile processes the MemberCluster object and creates the corresponding ClusterProfile object +// in the designated namespace. +func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + mcRef := klog.KRef(req.Namespace, req.Name) + startTime := time.Now() + klog.V(2).InfoS("Reconciliation starts (cluster profile controller)", "memberCluster", mcRef) + defer func() { + latency := time.Since(startTime).Milliseconds() + klog.V(2).InfoS("Reconciliation ends (cluster profile controller)", "memberCluster", mcRef, "latency", latency) + }() + + // Retrieve the MemberCluster object. + mc := &clusterv1beta1.MemberCluster{} + if err := r.Get(ctx, req.NamespacedName, mc); err != nil { + if errors.IsNotFound(err) { + klog.InfoS("Member cluster object is not found", "memberCluster", mcRef) + // To address the case where a member cluster is deleted before its cluster profile is cleaned up + // since we didn't put the logic in the member cluster controller + // or a cluster profile has been created without the acknowledgment of this controller. + if err = r.cleanupClusterProfile(ctx, req.Name); err != nil { + klog.ErrorS(err, "Failed to clean up the cluster profile when the member cluster is already gone", "memberCluster", mcRef) + return ctrl.Result{}, err + } + return ctrl.Result{}, nil + } + klog.ErrorS(err, "Failed to get member cluster", "memberCluster", mcRef) + return ctrl.Result{}, err + } + + // Check if the member cluster object has been marked for deletion. + if mc.DeletionTimestamp != nil { + klog.V(2).InfoS("Member cluster object is being deleted; remove the corresponding cluster profile", "memberCluster", mcRef) + // Delete the corresponding ClusterProfile object. + if err := r.cleanupClusterProfile(ctx, mc.Name); err != nil { + klog.ErrorS(err, "Failed to clean up cluster profile when member cluster is marked for deletion", "memberCluster", mcRef) + return ctrl.Result{}, err + } + + // Remove the cleanup finalizer from the MemberCluster object. + controllerutil.RemoveFinalizer(mc, clusterProfileCleanupFinalizer) + if err := r.Update(ctx, mc); err != nil { + klog.ErrorS(err, "Failed to remove cleanup finalizer", "memberCluster", mcRef) + return ctrl.Result{}, err + } + return ctrl.Result{}, nil + } + + // Check if the MemberCluster object has the cleanup finalizer; if not, add it. + if !controllerutil.ContainsFinalizer(mc, clusterProfileCleanupFinalizer) { + mc.Finalizers = append(mc.Finalizers, clusterProfileCleanupFinalizer) + if err := r.Update(ctx, mc); err != nil { + klog.ErrorS(err, "Failed to add cleanup finalizer", "memberCluster", mcRef) + return ctrl.Result{}, err + } + } + + // Retrieve the corresponding ClusterProfile object. If the object does not exist, create it. + cp := &clusterinventory.ClusterProfile{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: r.ClusterProfileNamespace, + Name: mc.Name, + }, + } + // Note that if the object already exists and its spec matches with the desired space, no + // update op will be performed. + createOrUpdateRes, err := controllerutil.CreateOrUpdate(ctx, r, cp, func() error { + if cp.CreationTimestamp.IsZero() { + // Only set the ClusterManager field if the object is being created; this field + // is immutable by definition. + cp.Spec = clusterinventory.ClusterProfileSpec{ + ClusterManager: clusterinventory.ClusterManager{ + Name: controller.ClusterManagerName, + }, + } + } + // log an unexpected error if the cluster profile is under the management of a different platform. + if cp.Spec.ClusterManager.Name != controller.ClusterManagerName { + klog.ErrorS(controller.NewUnexpectedBehaviorError(fmt.Errorf("found another clustrer Manager: `%s`", cp.Spec.ClusterManager.Name)), + "Cluster profile is under the management of a different platform", "memberCluster", mcRef, "clusterProfile", klog.KObj(cp)) + return nil + } + + // Set the labels. + if cp.Labels == nil { + cp.Labels = make(map[string]string) + } + cp.Labels[clusterinventory.LabelClusterManagerKey] = controller.ClusterManagerName + + // Set the display name. + cp.Spec.DisplayName = mc.Name + return nil + }) + if err != nil { + klog.ErrorS(err, "Failed to create or update cluster profile", "memberCluster", mcRef, "clusterProfile", klog.KObj(cp), "operation", createOrUpdateRes) + return ctrl.Result{}, err + } + klog.V(2).InfoS("Cluster profile object is created or updated", "memberCluster", mcRef, "clusterProfile", klog.KObj(cp), "operation", createOrUpdateRes) + // sync the cluster profile condition from the member cluster condition + r.syncClusterProfileCondition(mc, cp) + if err = r.Status().Update(ctx, cp); err != nil { + klog.ErrorS(err, "Failed to update cluster profile status", "memberCluster", mcRef, "clusterProfile", klog.KObj(cp)) + return ctrl.Result{}, err + } + return ctrl.Result{}, nil +} + +// syncClusterProfileCondition syncs the ClusterProfile object's condition based on the MemberCluster object's condition. +func (r *Reconciler) syncClusterProfileCondition(mc *clusterv1beta1.MemberCluster, cp *clusterinventory.ClusterProfile) { + // Update the cluster profile status. + // + // For simplicity reasons, for now only the health check condition is populated, using + // Fleet member agent's API server health check result. + var mcHealthCond *metav1.Condition + var memberAgentLastHeartbeat *metav1.Time + + memberAgentStatus := mc.GetAgentStatus(clusterv1beta1.MemberAgent) + if memberAgentStatus != nil { + mcHealthCond = meta.FindStatusCondition(memberAgentStatus.Conditions, string(clusterv1beta1.AgentHealthy)) + memberAgentLastHeartbeat = &memberAgentStatus.LastReceivedHeartbeat + } + switch { + case memberAgentStatus == nil: + // The member agent hasn't reported its status yet. + // Set the unknown health condition in the cluster profile status. + meta.SetStatusCondition(&cp.Status.Conditions, metav1.Condition{ + Type: clusterinventory.ClusterConditionControlPlaneHealthy, + Status: metav1.ConditionUnknown, + Reason: clusterNoStatusReason, + ObservedGeneration: cp.Generation, + Message: "The Fleet member agent has not reported its status yet", + }) + case mcHealthCond == nil: + // The member agent has reported its status, but the health condition is missing. + // Set the unknown health condition in the cluster profile status. + meta.SetStatusCondition(&cp.Status.Conditions, metav1.Condition{ + Type: clusterinventory.ClusterConditionControlPlaneHealthy, + Status: metav1.ConditionUnknown, + Reason: clusterHealthUnknownReason, + ObservedGeneration: cp.Generation, + Message: "The Fleet member agent has reported its status, but the health condition is missing", + }) + case memberAgentLastHeartbeat == nil || time.Since(memberAgentLastHeartbeat.Time) > r.ClusterUnhealthyThreshold: + // The member agent has lost its heartbeat connection to the Fleet hub cluster. + // Set the unknown health condition in the cluster profile status. + meta.SetStatusCondition(&cp.Status.Conditions, metav1.Condition{ + Type: clusterinventory.ClusterConditionControlPlaneHealthy, + Status: metav1.ConditionFalse, + Reason: clusterHeartbeatLostReason, + ObservedGeneration: cp.Generation, + Message: "The Fleet member agent has lost its heartbeat connection to the Fleet hub cluster", + }) + // TODO: Add the generation check after Fleet member agent handle the health condition appropriately. + case mcHealthCond.Status == metav1.ConditionUnknown: + // The health condition has not been updated. + // Set the unknown health condition in the cluster profile status. + meta.SetStatusCondition(&cp.Status.Conditions, metav1.Condition{ + Type: clusterinventory.ClusterConditionControlPlaneHealthy, + Status: metav1.ConditionUnknown, + Reason: clusterHealthUnknownReason, + ObservedGeneration: cp.Generation, + Message: "The Fleet member agent health check result is out of date or unknown", + }) + case mcHealthCond.Status == metav1.ConditionFalse: + // The member agent reports that the API server is unhealthy. + // Set the false health condition in the cluster profile status. + meta.SetStatusCondition(&cp.Status.Conditions, metav1.Condition{ + Type: clusterinventory.ClusterConditionControlPlaneHealthy, + Status: metav1.ConditionFalse, + Reason: clusterUnHealthyReason, + ObservedGeneration: cp.Generation, + Message: "The Fleet member agent reports that the API server is unhealthy", + }) + default: + // The member agent reports that the API server is healthy. + // Set the true health condition in the cluster profile status. + meta.SetStatusCondition(&cp.Status.Conditions, metav1.Condition{ + Type: clusterinventory.ClusterConditionControlPlaneHealthy, + Status: metav1.ConditionTrue, + Reason: clusterHealthyReason, + ObservedGeneration: cp.Generation, + Message: "The Fleet member agent reports that the API server is healthy", + }) + } +} + +// cleanupClusterProfile deletes the ClusterProfile object associated with a given MemberCluster object. +func (r *Reconciler) cleanupClusterProfile(ctx context.Context, clusterName string) error { + cp := &clusterinventory.ClusterProfile{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: r.ClusterProfileNamespace, + Name: clusterName, + }, + } + klog.V(2).InfoS("delete the cluster profile", "memberCluster", clusterName, "clusterProfile", klog.KObj(cp)) + if err := r.Delete(ctx, cp); err != nil && !errors.IsNotFound(err) { + klog.ErrorS(err, "Failed to delete the cluster profile", "memberCluster", clusterName, "clusterProfile", klog.KObj(cp)) + return err + } + return nil +} + +// SetupWithManager sets up the controller with the controller manager. +func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&clusterv1beta1.MemberCluster{}). + Watches(&clusterinventory.ClusterProfile{}, handler.Funcs{ + DeleteFunc: func(ctx context.Context, e event.DeleteEvent, q workqueue.RateLimitingInterface) { + klog.V(2).InfoS("Handling a clusterProfile delete event", "clusterProfile", klog.KObj(e.Object)) + q.Add(reconcile.Request{ + NamespacedName: types.NamespacedName{Name: e.Object.GetName()}, + }) + }, + }). + Complete(r) +} diff --git a/pkg/controllers/clusterinventory/clusterprofile/controller_integration_test.go b/pkg/controllers/clusterinventory/clusterprofile/controller_integration_test.go new file mode 100644 index 000000000..1039793bd --- /dev/null +++ b/pkg/controllers/clusterinventory/clusterprofile/controller_integration_test.go @@ -0,0 +1,128 @@ +/* +Copyright (c) Microsoft Corporation. +Licensed under the MIT license. +*/ +package clusterprofile + +import ( + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + clusterinventory "sigs.k8s.io/cluster-inventory-api/apis/v1alpha1" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + + clusterv1beta1 "go.goms.io/fleet/apis/cluster/v1beta1" + "go.goms.io/fleet/pkg/utils" + "go.goms.io/fleet/pkg/utils/condition" +) + +const ( + testTargetCluster = "test-cluster" + clusterProfileNS = "default" + + eventuallyTimeout = time.Second * 5 + consistentlyDuration = time.Second * 10 + interval = time.Millisecond * 250 +) + +// This container cannot be run in parallel with other ITs because it uses a shared fakePlacementController. +var _ = Describe("Test ClusterProfile Controller", func() { + var mc *clusterv1beta1.MemberCluster + var clusterProfile clusterinventory.ClusterProfile + var testMCName string + BeforeEach(func() { + testMCName = testTargetCluster + utils.RandStr() + By("Creating a new MemberCluster") + mc = memberClusterForTest(testMCName) + Expect(k8sClient.Create(ctx, mc)).Should(Succeed(), "failed to create MemberCluster") + }) + + AfterEach(func() { + By("Deleting the MemberCluster") + Expect(k8sClient.Delete(ctx, mc)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{})) + By("Deleting the ClusterProfile") + Expect(k8sClient.Delete(ctx, &clusterProfile)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{})) + }) + + It("Should create a clusterProfile when a member cluster is created", func() { + By("Check the clusterProfile is created") + Eventually(func() error { + return k8sClient.Get(ctx, types.NamespacedName{Namespace: clusterProfileNS, Name: testMCName}, &clusterProfile) + }, eventuallyTimeout, interval).Should(Succeed(), "clusterProfile is not created") + By("Check the MemberCluster has the finalizer") + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: testMCName}, mc)).Should(Succeed(), "failed to get MemberCluster") + Expect(controllerutil.ContainsFinalizer(mc, clusterProfileCleanupFinalizer)).Should(BeTrue(), "failed to add the finalizer to MemberCluster") + mc.Status.AgentStatus = []clusterv1beta1.AgentStatus{ + { + Type: clusterv1beta1.MemberAgent, + Conditions: []metav1.Condition{ + { + Type: string(clusterv1beta1.AgentHealthy), + Status: metav1.ConditionTrue, + Reason: "Healthy", + Message: "Agent is healthy", + LastTransitionTime: metav1.Time{Time: time.Now()}, + }, + }, + LastReceivedHeartbeat: metav1.Time{Time: time.Now()}, + }, + } + Expect(k8sClient.Status().Update(ctx, mc)).Should(Succeed(), "failed to update member cluster status") + Eventually(func() bool { + if err := k8sClient.Get(ctx, types.NamespacedName{Namespace: clusterProfileNS, Name: testMCName}, &clusterProfile); err != nil { + return false + } + cond := meta.FindStatusCondition(clusterProfile.Status.Conditions, clusterinventory.ClusterConditionControlPlaneHealthy) + return condition.IsConditionStatusTrue(cond, clusterProfile.Generation) + }, eventuallyTimeout, interval).Should(BeTrue(), "clusterProfile is not created") + }) + + It("Should recreate a clusterProfile when it is deleted by the user", func() { + By("Check the clusterProfile is created") + Eventually(func() error { + return k8sClient.Get(ctx, types.NamespacedName{Namespace: clusterProfileNS, Name: testMCName}, &clusterProfile) + }, eventuallyTimeout, interval).Should(Succeed(), "clusterProfile is not created") + By("Deleting the ClusterProfile") + Expect(k8sClient.Delete(ctx, &clusterProfile)).Should(Succeed(), "failed to delete clusterProfile") + By("Check the clusterProfile is created again") + Eventually(func() error { + return k8sClient.Get(ctx, types.NamespacedName{Namespace: clusterProfileNS, Name: testMCName}, &clusterProfile) + }, eventuallyTimeout, interval).Should(Succeed(), "clusterProfile is not created") + }) + + It("Should delete the clusterProfile when the MemberCluster is deleted", func() { + By("Check the clusterProfile is created") + Eventually(func() error { + return k8sClient.Get(ctx, types.NamespacedName{Namespace: clusterProfileNS, Name: testMCName}, &clusterProfile) + }, eventuallyTimeout, interval).Should(Succeed(), "clusterProfile is not created") + By("Deleting the MemberCluster") + Expect(k8sClient.Delete(ctx, mc)).Should(Succeed(), "failed to delete clusterProfile") + By("Check the clusterProfile is deleted too") + Eventually(func() error { + return k8sClient.Get(ctx, types.NamespacedName{Namespace: clusterProfileNS, Name: testMCName}, &clusterProfile) + }, eventuallyTimeout, interval).Should(utils.NotFoundMatcher{}, "clusterProfile is not deleted") + Consistently(func() error { + return k8sClient.Get(ctx, types.NamespacedName{Namespace: clusterProfileNS, Name: testMCName}, &clusterProfile) + }, consistentlyDuration, interval).Should(utils.NotFoundMatcher{}, "clusterProfile is not deleted") + }) +}) + +func memberClusterForTest(mcName string) *clusterv1beta1.MemberCluster { + return &clusterv1beta1.MemberCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: mcName, + }, + Spec: clusterv1beta1.MemberClusterSpec{ + Identity: rbacv1.Subject{ + Kind: "ServiceAccount", + Name: "test-service-account", + Namespace: "fleet-system", + }, + }, + } +} diff --git a/pkg/controllers/clusterinventory/clusterprofile/controller_test.go b/pkg/controllers/clusterinventory/clusterprofile/controller_test.go new file mode 100644 index 000000000..115c04046 --- /dev/null +++ b/pkg/controllers/clusterinventory/clusterprofile/controller_test.go @@ -0,0 +1,162 @@ +/* +Copyright (c) Microsoft Corporation. +Licensed under the MIT license. +*/ + +package clusterprofile + +import ( + "testing" + "time" + + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + clusterinventory "sigs.k8s.io/cluster-inventory-api/apis/v1alpha1" + + clusterv1beta1 "go.goms.io/fleet/apis/cluster/v1beta1" +) + +func TestSyncClusterProfileCondition(t *testing.T) { + clusterUnhealthyThreshold := 5 * time.Minute + reconciler := &Reconciler{ + ClusterUnhealthyThreshold: clusterUnhealthyThreshold, + } + + tests := []struct { + name string + memberCluster *clusterv1beta1.MemberCluster + clusterProfile *clusterinventory.ClusterProfile + expectedConditionStatus metav1.ConditionStatus + expectedConditionReason string + }{ + { + name: "Member agent has not reported its status yet", + memberCluster: &clusterv1beta1.MemberCluster{ + Status: clusterv1beta1.MemberClusterStatus{}, + }, + clusterProfile: &clusterinventory.ClusterProfile{}, + expectedConditionStatus: metav1.ConditionUnknown, + expectedConditionReason: clusterNoStatusReason, + }, + { + name: "Member agent has reported its status, but the health condition is missing", + memberCluster: &clusterv1beta1.MemberCluster{ + Status: clusterv1beta1.MemberClusterStatus{ + AgentStatus: []clusterv1beta1.AgentStatus{ + { + Type: clusterv1beta1.MemberAgent, + Conditions: []metav1.Condition{}, + }, + }, + }, + }, + clusterProfile: &clusterinventory.ClusterProfile{}, + expectedConditionStatus: metav1.ConditionUnknown, + expectedConditionReason: clusterHealthUnknownReason, + }, + { + name: "Member agent has lost its heartbeat connection to the Fleet hub cluster", + memberCluster: &clusterv1beta1.MemberCluster{ + Status: clusterv1beta1.MemberClusterStatus{ + AgentStatus: []clusterv1beta1.AgentStatus{ + { + Type: clusterv1beta1.MemberAgent, + Conditions: []metav1.Condition{ + { + Type: string(clusterv1beta1.AgentHealthy), + Status: metav1.ConditionTrue, + }, + }, + LastReceivedHeartbeat: metav1.Time{Time: time.Now().Add(-10 * clusterUnhealthyThreshold)}, + }, + }, + }, + }, + clusterProfile: &clusterinventory.ClusterProfile{}, + expectedConditionStatus: metav1.ConditionFalse, + expectedConditionReason: clusterHeartbeatLostReason, + }, + { + name: "Member agent health check result is out of date or unknown", + memberCluster: &clusterv1beta1.MemberCluster{ + Status: clusterv1beta1.MemberClusterStatus{ + AgentStatus: []clusterv1beta1.AgentStatus{ + { + Type: clusterv1beta1.MemberAgent, + Conditions: []metav1.Condition{ + { + Type: string(clusterv1beta1.AgentHealthy), + Status: metav1.ConditionUnknown, + }, + }, + LastReceivedHeartbeat: metav1.Time{Time: time.Now().Add(-1 * time.Second)}, + }, + }, + }, + }, + clusterProfile: &clusterinventory.ClusterProfile{}, + expectedConditionStatus: metav1.ConditionUnknown, + expectedConditionReason: clusterHealthUnknownReason, + }, + { + name: "Member agent reports that the API server is unhealthy", + memberCluster: &clusterv1beta1.MemberCluster{ + Status: clusterv1beta1.MemberClusterStatus{ + AgentStatus: []clusterv1beta1.AgentStatus{ + { + Type: clusterv1beta1.MemberAgent, + Conditions: []metav1.Condition{ + { + Type: string(clusterv1beta1.AgentHealthy), + Status: metav1.ConditionFalse, + }, + }, + LastReceivedHeartbeat: metav1.Time{Time: time.Now().Add(-1 * time.Second)}, + }, + }, + }, + }, + clusterProfile: &clusterinventory.ClusterProfile{}, + expectedConditionStatus: metav1.ConditionFalse, + expectedConditionReason: clusterUnHealthyReason, + }, + { + name: "Member agent reports that the API server is healthy", + memberCluster: &clusterv1beta1.MemberCluster{ + Status: clusterv1beta1.MemberClusterStatus{ + AgentStatus: []clusterv1beta1.AgentStatus{ + { + Type: clusterv1beta1.MemberAgent, + LastReceivedHeartbeat: metav1.Time{Time: time.Now().Add(-1 * time.Second)}, + Conditions: []metav1.Condition{ + { + Type: string(clusterv1beta1.AgentHealthy), + Status: metav1.ConditionTrue, + }, + }, + }, + }, + }, + }, + clusterProfile: &clusterinventory.ClusterProfile{}, + expectedConditionStatus: metav1.ConditionTrue, + expectedConditionReason: clusterHealthyReason, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + reconciler.syncClusterProfileCondition(tt.memberCluster, tt.clusterProfile) + condition := meta.FindStatusCondition(tt.clusterProfile.Status.Conditions, clusterinventory.ClusterConditionControlPlaneHealthy) + if condition == nil { + t.Fatalf("expected condition to be set, but it was not") + } + if condition.Status != tt.expectedConditionStatus { + t.Errorf("test case `%s` failed, expected condition status %v, got %v", tt.name, tt.expectedConditionStatus, condition.Status) + } + if condition.Reason != tt.expectedConditionReason { + t.Errorf("test case `%s` failed, expected condition reason %v, got %v", tt.name, tt.expectedConditionReason, condition.Reason) + } + }) + } +} diff --git a/pkg/controllers/clusterinventory/clusterprofile/suite_test.go b/pkg/controllers/clusterinventory/clusterprofile/suite_test.go new file mode 100644 index 000000000..c80eb8124 --- /dev/null +++ b/pkg/controllers/clusterinventory/clusterprofile/suite_test.go @@ -0,0 +1,108 @@ +/* +Copyright (c) Microsoft Corporation. +Licensed under the MIT license. +*/ + +package clusterprofile + +import ( + "context" + "flag" + "path/filepath" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/klog/v2" + "k8s.io/klog/v2/textlogger" + clusterinventory "sigs.k8s.io/cluster-inventory-api/apis/v1alpha1" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/envtest" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/metrics/server" + + clusterv1beta1 "go.goms.io/fleet/apis/cluster/v1beta1" +) + +var ( + cfg *rest.Config + mgr manager.Manager + k8sClient client.Client + testEnv *envtest.Environment + ctx context.Context + cancel context.CancelFunc +) + +func TestAPIs(t *testing.T) { + RegisterFailHandler(Fail) + + RunSpecs(t, "ClusterProfile Controller Suite") +} + +var _ = BeforeSuite(func() { + ctx, cancel = context.WithCancel(context.TODO()) + + By("Setup klog") + fs := flag.NewFlagSet("klog", flag.ContinueOnError) + klog.InitFlags(fs) + Expect(fs.Parse([]string{"--v", "5", "-add_dir_header", "true"})).Should(Succeed()) + + By("bootstrapping test environment") + testEnv = &envtest.Environment{ + CRDDirectoryPaths: []string{filepath.Join("../../../../", "config", "crd", "bases")}, + ErrorIfCRDPathMissing: true, + } + + var err error + cfg, err = testEnv.Start() + Expect(err).Should(Succeed()) + Expect(cfg).NotTo(BeNil()) + + err = clusterinventory.AddToScheme(scheme.Scheme) + Expect(err).Should(Succeed()) + err = clusterv1beta1.AddToScheme(scheme.Scheme) + Expect(err).Should(Succeed()) + + By("construct the k8s client") + k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) + Expect(err).Should(Succeed()) + Expect(k8sClient).NotTo(BeNil()) + + By("starting the controller manager") + klog.InitFlags(flag.CommandLine) + flag.Parse() + + mgr, err = ctrl.NewManager(cfg, ctrl.Options{ + Scheme: scheme.Scheme, + Metrics: server.Options{ + BindAddress: "0", + }, + Logger: textlogger.NewLogger(textlogger.NewConfig(textlogger.Verbosity(4))), + }) + Expect(err).Should(Succeed()) + err = (&Reconciler{ + Client: mgr.GetClient(), + ClusterProfileNamespace: clusterProfileNS, + ClusterUnhealthyThreshold: 5 * time.Second, + }).SetupWithManager(mgr) + Expect(err).Should(Succeed()) + + go func() { + defer GinkgoRecover() + err = mgr.Start(ctx) + Expect(err).Should(Succeed(), "failed to run manager") + }() +}) + +var _ = AfterSuite(func() { + defer klog.Flush() + + cancel() + By("tearing down the test environment") + err := testEnv.Stop() + Expect(err).Should(Succeed()) +}) diff --git a/pkg/utils/controller/controller.go b/pkg/utils/controller/controller.go index 5ef85cfef..aa8e84f9c 100644 --- a/pkg/utils/controller/controller.go +++ b/pkg/utils/controller/controller.go @@ -27,6 +27,11 @@ import ( "go.goms.io/fleet/pkg/utils/labels" ) +const ( + // ClusterManagerName is the name of KubeFleet cluster manager. + ClusterManagerName = "KubeFleet" +) + const ( labelError = "error" labelRequeueAfter = "requeue_after" diff --git a/test/e2e/setup.sh b/test/e2e/setup.sh index dc61a89e0..f70d83055 100755 --- a/test/e2e/setup.sh +++ b/test/e2e/setup.sh @@ -122,6 +122,7 @@ helm install hub-agent ../../charts/hub-agent/ \ --set enableWebhook=true \ --set webhookClientConnectionType=service \ --set forceDeleteWaitTime="1m0s" \ + --set clusterUnhealthyThreshold="3m0s" \ --set logFileMaxSize=1000000 # Download CRDs from Fleet networking repo diff --git a/test/e2e/setup_test.go b/test/e2e/setup_test.go index 136409529..a7b3e178d 100644 --- a/test/e2e/setup_test.go +++ b/test/e2e/setup_test.go @@ -24,6 +24,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" k8sscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/klog/v2" + clusterinventory "sigs.k8s.io/cluster-inventory-api/apis/v1alpha1" "sigs.k8s.io/controller-runtime/pkg/client" ctrllog "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log/zap" @@ -238,6 +239,9 @@ func TestMain(m *testing.M) { if err := apiextensionsv1.AddToScheme(scheme); err != nil { log.Fatalf("failed to add API extensions to the runtime scheme: %v", err) } + if err := clusterinventory.AddToScheme(scheme); err != nil { + log.Fatalf("failed to add cluster inventory APIs to the runtime scheme: %v", err) + } os.Exit(m.Run()) }