diff --git a/Makefile b/Makefile index fe8586f97..ddebaddb9 100644 --- a/Makefile +++ b/Makefile @@ -165,6 +165,7 @@ install-hub-agent-helm: --set webhookClientConnectionType=service \ --set enableV1Alpha1APIs=true \ --set enableV1Beta1APIs=false \ + --set enableClusterInventoryAPI=true \ --set logFileMaxSize=1000000 .PHONY: e2e-v1alpha1-hub-kubeconfig-secret diff --git a/charts/hub-agent/templates/crds/multicluster.x-k8s.io_clusterprofiles.yaml b/charts/hub-agent/templates/crds/multicluster.x-k8s.io_clusterprofiles.yaml new file mode 120000 index 000000000..88e5230ab --- /dev/null +++ b/charts/hub-agent/templates/crds/multicluster.x-k8s.io_clusterprofiles.yaml @@ -0,0 +1 @@ +../../../../config/crd/bases/multicluster.x-k8s.io_clusterprofiles.yaml \ No newline at end of file diff --git a/charts/hub-agent/templates/deployment.yaml b/charts/hub-agent/templates/deployment.yaml index d243463cf..1a7a98037 100644 --- a/charts/hub-agent/templates/deployment.yaml +++ b/charts/hub-agent/templates/deployment.yaml @@ -30,6 +30,7 @@ spec: - -add_dir_header - --enable-v1alpha1-apis={{ .Values.enableV1Alpha1APIs }} - --enable-v1beta1-apis={{ .Values.enableV1Beta1APIs }} + - --enable-cluster-inventory-apis={{ .Values.enableClusterInventoryAPI }} - --max-concurrent-cluster-placement={{ .Values.MaxConcurrentClusterPlacement }} - --concurrent-resource-change-syncs={{ .Values.ConcurrentResourceChangeSyncs }} - --log_file_max_size={{ .Values.logFileMaxSize }} @@ -37,6 +38,7 @@ spec: - --hub-api-qps={{ .Values.hubAPIQPS }} - --hub-api-burst={{ .Values.hubAPIBurst }} - --force-delete-wait-time={{ .Values.forceDeleteWaitTime }} + - --cluster-unhealthy-threshold={{ .Values.clusterUnhealthyThreshold }} ports: - name: metrics containerPort: 8080 diff --git a/charts/hub-agent/values.yaml b/charts/hub-agent/values.yaml index 392f45971..7a3394fe4 100644 --- a/charts/hub-agent/values.yaml +++ b/charts/hub-agent/values.yaml @@ -17,7 +17,7 @@ webhookServiceName: fleetwebhook enableGuardRail: true webhookClientConnectionType: service forceDeleteWaitTime: 15m0s - +clusterUnhealthyThreshold: 3m0s namespace: fleet-system @@ -35,10 +35,11 @@ affinity: {} enableV1Alpha1APIs: false enableV1Beta1APIs: true +enableClusterInventoryAPI: true hubAPIQPS: 250 hubAPIBurst: 1000 MaxConcurrentClusterPlacement: 100 ConcurrentResourceChangeSyncs: 20 -logFileMaxSize: 1000000 +logFileMaxSize: 10000000 MaxFleetSizeSupported: 100 diff --git a/cmd/hubagent/main.go b/cmd/hubagent/main.go index 7505ca265..b7c263fc4 100644 --- a/cmd/hubagent/main.go +++ b/cmd/hubagent/main.go @@ -17,6 +17,7 @@ import ( utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/klog/v2" + clusterinventory "sigs.k8s.io/cluster-inventory-api/apis/v1alpha1" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/healthz" @@ -67,6 +68,7 @@ func init() { utilruntime.Must(apiextensionsv1.AddToScheme(scheme)) utilruntime.Must(fleetnetworkingv1alpha1.AddToScheme(scheme)) utilruntime.Must(placementv1alpha1.AddToScheme(scheme)) + utilruntime.Must(clusterinventory.AddToScheme(scheme)) // +kubebuilder:scaffold:scheme klog.InitFlags(nil) diff --git a/cmd/hubagent/options/options.go b/cmd/hubagent/options/options.go index ceb2f2375..b5f728bfd 100644 --- a/cmd/hubagent/options/options.go +++ b/cmd/hubagent/options/options.go @@ -79,6 +79,8 @@ type Options struct { EnableV1Alpha1APIs bool // EnableV1Beta1APIs enables the agents to watch the v1beta1 CRs. EnableV1Beta1APIs bool + // EnableClusterInventoryAPIs enables the agents to watch the cluster inventory CRs. + EnableClusterInventoryAPIs bool // ForceDeleteWaitTime is the duration the hub agent waits before force deleting a member cluster. ForceDeleteWaitTime metav1.Duration } @@ -96,6 +98,7 @@ func NewOptions() *Options { ConcurrentResourceChangeSyncs: 1, MaxFleetSizeSupported: 100, EnableV1Alpha1APIs: false, + EnableClusterInventoryAPIs: false, } } @@ -135,6 +138,7 @@ func (o *Options) AddFlags(flags *flag.FlagSet) { flags.IntVar(&o.MaxFleetSizeSupported, "max-fleet-size", 100, "The max number of member clusters supported in this fleet") flags.BoolVar(&o.EnableV1Alpha1APIs, "enable-v1alpha1-apis", false, "If set, the agents will watch for the v1alpha1 APIs.") flags.BoolVar(&o.EnableV1Beta1APIs, "enable-v1beta1-apis", true, "If set, the agents will watch for the v1beta1 APIs.") + flags.BoolVar(&o.EnableClusterInventoryAPIs, "enable-cluster-inventory-apis", false, "If set, the agents will watch for the ClusterInventory APIs.") flags.DurationVar(&o.ForceDeleteWaitTime.Duration, "force-delete-wait-time", 15*time.Minute, "The duration the hub agent waits before force deleting a member cluster.") o.RateLimiterOpts.AddFlags(flags) diff --git a/cmd/hubagent/options/validation_test.go b/cmd/hubagent/options/validation_test.go index d94cf97bc..3965875c1 100644 --- a/cmd/hubagent/options/validation_test.go +++ b/cmd/hubagent/options/validation_test.go @@ -22,7 +22,7 @@ func newTestOptions(modifyOptions ModifyOptions) Options { option := Options{ SkippedPropagatingAPIs: "fleet.azure.com;multicluster.x-k8s.io", WorkPendingGracePeriod: metav1.Duration{Duration: 10 * time.Second}, - ClusterUnhealthyThreshold: metav1.Duration{Duration: 1 * time.Second}, + ClusterUnhealthyThreshold: metav1.Duration{Duration: 60 * time.Second}, WebhookClientConnectionType: "url", EnableV1Alpha1APIs: true, } diff --git a/cmd/hubagent/workload/setup.go b/cmd/hubagent/workload/setup.go index 358ceabee..e49ca26f5 100644 --- a/cmd/hubagent/workload/setup.go +++ b/cmd/hubagent/workload/setup.go @@ -16,6 +16,7 @@ import ( "k8s.io/client-go/dynamic" "k8s.io/client-go/rest" "k8s.io/klog/v2" + clusterinventory "sigs.k8s.io/cluster-inventory-api/apis/v1alpha1" ctrl "sigs.k8s.io/controller-runtime" workv1alpha1 "sigs.k8s.io/work-api/pkg/apis/v1alpha1" @@ -24,6 +25,7 @@ import ( placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" fleetv1alpha1 "go.goms.io/fleet/apis/v1alpha1" "go.goms.io/fleet/cmd/hubagent/options" + "go.goms.io/fleet/pkg/controllers/clusterinventory/clusterprofile" "go.goms.io/fleet/pkg/controllers/clusterresourcebindingwatcher" "go.goms.io/fleet/pkg/controllers/clusterresourceplacement" "go.goms.io/fleet/pkg/controllers/clusterresourceplacementwatcher" @@ -80,6 +82,10 @@ var ( placementv1alpha1.GroupVersion.WithKind(placementv1alpha1.ResourceOverrideKind), placementv1alpha1.GroupVersion.WithKind(placementv1alpha1.ResourceOverrideSnapshotKind), } + + clusterInventoryGVKs = []schema.GroupVersionKind{ + clusterinventory.GroupVersion.WithKind("ClusterProfile"), + } ) // SetupControllers set up the customized controllers we developed @@ -92,32 +98,32 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, discoverClient := discovery.NewDiscoveryClientForConfigOrDie(config) // Verify CRD installation status. - if opts.EnableV1Alpha1APIs { - for _, gvk := range v1Alpha1RequiredGVKs { + if opts.EnableClusterInventoryAPIs { + for _, gvk := range clusterInventoryGVKs { if err = utils.CheckCRDInstalled(discoverClient, gvk); err != nil { klog.ErrorS(err, "unable to find the required CRD", "GVK", gvk) return err } } - } - - if opts.EnableV1Beta1APIs { - for _, gvk := range v1Beta1RequiredGVKs { - if err = utils.CheckCRDInstalled(discoverClient, gvk); err != nil { - klog.ErrorS(err, "unable to find the required CRD", "GVK", gvk) - return err - } + klog.Info("Setting up cluster profile controller") + if err = (&clusterprofile.Reconciler{ + HubClient: mgr.GetClient(), + ClusterProfileNamespace: utils.FleetSystemNamespace, + ClusterUnhealthyThreshold: opts.ClusterUnhealthyThreshold.Duration, + }).SetupWithManager(mgr); err != nil { + klog.ErrorS(err, "unable to set up ClusterProfile controller") + return err } } // AllowedPropagatingAPIs and SkippedPropagatingAPIs are mutually exclusive. // If none of them are set, the resourceConfig by default stores a list of skipped propagation APIs. resourceConfig := utils.NewResourceConfig(opts.AllowedPropagatingAPIs != "") - if err := resourceConfig.Parse(opts.AllowedPropagatingAPIs); err != nil { + if err = resourceConfig.Parse(opts.AllowedPropagatingAPIs); err != nil { // The program will never go here because the parameters have been checked. return err } - if err := resourceConfig.Parse(opts.SkippedPropagatingAPIs); err != nil { + if err = resourceConfig.Parse(opts.SkippedPropagatingAPIs); err != nil { // The program will never go here because the parameters have been checked return err } @@ -153,32 +159,16 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, rateLimiter := options.DefaultControllerRateLimiter(opts.RateLimiterOpts) var clusterResourcePlacementControllerV1Alpha1 controller.Controller var clusterResourcePlacementControllerV1Beta1 controller.Controller - + var memberClusterPlacementController controller.Controller if opts.EnableV1Alpha1APIs { + for _, gvk := range v1Alpha1RequiredGVKs { + if err = utils.CheckCRDInstalled(discoverClient, gvk); err != nil { + klog.ErrorS(err, "unable to find the required CRD", "GVK", gvk) + return err + } + } klog.Info("Setting up clusterResourcePlacement v1alpha1 controller") clusterResourcePlacementControllerV1Alpha1 = controller.NewController(crpControllerV1Alpha1Name, controller.NamespaceKeyFunc, crpc.ReconcileV1Alpha1, rateLimiter) - } - - if opts.EnableV1Beta1APIs { - klog.Info("Setting up clusterResourcePlacement v1beta1 controller") - clusterResourcePlacementControllerV1Beta1 = controller.NewController(crpControllerV1Beta1Name, controller.NamespaceKeyFunc, crpc.Reconcile, rateLimiter) - } - - // Set up a new controller to reconcile any resources in the cluster - klog.Info("Setting up resource change controller") - rcr := &resourcechange.Reconciler{ - DynamicClient: dynamicClient, - Recorder: mgr.GetEventRecorderFor(resourceChangeControllerName), - RestMapper: mgr.GetRESTMapper(), - InformerManager: dynamicInformerManager, - PlacementControllerV1Alpha1: clusterResourcePlacementControllerV1Alpha1, - PlacementControllerV1Beta1: clusterResourcePlacementControllerV1Beta1, - } - - resourceChangeController := controller.NewController(resourceChangeControllerName, controller.ClusterWideKeyFunc, rcr.Reconcile, rateLimiter) - - var memberClusterPlacementController controller.Controller - if opts.EnableV1Alpha1APIs { klog.Info("Setting up member cluster change controller") mcp := &memberclusterplacement.Reconciler{ InformerManager: dynamicInformerManager, @@ -188,6 +178,14 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, } if opts.EnableV1Beta1APIs { + for _, gvk := range v1Beta1RequiredGVKs { + if err = utils.CheckCRDInstalled(discoverClient, gvk); err != nil { + klog.ErrorS(err, "unable to find the required CRD", "GVK", gvk) + return err + } + } + klog.Info("Setting up clusterResourcePlacement v1beta1 controller") + clusterResourcePlacementControllerV1Beta1 = controller.NewController(crpControllerV1Beta1Name, controller.NamespaceKeyFunc, crpc.Reconcile, rateLimiter) klog.Info("Setting up clusterResourcePlacement watcher") if err := (&clusterresourceplacementwatcher.Reconciler{ PlacementController: clusterResourcePlacementControllerV1Beta1, @@ -310,6 +308,18 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, } } + // Set up a new controller to reconcile any resources in the cluster + klog.Info("Setting up resource change controller") + rcr := &resourcechange.Reconciler{ + DynamicClient: dynamicClient, + Recorder: mgr.GetEventRecorderFor(resourceChangeControllerName), + RestMapper: mgr.GetRESTMapper(), + InformerManager: dynamicInformerManager, + PlacementControllerV1Alpha1: clusterResourcePlacementControllerV1Alpha1, + PlacementControllerV1Beta1: clusterResourcePlacementControllerV1Beta1, + } + resourceChangeController := controller.NewController(resourceChangeControllerName, controller.ClusterWideKeyFunc, rcr.Reconcile, rateLimiter) + // Set up a runner that starts all the custom controllers we created above resourceChangeDetector := &resourcewatcher.ChangeDetector{ DiscoveryClient: discoverClient, diff --git a/config/crd/bases/multicluster.x-k8s.io_clusterprofiles.yaml b/config/crd/bases/multicluster.x-k8s.io_clusterprofiles.yaml new file mode 100644 index 000000000..f8883b17d --- /dev/null +++ b/config/crd/bases/multicluster.x-k8s.io_clusterprofiles.yaml @@ -0,0 +1,185 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.14.0 + name: clusterprofiles.multicluster.x-k8s.io +spec: + group: multicluster.x-k8s.io + names: + kind: ClusterProfile + listKind: ClusterProfileList + plural: clusterprofiles + singular: clusterprofile + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: ClusterProfile represents a single cluster in a multi-cluster + deployment. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: ClusterProfileSpec defines the desired state of ClusterProfile. + properties: + clusterManager: + description: ClusterManager defines which cluster manager owns this + ClusterProfile resource + properties: + name: + description: Name defines the name of the cluster manager + type: string + required: + - name + type: object + x-kubernetes-validations: + - message: ClusterManager is immutable + rule: self == oldSelf + displayName: + description: DisplayName defines a human-readable name of the ClusterProfile + type: string + required: + - clusterManager + type: object + status: + description: ClusterProfileStatus defines the observed state of ClusterProfile. + properties: + conditions: + description: Conditions contains the different condition statuses + for this cluster. + items: + description: "Condition contains details for one aspect of the current + state of this API Resource.\n---\nThis struct is intended for + direct use as an array at the field path .status.conditions. For + example,\n\n\n\ttype FooStatus struct{\n\t // Represents the + observations of a foo's current state.\n\t // Known .status.conditions.type + are: \"Available\", \"Progressing\", and \"Degraded\"\n\t // + +patchMergeKey=type\n\t // +patchStrategy=merge\n\t // +listType=map\n\t + \ // +listMapKey=type\n\t Conditions []metav1.Condition `json:\"conditions,omitempty\" + patchStrategy:\"merge\" patchMergeKey:\"type\" protobuf:\"bytes,1,rep,name=conditions\"`\n\n\n\t + \ // other fields\n\t}" + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: |- + type of condition in CamelCase or in foo.example.com/CamelCase. + --- + Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be + useful (see .node.status.conditions), the ability to deconflict is important. + The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + properties: + description: |- + Properties defines name/value pairs to represent properties of a cluster. + It could be a collection of ClusterProperty (KEP-2149) resources, + but could also be info based on other implementations. + The names of the properties can be predefined names from ClusterProperty resources + and is allowed to be customized by different cluster managers. + items: + description: |- + Property defines a name/value pair to represent a property of a cluster. + It could be a ClusterProperty (KEP-2149) resource, + but could also be info based on other implementations. + The name of the property can be predefined name from a ClusterProperty resource + and is allowed to be customized by different cluster managers. + This property can store various configurable details and metrics of a cluster, + which may include information such as the number of nodes, total and free CPU, + and total and free memory, among other potential attributes. + properties: + name: + description: |- + Name is the name of a property resource on cluster. It's a well-known + or customized name to identify the property. + maxLength: 253 + minLength: 1 + type: string + value: + description: Value is a property-dependent string + maxLength: 1024 + minLength: 1 + type: string + required: + - name + - value + type: object + type: array + version: + description: Version defines the version information of the cluster. + properties: + kubernetes: + description: Kubernetes is the kubernetes version of the cluster. + type: string + type: object + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/go.mod b/go.mod index 918077e2b..d6bc2ced5 100644 --- a/go.mod +++ b/go.mod @@ -23,14 +23,15 @@ require ( golang.org/x/exp v0.0.0-20240613232115-7f521ea00fb8 golang.org/x/sync v0.7.0 golang.org/x/time v0.5.0 - k8s.io/api v0.30.2 + k8s.io/api v0.30.3 k8s.io/apiextensions-apiserver v0.30.2 - k8s.io/apimachinery v0.30.2 - k8s.io/client-go v0.30.2 + k8s.io/apimachinery v0.30.3 + k8s.io/client-go v0.30.3 k8s.io/component-base v0.30.2 - k8s.io/klog/v2 v2.120.1 + k8s.io/klog/v2 v2.130.1 k8s.io/metrics v0.25.2 k8s.io/utils v0.0.0-20240502163921-fe8a2dddb1d0 + sigs.k8s.io/cluster-inventory-api v0.0.0-20240730014211-ef0154379848 sigs.k8s.io/controller-runtime v0.18.4 sigs.k8s.io/work-api v0.0.0-20220407021756-586d707fdb2c ) @@ -43,7 +44,7 @@ require ( github.com/beorn7/perks v1.0.1 // indirect github.com/blendle/zapdriver v1.3.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect - github.com/davecgh/go-spew v1.1.1 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/emicklei/go-restful/v3 v3.11.0 // indirect github.com/evanphx/json-patch v5.9.0+incompatible // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect diff --git a/go.sum b/go.sum index b828380d0..a9943a9c0 100644 --- a/go.sum +++ b/go.sum @@ -64,8 +64,9 @@ github.com/cpuguy83/go-md2man/v2 v2.0.3/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46t github.com/crossplane/crossplane-runtime v1.17.0 h1:y+GvxPT1M9s8BKt2AeZJdd2d6pg2xZeCO6LiR+VxEF8= github.com/crossplane/crossplane-runtime v1.17.0/go.mod h1:vtglCrnnbq2HurAk9yLHa4qS0bbnCxaKL7C21cQcB/0= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g= github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/evanphx/json-patch v5.9.0+incompatible h1:fBXyNpNMuTTDdquAq/uisOr2lShz4oaXpDTX2bLe7ls= @@ -270,14 +271,14 @@ gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.30.2 h1:+ZhRj+28QT4UOH+BKznu4CBgPWgkXO7XAvMcMl0qKvI= -k8s.io/api v0.30.2/go.mod h1:ULg5g9JvOev2dG0u2hig4Z7tQ2hHIuS+m8MNZ+X6EmI= +k8s.io/api v0.30.3 h1:ImHwK9DCsPA9uoU3rVh4QHAHHK5dTSv1nxJUapx8hoQ= +k8s.io/api v0.30.3/go.mod h1:GPc8jlzoe5JG3pb0KJCSLX5oAFIW3/qNJITlDj8BH04= k8s.io/apiextensions-apiserver v0.30.2 h1:l7Eue2t6QiLHErfn2vwK4KgF4NeDgjQkCXtEbOocKIE= k8s.io/apiextensions-apiserver v0.30.2/go.mod h1:lsJFLYyK40iguuinsb3nt+Sj6CmodSI4ACDLep1rgjw= -k8s.io/apimachinery v0.30.2 h1:fEMcnBj6qkzzPGSVsAZtQThU62SmQ4ZymlXRC5yFSCg= -k8s.io/apimachinery v0.30.2/go.mod h1:iexa2somDaxdnj7bha06bhb43Zpa6eWH8N8dbqVjTUc= -k8s.io/client-go v0.30.2 h1:sBIVJdojUNPDU/jObC+18tXWcTJVcwyqS9diGdWHk50= -k8s.io/client-go v0.30.2/go.mod h1:JglKSWULm9xlJLx4KCkfLLQ7XwtlbflV6uFFSHTMgVs= +k8s.io/apimachinery v0.30.3 h1:q1laaWCmrszyQuSQCfNB8cFgCuDAoPszKY4ucAjDwHc= +k8s.io/apimachinery v0.30.3/go.mod h1:iexa2somDaxdnj7bha06bhb43Zpa6eWH8N8dbqVjTUc= +k8s.io/client-go v0.30.3 h1:bHrJu3xQZNXIi8/MoxYtZBBWQQXwy16zqJwloXXfD3k= +k8s.io/client-go v0.30.3/go.mod h1:8d4pf8vYu665/kUbsxWAQ/JDBNWqfFeZnvFiVdmx89U= k8s.io/cloud-provider v0.28.3 h1:9u+JjA3zIn0nqLOOa8tWnprFkffguSAhfBvo8p7LhBQ= k8s.io/cloud-provider v0.28.3/go.mod h1:shAJxdrKu+SwwGUhkodxByPjaH8KBFZqXo6jU1F0ehI= k8s.io/component-base v0.30.2 h1:pqGBczYoW1sno8q9ObExUqrYSKhtE5rW3y6gX88GZII= @@ -286,8 +287,8 @@ k8s.io/component-helpers v0.28.3 h1:te9ieTGzcztVktUs92X53P6BamAoP73MK0qQP0WmDqc= k8s.io/component-helpers v0.28.3/go.mod h1:oJR7I9ist5UAQ3y/CTdbw6CXxdMZ1Lw2Ua/EZEwnVLs= k8s.io/csi-translation-lib v0.28.3 h1:7deV+HZjV418AGikSDPW8dyzTpm4K3tNbQUp3KmR7cs= k8s.io/csi-translation-lib v0.28.3/go.mod h1:zlrYwakCz2yji9/8EaJk+afIKPrYXPNXXLDO8DVuuTk= -k8s.io/klog/v2 v2.120.1 h1:QXU6cPEOIslTGvZaXvFWiP9VKyeet3sawzTOvdXb4Vw= -k8s.io/klog/v2 v2.120.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= +k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= +k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/kube-openapi v0.0.0-20240521193020-835d969ad83a h1:zD1uj3Jf+mD4zmA7W+goE5TxDkI7OGJjBNBzq5fJtLA= k8s.io/kube-openapi v0.0.0-20240521193020-835d969ad83a/go.mod h1:UxDHUPsUwTOOxSU+oXURfFBcAS6JwiRXTYqYwfuGowc= k8s.io/metrics v0.25.2 h1:105TuPaIFfr4EHzN56WwZJO7r1UesuDytNTzeMqGySo= @@ -298,6 +299,8 @@ knative.dev/pkg v0.0.0-20231010144348-ca8c009405dd h1:KJXBX9dOmRTUWduHg1gnWtPGIE knative.dev/pkg v0.0.0-20231010144348-ca8c009405dd/go.mod h1:36cYnaOVHkzmhgybmYX6zDaTl3PakFeJQJl7wi6/RLE= sigs.k8s.io/cloud-provider-azure v1.28.2 h1:KKrWdC1+p2xXdT1VRmSkT57MhKNzPXk3yPcrwUDIr5I= sigs.k8s.io/cloud-provider-azure v1.28.2/go.mod h1:vDsaFOrvDDEUg0mLF2eoUeneCK+ROlRf4zACA91iwHs= +sigs.k8s.io/cluster-inventory-api v0.0.0-20240730014211-ef0154379848 h1:WYPi2PdQyZwZkHG648v2jQl6deyCgyjJ0fkLYgUJ618= +sigs.k8s.io/cluster-inventory-api v0.0.0-20240730014211-ef0154379848/go.mod h1:/aN4e7RWOMHgT4xAjCNkV4YFcpKfpZCeumMIL7S+KNM= sigs.k8s.io/controller-runtime v0.18.4 h1:87+guW1zhvuPLh1PHybKdYFLU0YJp4FhJRmiHvm5BZw= sigs.k8s.io/controller-runtime v0.18.4/go.mod h1:TVoGrfdpbA9VRFaRnKgk9P5/atA0pMwq+f+msb9M8Sg= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= diff --git a/hack/Azure/setup/createHubCluster.sh b/hack/Azure/setup/createHubCluster.sh index 005c4248b..4c55ae355 100755 --- a/hack/Azure/setup/createHubCluster.sh +++ b/hack/Azure/setup/createHubCluster.sh @@ -26,12 +26,14 @@ helm install hub-agent charts/hub-agent/ \ --set image.pullPolicy=Always \ --set image.repository=$REGISTRY/hub-agent \ --set image.tag=$TAG \ - --set logVerbosity=2 \ + --set logVerbosity=5 \ --set namespace=fleet-system \ --set enableWebhook=false \ --set webhookClientConnectionType=service \ --set enableV1Alpha1APIs=false \ --set enableV1Beta1APIs=true \ + --set clusterUnhealthyThreshold="3m0s" \ + --set forceDeleteWaitTime="1m0s" \ --set resources.limits.cpu=4 \ --set resources.limits.memory=4Gi \ --set concurrentClusterPlacementSyncs=10 \ diff --git a/hack/Azure/setup/joinMC.sh b/hack/Azure/setup/joinMC.sh index 331286405..3a5032da1 100755 --- a/hack/Azure/setup/joinMC.sh +++ b/hack/Azure/setup/joinMC.sh @@ -87,7 +87,7 @@ helm install member-agent charts/member-agent/ \ --set image.pullPolicy=Always \ --set refreshtoken.pullPolicy=Always \ --set config.memberClusterName=$MEMBER_CLUSTER \ - --set logVerbosity=8 \ + --set logVerbosity=5 \ --set namespace=fleet-system \ --set enableV1Alpha1APIs=false \ --set enableV1Beta1APIs=true diff --git a/pkg/controllers/clusterinventory/clusterprofile/controller.go b/pkg/controllers/clusterinventory/clusterprofile/controller.go new file mode 100644 index 000000000..f0a10d94d --- /dev/null +++ b/pkg/controllers/clusterinventory/clusterprofile/controller.go @@ -0,0 +1,248 @@ +/* +Copyright (c) Microsoft Corporation. +Licensed under the MIT license. +*/ + +// Package clusterprofile features a controller to generate clusterprofile objects from MemberCluster +package clusterprofile + +import ( + "context" + "fmt" + "time" + + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" + clusterinventory "sigs.k8s.io/cluster-inventory-api/apis/v1alpha1" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + + clusterv1beta1 "go.goms.io/fleet/apis/cluster/v1beta1" + "go.goms.io/fleet/pkg/utils" +) + +const ( + // clusterProfileCleanupFinalizer is the finalizer added to a MemberCluster object if + // a corresponding ClusterProfile object has been created. + clusterProfileCleanupFinalizer = "kubernetes-fleet.io/cluster-profile-cleanup" +) + +// Reconciler reconciles a MemberCluster object and creates the corresponding ClusterProfile +// object in the designated namespace. +type Reconciler struct { + HubClient client.Client + ClusterProfileNamespace string + ClusterUnhealthyThreshold time.Duration +} + +// Reconcile processes the MemberCluster object and creates the corresponding ClusterProfile object +// in the designated namespace. +func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + mcRef := klog.KRef(req.Namespace, req.Name) + startTime := time.Now() + klog.V(2).InfoS("Reconciliation starts (cluster profile controller)", "MemberCluster", mcRef) + defer func() { + latency := time.Since(startTime).Milliseconds() + klog.V(2).InfoS("Reconciliation ends (cluster profile controller)", "MemberCluster", mcRef, "latency", latency) + }() + + // Retrieve the MemberCluster object. + mc := &clusterv1beta1.MemberCluster{} + if err := r.HubClient.Get(ctx, req.NamespacedName, mc); err != nil { + if errors.IsNotFound(err) { + klog.InfoS("Member cluster object is not found", "MemberCluster", mcRef) + // To address the case where a member cluster is deleted before its cluster profile is cleaned up + // since we didn't put the logic in the member cluster controller + // or a cluster profile has been created without the acknowledgment of this controller. + if err = r.cleanupClusterProfile(ctx, req.Name); err != nil { + klog.ErrorS(err, "Failed to clean up the cluster profile when the member cluster is already gone", "MemberCluster", mcRef) + return ctrl.Result{}, err + } + return ctrl.Result{}, nil + } + klog.ErrorS(err, "Failed to get member cluster", "MemberCluster", mcRef) + return ctrl.Result{}, err + } + + // Check if the member cluster object has been marked for deletion. + if mc.DeletionTimestamp != nil { + klog.V(2).InfoS("Member cluster object is being deleted; remove the corresponding cluster profile", "MemberCluster", mcRef) + // Delete the corresponding ClusterProfile object. + if err := r.cleanupClusterProfile(ctx, mc.Name); err != nil { + klog.ErrorS(err, "Failed to clean up cluster profile when member cluster is marked for deletion", "MemberCluster", mcRef) + return ctrl.Result{}, err + } + + // Remove the cleanup finalizer from the MemberCluster object. + controllerutil.RemoveFinalizer(mc, clusterProfileCleanupFinalizer) + if err := r.HubClient.Update(ctx, mc); err != nil { + klog.ErrorS(err, "Failed to remove cleanup finalizer", "MemberCluster", mcRef) + return ctrl.Result{}, err + } + return ctrl.Result{}, nil + } + + // Check if the MemberCluster object has the cleanup finalizer; if not, add it. + if !controllerutil.ContainsFinalizer(mc, clusterProfileCleanupFinalizer) { + mc.Finalizers = append(mc.Finalizers, clusterProfileCleanupFinalizer) + if err := r.HubClient.Update(ctx, mc); err != nil { + klog.ErrorS(err, "Failed to add cleanup finalizer", "MemberCluster", mcRef) + return ctrl.Result{}, err + } + } + + // Retrieve the corresponding ClusterProfile object. If the object does not exist, create it. + cp := &clusterinventory.ClusterProfile{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: r.ClusterProfileNamespace, + Name: mc.Name, + }, + } + // Note that if the object already exists and its spec matches with the desired space, no + // update op will be performed. + createOrUpdateRes, err := controllerutil.CreateOrUpdate(ctx, r.HubClient, cp, func() error { + if cp.CreationTimestamp.IsZero() { + // Only set the ClusterManager field if the object is being created; this field + // is immutable by definition. + cp.Spec = clusterinventory.ClusterProfileSpec{ + ClusterManager: clusterinventory.ClusterManager{ + Name: utils.ClusterManagerName, + }, + } + } + + // Return an error if the cluster profile is under the management of a different platform. + if cp.Spec.ClusterManager.Name != utils.ClusterManagerName { + return fmt.Errorf("cluster profile is under the management of a different platform: %s", cp.Spec.ClusterManager.Name) + } + + // Set the labels. + if cp.Labels == nil { + cp.Labels = make(map[string]string) + } + cp.Labels[clusterinventory.LabelClusterManagerKey] = utils.ClusterManagerName + + // Set the display name. + cp.Spec.DisplayName = mc.Name + return nil + }) + if err != nil { + klog.ErrorS(err, "Failed to create or update cluster profile", "MemberCluster", mcRef, "ClusterProfile", klog.KObj(cp), "operation", createOrUpdateRes) + return ctrl.Result{}, err + } + klog.V(2).InfoS("Cluster profile object is created or updated", "MemberCluster", mcRef, "ClusterProfile", klog.KObj(cp), "operation", createOrUpdateRes) + // sync the cluster profile condition from the member cluster condition + r.syncClusterProfileCondition(mc, cp) + if err = r.HubClient.Status().Update(ctx, cp); err != nil { + klog.ErrorS(err, "Failed to update cluster profile status", "MemberCluster", mcRef, "ClusterProfile", klog.KObj(cp)) + return ctrl.Result{}, err + } + return ctrl.Result{}, nil +} + +// syncClusterProfileCondition syncs the ClusterProfile object's condition based on the MemberCluster object's condition. +func (r *Reconciler) syncClusterProfileCondition(mc *clusterv1beta1.MemberCluster, cp *clusterinventory.ClusterProfile) { + // Update the cluster profile status. + // + // For simplicity reasons, for now only the health check condition is populated, using + // Fleet member agent's API server health check result. + var mcHealthCond *metav1.Condition + var memberAgentLastHeartbeat *metav1.Time + + memberAgentStatus := mc.GetAgentStatus(clusterv1beta1.MemberAgent) + if memberAgentStatus != nil { + mcHealthCond = meta.FindStatusCondition(memberAgentStatus.Conditions, string(clusterv1beta1.AgentHealthy)) + memberAgentLastHeartbeat = &memberAgentStatus.LastReceivedHeartbeat + } + switch { + case memberAgentStatus == nil: + // The member agent hasn't reported its status yet. + // Set the unknown health condition in the cluster profile status. + meta.SetStatusCondition(&cp.Status.Conditions, metav1.Condition{ + Type: clusterinventory.ClusterConditionControlPlaneHealthy, + Status: metav1.ConditionUnknown, + Reason: "MemberAgentReportedNoStatus", + ObservedGeneration: cp.Generation, + Message: "The Fleet member agent has not reported its status yet", + }) + case mcHealthCond == nil: + // The member agent has reported its status, but the health condition is missing. + // Set the unknown health condition in the cluster profile status. + meta.SetStatusCondition(&cp.Status.Conditions, metav1.Condition{ + Type: clusterinventory.ClusterConditionControlPlaneHealthy, + Status: metav1.ConditionUnknown, + Reason: "MemberAgentReportedNoHealthInfo", + ObservedGeneration: cp.Generation, + Message: "The Fleet member agent has reported its status, but the health condition is missing", + }) + case memberAgentLastHeartbeat == nil || time.Since(memberAgentLastHeartbeat.Time) > r.ClusterUnhealthyThreshold: + // The member agent has lost its heartbeat connection to the Fleet hub cluster. + // Set the unknown health condition in the cluster profile status. + meta.SetStatusCondition(&cp.Status.Conditions, metav1.Condition{ + Type: clusterinventory.ClusterConditionControlPlaneHealthy, + Status: metav1.ConditionUnknown, + Reason: "MemberAgentHeartbeatLost", + ObservedGeneration: cp.Generation, + Message: "The Fleet member agent has lost its heartbeat connection to the Fleet hub cluster", + }) + //case mcHealthCond.Status == metav1.ConditionUnknown || mcHealthCond.ObservedGeneration != mc.Generation: + // Note (chenyu1): Skip the generation check as Fleet member agent currently does not handle + // the health condition appropriately. + case mcHealthCond.Status == metav1.ConditionUnknown: + // The health condition has not been updated. + // Set the unknown health condition in the cluster profile status. + meta.SetStatusCondition(&cp.Status.Conditions, metav1.Condition{ + Type: clusterinventory.ClusterConditionControlPlaneHealthy, + Status: metav1.ConditionUnknown, + Reason: "MemberAgentHealthCheckResultUnknown", + ObservedGeneration: cp.Generation, + Message: "The Fleet member agent health check result is out of date or unknown", + }) + case mcHealthCond.Status == metav1.ConditionFalse: + // The member agent reports that the API server is unhealthy. + // Set the false health condition in the cluster profile status. + meta.SetStatusCondition(&cp.Status.Conditions, metav1.Condition{ + Type: clusterinventory.ClusterConditionControlPlaneHealthy, + Status: metav1.ConditionFalse, + Reason: "MemberClusterAPIServerUnhealthy", + ObservedGeneration: cp.Generation, + Message: "The Fleet member agent reports that the API server is unhealthy", + }) + default: + // The member agent reports that the API server is healthy. + // Set the true health condition in the cluster profile status. + meta.SetStatusCondition(&cp.Status.Conditions, metav1.Condition{ + Type: clusterinventory.ClusterConditionControlPlaneHealthy, + Status: metav1.ConditionTrue, + Reason: "MemberClusterAPIServerHealthy", + ObservedGeneration: cp.Generation, + Message: "The Fleet member agent reports that the API server is healthy", + }) + } +} + +// cleanupClusterProfile deletes the ClusterProfile object associated with a given MemberCluster object. +func (r *Reconciler) cleanupClusterProfile(ctx context.Context, clusterName string) error { + cp := &clusterinventory.ClusterProfile{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: r.ClusterProfileNamespace, + Name: clusterName, + }, + } + klog.V(2).InfoS("delete the cluster profile", "MemberCluster", clusterName, "ClusterProfile", klog.KObj(cp)) + if err := r.HubClient.Delete(ctx, cp); err != nil && !errors.IsNotFound(err) { + return err + } + return nil +} + +// SetupWithManager sets up the controller with the controller manager. +func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&clusterv1beta1.MemberCluster{}). + // TO-DO (chenyu1): watch also cluster profile objects. + Complete(r) +} diff --git a/pkg/utils/common.go b/pkg/utils/common.go index 4fb1086f7..f1bde476b 100644 --- a/pkg/utils/common.go +++ b/pkg/utils/common.go @@ -42,6 +42,7 @@ import ( ) const ( + ClusterManagerName = "KubeFleet" kubePrefix = "kube-" fleetPrefix = "fleet-" FleetSystemNamespace = fleetPrefix + "system" diff --git a/test/e2e/setup.sh b/test/e2e/setup.sh index 6caccc1d3..5ab924af8 100755 --- a/test/e2e/setup.sh +++ b/test/e2e/setup.sh @@ -122,6 +122,7 @@ helm install hub-agent ../../charts/hub-agent/ \ --set enableWebhook=true \ --set webhookClientConnectionType=service \ --set forceDeleteWaitTime="1m0s" \ + --set clusterUnhealthyThreshold="3m0s" \ --set logFileMaxSize=1000000 # Download CRDs from Fleet networking repo diff --git a/test/e2e/setup_test.go b/test/e2e/setup_test.go index 136409529..a7b3e178d 100644 --- a/test/e2e/setup_test.go +++ b/test/e2e/setup_test.go @@ -24,6 +24,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" k8sscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/klog/v2" + clusterinventory "sigs.k8s.io/cluster-inventory-api/apis/v1alpha1" "sigs.k8s.io/controller-runtime/pkg/client" ctrllog "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log/zap" @@ -238,6 +239,9 @@ func TestMain(m *testing.M) { if err := apiextensionsv1.AddToScheme(scheme); err != nil { log.Fatalf("failed to add API extensions to the runtime scheme: %v", err) } + if err := clusterinventory.AddToScheme(scheme); err != nil { + log.Fatalf("failed to add cluster inventory APIs to the runtime scheme: %v", err) + } os.Exit(m.Run()) }