Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: drift detection + takeover (full draft) #1001

Closed
wants to merge 26 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apis/cluster/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion apis/placement/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion apis/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

37 changes: 23 additions & 14 deletions cmd/memberagent/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ import (
fleetv1alpha1 "go.goms.io/fleet/apis/v1alpha1"
imcv1alpha1 "go.goms.io/fleet/pkg/controllers/internalmembercluster/v1alpha1"
imcv1beta1 "go.goms.io/fleet/pkg/controllers/internalmembercluster/v1beta1"
"go.goms.io/fleet/pkg/controllers/work"
"go.goms.io/fleet/pkg/controllers/workapplier"
workv1alpha1controller "go.goms.io/fleet/pkg/controllers/workv1alpha1"
fleetmetrics "go.goms.io/fleet/pkg/metrics"
"go.goms.io/fleet/pkg/propertyprovider"
Expand All @@ -68,12 +68,14 @@ var (
metricsAddr = flag.String("metrics-bind-address", ":8090", "The address the metric endpoint binds to.")
enableLeaderElection = flag.Bool("leader-elect", false,
"Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager.")
leaderElectionNamespace = flag.String("leader-election-namespace", "kube-system", "The namespace in which the leader election resource will be created.")
enableV1Alpha1APIs = flag.Bool("enable-v1alpha1-apis", true, "If set, the agents will watch for the v1alpha1 APIs.")
enableV1Beta1APIs = flag.Bool("enable-v1beta1-apis", false, "If set, the agents will watch for the v1beta1 APIs.")
propertyProvider = flag.String("property-provider", "none", "The property provider to use for the agent.")
region = flag.String("region", "", "The region where the member cluster resides.")
cloudConfigFile = flag.String("cloud-config", "/etc/kubernetes/provider/config.json", "The path to the cloud cloudconfig file.")
leaderElectionNamespace = flag.String("leader-election-namespace", "kube-system", "The namespace in which the leader election resource will be created.")
enableV1Alpha1APIs = flag.Bool("enable-v1alpha1-apis", true, "If set, the agents will watch for the v1alpha1 APIs.")
enableV1Beta1APIs = flag.Bool("enable-v1beta1-apis", false, "If set, the agents will watch for the v1beta1 APIs.")
propertyProvider = flag.String("property-provider", "none", "The property provider to use for the agent.")
region = flag.String("region", "", "The region where the member cluster resides.")
cloudConfigFile = flag.String("cloud-config", "/etc/kubernetes/provider/config.json", "The path to the cloud cloudconfig file.")
availabilityCheckInterval = flag.Int("availability-check-interval", 5, "The interval in seconds between attempts to check for resource availability when resources are not yet available.")
driftDetectionInterval = flag.Int("drift-detection-interval", 15, "The interval in seconds between attempts to detect configuration drifts in the cluster.")
)

func init() {
Expand Down Expand Up @@ -348,15 +350,22 @@ func Start(ctx context.Context, hubCfg, memberConfig *rest.Config, hubOpts, memb
klog.ErrorS(err, "unable to find the required CRD", "GVK", gvk)
return err
}
// create the work controller, so we can pass it to the internal member cluster reconciler
workController := work.NewApplyWorkReconciler(

// Prepare the work applier.
workApplier := workapplier.NewReconciler(
hubMgr.GetClient(),
targetNS,
spokeDynamicClient,
memberMgr.GetClient(),
restMapper, hubMgr.GetEventRecorderFor("work_controller"), 5, targetNS)

if err = workController.SetupWithManager(hubMgr); err != nil {
klog.ErrorS(err, "Failed to create v1beta1 controller", "controller", "work")
restMapper,
hubMgr.GetEventRecorderFor("work_applier"),
5,
4,
Comment on lines +362 to +363
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

where do those magic number come from?

time.Second*time.Duration(*availabilityCheckInterval),
time.Second*time.Duration(*driftDetectionInterval),
)
if err = workApplier.SetupWithManager(hubMgr); err != nil {
klog.ErrorS(err, "Failed to create work applier controller", "controller", "workapplier")
return err
}

Expand All @@ -383,7 +392,7 @@ func Start(ctx context.Context, hubCfg, memberConfig *rest.Config, hubOpts, memb
ctx,
hubMgr.GetClient(),
memberMgr.GetConfig(), memberMgr.GetClient(),
workController,
workApplier,
pp)
if err != nil {
klog.ErrorS(err, "Failed to create InternalMemberCluster v1beta1 reconciler")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,10 @@ spec:
description: |-
Value defines the content to be applied on the target location.
Value should be empty when operator is `remove`.
We have reserved a few variables in this field that will be replaced by the actual values.
Those variables all start with `$` and are case sensitive.
Here is the list of currently supported variables:
`${MEMBER-CLUSTER-NAME}`: this will be replaced by the name of the memberCluster CR that represents this cluster.
x-kubernetes-preserve-unknown-fields: true
required:
- op
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,10 @@ spec:
description: |-
Value defines the content to be applied on the target location.
Value should be empty when operator is `remove`.
We have reserved a few variables in this field that will be replaced by the actual values.
Those variables all start with `$` and are case sensitive.
Here is the list of currently supported variables:
`${MEMBER-CLUSTER-NAME}`: this will be replaced by the name of the memberCluster CR that represents this cluster.
x-kubernetes-preserve-unknown-fields: true
required:
- op
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,10 @@ spec:
description: |-
Value defines the content to be applied on the target location.
Value should be empty when operator is `remove`.
We have reserved a few variables in this field that will be replaced by the actual values.
Those variables all start with `$` and are case sensitive.
Here is the list of currently supported variables:
`${MEMBER-CLUSTER-NAME}`: this will be replaced by the name of the memberCluster CR that represents this cluster.
x-kubernetes-preserve-unknown-fields: true
required:
- op
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,10 @@ spec:
description: |-
Value defines the content to be applied on the target location.
Value should be empty when operator is `remove`.
We have reserved a few variables in this field that will be replaced by the actual values.
Those variables all start with `$` and are case sensitive.
Here is the list of currently supported variables:
`${MEMBER-CLUSTER-NAME}`: this will be replaced by the name of the memberCluster CR that represents this cluster.
x-kubernetes-preserve-unknown-fields: true
required:
- op
Expand Down
8 changes: 7 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@ require (
github.com/onsi/gomega v1.35.1
github.com/prometheus/client_golang v1.19.1
github.com/prometheus/client_model v0.6.1
github.com/qri-io/jsonpointer v0.1.1
github.com/spf13/cobra v1.8.1
github.com/spf13/pflag v1.0.5
github.com/stretchr/testify v1.9.0
go.goms.io/fleet-networking v0.2.7
github.com/wI2L/jsondiff v0.6.0
go.goms.io/fleet-networking v0.2.11
go.uber.org/atomic v1.11.0
go.uber.org/zap v1.27.0
golang.org/x/exp v0.0.0-20241004190924-225e2abe05e6
Expand Down Expand Up @@ -102,6 +104,10 @@ require (
github.com/prometheus/common v0.55.0 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
github.com/samber/lo v1.38.1 // indirect
github.com/tidwall/gjson v1.18.0 // indirect
github.com/tidwall/match v1.1.1 // indirect
github.com/tidwall/pretty v1.2.1 // indirect
github.com/tidwall/sjson v1.2.5 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.52.0 // indirect
go.opentelemetry.io/otel v1.31.0 // indirect
go.opentelemetry.io/otel/metric v1.31.0 // indirect
Expand Down
18 changes: 16 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,8 @@ github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G
github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8=
github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
github.com/qri-io/jsonpointer v0.1.1 h1:prVZBZLL6TW5vsSB9fFHFAMBLI4b0ri5vribQlTJiBA=
github.com/qri-io/jsonpointer v0.1.1/go.mod h1:DnJPaYgiKu56EuDp8TU5wFLdZIcAnb/uH9v37ZaMV64=
github.com/redis/go-redis/v9 v9.6.1 h1:HHDteefn6ZkTtY5fGUE8tj8uy85AHk6zP7CpzIAM0y4=
github.com/redis/go-redis/v9 v9.6.1/go.mod h1:0C0c6ycQsdpVNQpxb1njEQIqkx5UcsM8FJCQLgE9+RA=
github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
Expand All @@ -232,11 +234,23 @@ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO
github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY=
github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=
github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4=
github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY=
github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28=
github.com/wI2L/jsondiff v0.6.0 h1:zrsH3FbfVa3JO9llxrcDy/XLkYPLgoMX6Mz3T2PP2AI=
github.com/wI2L/jsondiff v0.6.0/go.mod h1:D6aQ5gKgPF9g17j+E9N7aasmU1O+XvfmWm1y8UMmNpw=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
go.goms.io/fleet-networking v0.2.7 h1:lVs2/GiCjo18BRgACib+VPnENUMh+2YbYXoeNtcAvw0=
go.goms.io/fleet-networking v0.2.7/go.mod h1:JoWG82La5nV29mooOnPpIhy6/Pi4oGXQk21CPF1UStg=
go.goms.io/fleet-networking v0.2.11 h1:N5/5TckwEipA8s4DC71lzwdz88P3mtXdvRKGL8Cg/JA=
go.goms.io/fleet-networking v0.2.11/go.mod h1:tenpiBceno4hiCakjCMz/KCAwRlBz0+/kER6lSXy+Mk=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.52.0 h1:9l89oX4ba9kHbBol3Xin3leYJ+252h0zszDtBwyKe2A=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.52.0/go.mod h1:XLZfZboOJWHNKUv7eH0inh0E9VV6eWDFB/9yJyTLPp0=
go.opentelemetry.io/otel v1.31.0 h1:NsJcKPIW0D0H3NgzPDHmo0WW6SptzPdqg/L1zsIm2hY=
Expand Down
10 changes: 10 additions & 0 deletions pkg/controllers/clusterresourcebindingwatcher/watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,16 @@ func isBindingStatusUpdated(oldBinding, newBinding *fleetv1beta1.ClusterResource
klog.V(2).InfoS("The binding failed placement has changed, need to update the corresponding CRP", "oldBinding", klog.KObj(oldBinding), "newBinding", klog.KObj(newBinding))
return true
}

if !utils.IsDriftedResourcePlacementEqual(oldBinding.Status.DriftedPlacements, newBinding.Status.DriftedPlacements) {
klog.V(2).InfoS("The binding drifted placement has changed, need to update the corresponding CRP", "oldBinding", klog.KObj(oldBinding), "newBinding", klog.KObj(newBinding))
return true
}

if !utils.IsDiffedResourcePlacementEqual(oldBinding.Status.DiffedPlacements, newBinding.Status.DiffedPlacements) {
klog.V(2).InfoS("The binding diffed placement has changed, need to update the corresponding CRP", "oldBinding", klog.KObj(oldBinding), "newBinding", klog.KObj(newBinding))
return true
}
Comment on lines +126 to +135
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there any e2e cover this case?

klog.V(5).InfoS("The binding status has not changed, no need to update the corresponding CRP", "oldBinding", klog.KObj(oldBinding), "newBinding", klog.KObj(newBinding))
return false
}
3 changes: 2 additions & 1 deletion pkg/controllers/clusterresourceplacement/placement_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -262,9 +262,10 @@ func (r *Reconciler) setResourcePlacementStatusPerCluster(crp *fleetv1beta1.Clus
if bindingCond.Status == metav1.ConditionFalse {
status.FailedPlacements = binding.Status.FailedPlacements
status.DiffedPlacements = binding.Status.DiffedPlacements
status.DriftedPlacements = binding.Status.DriftedPlacements
}
}
// Drifts are reported regardless of the status of the Applied condition.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there any e2e cover this case?

status.DriftedPlacements = binding.Status.DriftedPlacements
cond := metav1.Condition{
Type: string(i.ResourcePlacementConditionType()),
Status: bindingCond.Status,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ func TestValidateEviction(t *testing.T) {
RevisionHistoryLimit: ptr.To(int32(defaulter.DefaultRevisionHistoryLimitValue)),
},
}
// Must set default values here as the test target method will set them.
defaulter.SetDefaultsClusterResourcePlacement(testCRP)

testBinding1 := placementv1beta1.ClusterResourceBinding{
ObjectMeta: metav1.ObjectMeta{
Name: "test-binding-1",
Expand Down
Loading
Loading