Skip to content

Commit

Permalink
enable leader election of klusterlet-agent on single node managed clu…
Browse files Browse the repository at this point in the history
…sters (#727)

Signed-off-by: Qing Hao <[email protected]>
  • Loading branch information
haoqing0110 authored Nov 29, 2024
1 parent 93db6de commit ed367fd
Show file tree
Hide file tree
Showing 7 changed files with 30 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ spec:
- "--spoke-external-server-urls={{ .ExternalServerURL }}"
{{end}}
{{if eq .Replica 1}}
- "--disable-leader-election"
- "--status-sync-interval=60s"
{{end}}
{{if gt .ClientCertExpirationSeconds 0}}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,6 @@ spec:
- "--spoke-kubeconfig=/spoke/config/kubeconfig"
- "--terminate-on-files=/spoke/config/kubeconfig"
{{end}}
{{if eq .Replica 1}}
- "--disable-leader-election"
{{end}}
{{if gt .ClientCertExpirationSeconds 0}}
- "--client-cert-expiration-seconds={{ .ClientCertExpirationSeconds }}"
{{end}}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ spec:
{{end}}
- "--terminate-on-files=/spoke/hub-kubeconfig/kubeconfig"
{{if eq .Replica 1}}
- "--disable-leader-election"
- "--status-sync-interval=60s"
{{end}}
{{if gt .WorkKubeAPIQPS 0.0}}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -402,13 +402,8 @@ func assertKlusterletDeployment(t *testing.T, actions []clienttesting.Action, ve
expectedArgs = append(expectedArgs, fmt.Sprintf("--spoke-external-server-urls=%s", serverURL))
}

expectedArgs = append(expectedArgs, "--agent-id=", "--workload-source-driver=kube", "--workload-source-config=/spoke/hub-kubeconfig/kubeconfig")

if *deployment.Spec.Replicas == 1 {
expectedArgs = append(expectedArgs, "--disable-leader-election")
}

expectedArgs = append(expectedArgs, "--status-sync-interval=60s", "--kube-api-qps=20", "--kube-api-burst=60",
expectedArgs = append(expectedArgs, "--agent-id=", "--workload-source-driver=kube", "--workload-source-config=/spoke/hub-kubeconfig/kubeconfig",
"--status-sync-interval=60s", "--kube-api-qps=20", "--kube-api-burst=60",
"--registration-auth=awsirsa",
"--hub-cluster-arn=arn:aws:eks:us-west-2:123456789012:cluster/hub-cluster1",
"--managed-cluster-arn=arn:aws:eks:us-west-2:123456789012:cluster/managed-cluster1",
Expand Down Expand Up @@ -465,10 +460,6 @@ func assertRegistrationDeployment(t *testing.T, actions []clienttesting.Action,
expectedArgs = append(expectedArgs, fmt.Sprintf("--spoke-external-server-urls=%s", serverURL))
}

if *deployment.Spec.Replicas == 1 {
expectedArgs = append(expectedArgs, "--disable-leader-election")
}

expectedArgs = append(expectedArgs, "--kube-api-qps=10", "--kube-api-burst=60")
if awsAuth {
expectedArgs = append(expectedArgs, "--registration-auth=awsirsa",
Expand Down Expand Up @@ -515,7 +506,7 @@ func assertWorkDeployment(t *testing.T, actions []clienttesting.Action, verb, cl
expectArgs = append(expectArgs, "--terminate-on-files=/spoke/hub-kubeconfig/kubeconfig")

if *deployment.Spec.Replicas == 1 {
expectArgs = append(expectArgs, "--disable-leader-election", "--status-sync-interval=60s")
expectArgs = append(expectArgs, "--status-sync-interval=60s")
}

expectArgs = append(expectArgs, "--kube-api-qps=20", "--kube-api-burst=50")
Expand Down
15 changes: 13 additions & 2 deletions pkg/registration/spoke/spokeagent.go
Original file line number Diff line number Diff line change
Expand Up @@ -354,12 +354,23 @@ func (o *SpokeAgentConfig) RunSpokeAgentWithSpokeInformers(ctx context.Context,
go bootstrapInformerFactory.Start(bootstrapCtx.Done())
go secretController.Run(bootstrapCtx, 1)

// wait for the hub client config is ready.
// Wait for the hub client config is ready.
// PollUntilContextCancel periodically executes the condition func `o.internalHubConfigValidFunc`
// until one of the following conditions is met:
// - condition returns `true`: Indicates the hub client configuration
// is ready, and the polling stops successfully.
// - condition returns an error: This happens when loading the kubeconfig
// file fails or the kubeconfig is invalid. In such cases, the error is returned, causing the
// agent to exit with an error and triggering a new leader election.
// - The context is canceled: In this case, no error is returned. This ensures that
// the current leader can release leadership, allowing a new pod to get leadership quickly.
logger.Info("Waiting for hub client config and managed cluster to be ready")
if err := wait.PollUntilContextCancel(bootstrapCtx, 1*time.Second, true, o.internalHubConfigValidFunc); err != nil {
// TODO need run the bootstrap CSR forever to re-establish the client-cert if it is ever lost.
stopBootstrap()
return fmt.Errorf("failed to wait for hub client config for managed cluster to be ready: %w", err)
if err != context.Canceled {
return fmt.Errorf("failed to wait for hub client config for managed cluster to be ready: %w", err)
}
}

// stop the clientCertForHubController for bootstrap once the hub client config is ready
Expand Down
13 changes: 12 additions & 1 deletion pkg/singleton/spoke/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,20 @@ func (a *AgentConfig) RunSpokeAgent(ctx context.Context, controllerContext *cont
}()

// wait for the hub client config ready.
// PollUntilContextCancel periodically executes the condition func `o.internalHubConfigValidFunc`
// until one of the following conditions is met:
// - condition returns `true`: Indicates the hub client configuration
// is ready, and the polling stops successfully.
// - condition returns an error: This happens when loading the kubeconfig
// file fails or the kubeconfig is invalid. In such cases, the error is returned, causing the
// agent to exit with an error and triggering a new leader election.
// - The context is canceled: In this case, no error is returned. This ensures that
// the current leader can release leadership, allowing a new pod to get leadership quickly.
klog.Info("Waiting for hub client config and managed cluster to be ready")
if err := wait.PollUntilContextCancel(ctx, 1*time.Second, true, a.registrationConfig.IsHubKubeConfigValid); err != nil {
return err
if err != context.Canceled {
return err
}
}

// start work agent
Expand Down
4 changes: 2 additions & 2 deletions test/integration/operator/klusterlet_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -605,7 +605,7 @@ var _ = ginkgo.Describe("Klusterlet", func() {
gomega.Expect(len(actual.Spec.Template.Spec.Containers)).Should(gomega.Equal(1))
// klusterlet has no condition, replica is 0
gomega.Expect(actual.Status.Replicas).Should(gomega.Equal(int32(0)))
gomega.Expect(len(actual.Spec.Template.Spec.Containers[0].Args)).Should(gomega.Equal(9))
gomega.Expect(len(actual.Spec.Template.Spec.Containers[0].Args)).Should(gomega.Equal(8))
return actual.Spec.Template.Spec.Containers[0].Args[2] != "--spoke-cluster-name=cluster2"
}, eventuallyTimeout, eventuallyInterval).Should(gomega.BeTrue())

Expand All @@ -615,7 +615,7 @@ var _ = ginkgo.Describe("Klusterlet", func() {
return false
}
gomega.Expect(len(actual.Spec.Template.Spec.Containers)).Should(gomega.Equal(1))
gomega.Expect(len(actual.Spec.Template.Spec.Containers[0].Args)).Should(gomega.Equal(6))
gomega.Expect(len(actual.Spec.Template.Spec.Containers[0].Args)).Should(gomega.Equal(5))
return actual.Spec.Template.Spec.Containers[0].Args[2] == "--spoke-cluster-name=cluster2"
}, eventuallyTimeout, eventuallyInterval).Should(gomega.BeTrue())

Expand Down

0 comments on commit ed367fd

Please sign in to comment.