From d235f0c25b751935c5e8afd89fe1fa7e9ce7a385 Mon Sep 17 00:00:00 2001 From: Manuel Buil Date: Fri, 9 Feb 2024 15:31:02 +0100 Subject: [PATCH] Improve the waits in calico-windows Signed-off-by: Manuel Buil --- pkg/windows/calico.go | 37 +++++++++++++++++++++++++------------ pkg/windows/flannel.go | 14 +++++++++----- pkg/windows/utils.go | 2 +- 3 files changed, 35 insertions(+), 18 deletions(-) diff --git a/pkg/windows/calico.go b/pkg/windows/calico.go index b8f24dbd0e..4892320fef 100644 --- a/pkg/windows/calico.go +++ b/pkg/windows/calico.go @@ -116,7 +116,8 @@ users: ) type Calico struct { - CNICfg *CalicoConfig + CNICfg *CalicoConfig + KubeClient *kubernetes.Clientset } const ( @@ -161,7 +162,7 @@ func (c *Calico) initializeConfig(ctx context.Context, nodeConfig *daemonconfig. c.CNICfg = &CalicoConfig{ CNICommonConfig: CNICommonConfig{ - Name: "calico", + Name: "Calico", OverlayNetName: "Calico", OverlayEncap: "vxlan", Hostname: nodeConfig.AgentConfig.NodeName, @@ -185,7 +186,7 @@ func (c *Calico) initializeConfig(ctx context.Context, nodeConfig *daemonconfig. IPAutoDetectionMethod: "first-found", } - c.CNICfg.KubeConfig, err = c.createKubeConfig(ctx, restConfig) + c.CNICfg.KubeConfig, c.KubeClient, err = c.createKubeConfigAndClient(ctx, restConfig) if err != nil { return err } @@ -226,8 +227,8 @@ func (c *Calico) renderCalicoConfig(path string, toRender *template.Template) er return nil } -// createKubeConfig creates all needed for Calico to contact kube-api -func (c *Calico) createKubeConfig(ctx context.Context, restConfig *rest.Config) (*KubeConfig, error) { +// createKubeConfigAndClient creates all needed for Calico to contact kube-api +func (c *Calico) createKubeConfigAndClient(ctx context.Context, restConfig *rest.Config) (*KubeConfig, *kubernetes.Clientset, error) { // Fill all information except for the token calicoKubeConfig := KubeConfig{ @@ -247,30 +248,42 @@ func (c *Calico) createKubeConfig(ctx context.Context, restConfig *rest.Config) // Register the token in the Calico service account client, err := kubernetes.NewForConfig(restConfig) if err != nil { - return nil, err + return nil, nil, err } serviceAccounts := client.CoreV1().ServiceAccounts(CalicoSystemNamespace) token, err := serviceAccounts.CreateToken(ctx, calicoNode, &req, metav1.CreateOptions{}) if err != nil { - return nil, errors.Wrapf(err, "failed to create token for service account (%s/%s)", CalicoSystemNamespace, calicoNode) + return nil, nil, errors.Wrapf(err, "failed to create token for service account (%s/%s)", CalicoSystemNamespace, calicoNode) } calicoKubeConfig.Token = token.Status.Token - return &calicoKubeConfig, nil + return &calicoKubeConfig, client, nil } // Start starts the CNI services on the Windows node. func (c *Calico) Start(ctx context.Context) error { logPath := filepath.Join(c.CNICfg.ConfigPath, "logs") - for { + + // Wait for the node to be registered in the cluster + if err := wait.PollImmediateWithContext(ctx, 5*time.Second, 5*time.Minute, func(ctx context.Context) (bool, error) { + _, err := c.KubeClient.CoreV1().Nodes().Get(ctx, c.CNICfg.Hostname, metav1.GetOptions{}) + if err != nil { + logrus.WithError(err).Warningf("Calico can't start because it can't find node, retrying %s", c.CNICfg.Hostname) + return false, nil + } + + logrus.Infof("Node %s registered. Calico can start", c.CNICfg.Hostname) + if err := startCalico(ctx, c.CNICfg, logPath); err != nil { - time.Sleep(5 * time.Second) logrus.Errorf("Calico exited: %v. Retrying", err) - continue + return false, nil } - break + return true, nil + }); err != nil { + return err } + go startFelix(ctx, c.CNICfg, logPath) if c.CNICfg.OverlayEncap == "windows-bgp" { go startConfd(ctx, c.CNICfg, logPath) diff --git a/pkg/windows/flannel.go b/pkg/windows/flannel.go index 9bf1715d17..06b04357ea 100644 --- a/pkg/windows/flannel.go +++ b/pkg/windows/flannel.go @@ -216,7 +216,7 @@ func (f *Flannel) renderFlannelConfig(path string, toRender *template.Template) return nil } -// createKubeConfig creates all needed for Flannel to contact kube-api +// createKubeConfigAndClient creates all needed for Flannel to contact kube-api func (f *Flannel) createKubeConfigAndClient(ctx context.Context, restConfig *rest.Config) (*KubeConfig, *kubernetes.Clientset, error) { // Fill all information except for the token @@ -254,7 +254,7 @@ func (f *Flannel) Start(ctx context.Context) error { logPath := filepath.Join(f.CNICfg.ConfigPath, "logs", "flanneld.log") // Wait for the node to be registered in the cluster - wait.PollImmediateWithContext(ctx, 3*time.Second, 5*time.Minute, func(ctx context.Context) (bool, error) { + if err := wait.PollImmediateWithContext(ctx, 3*time.Second, 5*time.Minute, func(ctx context.Context) (bool, error) { _, err := f.KubeClient.CoreV1().Nodes().Get(ctx, f.CNICfg.Hostname, metav1.GetOptions{}) if err != nil { logrus.WithError(err).Warningf("Flanneld can't start because it can't find node, retrying %s", f.CNICfg.Hostname) @@ -263,7 +263,9 @@ func (f *Flannel) Start(ctx context.Context) error { logrus.Infof("Node %s registered. Flanneld can start", f.CNICfg.Hostname) return true, nil } - }) + }); err != nil { + return err + } go startFlannel(ctx, f.CNICfg, logPath) @@ -305,7 +307,7 @@ func (f *Flannel) ReserveSourceVip(ctx context.Context) (string, error) { var err error logrus.Info("Reserving an IP on flannel HNS network for kube-proxy source vip") - wait.PollImmediateWithContext(ctx, 10*time.Second, 5*time.Minute, func(ctx context.Context) (bool, error) { + if err := wait.PollImmediateWithContext(ctx, 10*time.Second, 5*time.Minute, func(ctx context.Context) (bool, error) { network, err = hcsshim.GetHNSNetworkByName(f.CNICfg.OverlayNetName) if err != nil || network == nil { logrus.Debugf("can't find flannel HNS network, retrying %s", f.CNICfg.OverlayNetName) @@ -322,7 +324,9 @@ func (f *Flannel) ReserveSourceVip(ctx context.Context) (string, error) { return true, nil } return false, nil - }) + }); err != nil { + return "", err + } subnet := network.Subnets[0].AddressPrefix diff --git a/pkg/windows/utils.go b/pkg/windows/utils.go index 90dd475146..772de9f3f1 100644 --- a/pkg/windows/utils.go +++ b/pkg/windows/utils.go @@ -126,7 +126,7 @@ func deleteAllNetworks() error { // HNS overlay networks restart the physical interface when they are deleted. Wait until it comes back before returning // TODO: Replace with non-deprecated PollUntilContextTimeout when our and Kubernetes code migrate to it - waitErr := wait.Poll(2*time.Second, 20*time.Second, func() (bool, error) { + waitErr := wait.Poll(2*time.Second, 30*time.Second, func() (bool, error) { for _, ip := range ips { logrus.Debugf("Calico is waiting for the interface with ip: %s to come back", ip) _, err := findInterface(ip)