From c906bf64ee78a390e76eb06834d5ffa7d7e3db5d Mon Sep 17 00:00:00 2001 From: Manuel Buil Date: Tue, 30 Jan 2024 15:36:48 +0100 Subject: [PATCH] Avoid race condition when deleting HNS networks Signed-off-by: Manuel Buil --- pkg/windows/calico.go | 2 +- pkg/windows/utils.go | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/pkg/windows/calico.go b/pkg/windows/calico.go index 579b9aa4a0..4e7f3384d0 100644 --- a/pkg/windows/calico.go +++ b/pkg/windows/calico.go @@ -295,7 +295,7 @@ func (c *Calico) Start(ctx context.Context) error { // generateCalicoNetworks creates the overlay networks for internode networking func (c *Calico) generateCalicoNetworks() error { if err := deleteAllNetworks(); err != nil { - return err + return errors.Wrapf(err, "failed to delete all networks before bootstrapping calico") } // There are four ways to select the vxlan interface. In order of priority: diff --git a/pkg/windows/utils.go b/pkg/windows/utils.go index cf021709c4..6a6ed2e3ea 100644 --- a/pkg/windows/utils.go +++ b/pkg/windows/utils.go @@ -19,6 +19,7 @@ import ( "github.com/pkg/errors" "github.com/sirupsen/logrus" opv1 "github.com/tigera/operator/api/v1" + "k8s.io/apimachinery/pkg/util/wait" ) // createHnsNetwork creates the network that will connect nodes and returns its managementIP @@ -101,8 +102,12 @@ func deleteAllNetworks() error { return err } + var ips []string + for _, network := range networks { if network.Name != "nat" { + logrus.Debugf("Deleting network: %s before starting calico", network.Name) + ips = append(ips, network.ManagementIP) _, err = network.Delete() if err != nil { return err @@ -110,6 +115,23 @@ func deleteAllNetworks() error { } } + // HNS overlay networks restart the physical interface when they are deleted. Wait until it comes back before returning + // TODO: Replace with non-deprecated PollUntilContextTimeout when our and Kubernetes code migrate to it + waitErr := wait.Poll(2*time.Second, 20*time.Second, func() (bool, error) { + for _, ip := range ips { + logrus.Debugf("Calico is waiting for the interface with ip: %s to come back", ip) + _, err := findInterface(ip) + if err != nil { + return false, nil + } + } + return true, nil + }) + + if waitErr == wait.ErrWaitTimeout { + return fmt.Errorf("timed out waiting for the network interfaces to come back") + } + return nil }