Skip to content

Commit

Permalink
Improve the waits in calico-windows
Browse files Browse the repository at this point in the history
Signed-off-by: Manuel Buil <[email protected]>
  • Loading branch information
manuelbuil committed Feb 12, 2024
1 parent 06310a5 commit 6390104
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 18 deletions.
37 changes: 25 additions & 12 deletions pkg/windows/calico.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,8 @@ users:
)

type Calico struct {
CNICfg *CalicoConfig
CNICfg *CalicoConfig
KubeClient *kubernetes.Clientset
}

const (
Expand Down Expand Up @@ -161,7 +162,7 @@ func (c *Calico) initializeConfig(ctx context.Context, nodeConfig *daemonconfig.

c.CNICfg = &CalicoConfig{
CNICommonConfig: CNICommonConfig{
Name: "calico",
Name: "Calico",
OverlayNetName: "Calico",
OverlayEncap: "vxlan",
Hostname: nodeConfig.AgentConfig.NodeName,
Expand All @@ -185,7 +186,7 @@ func (c *Calico) initializeConfig(ctx context.Context, nodeConfig *daemonconfig.
IPAutoDetectionMethod: "first-found",
}

c.CNICfg.KubeConfig, err = c.createKubeConfig(ctx, restConfig)
c.CNICfg.KubeConfig, c.KubeClient, err = c.createKubeConfigAndClient(ctx, restConfig)
if err != nil {
return err
}
Expand Down Expand Up @@ -226,8 +227,8 @@ func (c *Calico) renderCalicoConfig(path string, toRender *template.Template) er
return nil
}

// createKubeConfig creates all needed for Calico to contact kube-api
func (c *Calico) createKubeConfig(ctx context.Context, restConfig *rest.Config) (*KubeConfig, error) {
// createKubeConfigAndClient creates all needed for Calico to contact kube-api
func (c *Calico) createKubeConfigAndClient(ctx context.Context, restConfig *rest.Config) (*KubeConfig, *kubernetes.Clientset, error) {

// Fill all information except for the token
calicoKubeConfig := KubeConfig{
Expand All @@ -247,30 +248,42 @@ func (c *Calico) createKubeConfig(ctx context.Context, restConfig *rest.Config)
// Register the token in the Calico service account
client, err := kubernetes.NewForConfig(restConfig)
if err != nil {
return nil, err
return nil, nil, err
}
serviceAccounts := client.CoreV1().ServiceAccounts(CalicoSystemNamespace)
token, err := serviceAccounts.CreateToken(ctx, calicoNode, &req, metav1.CreateOptions{})
if err != nil {
return nil, errors.Wrapf(err, "failed to create token for service account (%s/%s)", CalicoSystemNamespace, calicoNode)
return nil, nil, errors.Wrapf(err, "failed to create token for service account (%s/%s)", CalicoSystemNamespace, calicoNode)
}

calicoKubeConfig.Token = token.Status.Token

return &calicoKubeConfig, nil
return &calicoKubeConfig, client, nil
}

// Start starts the CNI services on the Windows node.
func (c *Calico) Start(ctx context.Context) error {
logPath := filepath.Join(c.CNICfg.ConfigPath, "logs")
for {

// Wait for the node to be registered in the cluster
if err := wait.PollImmediateWithContext(ctx, 5*time.Second, 5*time.Minute, func(ctx context.Context) (bool, error) {
_, err := c.KubeClient.CoreV1().Nodes().Get(ctx, c.CNICfg.Hostname, metav1.GetOptions{})
if err != nil {
logrus.WithError(err).Warningf("Calico can't start because it can't find node, retrying %s", c.CNICfg.Hostname)
return false, nil
}

logrus.Infof("Node %s registered. Calico can start", c.CNICfg.Hostname)

if err := startCalico(ctx, c.CNICfg, logPath); err != nil {
time.Sleep(5 * time.Second)
logrus.Errorf("Calico exited: %v. Retrying", err)
continue
return false, nil
}
break
return true, nil
}); err != nil {
return err
}

go startFelix(ctx, c.CNICfg, logPath)
if c.CNICfg.OverlayEncap == "windows-bgp" {
go startConfd(ctx, c.CNICfg, logPath)
Expand Down
14 changes: 9 additions & 5 deletions pkg/windows/flannel.go
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ func (f *Flannel) renderFlannelConfig(path string, toRender *template.Template)
return nil
}

// createKubeConfig creates all needed for Flannel to contact kube-api
// createKubeConfigAndClient creates all needed for Flannel to contact kube-api
func (f *Flannel) createKubeConfigAndClient(ctx context.Context, restConfig *rest.Config) (*KubeConfig, *kubernetes.Clientset, error) {

// Fill all information except for the token
Expand Down Expand Up @@ -254,7 +254,7 @@ func (f *Flannel) Start(ctx context.Context) error {
logPath := filepath.Join(f.CNICfg.ConfigPath, "logs", "flanneld.log")

// Wait for the node to be registered in the cluster
wait.PollImmediateWithContext(ctx, 3*time.Second, 5*time.Minute, func(ctx context.Context) (bool, error) {
if err := wait.PollImmediateWithContext(ctx, 3*time.Second, 5*time.Minute, func(ctx context.Context) (bool, error) {
_, err := f.KubeClient.CoreV1().Nodes().Get(ctx, f.CNICfg.Hostname, metav1.GetOptions{})
if err != nil {
logrus.WithError(err).Warningf("Flanneld can't start because it can't find node, retrying %s", f.CNICfg.Hostname)
Expand All @@ -263,7 +263,9 @@ func (f *Flannel) Start(ctx context.Context) error {
logrus.Infof("Node %s registered. Flanneld can start", f.CNICfg.Hostname)
return true, nil
}
})
}); err != nil {
return err
}

go startFlannel(ctx, f.CNICfg, logPath)

Expand Down Expand Up @@ -305,7 +307,7 @@ func (f *Flannel) ReserveSourceVip(ctx context.Context) (string, error) {
var err error

logrus.Info("Reserving an IP on flannel HNS network for kube-proxy source vip")
wait.PollImmediateWithContext(ctx, 10*time.Second, 5*time.Minute, func(ctx context.Context) (bool, error) {
if err := wait.PollImmediateWithContext(ctx, 10*time.Second, 5*time.Minute, func(ctx context.Context) (bool, error) {
network, err = hcsshim.GetHNSNetworkByName(f.CNICfg.OverlayNetName)
if err != nil || network == nil {
logrus.Debugf("can't find flannel HNS network, retrying %s", f.CNICfg.OverlayNetName)
Expand All @@ -322,7 +324,9 @@ func (f *Flannel) ReserveSourceVip(ctx context.Context) (string, error) {
return true, nil
}
return false, nil
})
}); err != nil {
return "", err
}

subnet := network.Subnets[0].AddressPrefix

Expand Down
2 changes: 1 addition & 1 deletion pkg/windows/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ func deleteAllNetworks() error {

// HNS overlay networks restart the physical interface when they are deleted. Wait until it comes back before returning
// TODO: Replace with non-deprecated PollUntilContextTimeout when our and Kubernetes code migrate to it
waitErr := wait.Poll(2*time.Second, 20*time.Second, func() (bool, error) {
waitErr := wait.Poll(2*time.Second, 30*time.Second, func() (bool, error) {
for _, ip := range ips {
logrus.Debugf("Calico is waiting for the interface with ip: %s to come back", ip)
_, err := findInterface(ip)
Expand Down

0 comments on commit 6390104

Please sign in to comment.