Skip to content

Commit

Permalink
fix gateway node check for centralized ecmp subnets (#4847)
Browse files Browse the repository at this point in the history
Signed-off-by: zhangzujian <[email protected]>
  • Loading branch information
zhangzujian committed Dec 18, 2024
1 parent cdc639d commit 19683fa
Show file tree
Hide file tree
Showing 11 changed files with 126 additions and 34 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ dist/images/test-server
dist/images/kube-ovn
dist/images/kube-ovn-cmd
dist/images/kube-ovn-daemon
dist/images/kube-ovn-controller
dist/images/kube-ovn-pinger
dist/images/kube-ovn-webhook
dist/windows/kube-ovn.exe
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ build-go:
CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build $(GO_BUILD_FLAGS) -o $(CURDIR)/dist/images/kube-ovn -v ./cmd/cni
CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build $(GO_BUILD_FLAGS) -buildmode=pie -o $(CURDIR)/dist/images/kube-ovn-cmd -v ./cmd
CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build $(GO_BUILD_FLAGS) -buildmode=pie -o $(CURDIR)/dist/images/kube-ovn-daemon -v ./cmd/daemon
CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build $(GO_BUILD_FLAGS) -buildmode=pie -o $(CURDIR)/dist/images/kube-ovn-pinger -v ./cmd/pinger
CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build $(GO_BUILD_FLAGS) -buildmode=pie -o $(CURDIR)/dist/images/kube-ovn-controller -v ./cmd/controller
CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build $(GO_BUILD_FLAGS) -o $(CURDIR)/dist/images/test-server -v ./test/server

.PHONY: build-go-windows
Expand All @@ -129,7 +129,7 @@ build-go-arm:
CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build $(GO_BUILD_FLAGS) -o $(CURDIR)/dist/images/kube-ovn -v ./cmd/cni
CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build $(GO_BUILD_FLAGS) -buildmode=pie -o $(CURDIR)/dist/images/kube-ovn-cmd -v ./cmd
CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build $(GO_BUILD_FLAGS) -buildmode=pie -o $(CURDIR)/dist/images/kube-ovn-daemon -v ./cmd/daemon
CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build $(GO_BUILD_FLAGS) -buildmode=pie -o $(CURDIR)/dist/images/kube-ovn-pinger -v ./cmd/pinger
CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build $(GO_BUILD_FLAGS) -buildmode=pie -o $(CURDIR)/dist/images/kube-ovn-controller -v ./cmd/controller

.PHONY: build-kube-ovn
build-kube-ovn: build-debug build-go
Expand Down
1 change: 1 addition & 0 deletions charts/kube-ovn/templates/controller-deploy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ spec:
capabilities:
add:
- NET_BIND_SERVICE
- NET_RAW
env:
- name: ENABLE_SSL
value: "{{ .Values.networking.ENABLE_SSL }}"
Expand Down
5 changes: 0 additions & 5 deletions cmd/cmdmain.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ import (

"k8s.io/klog/v2"

"github.com/kubeovn/kube-ovn/cmd/controller"
"github.com/kubeovn/kube-ovn/cmd/health_check"
"github.com/kubeovn/kube-ovn/cmd/ovn_ic_controller"
"github.com/kubeovn/kube-ovn/cmd/ovn_leader_checker"
Expand All @@ -22,7 +21,6 @@ import (
)

const (
CmdController = "kube-ovn-controller"
CmdMonitor = "kube-ovn-monitor"
CmdSpeaker = "kube-ovn-speaker"
CmdWebhook = "kube-ovn-webhook"
Expand Down Expand Up @@ -91,9 +89,6 @@ func dumpProfile() {
func main() {
cmd := filepath.Base(os.Args[0])
switch cmd {
case CmdController:
dumpProfile()
controller.CmdMain()
case CmdMonitor:
dumpProfile()
ovn_monitor.CmdMain()
Expand Down
92 changes: 92 additions & 0 deletions cmd/controller/cmdmain.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
package main

import (
"fmt"
"os"
"os/signal"
"path/filepath"
"runtime/pprof"
"syscall"
"time"

"k8s.io/klog/v2"

"github.com/kubeovn/kube-ovn/cmd/pinger"
"github.com/kubeovn/kube-ovn/pkg/util"
)

const (
CmdController = "kube-ovn-controller"
CmdPinger = "kube-ovn-pinger"
)

const timeFormat = "2006-01-02_15:04:05"

func dumpProfile() {
ch1 := make(chan os.Signal, 1)
ch2 := make(chan os.Signal, 1)
signal.Notify(ch1, syscall.SIGUSR1)
signal.Notify(ch2, syscall.SIGUSR2)
go func() {
for {
<-ch1
name := fmt.Sprintf("cpu-profile-%s.pprof", time.Now().Format(timeFormat))
path := filepath.Join(os.TempDir(), name)
f, err := os.Create(path) // #nosec G303,G304
if err != nil {
klog.Errorf("failed to create cpu profile file: %v", err)
return
}
if err = pprof.StartCPUProfile(f); err != nil {
klog.Errorf("failed to start cpu profile: %v", err)
if err = f.Close(); err != nil {
klog.Errorf("failed to close file %q: %v", path, err)
}
return
}
time.Sleep(30 * time.Second)
pprof.StopCPUProfile()
if err = f.Close(); err != nil {
klog.Errorf("failed to close file %q: %v", path, err)
return
}
}
}()
go func() {
for {
<-ch2
name := fmt.Sprintf("mem-profile-%s.pprof", time.Now().Format(timeFormat))
path := filepath.Join(os.TempDir(), name)
f, err := os.Create(path) // #nosec G303,G304
if err != nil {
klog.Errorf("failed to create memory profile file: %v", err)
return
}
if err = pprof.WriteHeapProfile(f); err != nil {
klog.Errorf("failed to write memory profile file: %v", err)
if err = f.Close(); err != nil {
klog.Errorf("failed to close file %q: %v", path, err)
}
return
}
if err = f.Close(); err != nil {
klog.Errorf("failed to close file %q: %v", path, err)
return
}
}
}()
}

func main() {
cmd := filepath.Base(os.Args[0])
switch cmd {
case CmdController:
dumpProfile()
CmdMain()
case CmdPinger:
dumpProfile()
pinger.CmdMain()
default:
util.LogFatalAndExit(nil, "%s is an unknown command", cmd)
}
}
2 changes: 1 addition & 1 deletion cmd/controller/controller.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package controller
package main

import (
"context"
Expand Down
4 changes: 2 additions & 2 deletions cmd/pinger/pinger.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package main
package pinger

import (
_ "net/http/pprof" // #nosec
Expand All @@ -14,7 +14,7 @@ import (
"github.com/kubeovn/kube-ovn/versions"
)

func main() {
func CmdMain() {
defer klog.Flush()

klog.Info(versions.String())
Expand Down
8 changes: 4 additions & 4 deletions dist/images/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,16 @@ COPY 01-kube-ovn.conflist /kube-ovn/01-kube-ovn.conflist
COPY kube-ovn /kube-ovn/kube-ovn
COPY kube-ovn-cmd /kube-ovn/kube-ovn-cmd
COPY kube-ovn-daemon /kube-ovn/kube-ovn-daemon
COPY kube-ovn-pinger /kube-ovn/kube-ovn-pinger
RUN ln -s /kube-ovn/kube-ovn-cmd /kube-ovn/kube-ovn-controller && \
ln -s /kube-ovn/kube-ovn-cmd /kube-ovn/kube-ovn-monitor && \
COPY kube-ovn-controller /kube-ovn/kube-ovn-controller
RUN ln -s /kube-ovn/kube-ovn-cmd /kube-ovn/kube-ovn-monitor && \
ln -s /kube-ovn/kube-ovn-cmd /kube-ovn/kube-ovn-speaker && \
ln -s /kube-ovn/kube-ovn-cmd /kube-ovn/kube-ovn-webhook && \
ln -s /kube-ovn/kube-ovn-cmd /kube-ovn/kube-ovn-healthcheck && \
ln -s /kube-ovn/kube-ovn-cmd /kube-ovn/kube-ovn-leader-checker && \
ln -s /kube-ovn/kube-ovn-cmd /kube-ovn/kube-ovn-ic-controller && \
ln -s /kube-ovn/kube-ovn-controller /kube-ovn/kube-ovn-pinger && \
setcap CAP_NET_BIND_SERVICE+eip /kube-ovn/kube-ovn-cmd && \
setcap CAP_NET_RAW,CAP_NET_BIND_SERVICE+eip /kube-ovn/kube-ovn-pinger && \
setcap CAP_NET_RAW,CAP_NET_BIND_SERVICE+eip /kube-ovn/kube-ovn-controller && \
setcap CAP_NET_ADMIN,CAP_NET_RAW,CAP_NET_BIND_SERVICE,CAP_SYS_ADMIN+eip /kube-ovn/kube-ovn-daemon

FROM kubeovn/kube-ovn-base:$BASE_TAG
Expand Down
1 change: 1 addition & 0 deletions dist/images/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4335,6 +4335,7 @@ spec:
capabilities:
add:
- NET_BIND_SERVICE
- NET_RAW
env:
- name: ENABLE_SSL
value: "$ENABLE_SSL"
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1182,7 +1182,7 @@ func (c *Controller) startWorkers(ctx context.Context) {

go wait.Until(c.resyncProviderNetworkStatus, 30*time.Second, ctx.Done())
go wait.Until(c.exportSubnetMetrics, 30*time.Second, ctx.Done())
go wait.Until(c.CheckGatewayReady, 5*time.Second, ctx.Done())
go wait.Until(c.checkSubnetGateway, 5*time.Second, ctx.Done())

go wait.Until(runWorker("add ovn eip", c.addOvnEipQueue, c.handleAddOvnEip), time.Second, ctx.Done())
go wait.Until(runWorker("update ovn eip", c.updateOvnEipQueue, c.handleUpdateOvnEip), time.Second, ctx.Done())
Expand Down
40 changes: 21 additions & 19 deletions pkg/controller/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -540,27 +540,27 @@ func (c *Controller) handleUpdateNode(key string) error {
return nil
}

func (c *Controller) CheckGatewayReady() {
if err := c.checkGatewayReady(); err != nil {
klog.Errorf("failed to check gateway ready %v", err)
func (c *Controller) checkSubnetGateway() {
if err := c.checkSubnetGatewayNode(); err != nil {
klog.Errorf("failed to check subnet gateway node: %v", err)
}
}

func (c *Controller) checkGatewayReady() error {
klog.V(3).Infoln("start to check gateway status")
func (c *Controller) checkSubnetGatewayNode() error {
klog.V(3).Infoln("start to check subnet gateway node")
subnetList, err := c.subnetsLister.List(labels.Everything())
if err != nil {
klog.Errorf("failed to list subnets %v", err)
klog.Errorf("failed to list subnets: %v", err)
return err
}
nodes, err := c.nodesLister.List(labels.Everything())
if err != nil {
klog.Errorf("failed to list nodes, %v", err)
klog.Errorf("failed to list nodes: %v", err)
return err
}

for _, subnet := range subnetList {
if (subnet.Spec.Vlan != "" && !subnet.Spec.LogicalGateway) ||
if (subnet.Spec.Vlan != "" && (subnet.Spec.U2OInterconnection || !subnet.Spec.LogicalGateway)) ||
subnet.Spec.GatewayNode == "" ||
subnet.Spec.GatewayType != kubeovnv1.GWCentralizedType ||
!subnet.Spec.EnableEcmp {
Expand Down Expand Up @@ -598,24 +598,26 @@ func (c *Controller) checkGatewayReady() error {
pinger.Timeout = time.Duration(count) * time.Second
pinger.Interval = 1 * time.Second

success := false

var pingSucceeded bool
pinger.OnRecv = func(_ *goping.Packet) {
success = true
pingSucceeded = true
pinger.Stop()
}
if err = pinger.Run(); err != nil {
klog.Errorf("failed to run pinger for destination %s: %v", ip, err)
return err
}

if !nodeReady(node) {
success = false
}

if !success {
nodeIsReady := nodeReady(node)
if !pingSucceeded || !nodeIsReady {
if exist {
klog.Warningf("failed to ping ovn0 %s or node %s is not ready, delete ecmp policy route for node", ip, node.Name)
if !pingSucceeded {
klog.Warningf("failed to ping ovn0 ip %s on node %s", ip, node.Name)
}
if !nodeIsReady {
klog.Warningf("node %s is not ready", node.Name)
}
klog.Warningf("delete ecmp policy route for node %s ip %s", node.Name, ip)
nextHops.Remove(ip)
delete(nameIPMap, node.Name)
klog.Infof("update policy route for centralized subnet %s, nextHops %s", subnet.Name, nextHops)
Expand All @@ -625,7 +627,7 @@ func (c *Controller) checkGatewayReady() error {
}
}
} else {
klog.V(3).Infof("succeed to ping gw %s", ip)
klog.V(3).Infof("succeeded to ping ovn0 ip %s on node %s", ip, node.Name)
if !exist {
nextHops.Add(ip)
if nameIPMap == nil {
Expand All @@ -640,7 +642,7 @@ func (c *Controller) checkGatewayReady() error {
}
}
} else if exist {
klog.Infof("subnet %s gatewayNode does not contains node %v, delete policy route for node ip %s", subnet.Name, node.Name, ip)
klog.Infof("subnet %s gateway nodes does not contain node %s, delete policy route for node ip %s", subnet.Name, node.Name, ip)
nextHops.Remove(ip)
delete(nameIPMap, node.Name)
klog.Infof("update policy route for centralized subnet %s, nextHops %s", subnet.Name, nextHops)
Expand Down

0 comments on commit 19683fa

Please sign in to comment.