Skip to content

Commit

Permalink
fix: kube-ovn-controller cannot be ready when ENABLE_METRICS is false (
Browse files Browse the repository at this point in the history
…#4886)

* fix: kube-ovn-controller cannot be ready when ENABLE_METRICS is false
* ovn-monitor: refactor pprof to metrics

---------

Signed-off-by: zhaocongqi <[email protected]>
  • Loading branch information
zhaocongqi authored and zbb88888 committed Jan 21, 2025
1 parent 5abd85c commit cb39c63
Show file tree
Hide file tree
Showing 11 changed files with 128 additions and 45 deletions.
3 changes: 2 additions & 1 deletion charts/kube-ovn/templates/controller-deploy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ spec:
- /kube-ovn/kube-ovn-healthcheck
- --port=10660
- --tls={{- .Values.func.SECURE_SERVING }}
- --enable-metrics={{- .Values.networking.ENABLE_METRICS }}
periodSeconds: 3
timeoutSeconds: 5
livenessProbe:
Expand All @@ -181,6 +182,7 @@ spec:
- /kube-ovn/kube-ovn-healthcheck
- --port=10660
- --tls={{- .Values.func.SECURE_SERVING }}
- --enable-metrics={{- .Values.networking.ENABLE_METRICS }}
initialDelaySeconds: 300
periodSeconds: 7
failureThreshold: 5
Expand Down Expand Up @@ -208,4 +210,3 @@ spec:
secret:
optional: true
secretName: kube-ovn-tls

3 changes: 3 additions & 0 deletions charts/kube-ovn/templates/monitor-deploy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ spec:
- --logtostderr=false
- --alsologtostderr=true
- --log_file_max_size=200
- --enable-metrics={{- .Values.networking.ENABLE_METRICS }}
securityContext:
runAsUser: 0
privileged: false
Expand Down Expand Up @@ -113,6 +114,7 @@ spec:
- /kube-ovn/kube-ovn-healthcheck
- --port=10661
- --tls={{- .Values.func.SECURE_SERVING }}
- --enable-metrics={{- .Values.networking.ENABLE_METRICS }}
timeoutSeconds: 5
readinessProbe:
failureThreshold: 3
Expand All @@ -124,6 +126,7 @@ spec:
- /kube-ovn/kube-ovn-healthcheck
- --port=10661
- --tls={{- .Values.func.SECURE_SERVING }}
- --enable-metrics={{- .Values.networking.ENABLE_METRICS }}
timeoutSeconds: 5
nodeSelector:
kubernetes.io/os: "linux"
Expand Down
2 changes: 2 additions & 0 deletions charts/kube-ovn/templates/ovncni-ds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ spec:
- /kube-ovn/kube-ovn-healthcheck
- --port=10665
- --tls={{- .Values.func.SECURE_SERVING }}
- --enable-metrics={{- .Values.networking.ENABLE_METRICS }}
timeoutSeconds: 5
livenessProbe:
failureThreshold: 3
Expand All @@ -165,6 +166,7 @@ spec:
- /kube-ovn/kube-ovn-healthcheck
- --port=10665
- --tls={{- .Values.func.SECURE_SERVING }}
- --enable-metrics={{- .Values.networking.ENABLE_METRICS }}
timeoutSeconds: 5
resources:
requests:
Expand Down
37 changes: 29 additions & 8 deletions cmd/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,36 @@ func CmdMain() {
}()
}

if !config.EnableMetrics {
return
}
metrics.InitKlogMetrics()
metrics.InitClientGoMetrics()
addr := util.JoinHostPort(metricsAddr, config.PprofPort)
if err := metrics.Run(ctx, config.KubeRestConfig, addr, config.SecureServing, servePprofInMetricsServer); err != nil {
util.LogFatalAndExit(err, "failed to run metrics server")
if config.EnableMetrics {
metrics.InitKlogMetrics()
metrics.InitClientGoMetrics()
addr := util.JoinHostPort(metricsAddr, config.PprofPort)
if err := metrics.Run(ctx, config.KubeRestConfig, addr, config.SecureServing, servePprofInMetricsServer); err != nil {
util.LogFatalAndExit(err, "failed to run metrics server")
}
} else {
klog.Info("metrics server is disabled")
listerner, err := net.ListenTCP("tcp", &net.TCPAddr{IP: net.ParseIP(metricsAddr), Port: int(config.PprofPort)})
if err != nil {
util.LogFatalAndExit(err, "failed to listen on %s", util.JoinHostPort(metricsAddr, config.PprofPort))
}
svr := manager.Server{
Name: "health-check",
Server: &http.Server{
Handler: http.NewServeMux(),
MaxHeaderBytes: 1 << 20,
IdleTimeout: 90 * time.Second,
ReadHeaderTimeout: 32 * time.Second,
},
Listener: listerner,
}
go func() {
if err = svr.Start(ctx); err != nil {
util.LogFatalAndExit(err, "failed to run health check server")
}
}()
}

<-ctx.Done()
}()

Expand Down
35 changes: 29 additions & 6 deletions cmd/daemon/cniserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,6 @@ import (
func CmdMain() {
defer klog.Flush()

daemon.InitMetrics()
metrics.InitKlogMetrics()

config := daemon.ParseFlags()
klog.Info(versions.String())

Expand Down Expand Up @@ -143,10 +140,36 @@ func CmdMain() {
}()
}

listenAddr := util.JoinHostPort(addr, config.PprofPort)
if err = metrics.Run(ctx, nil, listenAddr, config.SecureServing, servePprofInMetricsServer); err != nil {
util.LogFatalAndExit(err, "failed to run metrics server")
if config.EnableMetrics {
daemon.InitMetrics()
metrics.InitKlogMetrics()
listenAddr := util.JoinHostPort(addr, config.PprofPort)
if err = metrics.Run(ctx, nil, listenAddr, config.SecureServing, servePprofInMetricsServer); err != nil {
util.LogFatalAndExit(err, "failed to run metrics server")
}
} else {
klog.Info("metrics server is disabled")
listerner, err := net.ListenTCP("tcp", &net.TCPAddr{IP: net.ParseIP(addr), Port: int(config.PprofPort)})
if err != nil {
util.LogFatalAndExit(err, "failed to listen on %s", util.JoinHostPort(addr, config.PprofPort))
}
svr := manager.Server{
Name: "health-check",
Server: &http.Server{
Handler: http.NewServeMux(),
MaxHeaderBytes: 1 << 20,
IdleTimeout: 90 * time.Second,
ReadHeaderTimeout: 32 * time.Second,
},
Listener: listerner,
}
go func() {
if err = svr.Start(ctx); err != nil {
util.LogFatalAndExit(err, "failed to run health check server")
}
}()
}

<-stopCh
}

Expand Down
3 changes: 2 additions & 1 deletion cmd/health_check/health_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
func CmdMain() {
port := pflag.Int32("port", 0, "Target port")
tls := pflag.Bool("tls", false, "Dial the server with TLS")
enableMetrics := pflag.Bool("enable-metrics", true, "Whether to support metrics query")

klogFlags := flag.NewFlagSet("klog", flag.ExitOnError)
klog.InitFlags(klogFlags)
Expand Down Expand Up @@ -46,7 +47,7 @@ func CmdMain() {
}

addr := util.JoinHostPort(ip, *port)
if *tls {
if *enableMetrics && *tls {
addr = "tls://" + addr
} else {
addr = "tcp://" + addr
Expand Down
59 changes: 39 additions & 20 deletions cmd/ovn_monitor/ovn_monitor.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
package ovn_monitor

import (
"os"
"strings"
"net"
"net/http"
"time"

"k8s.io/klog/v2"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/manager/signals"

"github.com/kubeovn/kube-ovn/pkg/metrics"
Expand All @@ -14,8 +16,6 @@ import (
"github.com/kubeovn/kube-ovn/versions"
)

const port = 10661

func CmdMain() {
defer klog.Flush()

Expand All @@ -26,24 +26,43 @@ func CmdMain() {
util.LogFatalAndExit(err, "failed to parse config")
}

addr := config.ListenAddress
if os.Getenv("ENABLE_BIND_LOCAL_IP") == "true" {
if ips := strings.Split(os.Getenv("POD_IPS"), ","); len(ips) == 1 {
addr = util.JoinHostPort(ips[0], port)
}
}

exporter := ovn.NewExporter(config)
if err = exporter.StartConnection(); err != nil {
klog.Errorf("%s failed to connect db socket properly: %s", ovn.GetExporterName(), err)
go exporter.TryClientConnection()
}
exporter.StartOvnMetrics()

ctrl.SetLogger(klog.NewKlogr())
ctx := signals.SetupSignalHandler()
if err = metrics.Run(ctx, nil, addr, config.SecureServing, false); err != nil {
util.LogFatalAndExit(err, "failed to run metrics server")

metricsAddr := util.GetDefaultListenAddr()
if config.EnableMetrics {
exporter := ovn.NewExporter(config)
if err = exporter.StartConnection(); err != nil {
klog.Errorf("%s failed to connect db socket properly: %s", ovn.GetExporterName(), err)
go exporter.TryClientConnection()
}
exporter.StartOvnMetrics()
addr := util.JoinHostPort(metricsAddr, config.MetricsPort)
if err = metrics.Run(ctx, nil, addr, config.SecureServing, false); err != nil {
util.LogFatalAndExit(err, "failed to run metrics server")
}
} else {
klog.Info("metrics server is disabled")
listerner, err := net.ListenTCP("tcp", &net.TCPAddr{IP: net.ParseIP(util.GetDefaultListenAddr()), Port: int(config.MetricsPort)})
if err != nil {
util.LogFatalAndExit(err, "failed to listen on %s", util.JoinHostPort(metricsAddr, config.MetricsPort))
}
svr := manager.Server{
Name: "health-check",
Server: &http.Server{
Handler: http.NewServeMux(),
MaxHeaderBytes: 1 << 20,
IdleTimeout: 90 * time.Second,
ReadHeaderTimeout: 32 * time.Second,
},
Listener: listerner,
}
go func() {
if err = svr.Start(ctx); err != nil {
util.LogFatalAndExit(err, "failed to run health check server")
}
}()
}

<-ctx.Done()
}
8 changes: 5 additions & 3 deletions cmd/speaker/speaker.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,11 @@ func CmdMain() {
ctrl.SetLogger(klog.NewKlogr())
ctx := signals.SetupSignalHandler()
go func() {
metrics.InitKlogMetrics()
if err = metrics.Run(ctx, nil, util.JoinHostPort("0.0.0.0", config.PprofPort), false, false); err != nil {
util.LogFatalAndExit(err, "failed to run metrics server")
if config.EnableMetrics {
metrics.InitKlogMetrics()
if err = metrics.Run(ctx, nil, util.JoinHostPort("0.0.0.0", config.PprofPort), false, false); err != nil {
util.LogFatalAndExit(err, "failed to run metrics server")
}
}
<-ctx.Done()
}()
Expand Down
11 changes: 11 additions & 0 deletions dist/images/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ ENABLE_LB_SVC=${ENABLE_LB_SVC:-false}
ENABLE_NAT_GW=${ENABLE_NAT_GW:-true}
ENABLE_KEEP_VM_IP=${ENABLE_KEEP_VM_IP:-true}
ENABLE_ARP_DETECT_IP_CONFLICT=${ENABLE_ARP_DETECT_IP_CONFLICT:-true}
ENABLE_METRICS=${ENABLE_METRICS:-true}
# comma-separated string of nodelocal DNS ip addresses
NODE_LOCAL_DNS_IP=${NODE_LOCAL_DNS_IP:-}
ENABLE_IC=${ENABLE_IC:-$(kubectl get node --show-labels | grep -qw "ovn.kubernetes.io/ic-gw" && echo true || echo false)}
Expand Down Expand Up @@ -4182,6 +4183,7 @@ spec:
- --log_file_max_size=200
- --enable-lb-svc=$ENABLE_LB_SVC
- --keep-vm-ip=$ENABLE_KEEP_VM_IP
- --enable-metrics=$ENABLE_METRICS
- --node-local-dns-ip=$NODE_LOCAL_DNS_IP
- --secure-serving=${SECURE_SERVING}
- --ovsdb-con-timeout=$OVSDB_CON_TIMEOUT
Expand Down Expand Up @@ -4240,6 +4242,7 @@ spec:
- /kube-ovn/kube-ovn-healthcheck
- --port=10660
- --tls=${SECURE_SERVING}
- --enable-metrics=$ENABLE_METRICS
periodSeconds: 3
timeoutSeconds: 5
livenessProbe:
Expand All @@ -4248,6 +4251,7 @@ spec:
- /kube-ovn/kube-ovn-healthcheck
- --port=10660
- --tls=${SECURE_SERVING}
- --enable-metrics=$ENABLE_METRICS
initialDelaySeconds: 300
periodSeconds: 7
failureThreshold: 5
Expand Down Expand Up @@ -4342,6 +4346,7 @@ spec:
- --alsologtostderr=true
- --log_file=/var/log/kube-ovn/kube-ovn-cni.log
- --log_file_max_size=200
- --enable-metrics=$ENABLE_METRICS
- --kubelet-dir=$KUBELET_DIR
- --enable-tproxy=$ENABLE_TPROXY
- --ovs-vsctl-concurrency=$OVS_VSCTL_CONCURRENCY
Expand Down Expand Up @@ -4422,6 +4427,7 @@ spec:
- /kube-ovn/kube-ovn-healthcheck
- --port=10665
- --tls=${SECURE_SERVING}
- --enable-metrics=$ENABLE_METRICS
timeoutSeconds: 5
readinessProbe:
failureThreshold: 3
Expand All @@ -4432,6 +4438,7 @@ spec:
- /kube-ovn/kube-ovn-healthcheck
- --port=10665
- --tls=${SECURE_SERVING}
- --enable-metrics=$ENABLE_METRICS
timeoutSeconds: 5
resources:
requests:
Expand Down Expand Up @@ -4523,6 +4530,7 @@ spec:
- --alsologtostderr=true
- --log_file=/var/log/kube-ovn/kube-ovn-pinger.log
- --log_file_max_size=200
- --enable-metrics=$ENABLE_METRICS
imagePullPolicy: $IMAGE_PULL_POLICY
securityContext:
runAsUser: 0
Expand Down Expand Up @@ -4653,6 +4661,7 @@ spec:
- --logtostderr=false
- --alsologtostderr=true
- --log_file_max_size=200
- --enable-metrics=$ENABLE_METRICS
securityContext:
runAsUser: 0
privileged: false
Expand Down Expand Up @@ -4717,6 +4726,7 @@ spec:
- /kube-ovn/kube-ovn-healthcheck
- --port=10661
- --tls=${SECURE_SERVING}
- --enable-metrics=$ENABLE_METRICS
timeoutSeconds: 5
readinessProbe:
failureThreshold: 3
Expand All @@ -4728,6 +4738,7 @@ spec:
- /kube-ovn/kube-ovn-healthcheck
- --port=10661
- --tls=${SECURE_SERVING}
- --enable-metrics=$ENABLE_METRICS
timeoutSeconds: 5
nodeSelector:
kubernetes.io/os: "linux"
Expand Down
Loading

0 comments on commit cb39c63

Please sign in to comment.