From 25914ec06edfa3f5d8fbbc13c45ee870b575a760 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Wed, 23 Oct 2024 14:27:29 +0300 Subject: [PATCH 1/3] nfd-worker: drop the gRPC health port To be replaced with plain http. --- cmd/nfd-worker/main.go | 2 - .../worker-daemonset/worker-daemonset.yaml | 4 -- .../templates/worker.yaml | 7 --- .../helm/node-feature-discovery/values.yaml | 5 -- docs/deployment/helm.md | 1 - pkg/nfd-worker/nfd-worker.go | 58 +++---------------- 6 files changed, 8 insertions(+), 69 deletions(-) diff --git a/cmd/nfd-worker/main.go b/cmd/nfd-worker/main.go index c92bfab1fe..afc68c5b4e 100644 --- a/cmd/nfd-worker/main.go +++ b/cmd/nfd-worker/main.go @@ -110,8 +110,6 @@ func initFlags(flagset *flag.FlagSet) (*worker.Args, *worker.ConfigOverrideArgs) "Do not publish feature labels") flagset.IntVar(&args.MetricsPort, "metrics", 8081, "Port on which to expose metrics.") - flagset.IntVar(&args.GrpcHealthPort, "grpc-health", 8082, - "Port on which to expose the grpc health endpoint.") flagset.StringVar(&args.Options, "options", "", "Specify config options from command line. Config options are specified "+ "in the same format as in the config file (i.e. json or yaml). These options") diff --git a/deployment/base/worker-daemonset/worker-daemonset.yaml b/deployment/base/worker-daemonset/worker-daemonset.yaml index f70a64388c..300953fb02 100644 --- a/deployment/base/worker-daemonset/worker-daemonset.yaml +++ b/deployment/base/worker-daemonset/worker-daemonset.yaml @@ -20,13 +20,9 @@ spec: image: gcr.io/k8s-staging-nfd/node-feature-discovery:master imagePullPolicy: Always livenessProbe: - grpc: - port: 8082 initialDelaySeconds: 10 periodSeconds: 10 readinessProbe: - grpc: - port: 8082 initialDelaySeconds: 5 periodSeconds: 10 failureThreshold: 10 diff --git a/deployment/helm/node-feature-discovery/templates/worker.yaml b/deployment/helm/node-feature-discovery/templates/worker.yaml index d2d1ce4b13..ec459ac116 100644 --- a/deployment/helm/node-feature-discovery/templates/worker.yaml +++ b/deployment/helm/node-feature-discovery/templates/worker.yaml @@ -47,8 +47,6 @@ spec: image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" imagePullPolicy: {{ .Values.image.pullPolicy }} livenessProbe: - grpc: - port: {{ .Values.worker.healthPort | default "8082" }} {{- with .Values.worker.livenessProbe.initialDelaySeconds }} initialDelaySeconds: {{ . }} {{- end }} @@ -62,8 +60,6 @@ spec: timeoutSeconds: {{ . }} {{- end }} readinessProbe: - grpc: - port: {{ .Values.worker.healthPort | default "8082" }} {{- with .Values.worker.readinessProbe.initialDelaySeconds }} initialDelaySeconds: {{ . }} {{- end }} @@ -105,15 +101,12 @@ spec: - "-feature-gates={{ $key }}={{ $value }}" {{- end }} - "-metrics={{ .Values.worker.metricsPort | default "8081"}}" - - "-grpc-health={{ .Values.worker.healthPort | default "8082" }}" {{- with .Values.worker.extraArgs }} {{- toYaml . | nindent 8 }} {{- end }} ports: - containerPort: {{ .Values.worker.metricsPort | default "8081"}} name: metrics - - containerPort: {{ .Values.worker.healthPort | default "8082" }} - name: health volumeMounts: - name: host-boot mountPath: "/host-boot" diff --git a/deployment/helm/node-feature-discovery/values.yaml b/deployment/helm/node-feature-discovery/values.yaml index a0ba0d12d2..cb5f7e80bc 100644 --- a/deployment/helm/node-feature-discovery/values.yaml +++ b/deployment/helm/node-feature-discovery/values.yaml @@ -419,7 +419,6 @@ worker: ### metricsPort: 8081 - healthPort: 8082 daemonsetAnnotations: {} podSecurityContext: {} # fsGroup: 2000 @@ -433,15 +432,11 @@ worker: # runAsUser: 1000 livenessProbe: - grpc: - port: 8082 initialDelaySeconds: 10 # failureThreshold: 3 # periodSeconds: 10 # timeoutSeconds: 1 readinessProbe: - grpc: - port: 8082 initialDelaySeconds: 5 failureThreshold: 10 # periodSeconds: 10 diff --git a/docs/deployment/helm.md b/docs/deployment/helm.md index be083d4e82..4d47dcc5ef 100644 --- a/docs/deployment/helm.md +++ b/docs/deployment/helm.md @@ -234,7 +234,6 @@ API's you need to install the prometheus operator in your cluster. | `worker.enable` | bool | true | Specifies whether nfd-worker should be deployed | | `worker.hostNetwork` | bool | false | Specifies whether to enable or disable running the container in the host's network namespace | | `worker.metricsPort` | int | 8081 | Port on which to expose metrics from components to prometheus operator. **DEPRECATED**: will be replaced by `worker.port` in NFD v0.18. | -| `worker.healthPort` | int | 8082 | Port on which to expose the grpc health endpoint, will be also used for the probes. **DEPRECATED**: will be replaced by `worker.port` in NFD v0.18. | | `worker.config` | dict | | NFD worker [configuration](../reference/worker-configuration-reference) | | `worker.podSecurityContext` | dict | {} | [PodSecurityContext](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-pod) holds pod-level security attributes and common container settins | | `worker.securityContext` | dict | {} | Container [security settings](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-container) | diff --git a/pkg/nfd-worker/nfd-worker.go b/pkg/nfd-worker/nfd-worker.go index e709c6d6e4..119883fe26 100644 --- a/pkg/nfd-worker/nfd-worker.go +++ b/pkg/nfd-worker/nfd-worker.go @@ -19,7 +19,6 @@ package nfdworker import ( "encoding/json" "fmt" - "net" "os" "path/filepath" "regexp" @@ -29,9 +28,6 @@ import ( "golang.org/x/exp/maps" "golang.org/x/net/context" - "google.golang.org/grpc" - "google.golang.org/grpc/health" - "google.golang.org/grpc/health/grpc_health_v1" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/validation" @@ -93,14 +89,13 @@ type Labels map[string]string // Args are the command line arguments of NfdWorker. type Args struct { - ConfigFile string - Klog map[string]*utils.KlogFlagVal - Kubeconfig string - Oneshot bool - Options string - MetricsPort int - GrpcHealthPort int - NoOwnerRefs bool + ConfigFile string + Klog map[string]*utils.KlogFlagVal + Kubeconfig string + Oneshot bool + Options string + MetricsPort int + NoOwnerRefs bool Overrides ConfigOverrideArgs } @@ -118,7 +113,6 @@ type nfdWorker struct { configFilePath string config *NFDConfig kubernetesNamespace string - healthServer *grpc.Server k8sClient k8sclient.Interface nfdClient nfdclient.Interface stop chan struct{} // channel for signaling stop @@ -217,29 +211,6 @@ func (i *infiniteTicker) Reset(d time.Duration) { } } -func (w *nfdWorker) startGrpcHealthServer(errChan chan<- error) error { - lis, err := net.Listen("tcp", fmt.Sprintf(":%d", w.args.GrpcHealthPort)) - if err != nil { - return fmt.Errorf("failed to listen: %w", err) - } - - s := grpc.NewServer() - grpc_health_v1.RegisterHealthServer(s, health.NewServer()) - klog.InfoS("gRPC health server serving", "port", w.args.GrpcHealthPort) - - go func() { - defer func() { - lis.Close() - }() - if err := s.Serve(lis); err != nil { - errChan <- fmt.Errorf("gRPC health server exited with an error: %w", err) - } - klog.InfoS("gRPC health server stopped") - }() - w.healthServer = s - return nil -} - // Run feature discovery. func (w *nfdWorker) runFeatureDiscovery() error { discoveryStart := time.Now() @@ -344,20 +315,10 @@ func (w *nfdWorker) Run() error { return nil } - grpcErr := make(chan error) - - // Start gRPC server for liveness probe (at this point we're "live") - if w.args.GrpcHealthPort != 0 { - if err := w.startGrpcHealthServer(grpcErr); err != nil { - return fmt.Errorf("failed to start gRPC health server: %w", err) - } - } + // Start readiness probe (at this point we're "ready and live") for { select { - case err := <-grpcErr: - return fmt.Errorf("error in serving gRPC: %w", err) - case <-labelTrigger.C: err = w.runFeatureDiscovery() if err != nil { @@ -366,9 +327,6 @@ func (w *nfdWorker) Run() error { case <-w.stop: klog.InfoS("shutting down nfd-worker") - if w.healthServer != nil { - w.healthServer.GracefulStop() - } return nil } } From 4959a13a07be234f127fce2f5fa33a8050304225 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Wed, 23 Oct 2024 15:08:32 +0300 Subject: [PATCH 2/3] nfd-worker: replace --metrics with --port Use a single port for serving http. In addition to metrics we will have the healthz endpoint. --- cmd/nfd-worker/main.go | 4 +-- .../worker-daemonset/worker-daemonset.yaml | 4 +-- .../templates/worker.yaml | 6 ++--- .../helm/node-feature-discovery/values.yaml | 2 +- docs/deployment/helm.md | 2 +- pkg/nfd-worker/nfd-worker.go | 27 ++++++++++++------- 6 files changed, 27 insertions(+), 18 deletions(-) diff --git a/cmd/nfd-worker/main.go b/cmd/nfd-worker/main.go index afc68c5b4e..120db537ab 100644 --- a/cmd/nfd-worker/main.go +++ b/cmd/nfd-worker/main.go @@ -108,8 +108,8 @@ func initFlags(flagset *flag.FlagSet) (*worker.Args, *worker.ConfigOverrideArgs) "Kubeconfig to use") flagset.BoolVar(&args.Oneshot, "oneshot", false, "Do not publish feature labels") - flagset.IntVar(&args.MetricsPort, "metrics", 8081, - "Port on which to expose metrics.") + flagset.IntVar(&args.Port, "port", 8080, + "Port on which to metrics and healthz endpoints are served") flagset.StringVar(&args.Options, "options", "", "Specify config options from command line. Config options are specified "+ "in the same format as in the config file (i.e. json or yaml). These options") diff --git a/deployment/base/worker-daemonset/worker-daemonset.yaml b/deployment/base/worker-daemonset/worker-daemonset.yaml index 300953fb02..ec340737b0 100644 --- a/deployment/base/worker-daemonset/worker-daemonset.yaml +++ b/deployment/base/worker-daemonset/worker-daemonset.yaml @@ -36,5 +36,5 @@ spec: cpu: 5m memory: 64Mi ports: - - name: metrics - containerPort: 8081 + - name: http + containerPort: 8080 diff --git a/deployment/helm/node-feature-discovery/templates/worker.yaml b/deployment/helm/node-feature-discovery/templates/worker.yaml index ec459ac116..11ccecf4d1 100644 --- a/deployment/helm/node-feature-discovery/templates/worker.yaml +++ b/deployment/helm/node-feature-discovery/templates/worker.yaml @@ -100,13 +100,13 @@ spec: {{- range $key, $value := .Values.featureGates }} - "-feature-gates={{ $key }}={{ $value }}" {{- end }} - - "-metrics={{ .Values.worker.metricsPort | default "8081"}}" + - "-port={{ .Values.worker.port | default "8080"}}" {{- with .Values.worker.extraArgs }} {{- toYaml . | nindent 8 }} {{- end }} ports: - - containerPort: {{ .Values.worker.metricsPort | default "8081"}} - name: metrics + - containerPort: {{ .Values.worker.port | default "8080"}} + name: http volumeMounts: - name: host-boot mountPath: "/host-boot" diff --git a/deployment/helm/node-feature-discovery/values.yaml b/deployment/helm/node-feature-discovery/values.yaml index cb5f7e80bc..0750b38bf6 100644 --- a/deployment/helm/node-feature-discovery/values.yaml +++ b/deployment/helm/node-feature-discovery/values.yaml @@ -418,7 +418,7 @@ worker: # matchName: {op: In, value: ["SWAP", "X86", "ARM"]} ### - metricsPort: 8081 + port: 8080 daemonsetAnnotations: {} podSecurityContext: {} # fsGroup: 2000 diff --git a/docs/deployment/helm.md b/docs/deployment/helm.md index 4d47dcc5ef..4d9fa9d2a0 100644 --- a/docs/deployment/helm.md +++ b/docs/deployment/helm.md @@ -233,7 +233,7 @@ API's you need to install the prometheus operator in your cluster. | `worker.*` | dict | | NFD worker daemonset configuration | | `worker.enable` | bool | true | Specifies whether nfd-worker should be deployed | | `worker.hostNetwork` | bool | false | Specifies whether to enable or disable running the container in the host's network namespace | -| `worker.metricsPort` | int | 8081 | Port on which to expose metrics from components to prometheus operator. **DEPRECATED**: will be replaced by `worker.port` in NFD v0.18. | +| `worker.port` | int | 8080 | Port on which to serve http for metrics and healthz endpoints. | | `worker.config` | dict | | NFD worker [configuration](../reference/worker-configuration-reference) | | `worker.podSecurityContext` | dict | {} | [PodSecurityContext](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-pod) holds pod-level security attributes and common container settins | | `worker.securityContext` | dict | {} | Container [security settings](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-container) | diff --git a/pkg/nfd-worker/nfd-worker.go b/pkg/nfd-worker/nfd-worker.go index 119883fe26..b275e2dd3e 100644 --- a/pkg/nfd-worker/nfd-worker.go +++ b/pkg/nfd-worker/nfd-worker.go @@ -19,6 +19,7 @@ package nfdworker import ( "encoding/json" "fmt" + "net/http" "os" "path/filepath" "regexp" @@ -26,6 +27,8 @@ import ( "strings" "time" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" "golang.org/x/exp/maps" "golang.org/x/net/context" "k8s.io/apimachinery/pkg/api/errors" @@ -94,7 +97,7 @@ type Args struct { Kubeconfig string Oneshot bool Options string - MetricsPort int + Port int NoOwnerRefs bool Overrides ConfigOverrideArgs @@ -295,15 +298,13 @@ func (w *nfdWorker) Run() error { labelTrigger.Reset(w.config.Core.SleepInterval.Duration) defer labelTrigger.Stop() + httpMux := http.NewServeMux() + // Register to metrics server - if w.args.MetricsPort > 0 { - m := utils.CreateMetricsServer(w.args.MetricsPort, - buildInfo, - featureDiscoveryDuration) - go m.Run() - registerVersion(version.Get()) - defer m.Stop() - } + promRegistry := prometheus.NewRegistry() + promRegistry.MustRegister(buildInfo, featureDiscoveryDuration) + httpMux.Handle("/metrics", promhttp.HandlerFor(promRegistry, promhttp.HandlerOpts{})) + registerVersion(version.Get()) err = w.runFeatureDiscovery() if err != nil { @@ -317,6 +318,14 @@ func (w *nfdWorker) Run() error { // Start readiness probe (at this point we're "ready and live") + // Start HTTP server + httpServer := http.Server{Addr: fmt.Sprintf(":%d", w.args.Port), Handler: httpMux} + go func() { + klog.InfoS("http server starting", "port", httpServer.Addr) + klog.InfoS("http server stopped", "exitCode", httpServer.ListenAndServe()) + }() + defer httpServer.Close() + for { select { case <-labelTrigger.C: From 850c544522b507fed4a51746ea63d96abc8144aa Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Wed, 23 Oct 2024 15:19:53 +0300 Subject: [PATCH 3/3] nfd-worker: add healthz endpoint --- deployment/base/worker-daemonset/worker-daemonset.yaml | 6 ++++++ .../helm/node-feature-discovery/templates/worker.yaml | 6 ++++++ pkg/nfd-worker/nfd-worker.go | 7 ++++++- 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/deployment/base/worker-daemonset/worker-daemonset.yaml b/deployment/base/worker-daemonset/worker-daemonset.yaml index ec340737b0..dd429af513 100644 --- a/deployment/base/worker-daemonset/worker-daemonset.yaml +++ b/deployment/base/worker-daemonset/worker-daemonset.yaml @@ -20,9 +20,15 @@ spec: image: gcr.io/k8s-staging-nfd/node-feature-discovery:master imagePullPolicy: Always livenessProbe: + httpGet: + path: /healthz + port: http initialDelaySeconds: 10 periodSeconds: 10 readinessProbe: + httpGet: + path: /healthz + port: http initialDelaySeconds: 5 periodSeconds: 10 failureThreshold: 10 diff --git a/deployment/helm/node-feature-discovery/templates/worker.yaml b/deployment/helm/node-feature-discovery/templates/worker.yaml index 11ccecf4d1..b8399f0981 100644 --- a/deployment/helm/node-feature-discovery/templates/worker.yaml +++ b/deployment/helm/node-feature-discovery/templates/worker.yaml @@ -47,6 +47,9 @@ spec: image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" imagePullPolicy: {{ .Values.image.pullPolicy }} livenessProbe: + httpGet: + path: /healthz + port: http {{- with .Values.worker.livenessProbe.initialDelaySeconds }} initialDelaySeconds: {{ . }} {{- end }} @@ -60,6 +63,9 @@ spec: timeoutSeconds: {{ . }} {{- end }} readinessProbe: + httpGet: + path: /healthz + port: http {{- with .Values.worker.readinessProbe.initialDelaySeconds }} initialDelaySeconds: {{ . }} {{- end }} diff --git a/pkg/nfd-worker/nfd-worker.go b/pkg/nfd-worker/nfd-worker.go index b275e2dd3e..8021cc46e5 100644 --- a/pkg/nfd-worker/nfd-worker.go +++ b/pkg/nfd-worker/nfd-worker.go @@ -203,6 +203,10 @@ func newDefaultConfig() *NFDConfig { } } +func (w *nfdWorker) Healthz(writer http.ResponseWriter, _ *http.Request) { + writer.WriteHeader(http.StatusOK) +} + func (i *infiniteTicker) Reset(d time.Duration) { switch { case d > 0: @@ -316,7 +320,8 @@ func (w *nfdWorker) Run() error { return nil } - // Start readiness probe (at this point we're "ready and live") + // Register health endpoint (at this point we're "ready and live") + httpMux.HandleFunc("/healthz", w.Healthz) // Start HTTP server httpServer := http.Server{Addr: fmt.Sprintf(":%d", w.args.Port), Handler: httpMux}