diff --git a/pkg/agent/agent.go b/pkg/agent/agent.go index af0409ef1..334b298b2 100644 --- a/pkg/agent/agent.go +++ b/pkg/agent/agent.go @@ -103,7 +103,7 @@ func TemplateConfigInterface() ConfigInterface { } // NotifyFn is a function to call when the effective configuration changes. -type NotifyFn func(cfg interface{}) error +type NotifyFn func(cfg interface{}) (bool, error) var ( // Our logger instance for the agent. @@ -544,12 +544,17 @@ func (a *Agent) updateConfig(cfg metav1.Object) { } } - err := a.notifyFn(cfg) + fatal, err := a.notifyFn(cfg) + a.patchConfigStatus(a.currentCfg, cfg, err) + if err != nil { - log.Errorf("failed to apply configuration: %v", err) + if fatal { + log.Fatalf("failed to apply configuration: %v", err) + } else { + log.Errorf("failed to apply configuration: %v", err) + } } - a.patchConfigStatus(a.currentCfg, cfg, err) a.currentCfg = cfg } diff --git a/pkg/resmgr/resource-manager.go b/pkg/resmgr/resource-manager.go index cff9c7cfb..b5ba74f23 100644 --- a/pkg/resmgr/resource-manager.go +++ b/pkg/resmgr/resource-manager.go @@ -128,17 +128,17 @@ func (m *resmgr) Start() error { return nil } -func (m *resmgr) updateConfig(newCfg interface{}) error { +func (m *resmgr) updateConfig(newCfg interface{}) (bool, error) { if newCfg == nil { - return fmt.Errorf("can't run without effective configuration...") + return false, fmt.Errorf("can't run without effective configuration...") } cfg, ok := newCfg.(cfgapi.ResmgrConfig) if !ok { if !m.running { - log.Fatalf("got initial configuration of unexpected type %T", newCfg) + return true, fmt.Errorf("got initial configuration of unexpected type %T", newCfg) } else { - return fmt.Errorf("got configuration of unexpected type %T", newCfg) + return false, fmt.Errorf("got configuration of unexpected type %T", newCfg) } } @@ -151,17 +151,17 @@ func (m *resmgr) updateConfig(newCfg interface{}) error { log.InfoBlock(" ", "%s", dump) if err := m.start(cfg); err != nil { - log.Fatalf("failed to start with initial configuration: %v", err) + return true, fmt.Errorf("failed to start with initial configuration: %v", err) } m.running = true - return nil + return false, nil } log.Infof("configuration update %s (generation %d):", meta.GetName(), meta.GetGeneration()) log.InfoBlock(" ", "%s", dump) - return m.reconfigure(cfg) + return false, m.reconfigure(cfg) } // Start resource management once we acquired initial configuration. diff --git a/test/e2e/policies.test-suite/topology-aware/n4c16/test12-config-status/code.var.sh b/test/e2e/policies.test-suite/topology-aware/n4c16/test12-config-status/code.var.sh index 644001707..c8472e046 100644 --- a/test/e2e/policies.test-suite/topology-aware/n4c16/test12-config-status/code.var.sh +++ b/test/e2e/policies.test-suite/topology-aware/n4c16/test12-config-status/code.var.sh @@ -1,4 +1,13 @@ -helm-terminate +CONFIG_GROUP="group.test" + +cleanup() { + vm-command "kubectl delete -n kube-system topologyawarepolicies.config.nri/default" || : + vm-command "kubectl delete -n kube-system topologyawarepolicies.config.nri/$CONFIG_GROUP" || : + vm-command "kubectl label nodes --all config.nri/group-" || : + helm-terminate || : +} + +cleanup helm_config=$(instantiate helm-config.yaml) helm-launch topology-aware sleep 1 @@ -12,6 +21,7 @@ vm-command "kubectl wait -n kube-system topologyawarepolicies/default \ error "expected initial Success status" } +# verify propagation of errors back to source CR vm-put-file $(RESERVED_CPU=750x instantiate custom-config.yaml) broken-config.yaml vm-command "kubectl apply -f broken-config.yaml" @@ -26,3 +36,19 @@ vm-command "kubectl wait -n kube-system topologyawarepolicies/default \ } helm-terminate + +# verify propagation of initial configuration errors back to source CR +vm-put-file $(CONFIG_NAME="$CONFIG_GROUP" RESERVED_CPU=750x instantiate custom-config.yaml) \ + broken-group-config.yaml +vm-command "kubectl apply -f broken-group-config.yaml" || \ + error "failed to install broken group config" +vm-command "kubectl label nodes --all config.nri/group=test" || \ + error "failed to label nodes for group config" + +expect_error=1 launch_timeout=5s helm_config=$(instantiate helm-config.yaml) helm-launch topology-aware +get-config-node-status-error topologyawarepolicies/$CONFIG_GROUP | \ + grep "failed to parse" | grep -q 750x || { + error "expected initial error not found in status" +} + +cleanup diff --git a/test/e2e/policies.test-suite/topology-aware/n4c16/test12-config-status/custom-config.yaml.in b/test/e2e/policies.test-suite/topology-aware/n4c16/test12-config-status/custom-config.yaml.in index 720ef00d4..4a5f812de 100644 --- a/test/e2e/policies.test-suite/topology-aware/n4c16/test12-config-status/custom-config.yaml.in +++ b/test/e2e/policies.test-suite/topology-aware/n4c16/test12-config-status/custom-config.yaml.in @@ -1,7 +1,7 @@ apiVersion: config.nri/v1alpha1 kind: TopologyAwarePolicy metadata: - name: default + name: ${CONFIG_NAME:-default} namespace: kube-system spec: pinCPU: true diff --git a/test/e2e/run.sh b/test/e2e/run.sh index cee7d8736..d3fc8eced 100755 --- a/test/e2e/run.sh +++ b/test/e2e/run.sh @@ -353,6 +353,7 @@ helm-launch() { # script API # default: 20s # cfgresource: config custom resource to wait for node status to change in # default: balloonspolicies/default or topologyawarepolicies/default + # expect_error: don't exit, expect availability error # # Example: # helm_config=$(instantiate helm-config.yaml) helm-launch balloons @@ -361,6 +362,7 @@ helm-launch() { # script API local helm_config="${helm_config:-$TEST_DIR/helm-config.yaml}" local ds_name="${daemonset_name:-}" ctr_name="${container_name:-nri-resource-policy-$1}" local helm_name="${helm_name:-test}" timeout="${launch_timeout:-20s}" + local expect_error="${expect_error:-0}" local plugin="$1" cfgresource=${cfgresource:-} cfgstatus local deadline shift @@ -403,8 +405,15 @@ helm-launch() { # script API deadline=$(deadline-for-timeout $timeout) vm-command "kubectl wait -n kube-system ds/${ds_name} --timeout=$timeout \ - --for=jsonpath='{.status.numberAvailable}'=1" || \ - error "Timeout while waiting daemonset/${ds_name} to be available" + --for=jsonpath='{.status.numberAvailable}'=1" + + if [ "$COMMAND_STATUS" != "0" ]; then + if [ "$expect_error" != "1" ]; then + error "Timeout while waiting daemonset/${ds_name} to be available" + else + return 1 + fi + fi timeout=$(timeout-for-deadline $deadline) timeout=$timeout wait-config-node-status $cfgresource @@ -413,7 +422,6 @@ helm-launch() { # script API if [ "$result" != "Success" ]; then reason=$(get-config-node-status-error $cfgresource) error "Plugin $plugin configuration failed: $reason" - return 1 fi vm-start-log-collection -n kube-system ds/$ds_name -c $ctr_name @@ -502,7 +510,7 @@ get-config-node-status-result() { get-config-node-status-error() { local resource="$1" node="$(get-hostname-for-vm)" vm-command-q "kubectl get -n kube-system $resource \ - -ojsonpath=\"{.status.nodes['$node'].error}\"" + -ojsonpath=\"{.status.nodes['$node'].errors}\"" } wait-config-node-status() {