Skip to content

Commit

Permalink
Add and remove exclude-from-external-load-balancers label (#582)
Browse files Browse the repository at this point in the history
* add and remove exclude-from-external-load-balancers label

* move dry run after checking the label's value

* update helm and helm README

* wording nit
  • Loading branch information
tjs-intel authored Feb 11, 2022
1 parent 35ced6c commit 416da0b
Show file tree
Hide file tree
Showing 7 changed files with 89 additions and 5 deletions.
1 change: 1 addition & 0 deletions config/helm/aws-node-termination-handler/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ The configuration in this table applies to all AWS Node Termination Handler mode
| `dryRun` | If `true`, only log if a node would be drained. | `false` |
| `cordonOnly` | If `true`, nodes will be cordoned but not drained when an interruption event occurs. | `false` |
| `taintNode` | If `true`, nodes will be tainted when an interruption event occurs. Currently used taint keys are `aws-node-termination-handler/scheduled-maintenance`, `aws-node-termination-handler/spot-itn`, `aws-node-termination-handler/asg-lifecycle-termination` and `aws-node-termination-handler/rebalance-recommendation`. | `false` |
| `excludeFromLoadBalancers` | If `true`, nodes will be marked for exclusion from load balancers before they are cordoned. This applies the `node.kubernetes.io/exclude-from-external-load-balancers` label to enable the ServiceNodeExclusion feature gate. The label will not be modified or removed for nodes that already have it. | `false` |
| `deleteLocalData` | If `true`, continue even if there are pods using local data that will be deleted when the node is drained. | `true` |
| `ignoreDaemonSets` | If `true`, skip terminating daemon set managed pods. | `true` |
| `podTerminationGracePeriod` | The time in seconds given to each pod to terminate gracefully. If negative, the default value specified in the pod will be used, which defaults to 30 seconds if not specified for the pod. | `-1` |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ spec:
value: {{ .Values.cordonOnly | quote }}
- name: TAINT_NODE
value: {{ .Values.taintNode | quote }}
- name: EXCLUDE_FROM_LOAD_BALANCERS
value: {{ .Values.excludeFromLoadBalancers | quote }}
- name: DELETE_LOCAL_DATA
value: {{ .Values.deleteLocalData | quote }}
- name: IGNORE_DAEMON_SETS
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ spec:
value: {{ .Values.cordonOnly | quote }}
- name: TAINT_NODE
value: {{ .Values.taintNode | quote }}
- name: EXCLUDE_FROM_LOAD_BALANCERS
value: {{ .Values.excludeFromLoadBalancers | quote }}
- name: DELETE_LOCAL_DATA
value: {{ .Values.deleteLocalData | quote }}
- name: IGNORE_DAEMON_SETS
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ spec:
value: {{ .Values.cordonOnly | quote }}
- name: TAINT_NODE
value: {{ .Values.taintNode | quote }}
- name: EXCLUDE_FROM_LOAD_BALANCERS
value: {{ .Values.excludeFromLoadBalancers | quote }}
- name: DELETE_LOCAL_DATA
value: {{ .Values.deleteLocalData | quote }}
- name: IGNORE_DAEMON_SETS
Expand Down
3 changes: 3 additions & 0 deletions config/helm/aws-node-termination-handler/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,9 @@ cordonOnly: false
# Taint node upon spot interruption termination notice.
taintNode: false

# Exclude node from load balancer before cordoning via the ServiceNodeExclusion feature gate.
excludeFromLoadBalancers: false

# deleteLocalData tells kubectl to continue even if there are pods using
# emptyDir (local data that will be deleted when the node is drained).
deleteLocalData: true
Expand Down
6 changes: 6 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ const (
taintNode = "TAINT_NODE"
taintEffectDefault = "NoSchedule"
taintEffect = "TAINT_EFFECT"
excludeFromLoadBalancers = "EXCLUDE_FROM_LOAD_BALANCERS"
jsonLoggingConfigKey = "JSON_LOGGING"
jsonLoggingDefault = false
logLevelConfigKey = "LOG_LEVEL"
Expand Down Expand Up @@ -126,6 +127,7 @@ type Config struct {
CordonOnly bool
TaintNode bool
TaintEffect string
ExcludeFromLoadBalancers bool
JsonLogging bool
LogLevel string
UptimeFromFile string
Expand Down Expand Up @@ -179,6 +181,7 @@ func ParseCliArgs() (config Config, err error) {
flag.BoolVar(&config.CordonOnly, "cordon-only", getBoolEnv(cordonOnly, false), "If true, nodes will be cordoned but not drained when an interruption event occurs.")
flag.BoolVar(&config.TaintNode, "taint-node", getBoolEnv(taintNode, false), "If true, nodes will be tainted when an interruption event occurs.")
flag.StringVar(&config.TaintEffect, "taint-effect", getEnv(taintEffect, taintEffectDefault), "Sets the effect when a node is tainted.")
flag.BoolVar(&config.ExcludeFromLoadBalancers, "exclude-from-load-balancers", getBoolEnv(excludeFromLoadBalancers, false), "If true, nodes will be marked for exclusion from load balancers when an interruption event occurs.")
flag.BoolVar(&config.JsonLogging, "json-logging", getBoolEnv(jsonLoggingConfigKey, jsonLoggingDefault), "If true, use JSON-formatted logs instead of human readable logs.")
flag.StringVar(&config.LogLevel, "log-level", getEnv(logLevelConfigKey, logLevelDefault), "Sets the log level (INFO, DEBUG, or ERROR)")
flag.StringVar(&config.UptimeFromFile, "uptime-from-file", getEnv(uptimeFromFileConfigKey, uptimeFromFileDefault), "If specified, read system uptime from the file path (useful for testing).")
Expand Down Expand Up @@ -254,6 +257,7 @@ func (c Config) PrintJsonConfigArgs() {
Bool("cordon_only", c.CordonOnly).
Bool("taint_node", c.TaintNode).
Str("taint_effect", c.TaintEffect).
Bool("exclude_from_load_balancers", c.ExcludeFromLoadBalancers).
Bool("json_logging", c.JsonLogging).
Str("log_level", c.LogLevel).
Str("webhook_proxy", c.WebhookProxy).
Expand Down Expand Up @@ -298,6 +302,7 @@ func (c Config) PrintHumanConfigArgs() {
"\tcordon-only: %t,\n"+
"\ttaint-node: %t,\n"+
"\ttaint-effect: %s,\n"+
"\texclude-from-load-balancers: %t,\n"+
"\tjson-logging: %t,\n"+
"\tlog-level: %s,\n"+
"\twebhook-proxy: %s,\n"+
Expand Down Expand Up @@ -333,6 +338,7 @@ func (c Config) PrintHumanConfigArgs() {
c.CordonOnly,
c.TaintNode,
c.TaintEffect,
c.ExcludeFromLoadBalancers,
c.JsonLogging,
c.LogLevel,
c.WebhookProxy,
Expand Down
78 changes: 73 additions & 5 deletions pkg/node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ const (
ActionLabelTimeKey = "aws-node-termination-handler/action-time"
// EventIDLabelKey is a k8s label key whose value is the drainable event id
EventIDLabelKey = "aws-node-termination-handler/event-id"
// Apply this label to enable the ServiceNodeExclusion feature gate for excluding nodes from load balancers
ExcludeFromLoadBalancersLabelKey = "node.kubernetes.io/exclude-from-external-load-balancers"
// The value associated with this label is irrelevant for enabling the feature gate
// By defining a unique value it is possible to check if the label was applied by us before removing it
ExcludeFromLoadBalancersLabelValue = "aws-node-termination-handler"
)

const (
Expand Down Expand Up @@ -95,7 +100,11 @@ func (n Node) CordonAndDrain(nodeName string, reason string) error {
log.Info().Str("node_name", nodeName).Str("reason", reason).Msg("Node would have been cordoned and drained, but dry-run flag was set.")
return nil
}
err := n.Cordon(nodeName, reason)
err := n.MaybeMarkForExclusionFromLoadBalancers(nodeName)
if err != nil {
return err
}
err = n.Cordon(nodeName, reason)
if err != nil {
return err
}
Expand Down Expand Up @@ -161,13 +170,26 @@ func (n Node) IsUnschedulable(nodeName string) (bool, error) {

// MarkWithEventID will add the drain event ID to the node to be properly ignored after a system restart event
func (n Node) MarkWithEventID(nodeName string, eventID string) error {
err := n.addLabel(nodeName, EventIDLabelKey, eventID)
err := n.addLabel(nodeName, EventIDLabelKey, eventID, false)
if err != nil {
return fmt.Errorf("Unable to label node with event ID %s=%s: %w", EventIDLabelKey, eventID, err)
}
return nil
}

// MaybeMarkForExclusionFromLoadBalancers will activate the ServiceNodeExclusion feature flag to indicate that the node should be removed from load balancers
func (n Node) MaybeMarkForExclusionFromLoadBalancers(nodeName string) error {
if !n.nthConfig.ExcludeFromLoadBalancers {
log.Debug().Msg("Not marking for exclusion from load balancers because the configuration flag is not set")
return nil
}
err := n.addLabel(nodeName, ExcludeFromLoadBalancersLabelKey, ExcludeFromLoadBalancersLabelValue, true)
if err != nil {
return fmt.Errorf("Unable to label node for exclusion from load balancers: %w", err)
}
return nil
}

// RemoveNTHLabels will remove all the custom NTH labels added to the node
func (n Node) RemoveNTHLabels(nodeName string) error {
for _, label := range []string{EventIDLabelKey, ActionLabelKey, ActionLabelTimeKey} {
Expand All @@ -176,6 +198,10 @@ func (n Node) RemoveNTHLabels(nodeName string) error {
return fmt.Errorf("Unable to remove %s from node: %w", label, err)
}
}
err := n.removeLabelIfValueMatches(nodeName, ExcludeFromLoadBalancersLabelKey, ExcludeFromLoadBalancersLabelValue)
if err != nil {
return fmt.Errorf("Unable to remove %s from node: %w", ExcludeFromLoadBalancersLabelKey, err)
}
return nil
}

Expand All @@ -199,12 +225,12 @@ func (n Node) GetEventID(nodeName string) (string, error) {
// MarkForUncordonAfterReboot adds labels to the kubernetes node which NTH will read upon reboot
func (n Node) MarkForUncordonAfterReboot(nodeName string) error {
// adds label to node so that the system will uncordon the node after the scheduled reboot has taken place
err := n.addLabel(nodeName, ActionLabelKey, UncordonAfterRebootLabelVal)
err := n.addLabel(nodeName, ActionLabelKey, UncordonAfterRebootLabelVal, false)
if err != nil {
return fmt.Errorf("Unable to label node with action to uncordon after system-reboot: %w", err)
}
// adds label with the current time which is checked against the uptime of the node when processing labels on startup
err = n.addLabel(nodeName, ActionLabelTimeKey, strconv.FormatInt(time.Now().Unix(), 10))
err = n.addLabel(nodeName, ActionLabelTimeKey, strconv.FormatInt(time.Now().Unix(), 10), false)
if err != nil {
// if time can't be recorded, rollback the action label
err := n.removeLabel(nodeName, ActionLabelKey)
Expand All @@ -218,7 +244,8 @@ func (n Node) MarkForUncordonAfterReboot(nodeName string) error {
}

// addLabel will add a label to the node given a label key and value
func (n Node) addLabel(nodeName string, key string, value string) error {
// Specifying true for the skipExisting parameter will skip adding the label if it already exists
func (n Node) addLabel(nodeName string, key string, value string, skipExisting bool) error {
type metadata struct {
Labels map[string]string `json:"labels"`
}
Expand All @@ -240,6 +267,12 @@ func (n Node) addLabel(nodeName string, key string, value string) error {
if err != nil {
return err
}
if skipExisting {
_, ok := node.ObjectMeta.Labels[key]
if ok {
return nil
}
}
if n.nthConfig.DryRun {
log.Info().Msgf("Would have added label (%s=%s) to node %s, but dry-run flag was set", key, value, nodeName)
return nil
Expand Down Expand Up @@ -282,6 +315,41 @@ func (n Node) removeLabel(nodeName string, key string) error {
return nil
}

// removeLabelIfValueMatches will remove a node label given a label key provided the label's value equals matchValue
func (n Node) removeLabelIfValueMatches(nodeName string, key string, matchValue string) error {
type patchRequest struct {
Op string `json:"op"`
Path string `json:"path"`
}

var patchReqs []interface{}
patchRemove := patchRequest{
Op: "remove",
Path: fmt.Sprintf("/metadata/labels/%s", jsonPatchEscape(key)),
}
payload, err := json.Marshal(append(patchReqs, patchRemove))
if err != nil {
return fmt.Errorf("An error occurred while marshalling the json to remove a label from the node: %w", err)
}
node, err := n.fetchKubernetesNode(nodeName)
if err != nil {
return err
}
val, ok := node.ObjectMeta.Labels[key]
if !ok || val == matchValue {
return nil
}
if n.nthConfig.DryRun {
log.Info().Msgf("Would have removed label with key %s from node %s, but dry-run flag was set", key, nodeName)
return nil
}
_, err = n.drainHelper.Client.CoreV1().Nodes().Patch(context.TODO(), node.Name, types.JSONPatchType, payload, metav1.PatchOptions{})
if err != nil {
return fmt.Errorf("%v node Patch failed when removing a label from the node: %w", node.Name, err)
}
return nil
}

// GetNodeLabels will fetch node labels for a given nodeName
func (n Node) GetNodeLabels(nodeName string) (map[string]string, error) {
if n.nthConfig.DryRun {
Expand Down

0 comments on commit 416da0b

Please sign in to comment.