Skip to content

Commit

Permalink
add medik8s/self-node-remediation
Browse files Browse the repository at this point in the history
Signed-off-by: Sebastian Hoß <[email protected]>
  • Loading branch information
sebhoss committed Apr 5, 2024
1 parent f006958 commit 9bab501
Show file tree
Hide file tree
Showing 12 changed files with 528 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .reuse/dep5
Original file line number Diff line number Diff line change
Expand Up @@ -907,6 +907,10 @@ Files: crd-catalog/medik8s/node-healthcheck-operator/*
Copyright: The medik8s/node-healthcheck-operator Authors
License: Apache-2.0

Files: crd-catalog/medik8s/self-node-remediation/*
Copyright: The medik8s/self-node-remediation Authors
License: Apache-2.0

Files: crd-catalog/metacontroller/metacontroller/*
Copyright: The metacontroller/metacontroller Authors
License: Apache-2.0
Expand Down
9 changes: 9 additions & 0 deletions code-generator/src/catalog.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2300,6 +2300,15 @@ pub const CRD_V1_SOURCES: &'static [UpstreamSource] = &[
"https://github.com/medik8s/node-healthcheck-operator/blob/main/config/crd/bases/remediation.medik8s.io_nodehealthchecks.yaml",
],
},
UpstreamSource {
project_name: "medik8s/self-node-remediation",
license: APACHE_V2,
urls: &[
"https://github.com/medik8s/self-node-remediation/blob/main/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml",
"https://github.com/medik8s/self-node-remediation/blob/main/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediations.yaml",
"https://github.com/medik8s/self-node-remediation/blob/main/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationtemplates.yaml",
],
},
UpstreamSource {
project_name: "metacontroller/metacontroller",
license: APACHE_V2,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
apiVersion: "apiextensions.k8s.io/v1"
kind: "CustomResourceDefinition"
metadata:
annotations:
controller-gen.kubebuilder.io/version: "v0.14.0"
name: "selfnoderemediationconfigs.self-node-remediation.medik8s.io"
spec:
group: "self-node-remediation.medik8s.io"
names:
kind: "SelfNodeRemediationConfig"
listKind: "SelfNodeRemediationConfigList"
plural: "selfnoderemediationconfigs"
shortNames:
- "snrc"
- "snrconfig"
singular: "selfnoderemediationconfig"
scope: "Namespaced"
versions:
- name: "v1alpha1"
schema:
openAPIV3Schema:
description: "SelfNodeRemediationConfig is the Schema for the selfnoderemediationconfigs API in which a user can configure the self node remediation agents"
properties:
apiVersion:
description: "APIVersion defines the versioned schema of this representation of an object.\nServers should convert recognized schemas to the latest internal value, and\nmay reject unrecognized values.\nMore info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources"
type: "string"
kind:
description: "Kind is a string value representing the REST resource this object represents.\nServers may infer this from the endpoint the client submits requests to.\nCannot be updated.\nIn CamelCase.\nMore info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds"
type: "string"
metadata:
type: "object"
spec:
description: "SelfNodeRemediationConfigSpec defines the desired state of SelfNodeRemediationConfig"
properties:
apiCheckInterval:
default: "15s"
description: "the frequency for api-server connectivity check\nValid time units are \"ms\", \"s\", \"m\", \"h\".\nthe frequency for api-server connectivity check"
pattern: "^(0|([0-9]+(\\.[0-9]+)?(ms|s|m|h)))$"
type: "string"
apiServerTimeout:
default: "5s"
description: "Valid time units are \"ms\", \"s\", \"m\", \"h\".\ntimeout for each api-connectivity check"
pattern: "^(0|([0-9]+(\\.[0-9]+)?(ms|s|m|h)))$"
type: "string"
customDsTolerations:
description: "CustomDsTolerations allows to add custom tolerations snr agents that are running on the ds in order to support remediation for different types of nodes."
items:
description: "The pod this Toleration is attached to tolerates any taint that matches\nthe triple <key,value,effect> using the matching operator <operator>."
properties:
effect:
description: "Effect indicates the taint effect to match. Empty means match all taint effects.\nWhen specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute."
type: "string"
key:
description: "Key is the taint key that the toleration applies to. Empty means match all taint keys.\nIf the key is empty, operator must be Exists; this combination means to match all values and all keys."
type: "string"
operator:
description: "Operator represents a key's relationship to the value.\nValid operators are Exists and Equal. Defaults to Equal.\nExists is equivalent to wildcard for value, so that a pod can\ntolerate all taints of a particular category."
type: "string"
tolerationSeconds:
description: "TolerationSeconds represents the period of time the toleration (which must be\nof effect NoExecute, otherwise this field is ignored) tolerates the taint. By default,\nit is not set, which means tolerate the taint forever (do not evict). Zero and\nnegative values will be treated as 0 (evict immediately) by the system."
format: "int64"
type: "integer"
value:
description: "Value is the taint value the toleration matches to.\nIf the operator is Exists, the value should be empty, otherwise just a regular string."
type: "string"
type: "object"
type: "array"
endpointHealthCheckUrl:
description: "EndpointHealthCheckUrl is an url that self node remediation agents which run on control-plane node will try to access when they can't contact their peers.\nThis is a part of self diagnostics which will decide whether the node should be remediated or not.\nIt will be ignored when empty (which is the default)."
type: "string"
hostPort:
default: 30001
description: "HostPort is used for internal communication between SNR agents."
minimum: 1.0
type: "integer"
isSoftwareRebootEnabled:
default: true
description: "IsSoftwareRebootEnabled indicates whether self node remediation agent will do software reboot,\nif the watchdog device can not be used or will use watchdog only,\nwithout a fallback to software reboot"
type: "boolean"
maxApiErrorThreshold:
default: 3
description: "after this threshold, the node will start contacting its peers"
minimum: 1.0
type: "integer"
peerApiServerTimeout:
default: "5s"
description: "Valid time units are \"ms\", \"s\", \"m\", \"h\"."
pattern: "^(0|([0-9]+(\\.[0-9]+)?(ms|s|m|h)))$"
type: "string"
peerDialTimeout:
default: "5s"
description: "Valid time units are \"ms\", \"s\", \"m\", \"h\".\ntimeout for establishing connection to peer"
pattern: "^(0|([0-9]+(\\.[0-9]+)?(ms|s|m|h)))$"
type: "string"
peerRequestTimeout:
default: "5s"
description: "Valid time units are \"ms\", \"s\", \"m\", \"h\".\ntimeout for each peer request"
pattern: "^(0|([0-9]+(\\.[0-9]+)?(ms|s|m|h)))$"
type: "string"
peerUpdateInterval:
default: "15m"
description: "Valid time units are \"ms\", \"s\", \"m\", \"h\"."
pattern: "^(0|([0-9]+(\\.[0-9]+)?(ms|s|m|h)))$"
type: "string"
safeTimeToAssumeNodeRebootedSeconds:
default: 180
description: "SafeTimeToAssumeNodeRebootedSeconds is the time after which the healthy self node remediation\nagents will assume the unhealthy node has been rebooted, and it is safe to recover affected workloads.\nThis is extremely important as starting replacement Pods while they are still running on the failed\nnode will likely lead to data corruption and violation of run-once semantics.\nIn an effort to prevent this, the operator ignores values lower than a minimum calculated from the\nApiCheckInterval, ApiServerTimeout, MaxApiErrorThreshold, PeerDialTimeout, and PeerRequestTimeout fields."
minimum: 0.0
type: "integer"
watchdogFilePath:
default: "/dev/watchdog"
description: "WatchdogFilePath is the watchdog file path that should be available on each node, e.g. /dev/watchdog"
type: "string"
type: "object"
status:
description: "SelfNodeRemediationConfigStatus defines the observed state of SelfNodeRemediationConfig"
type: "object"
type: "object"
served: true
storage: true
subresources:
status: {}
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
apiVersion: "apiextensions.k8s.io/v1"
kind: "CustomResourceDefinition"
metadata:
annotations:
controller-gen.kubebuilder.io/version: "v0.14.0"
name: "selfnoderemediations.self-node-remediation.medik8s.io"
spec:
group: "self-node-remediation.medik8s.io"
names:
kind: "SelfNodeRemediation"
listKind: "SelfNodeRemediationList"
plural: "selfnoderemediations"
shortNames:
- "snr"
- "snremediation"
singular: "selfnoderemediation"
scope: "Namespaced"
versions:
- name: "v1alpha1"
schema:
openAPIV3Schema:
description: "SelfNodeRemediation is the Schema for the selfnoderemediations API"
properties:
apiVersion:
description: "APIVersion defines the versioned schema of this representation of an object.\nServers should convert recognized schemas to the latest internal value, and\nmay reject unrecognized values.\nMore info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources"
type: "string"
kind:
description: "Kind is a string value representing the REST resource this object represents.\nServers may infer this from the endpoint the client submits requests to.\nCannot be updated.\nIn CamelCase.\nMore info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds"
type: "string"
metadata:
type: "object"
spec:
description: "SelfNodeRemediationSpec defines the desired state of SelfNodeRemediation"
properties:
remediationStrategy:
default: "Automatic"
description: "RemediationStrategy is the remediation method for unhealthy nodes.\nCurrently, it could be either \"Automatic\", \"OutOfServiceTaint\" or \"ResourceDeletion\".\nResourceDeletion will iterate over all pods and VolumeAttachment related to the unhealthy node and delete them.\nOutOfServiceTaint will add the out-of-service taint which is a new well-known taint \"node.kubernetes.io/out-of-service\"\nthat enables automatic deletion of pv-attached pods on failed nodes, \"out-of-service\" taint is only supported on clusters with k8s version 1.26+ or OCP/OKD version 4.13+.\nAutomatic will choose the most appropriate strategy during runtime."
enum:
- "Automatic"
- "ResourceDeletion"
- "OutOfServiceTaint"
type: "string"
type: "object"
status:
description: "SelfNodeRemediationStatus defines the observed state of SelfNodeRemediation"
properties:
conditions:
description: "Represents the observations of a SelfNodeRemediation's current state.\nKnown .status.conditions.type are: \"Processing\""
items:
description: "Condition contains details for one aspect of the current state of this API Resource.\n---\nThis struct is intended for direct use as an array at the field path .status.conditions. For example,\n\n\n\ttype FooStatus struct{\n\t // Represents the observations of a foo's current state.\n\t // Known .status.conditions.type are: \"Available\", \"Progressing\", and \"Degraded\"\n\t // +patchMergeKey=type\n\t // +patchStrategy=merge\n\t // +listType=map\n\t // +listMapKey=type\n\t Conditions []metav1.Condition `json:\"conditions,omitempty\" patchStrategy:\"merge\" patchMergeKey:\"type\" protobuf:\"bytes,1,rep,name=conditions\"`\n\n\n\t // other fields\n\t}"
properties:
lastTransitionTime:
description: "lastTransitionTime is the last time the condition transitioned from one status to another.\nThis should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable."
format: "date-time"
type: "string"
message:
description: "message is a human readable message indicating details about the transition.\nThis may be an empty string."
maxLength: 32768
type: "string"
observedGeneration:
description: "observedGeneration represents the .metadata.generation that the condition was set based upon.\nFor instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date\nwith respect to the current state of the instance."
format: "int64"
minimum: 0.0
type: "integer"
reason:
description: "reason contains a programmatic identifier indicating the reason for the condition's last transition.\nProducers of specific condition types may define expected values and meanings for this field,\nand whether the values are considered a guaranteed API.\nThe value should be a CamelCase string.\nThis field may not be empty."
maxLength: 1024
minLength: 1
pattern: "^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$"
type: "string"
status:
description: "status of the condition, one of True, False, Unknown."
enum:
- "True"
- "False"
- "Unknown"
type: "string"
type:
description: "type of condition in CamelCase or in foo.example.com/CamelCase.\n---\nMany .condition.type values are consistent across resources like Available, but because arbitrary conditions can be\nuseful (see .node.status.conditions), the ability to deconflict is important.\nThe regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt)"
maxLength: 316
pattern: "^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$"
type: "string"
required:
- "lastTransitionTime"
- "message"
- "reason"
- "status"
- "type"
type: "object"
type: "array"
x-kubernetes-list-map-keys:
- "type"
x-kubernetes-list-type: "map"
lastError:
description: "LastError captures the last error that occurred during remediation.\nIf no error occurred it would be empty"
type: "string"
phase:
description: "Phase represents the current phase of remediation,\nOne of: TBD"
type: "string"
timeAssumedRebooted:
description: "TimeAssumedRebooted is the time by then the unhealthy node assumed to be rebooted"
format: "date-time"
type: "string"
type: "object"
type: "object"
served: true
storage: true
subresources:
status: {}
Loading

0 comments on commit 9bab501

Please sign in to comment.