From 070ae40646f1ad8565302bdaa0193d20ab35a5a1 Mon Sep 17 00:00:00 2001 From: Aline Abler Date: Tue, 30 Apr 2024 18:01:34 +0200 Subject: [PATCH] Add documentation for scheduling a node drain when messing with nodes on a prod cluster --- .../pages/how-tos/cloudscale/remove_node.adoc | 79 +++++++------ .../cloudscale/replace-storage-node.adoc | 4 +- .../exoscale/change_storage_node_size.adoc | 4 +- .../pages/how-tos/exoscale/remove_node.adoc | 79 +++++++------ .../how-tos/exoscale/remove_storage_node.adoc | 4 +- .../exoscale/replace_storage_node.adoc | 4 +- .../{delete-node.adoc => delete-node-vm.adoc} | 31 ----- .../ROOT/partials/drain-node-immediately.adoc | 31 +++++ .../ROOT/partials/drain-node-scheduled.adoc | 106 ++++++++++++++++++ 9 files changed, 237 insertions(+), 105 deletions(-) rename docs/modules/ROOT/partials/{delete-node.adoc => delete-node-vm.adoc} (67%) create mode 100644 docs/modules/ROOT/partials/drain-node-immediately.adoc create mode 100644 docs/modules/ROOT/partials/drain-node-scheduled.adoc diff --git a/docs/modules/ROOT/pages/how-tos/cloudscale/remove_node.adoc b/docs/modules/ROOT/pages/how-tos/cloudscale/remove_node.adoc index cb3d4738..2d93769b 100644 --- a/docs/modules/ROOT/pages/how-tos/cloudscale/remove_node.adoc +++ b/docs/modules/ROOT/pages/how-tos/cloudscale/remove_node.adoc @@ -18,6 +18,12 @@ Steps to remove a worker node of an OpenShift 4 cluster on https://cloudscale.ch * You have admin-level access to the cluster * You want to remove an existing worker node in the cluster +== High-level overview + +* First, we identify the correct node to remove and drain it. +* Then, we remove it from Kubernetes. +* Finally, we remove the associated VMs. + == Prerequisites include::partial$cloudscale/prerequisites.adoc[] @@ -26,6 +32,42 @@ include::partial$cloudscale/prerequisites.adoc[] include::partial$cloudscale/setup-local-env.adoc[] +== Prepare Terraform environment + +include::partial$cloudscale/configure-terraform-secrets.adoc[] + +include::partial$setup_terraform.adoc[] + +== Drain and Remove Node + +* Find the node you want to remove. +It has to be the one with the highest terraform index. ++ +[source,bash] +---- +# Grab JSON copy of current Terraform state +terraform state pull > .tfstate.json + +export NODE_TO_REMOVE=$(jq --arg index "$node_count" -r \ + '.resources[] | + select(.module=="module.cluster.module.worker" and .type=="cloudscale_server") | + .instances[$index|tonumber-1] | + .attributes.name | split(".") | first' \ + .tfstate.json) +echo $NODE_TO_REMOVE +---- + +* If you are working on a production cluster, you need to *schedule the node drain for the next maintenance.* +* If you are working on a non-production cluster, you may *drain and remove the node immediately.* + +=== Schedule node drain (production clusters) + +include::partial$drain-node-scheduled.adoc[] + +=== Drain and remove node immediately + +include::partial$drain-node-immediately.adoc[] + == Update Cluster Config . Update cluster config. @@ -58,39 +100,6 @@ popd commodore catalog compile ${CLUSTER_ID} --push -i ---- -== Prepare Terraform environment - -include::partial$cloudscale/configure-terraform-secrets.adoc[] - -include::partial$setup_terraform.adoc[] - -== Remove Node - -* Find the node you want to remove. -It has to be the one with the highest terraform index. -+ -[source,bash] ----- -# Grab JSON copy of current Terraform state -terraform state pull > .tfstate.json - -node_count=$(jq -r \ - '.resources[] | - select(.module=="module.cluster.module.worker" and .type=="cloudscale_server") | - .instances | length' \ - .tfstate.json) -# Verify that the number of nodes is one more than we configured earlier. -echo $node_count - -export NODE_TO_REMOVE=$(jq --arg index "$node_count" -r \ - '.resources[] | - select(.module=="module.cluster.module.worker" and .type=="cloudscale_server") | - .instances[$index|tonumber-1] | - .attributes.name | split(".") | first' \ - .tfstate.json) -echo $NODE_TO_REMOVE ----- - -=== Remove VM +== Remove VM -include::partial$delete-node.adoc[] +include::partial$delete-node-vm.adoc[] diff --git a/docs/modules/ROOT/pages/how-tos/cloudscale/replace-storage-node.adoc b/docs/modules/ROOT/pages/how-tos/cloudscale/replace-storage-node.adoc index 39e44bd2..b22ba2c7 100644 --- a/docs/modules/ROOT/pages/how-tos/cloudscale/replace-storage-node.adoc +++ b/docs/modules/ROOT/pages/how-tos/cloudscale/replace-storage-node.adoc @@ -70,7 +70,9 @@ include::partial$storage-ceph-remove-mon.adoc[] === Clean up the old node -include::partial$delete-node.adoc[] +include::partial$drain-node-immediately.adoc[] + +include::partial$delete-node-vm.adoc[] == Finish up diff --git a/docs/modules/ROOT/pages/how-tos/exoscale/change_storage_node_size.adoc b/docs/modules/ROOT/pages/how-tos/exoscale/change_storage_node_size.adoc index 5e9e16bc..20c3ddc9 100644 --- a/docs/modules/ROOT/pages/how-tos/exoscale/change_storage_node_size.adoc +++ b/docs/modules/ROOT/pages/how-tos/exoscale/change_storage_node_size.adoc @@ -197,7 +197,9 @@ include::partial$storage-ceph-remove-mon.adoc[] === Clean up the old nodes -include::partial$delete-node.adoc[] +include::partial$drain-node-immediately.adoc[] + +include::partial$delete-node-vm.adoc[] == Finish up diff --git a/docs/modules/ROOT/pages/how-tos/exoscale/remove_node.adoc b/docs/modules/ROOT/pages/how-tos/exoscale/remove_node.adoc index 7077781f..ecb4a7f0 100644 --- a/docs/modules/ROOT/pages/how-tos/exoscale/remove_node.adoc +++ b/docs/modules/ROOT/pages/how-tos/exoscale/remove_node.adoc @@ -19,6 +19,12 @@ Steps to remove a worker node of an OpenShift 4 cluster on https://www.exoscale. * You have admin-level access to the cluster * You want to remove an existing worker node in the cluster +== High-level overview + +* First, we identify the correct node to remove and drain it. +* Then, we remove it from Kubernetes. +* Finally, we remove the associated VMs. + == Prerequisites include::partial$exoscale/prerequisites.adoc[] @@ -27,6 +33,42 @@ include::partial$exoscale/prerequisites.adoc[] include::partial$exoscale/setup-local-env.adoc[] +== Prepare Terraform environment + +include::partial$exoscale/configure-terraform-secrets.adoc[] + +include::partial$setup_terraform.adoc[] + +== Drain and Remove Node + +* Find the node you want to remove. +It has to be the one with the highest terraform index. ++ +[source,bash] +---- +# Grab JSON copy of current Terraform state +terraform state pull > .tfstate.json + +export NODE_TO_REMOVE=$(jq --arg index "$node_count" -r \ + '.resources[] | + select(.module=="module.cluster.module.worker" and .type=="exoscale_compute") | + .instances[$index|tonumber-1] | + .attributes.hostname' \ + .tfstate.json) +echo $NODE_TO_REMOVE +---- + +* If you are working on a production cluster, you need to *schedule the node drain for the next maintenance.* +* If you are working on a non-production cluster, you may *drain and remove the node immediately.* + +=== Schedule node drain (production clusters) + +include::partial$drain-node-scheduled.adoc[] + +=== Drain and remove node immediately + +include::partial$drain-node-immediately.adoc[] + == Update Cluster Config . Update cluster config. @@ -59,39 +101,6 @@ popd commodore catalog compile ${CLUSTER_ID} --push -i ---- -== Prepare Terraform environment - -include::partial$exoscale/configure-terraform-secrets.adoc[] - -include::partial$setup_terraform.adoc[] - -== Remove Node - -* Find the node you want to remove. -It has to be the one with the highest terraform index. -+ -[source,bash] ----- -# Grab JSON copy of current Terraform state -terraform state pull > .tfstate.json - -node_count=$(jq -r \ - '.resources[] | - select(.module=="module.cluster.module.worker" and .type=="exoscale_compute") | - .instances | length' \ - .tfstate.json) -# Verify that the number of nodes is one more than we configured earlier. -echo $node_count - -export NODE_TO_REMOVE=$(jq --arg index "$node_count" -r \ - '.resources[] | - select(.module=="module.cluster.module.worker" and .type=="exoscale_compute") | - .instances[$index|tonumber-1] | - .attributes.hostname' \ - .tfstate.json) -echo $NODE_TO_REMOVE ----- - -=== Remove VM +== Remove VM -include::partial$delete-node.adoc[] +include::partial$delete-node-vm.adoc[] diff --git a/docs/modules/ROOT/pages/how-tos/exoscale/remove_storage_node.adoc b/docs/modules/ROOT/pages/how-tos/exoscale/remove_storage_node.adoc index 796e0299..66a51422 100644 --- a/docs/modules/ROOT/pages/how-tos/exoscale/remove_storage_node.adoc +++ b/docs/modules/ROOT/pages/how-tos/exoscale/remove_storage_node.adoc @@ -151,7 +151,9 @@ include::partial$storage-ceph-remove-mon.adoc[] === Remove VM -include::partial$delete-node.adoc[] +include::partial$drain-node-immediately.adoc[] + +include::partial$delete-node-vm.adoc[] == Finish up diff --git a/docs/modules/ROOT/pages/how-tos/exoscale/replace_storage_node.adoc b/docs/modules/ROOT/pages/how-tos/exoscale/replace_storage_node.adoc index ed044f61..16a49d69 100644 --- a/docs/modules/ROOT/pages/how-tos/exoscale/replace_storage_node.adoc +++ b/docs/modules/ROOT/pages/how-tos/exoscale/replace_storage_node.adoc @@ -102,7 +102,9 @@ include::partial$storage-ceph-remove-mon.adoc[] === Clean up the old node -include::partial$delete-node.adoc[] +include::partial$drain-node-immediately.adoc[] + +include::partial$delete-node-vm.adoc[] == Finish up diff --git a/docs/modules/ROOT/partials/delete-node.adoc b/docs/modules/ROOT/partials/delete-node-vm.adoc similarity index 67% rename from docs/modules/ROOT/partials/delete-node.adoc rename to docs/modules/ROOT/partials/delete-node-vm.adoc index 53d7d0de..6e032901 100644 --- a/docs/modules/ROOT/partials/delete-node.adoc +++ b/docs/modules/ROOT/partials/delete-node-vm.adoc @@ -1,34 +1,3 @@ -. Drain the node(s) -+ -[source,bash,subs="attributes+"] ----- -for node in $(echo -n {node-delete-list}); do - kubectl --as=cluster-admin drain "${node}" \ - --delete-emptydir-data --ignore-daemonsets -done ----- -+ -ifeval::["{cloud_provider}" == "cloudscale"] -ifeval::["{delete-node-type}" == "storage"] -[TIP] -==== -On cloudscale.ch, we configure Rook Ceph to setup the OSDs in "portable" mode. -This configuration enables OSDs to be scheduled on any storage node. - -With this configuration, we don't have to migrate OSDs hosted on the old node(s) manually. -Instead, draining a node will cause any OSDs hosted on that node to be rescheduled on other storage nodes. -==== -endif::[] -endif::[] - -. Delete the node(s) from the cluster -+ -[source,bash,subs="attributes+"] ----- -for node in $(echo -n {node-delete-list}); do - kubectl --as=cluster-admin delete node "${node}" -done ----- ifeval::["{delete-node-type}" == "storage"] ifeval::["{delete-nodes-manually}" == "yes"] diff --git a/docs/modules/ROOT/partials/drain-node-immediately.adoc b/docs/modules/ROOT/partials/drain-node-immediately.adoc new file mode 100644 index 00000000..4f9a5063 --- /dev/null +++ b/docs/modules/ROOT/partials/drain-node-immediately.adoc @@ -0,0 +1,31 @@ +. Drain the node(s) ++ +[source,bash,subs="attributes+"] +---- +for node in $(echo -n {node-delete-list}); do + kubectl --as=cluster-admin drain "${node}" \ + --delete-emptydir-data --ignore-daemonsets +done +---- ++ +ifeval::["{cloud_provider}" == "cloudscale"] +ifeval::["{delete-node-type}" == "storage"] +[TIP] +==== +On cloudscale.ch, we configure Rook Ceph to setup the OSDs in "portable" mode. +This configuration enables OSDs to be scheduled on any storage node. + +With this configuration, we don't have to migrate OSDs hosted on the old node(s) manually. +Instead, draining a node will cause any OSDs hosted on that node to be rescheduled on other storage nodes. +==== +endif::[] +endif::[] + +. Delete the node(s) from the cluster ++ +[source,bash,subs="attributes+"] +---- +for node in $(echo -n {node-delete-list}); do + kubectl --as=cluster-admin delete node "${node}" +done +---- diff --git a/docs/modules/ROOT/partials/drain-node-scheduled.adoc b/docs/modules/ROOT/partials/drain-node-scheduled.adoc new file mode 100644 index 00000000..2ed1732d --- /dev/null +++ b/docs/modules/ROOT/partials/drain-node-scheduled.adoc @@ -0,0 +1,106 @@ +. Create an adhoc-config for the UpgradeJobHook that will drain the node. ++ +[source,bash,subs="attributes+"] +---- +pushd "../../../inventory/classes/$TENANT_ID" +cat > manifests/$CLUSTER_ID/drain_node_hook <