Add documentation for scheduling a node drain when messing with nodes…

… on a prod cluster
appuio · May 7, 2024 · 070ae40 · 070ae40
1 parent d31ee77
commit 070ae40
Show file tree

Hide file tree

Showing 9 changed files with 237 additions and 105 deletions.
diff --git a/docs/modules/ROOT/pages/how-tos/cloudscale/remove_node.adoc b/docs/modules/ROOT/pages/how-tos/cloudscale/remove_node.adoc
@@ -18,6 +18,12 @@ Steps to remove a worker node of an OpenShift 4 cluster on https://cloudscale.ch
 * You have admin-level access to the cluster
 * You want to remove an existing worker node in the cluster
 
+== High-level overview
+
+* First, we identify the correct node to remove and drain it.
+* Then, we remove it from Kubernetes.
+* Finally, we remove the associated VMs.
+
 == Prerequisites
 
 include::partial$cloudscale/prerequisites.adoc[]
@@ -26,6 +32,42 @@ include::partial$cloudscale/prerequisites.adoc[]
 
 include::partial$cloudscale/setup-local-env.adoc[]
 
+== Prepare Terraform environment
+
+include::partial$cloudscale/configure-terraform-secrets.adoc[]
+
+include::partial$setup_terraform.adoc[]
+
+== Drain and Remove Node
+
+* Find the node you want to remove.
+It has to be the one with the highest terraform index.
++
+[source,bash]
+----
+# Grab JSON copy of current Terraform state
+terraform state pull > .tfstate.json
+
+export NODE_TO_REMOVE=$(jq --arg index "$node_count" -r \
+  '.resources[] |
+   select(.module=="module.cluster.module.worker" and .type=="cloudscale_server") |
+   .instances[$index|tonumber-1] |
+   .attributes.name | split(".") | first' \
+   .tfstate.json)
+echo $NODE_TO_REMOVE
+----
+
+* If you are working on a production cluster, you need to *schedule the node drain for the next maintenance.*
+* If you are working on a non-production cluster, you may *drain and remove the node immediately.*
+
+=== Schedule node drain (production clusters)
+
+include::partial$drain-node-scheduled.adoc[]
+
+=== Drain and remove node immediately
+
+include::partial$drain-node-immediately.adoc[]
+
 == Update Cluster Config
 
 . Update cluster config.
@@ -58,39 +100,6 @@ popd
 commodore catalog compile ${CLUSTER_ID} --push -i
 ----
 
-== Prepare Terraform environment
-
-include::partial$cloudscale/configure-terraform-secrets.adoc[]
-
-include::partial$setup_terraform.adoc[]
-
-== Remove Node
-
-* Find the node you want to remove.
-It has to be the one with the highest terraform index.
-+
-[source,bash]
-----
-# Grab JSON copy of current Terraform state
-terraform state pull > .tfstate.json
-
-node_count=$(jq  -r \
-  '.resources[] |
-   select(.module=="module.cluster.module.worker" and .type=="cloudscale_server") |
-   .instances | length' \
-   .tfstate.json)
-# Verify that the number of nodes is one more than we configured earlier.
-echo $node_count
-
-export NODE_TO_REMOVE=$(jq --arg index "$node_count" -r \
-  '.resources[] |
-   select(.module=="module.cluster.module.worker" and .type=="cloudscale_server") |
-   .instances[$index|tonumber-1] |
-   .attributes.name | split(".") | first' \
-   .tfstate.json)
-echo $NODE_TO_REMOVE
-----
-
-=== Remove VM
+== Remove VM
 
-include::partial$delete-node.adoc[]
+include::partial$delete-node-vm.adoc[]
diff --git a/docs/modules/ROOT/pages/how-tos/cloudscale/replace-storage-node.adoc b/docs/modules/ROOT/pages/how-tos/cloudscale/replace-storage-node.adoc
@@ -70,7 +70,9 @@ include::partial$storage-ceph-remove-mon.adoc[]
 
 === Clean up the old node
 
-include::partial$delete-node.adoc[]
+include::partial$drain-node-immediately.adoc[]
+
+include::partial$delete-node-vm.adoc[]
 
 == Finish up
 

diff --git a/docs/modules/ROOT/pages/how-tos/exoscale/change_storage_node_size.adoc b/docs/modules/ROOT/pages/how-tos/exoscale/change_storage_node_size.adoc
@@ -197,7 +197,9 @@ include::partial$storage-ceph-remove-mon.adoc[]
 
 === Clean up the old nodes
 
-include::partial$delete-node.adoc[]
+include::partial$drain-node-immediately.adoc[]
+
+include::partial$delete-node-vm.adoc[]
 
 == Finish up
 

diff --git a/docs/modules/ROOT/pages/how-tos/exoscale/remove_node.adoc b/docs/modules/ROOT/pages/how-tos/exoscale/remove_node.adoc
@@ -19,6 +19,12 @@ Steps to remove a worker node of an OpenShift 4 cluster on https://www.exoscale.
 * You have admin-level access to the cluster
 * You want to remove an existing worker node in the cluster
 
+== High-level overview
+
+* First, we identify the correct node to remove and drain it.
+* Then, we remove it from Kubernetes.
+* Finally, we remove the associated VMs.
+
 == Prerequisites
 
 include::partial$exoscale/prerequisites.adoc[]
@@ -27,6 +33,42 @@ include::partial$exoscale/prerequisites.adoc[]
 
 include::partial$exoscale/setup-local-env.adoc[]
 
+== Prepare Terraform environment
+
+include::partial$exoscale/configure-terraform-secrets.adoc[]
+
+include::partial$setup_terraform.adoc[]
+
+== Drain and Remove Node
+
+* Find the node you want to remove.
+It has to be the one with the highest terraform index.
++
+[source,bash]
+----
+# Grab JSON copy of current Terraform state
+terraform state pull > .tfstate.json
+
+export NODE_TO_REMOVE=$(jq --arg index "$node_count" -r \
+  '.resources[] |
+   select(.module=="module.cluster.module.worker" and .type=="exoscale_compute") |
+   .instances[$index|tonumber-1] |
+   .attributes.hostname' \
+   .tfstate.json)
+echo $NODE_TO_REMOVE
+----
+
+* If you are working on a production cluster, you need to *schedule the node drain for the next maintenance.*
+* If you are working on a non-production cluster, you may *drain and remove the node immediately.*
+
+=== Schedule node drain (production clusters)
+
+include::partial$drain-node-scheduled.adoc[]
+
+=== Drain and remove node immediately
+
+include::partial$drain-node-immediately.adoc[]
+
 == Update Cluster Config
 
 . Update cluster config.
@@ -59,39 +101,6 @@ popd
 commodore catalog compile ${CLUSTER_ID} --push -i
 ----
 
-== Prepare Terraform environment
-
-include::partial$exoscale/configure-terraform-secrets.adoc[]
-
-include::partial$setup_terraform.adoc[]
-
-== Remove Node
-
-* Find the node you want to remove.
-It has to be the one with the highest terraform index.
-+
-[source,bash]
-----
-# Grab JSON copy of current Terraform state
-terraform state pull > .tfstate.json
-
-node_count=$(jq  -r \
-  '.resources[] |
-   select(.module=="module.cluster.module.worker" and .type=="exoscale_compute") |
-   .instances | length' \
-   .tfstate.json)
-# Verify that the number of nodes is one more than we configured earlier.
-echo $node_count
-
-export NODE_TO_REMOVE=$(jq --arg index "$node_count" -r \
-  '.resources[] |
-   select(.module=="module.cluster.module.worker" and .type=="exoscale_compute") |
-   .instances[$index|tonumber-1] |
-   .attributes.hostname' \
-   .tfstate.json)
-echo $NODE_TO_REMOVE
-----
-
-=== Remove VM
+== Remove VM
 
-include::partial$delete-node.adoc[]
+include::partial$delete-node-vm.adoc[]
diff --git a/docs/modules/ROOT/pages/how-tos/exoscale/remove_storage_node.adoc b/docs/modules/ROOT/pages/how-tos/exoscale/remove_storage_node.adoc
@@ -151,7 +151,9 @@ include::partial$storage-ceph-remove-mon.adoc[]
 
 === Remove VM
 
-include::partial$delete-node.adoc[]
+include::partial$drain-node-immediately.adoc[]
+
+include::partial$delete-node-vm.adoc[]
 
 == Finish up
 

diff --git a/docs/modules/ROOT/pages/how-tos/exoscale/replace_storage_node.adoc b/docs/modules/ROOT/pages/how-tos/exoscale/replace_storage_node.adoc
@@ -102,7 +102,9 @@ include::partial$storage-ceph-remove-mon.adoc[]
 
 === Clean up the old node
 
-include::partial$delete-node.adoc[]
+include::partial$drain-node-immediately.adoc[]
+
+include::partial$delete-node-vm.adoc[]
 
 == Finish up
 

diff --git a/docs/modules/ROOT/partials/delete-node.adoc → ...modules/ROOT/partials/delete-node-vm.adoc b/docs/modules/ROOT/partials/delete-node.adoc → ...modules/ROOT/partials/delete-node-vm.adoc
@@ -1,34 +1,3 @@
-. Drain the node(s)
-+
-[source,bash,subs="attributes+"]
-----
-for node in $(echo -n {node-delete-list}); do
-  kubectl --as=cluster-admin drain "${node}" \
-    --delete-emptydir-data --ignore-daemonsets
-done
-----
-+
-ifeval::["{cloud_provider}" == "cloudscale"]
-ifeval::["{delete-node-type}" == "storage"]
-[TIP]
-====
-On cloudscale.ch, we configure Rook Ceph to setup the OSDs in "portable" mode.
-This configuration enables OSDs to be scheduled on any storage node.
-
-With this configuration, we don't have to migrate OSDs hosted on the old node(s) manually.
-Instead, draining a node will cause any OSDs hosted on that node to be rescheduled on other storage nodes.
-====
-endif::[]
-endif::[]
-
-. Delete the node(s) from the cluster
-+
-[source,bash,subs="attributes+"]
-----
-for node in $(echo -n {node-delete-list}); do
-  kubectl --as=cluster-admin delete node "${node}"
-done
-----
 
 ifeval::["{delete-node-type}" == "storage"]
 ifeval::["{delete-nodes-manually}" == "yes"]

diff --git a/docs/modules/ROOT/partials/drain-node-immediately.adoc b/docs/modules/ROOT/partials/drain-node-immediately.adoc
@@ -0,0 +1,31 @@
+. Drain the node(s)
++
+[source,bash,subs="attributes+"]
+----
+for node in $(echo -n {node-delete-list}); do
+  kubectl --as=cluster-admin drain "${node}" \
+    --delete-emptydir-data --ignore-daemonsets
+done
+----
++
+ifeval::["{cloud_provider}" == "cloudscale"]
+ifeval::["{delete-node-type}" == "storage"]
+[TIP]
+====
+On cloudscale.ch, we configure Rook Ceph to setup the OSDs in "portable" mode.
+This configuration enables OSDs to be scheduled on any storage node.
+
+With this configuration, we don't have to migrate OSDs hosted on the old node(s) manually.
+Instead, draining a node will cause any OSDs hosted on that node to be rescheduled on other storage nodes.
+====
+endif::[]
+endif::[]
+
+. Delete the node(s) from the cluster
++
+[source,bash,subs="attributes+"]
+----
+for node in $(echo -n {node-delete-list}); do
+  kubectl --as=cluster-admin delete node "${node}"
+done
+----