From fb7898cde0dadf36277ce44c3d336c01f05a0fbc Mon Sep 17 00:00:00 2001 From: Simon Gerber Date: Mon, 16 Oct 2023 17:09:09 +0200 Subject: [PATCH] Add how-to for migrating to Cilium Some steps inspired by https://hackmd.io/UKV0Yl-RQvGl7A5ITF1Tug#Prepare-to-change-the-default-CNI-to-Cilium --- .../how-tos/network/migrate-to-cilium.adoc | 218 ++++++++++++++++++ ...e-alertmanager-silence-all-projectsyn.adoc | 21 ++ docs/modules/ROOT/partials/nav.adoc | 3 +- ...e-alertmanager-silence-all-projectsyn.adoc | 18 ++ 4 files changed, 259 insertions(+), 1 deletion(-) create mode 100644 docs/modules/ROOT/pages/how-tos/network/migrate-to-cilium.adoc create mode 100644 docs/modules/ROOT/partials/create-alertmanager-silence-all-projectsyn.adoc create mode 100644 docs/modules/ROOT/partials/remove-alertmanager-silence-all-projectsyn.adoc diff --git a/docs/modules/ROOT/pages/how-tos/network/migrate-to-cilium.adoc b/docs/modules/ROOT/pages/how-tos/network/migrate-to-cilium.adoc new file mode 100644 index 00000000..dfd8851c --- /dev/null +++ b/docs/modules/ROOT/pages/how-tos/network/migrate-to-cilium.adoc @@ -0,0 +1,218 @@ += Migrate to Cilium CNI + +== Prerequisites + +* `cluster-admin` privileges +* `kubectl` +* `jq` +* Working `commodore` + +// TODO: kube-proxy replacement? + +== Prepare for migration + +include::partial$create-alertmanager-silence-all-projectsyn.adoc[] + +. Select cluster ++ +[source,bash] +---- +export CLUSTER_ID=c-cluster-id-1234 <1> +export COMMODORE_API_URL=https://api.syn.vshn.net <2> +export TENANT_ID=$(curl -sH "Authorization: Bearer $(commodore fetch-token)" \ + "${COMMODORE_API_URL}/clusters/${CLUSTER_ID}" | jq -r '.tenant') +export KUBECONFIG=/path/to/cluster/kubeconfig <3> +---- +<1> Replace with the Project Syn cluster ID of the cluster to migrate +<2> Replace with the Lieutenant API on which the cluster is registered +<3> Ensure that `kubectl` commands are executed against the cluster you're migrating. + +. Disable ArgoCD auto sync for component `openshift4-nodes` ++ +:argo_app: openshift4-nodes ++ +include::partial$disable-argocd-autosync.adoc[] + +. Disable the cluster-network-operator. +This is necessary to ensure that we can migrate to Cilium without the cluster-network-operator trying to interfere. ++ +TODO: Figure out if we need to scale down the upgrade operator ++ +[source,bash] +---- +kubectl --as=cluster-admin patch clusterversion version \ + --type=merge \ + -p ' + {"spec":{"overrides":[ + { + "kind": "Deployment", + "group": "apps", + "name": "network-operator", + "namespace": "openshift-network-operator", + "unmanaged": true + } + ]}}' +---- ++ +[source,bash] +---- +kubectl --as=cluster-admin -n openshift-network-operator \ + scale deploy network-operator --replicas=0 +---- + +. Remove network operator state ++ +[source,bash] +---- +kubectl --as=cluster-admin -n openshift-network-operator \ + delete configmap applied-cluster +---- + +. Pause all machine config pools ++ +[source,bash] +---- +for mcp in $(kubectl get mcp -o name); do +kubectl --as=cluster-admin patch $mcp --type=merge -p '{"spec": {"paused": true}}' +done +---- + +== Migrate to Cilium + +. Get local cluster working directory ++ +[source,bash] +---- +commodore catalog compile "$CLUSTER_ID" <1> +---- +<1> We recommend switching to an empty directory to run this command. +Alternatively, switch to your existing directory for the cluster. + +. Enable component `cilium` ++ +[source,bash] +---- +pushd inventory/classes/"${TENANT_ID}" +yq -i ".applications += "cilium" "${CLUSTER_ID}.yml" +---- + +. Update `upstreamRules` for monitoring ++ +[source,bash] +---- +yq -i ".parameters.openshift4_monitoring.upstreamRules.networkPlugin = \"Cilium\"" \ + "${CLUSTER_ID}.yml" +---- + +. Update component `networkpolicy` config ++ +[source,bash] +---- +yq eval -i '.parameters.networkpolicy.networkPlugin = "cilium"' \ + "${CLUSTER_ID}.yml" +yq eval -i '.parameters.networkpolicy.ignoredNamespaces = ["openshift-oauth-apiserver"]' \ + "${CLUSTER_ID}.yml" +---- + +. Configure component `cilium` ++ +.Configure required parameters for strict kube-proxy replacement +[source,bash] +---- +yq -i '.parameters.cilium.cilium_helm_values.kubeProxyReplacement = "strict"' \ + "${CLUSTER_ID}.yml" +yq -i '.parameters.cilium.cilium_helm_values.k8sServiceHost = "api-int.${openshift:baseDomain}"' \ + "${CLUSTER_ID}.yml" +yq -i '.parameters.cilium.cilium_helm_values.k8sServicePort = "6443"' \ + "${CLUSTER_ID}.yml" +---- ++ +[source,bash] +---- +POD_CIDR=$(kubectl get network.config cluster \ + -o jsonpath='{.spec.clusterNetwork[0].cidr}') +HOST_PREFIX=$(kubectl get network.config cluster \ + -o jsonpath='{.spec.clusterNetwork[0].hostPrefix}') + +if [ $HOST_PREFIX != "23" ]; then +yq -i '.parameters.cilium.cilium_helm_values.ipam.operator.clusterPoolIPv4MaskSize = "'"${HOST_PREFIX}"'"' \ + "${CLUSTER_ID}.yml" +fi +if [ $POD_CIDR != "10.128.0.0/14" ]; then +yq -i '.parameters.cilium.cilium_helm_values.ipam.operator.clusterPoolIPv4PodCIDR = "'"${POD_CIDR}"'"' \ + "${CLUSTER_ID}.yml" +fi +---- + +. Commit changes ++ +[source,bash] +---- +git commit -am "Migrate ${CLUSTER_ID} to Cilium" +git push origin master +popd +---- + +. Compile catalog ++ +[source,yaml] +---- +commodore catalog compile "${CLUSTER_ID}" +---- + +. Patch cluster network config ++ +TODO: Should we manage this through component openshift4-networking somehow? If so, just as patches? or try to manage the objs? How to ensure we don't explode existing clusters if we manage the objs? IMPORTANT: If we manage it, this needs to be moved above `Compile catalog`. ++ +[source,bash] +---- +kubectl --as=cluster-admin patch network.config cluster \ + --type=merge -p '{"spec":{"networkType":"Cilium"},"status":null}' +kubectl --as=cluster-admin patch network.operator cluster \ + --type=merge -p '{"spec":{"defaultNetwork":{"type":"Cilium"},deployKubeProxy:false},"status":null}' +---- + +. TODO: Scale down/delete existing CNI? + +. Apply Cilium manifests ++ +[source,bash] +---- +kubectl apply -Rf catalog/manifests/cilium/ +---- + +. Wait until Cilium CNI is up and running ++ +[source,bash] +---- +kubectl -n cilium get pods -w +---- + +== Finalize migration + +. Re-enable cluster network operator ++ +[source,bash] +---- +kubectl --as=cluster-admin -n openshift-network-operator \ + scale deployment network-operator --replicas=1 +kubectl --as=cluster-admin patch clusterversion version \ + --type=merge -p '{"spec":{"overrides":null}}' +---- + +. Unpause MCPs ++ +[source,bash] +---- +for mcp in $(kubectl get mcp -o name); do +kubectl --as=cluster-admin patch $mcp --type=merge -p '{"spec":{"paused":false}}' +done +---- + +include::partial$enable-argocd-autosync.adoc[] + +== Cleanup alert silence + +:argo_app: + +include::partial$remove-alertmanager-silence-all-projectsyn.adoc[] diff --git a/docs/modules/ROOT/partials/create-alertmanager-silence-all-projectsyn.adoc b/docs/modules/ROOT/partials/create-alertmanager-silence-all-projectsyn.adoc new file mode 100644 index 00000000..10c1008b --- /dev/null +++ b/docs/modules/ROOT/partials/create-alertmanager-silence-all-projectsyn.adoc @@ -0,0 +1,21 @@ +// NOTE: this snippet only works correctly at the beginning of a numbered +// list. I was unable to figure out how to define the page attributes in a way +// that works for the alertmanager-silence-job.adoc partial without breaking +// the list flow. +:silence-target: all +:duration: +60 minutes +:http-method: POST +:alertmanager-endpoint: /api/v2/silences + +. Silence all Project Syn alerts ++ +include::partial$alertmanager-silence-job.adoc[] + +. Extract Alertmanager silence ID from job logs ++ +[source,bash] +---- +silence_id=$(kubectl --as=cluster-admin -n openshift-monitoring logs jobs/${job_name} | \ + jq -r '.silenceID') +---- + diff --git a/docs/modules/ROOT/partials/nav.adoc b/docs/modules/ROOT/partials/nav.adoc index e6f8749a..51a5fc20 100644 --- a/docs/modules/ROOT/partials/nav.adoc +++ b/docs/modules/ROOT/partials/nav.adoc @@ -100,7 +100,8 @@ ** xref:oc4:ROOT:how-tos/authentication/disable-self-provisioning.adoc[Disable project self-provisioning] ** xref:oc4:ROOT:explanations/sudo.adoc[] -// Networking +* Networking +** xref:oc4:ROOT:how-tos/network/migrate-to-cilium.adoc[] * Ingress ** xref:oc4:ROOT:how-tos/ingress/self-signed-ingress-cert.adoc[] diff --git a/docs/modules/ROOT/partials/remove-alertmanager-silence-all-projectsyn.adoc b/docs/modules/ROOT/partials/remove-alertmanager-silence-all-projectsyn.adoc new file mode 100644 index 00000000..910e9095 --- /dev/null +++ b/docs/modules/ROOT/partials/remove-alertmanager-silence-all-projectsyn.adoc @@ -0,0 +1,18 @@ +// NOTE: this snippet only works correctly at the beginning of a numbered +// list. I was unable to figure out how to define the page attributes in a way +// that works for the alertmanager-silence-job.adoc partial without breaking +// the list flow. +:alertmanager-endpoint: /api/v2/silence/${silence_id} +:silence-target: all +:http-method: DELETE + +. Remove silence in Alertmanager ++ +include::partial$alertmanager-silence-job.adoc[] + +. Clean up Alertmanager silence jobs ++ +[source,bash,subs="attributes+"] +---- +kubectl --as=cluster-admin -n openshift-monitoring delete jobs -l app=silence-{silence_target}-alerts +----