diff --git a/.taskfiles/Talos/Taskfile.yaml b/.taskfiles/Talos/Taskfile.yaml index 58925642d1e05..552096747189d 100644 --- a/.taskfiles/Talos/Taskfile.yaml +++ b/.taskfiles/Talos/Taskfile.yaml @@ -2,34 +2,32 @@ # yaml-language-server: $schema=https://taskfile.dev/schema.json version: "3" +x-env-vars: &env-vars + TALOS_VERSION: + sh: yq 'select(document_index == 1).spec.postBuild.substitute.TALOS_VERSION' {{.KUBERNETES_DIR}}/{{.cluster}}/apps/system-upgrade/system-upgrade-controller/ks.yaml + TALOS_SCHEMATIC_ID: + sh: yq 'select(document_index == 1).spec.postBuild.substitute.TALOS_SCHEMATIC_ID' {{.KUBERNETES_DIR}}/{{.cluster}}/apps/system-upgrade/system-upgrade-controller/ks.yaml + KUBERNETES_VERSION: + sh: yq 'select(document_index == 1).spec.postBuild.substitute.KUBERNETES_VERSION' {{.KUBERNETES_DIR}}/{{.cluster}}/apps/system-upgrade/system-upgrade-controller/ks.yaml + vars: # Ref: https://github.com/onedr0p/home-service HOME_SERVICE_ADDR: voyager.internal HOME_SERVICE_USER: devin HOME_SERVICE_MATCHBOX_DIR: /var/opt/home-service/apps/matchbox/data/config - # renovate: datasource=docker depName=ghcr.io/siderolabs/installer - TALOS_VERSION: v1.7.3 - TALOS_SCHEMATIC_ID: d715f723f882b1e1e8063f1b89f237dcc0e3bd000f9f970243af59c8baae0100 - # renovate: datasource=docker depName=ghcr.io/siderolabs/kubelet - KUBERNETES_VERSION: v1.30.1 - TALOS_SCRIPTS_DIR: "{{.ROOT_DIR}}/.taskfiles/Talos/scripts" tasks: bootstrap: desc: Bootstrap Talos - summary: | - Args: - cluster: Cluster to run command against (required) prompt: Bootstrap Talos on the '{{.cluster}}' cluster ... continue? cmds: - task: bootstrap-etcd - vars: &vars - cluster: "{{.cluster}}" + vars: { cluster: "{{.cluster}}" } - task: fetch-kubeconfig - vars: *vars + vars: { cluster: "{{.cluster}}" } - task: bootstrap-apps - vars: *vars + vars: { cluster: "{{.cluster}}" } requires: vars: ["cluster"] @@ -74,14 +72,11 @@ tasks: apply-config: desc: Apply Talos configuration to a node - env: - TALOS_VERSION: "{{.TALOS_VERSION}}" - TALOS_SCHEMATIC_ID: "{{.TALOS_SCHEMATIC_ID}}" - KUBERNETES_VERSION: "{{.KUBERNETES_VERSION}}" cmd: | sops -d {{.KUBERNETES_DIR}}/{{.cluster}}/bootstrap/talos/assets/{{.hostname}}.secret.sops.yaml | \ envsubst | \ talosctl --context {{.cluster}} apply-config --mode={{.mode}} --nodes {{.node}} --file /dev/stdin + env: *env-vars vars: mode: '{{.mode | default "no-reboot"}}' hostname: @@ -95,9 +90,12 @@ tasks: upgrade: desc: Upgrade Talos on a node - cmd: bash {{.TALOS_SCRIPTS_DIR}}/upgrade.sh "{{.cluster}}" "{{.node}}" "{{.TALOS_SCHEMATIC_ID}}:{{.TALOS_VERSION}}" "{{.rollout}}" - vars: - rollout: '{{.rollout | default "false"}}' + cmds: + - until kubectl --context {{.cluster}} wait --timeout=5m --for=condition=Complete jobs --all --all-namespaces; do sleep 10; done + - talosctl --context {{.cluster}} --nodes {{.node}} upgrade --image="factory.talos.dev/installer/{{.TALOS_SCHEMATIC_ID}}:{{.TALOS_VERSION}}" --wait=true --timeout=10m --preserve=true + - talosctl --context {{.cluster}} --nodes {{.node}} health --wait-timeout=10m --server=false + - until kubectl --context {{.cluster}} wait --timeout=5m --for=jsonpath=.status.ceph.health=HEALTH_OK cephcluster --all --all-namespaces; do sleep 10; done + vars: *env-vars requires: vars: ["cluster", "node"] preconditions: @@ -105,32 +103,6 @@ tasks: - talosctl --context {{.cluster}} config info >/dev/null 2>&1 - talosctl --context {{.cluster}} --nodes {{.node}} get machineconfig >/dev/null 2>&1 - upgrade-rollout: - desc: Rollout Talos upgrade on all nodes - cmds: - - flux --context {{.cluster}} suspend kustomization --all - - kubectl cnpg --context {{.cluster}} maintenance set --reusePVC --all-namespaces - - for: { var: nodes, split: "," } - task: upgrade - vars: - cluster: "{{.cluster}}" - node: "{{.ITEM}}" - rollout: "true" - - kubectl cnpg --context {{.cluster}} maintenance unset --reusePVC --all-namespaces - - flux --context {{.cluster}} resume kustomization --all - - task: :kubernetes:delete-failed-pods - vars: - cluster: "{{.cluster}}" - vars: - nodes: - sh: talosctl --context {{.cluster}} config info --output json | jq --join-output '[.nodes[]] | join(",")' - requires: - vars: ["cluster"] - preconditions: - - test -f {{.KUBERNETES_DIR}}/{{.cluster}}/talosconfig - - talosctl --context {{.cluster}} config info >/dev/null 2>&1 - - talosctl --context {{.cluster}} --nodes {{.nodes}} get machineconfig >/dev/null 2>&1 - upgrade-k8s: desc: Upgrade the clusters k8s version cmd: talosctl --context {{.cluster}} --nodes {{.controller}} upgrade-k8s --to {{.KUBERNETES_VERSION}} @@ -171,20 +143,16 @@ tasks: bootstrap-matchbox: desc: Bootstrap required Matchbox configuration to PXE Boot machine - dir: "{{.KUBERNETES_DIR}}/{{.cluster}}/bootstrap/talos" cmds: - for: ["kernel-amd64", "initramfs-amd64.xz"] cmd: | curl -skL https://factory.talos.dev/image/{{.TALOS_SCHEMATIC_ID}}/{{.TALOS_VERSION}}/{{.ITEM}} | \ curl -skT - -u "{{.HOME_SERVICE_USER}}:" \ sftp://{{.HOME_SERVICE_ADDR}}/{{.HOME_SERVICE_MATCHBOX_DIR}}/assets/{{.ITEM}} - - find ./assets -type f | xargs -I{} sh -c "sops -d {} | envsubst | curl -skT - -u "{{.HOME_SERVICE_USER}}:" sftp://{{.HOME_SERVICE_ADDR}}/{{.HOME_SERVICE_MATCHBOX_DIR}}/assets/\$(basename {} | sed 's/\.secret\.sops//')" - - find ./groups -type f | xargs -I{} curl -skT {} -u "{{.HOME_SERVICE_USER}}:" sftp://{{.HOME_SERVICE_ADDR}}/{{.HOME_SERVICE_MATCHBOX_DIR}}/groups/ - - find ./profiles -type f | xargs -I{} curl -skT {} -u "{{.HOME_SERVICE_USER}}:" sftp://{{.HOME_SERVICE_ADDR}}/{{.HOME_SERVICE_MATCHBOX_DIR}}/profiles/ + - find {{.KUBERNETES_DIR}}/{{.cluster}}/bootstrap/talos/assets -type f | xargs -I{} sh -c "sops -d {} | envsubst | curl -skT - -u "{{.HOME_SERVICE_USER}}:" sftp://{{.HOME_SERVICE_ADDR}}/{{.HOME_SERVICE_MATCHBOX_DIR}}/assets/\$(basename {} | sed 's/\.secret\.sops//')" + - find {{.KUBERNETES_DIR}}/{{.cluster}}/bootstrap/talos/groups -type f | xargs -I{} curl -skT {} -u "{{.HOME_SERVICE_USER}}:" sftp://{{.HOME_SERVICE_ADDR}}/{{.HOME_SERVICE_MATCHBOX_DIR}}/groups/ + - find {{.KUBERNETES_DIR}}/{{.cluster}}/bootstrap/talos/profiles -type f | xargs -I{} curl -skT {} -u "{{.HOME_SERVICE_USER}}:" sftp://{{.HOME_SERVICE_ADDR}}/{{.HOME_SERVICE_MATCHBOX_DIR}}/profiles/ - ssh -l {{.HOME_SERVICE_USER}} {{.HOME_SERVICE_ADDR}} "cd /var/opt/home-service ; go-task restart-matchbox" - env: - TALOS_VERSION: "{{.TALOS_VERSION}}" - TALOS_SCHEMATIC_ID: "{{.TALOS_SCHEMATIC_ID}}" - KUBERNETES_VERSION: "{{.KUBERNETES_VERSION}}" + vars: *env-vars requires: vars: ["cluster"] diff --git a/.taskfiles/Talos/scripts/upgrade.sh b/.taskfiles/Talos/scripts/upgrade.sh deleted file mode 100755 index c3cb9527566fb..0000000000000 --- a/.taskfiles/Talos/scripts/upgrade.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env bash - -CLUSTER="${1}" -NODE="${2}" -TALOS_STANZA="${3}" -ROLLOUT="${4:-false}" - -FROM_VERSION=$(kubectl --context "${CLUSTER}" get node "${NODE}" --output jsonpath='{.metadata.labels.feature\.node\.kubernetes\.io/system-os_release\.VERSION_ID}') -TO_VERSION=${TALOS_STANZA##*:} - -echo "Checking if Talos needs to be upgraded on node '${NODE}' in cluster '${CLUSTER}' ..." -if [ "${FROM_VERSION}" == "${TO_VERSION}" ]; then - echo "Talos is already up to date on version '${FROM_VERSION}', skipping upgrade ..." - exit 0 -fi - -echo "Waiting for all jobs to complete before upgrading Talos on node '${NODE}' in cluster '${CLUSTER}' ..." -until kubectl --context "${CLUSTER}" wait --timeout=5m \ - --for=condition=Complete jobs --all --all-namespaces; -do - echo "Waiting for all jobs to complete before upgrading Talos on node '${NODE}' in cluster '${CLUSTER}' ..." - sleep 10 -done - -if [ "${ROLLOUT}" != "true" ]; then - echo "Suspending Flux Kustomizations in cluster '${CLUSTER}' ..." - flux --context "${CLUSTER}" suspend kustomization --all - echo "Setting CNPG maintenance mode in cluster '${CLUSTER}' ..." - kubectl cnpg --context "${CLUSTER}" maintenance set --reusePVC --all-namespaces -fi - -echo "Upgrading Talos on node '${NODE}' in cluster '${CLUSTER}' to ${TO_VERSION}..." -talosctl --context "${CLUSTER}" --nodes "${NODE}" upgrade \ - --image="factory.talos.dev/installer/${TALOS_STANZA}" \ - --wait=true --timeout=10m --preserve=true - -echo "Waiting for Talos to be healthy on node '${NODE}' in cluster '${CLUSTER}' ..." -talosctl --context "${CLUSTER}" --nodes "${NODE}" health \ - --wait-timeout=10m --server=false - -echo "Waiting for Ceph health to be OK on node '${NODE}' in cluster '${CLUSTER}' ..." -until kubectl --context "${CLUSTER}" wait --timeout=5m \ - --for=jsonpath=.status.ceph.health=HEALTH_OK cephcluster \ - --all --all-namespaces; -do - echo "Waiting for Ceph health to be OK on node '${NODE}' in cluster '${CLUSTER}' ..." - sleep 10 -done - -if [ "${ROLLOUT}" != "true" ]; then - echo "Resuming Flux Kustomizations in cluster '${CLUSTER}' ..." - flux --context "${CLUSTER}" resume kustomization --all - echo "Unsetting CNPG maintenance mode in cluster '${CLUSTER}' ..." - kubectl cnpg --context "${CLUSTER}" maintenance unset --reusePVC --all-namespaces -fi diff --git a/.taskfiles/VolSync/Taskfile.yaml b/.taskfiles/VolSync/Taskfile.yaml index ff025048efed9..4df3e05478a83 100644 --- a/.taskfiles/VolSync/Taskfile.yaml +++ b/.taskfiles/VolSync/Taskfile.yaml @@ -8,7 +8,7 @@ version: "3" # 3. Applications are deployed as either a Kubernetes Deployment or StatefulSet # 4. Each application only has one PVC that is being replicated -x-env: &env +x-env-vars: &env-vars app: "{{.app}}" claim: "{{.claim}}" controller: "{{.controller}}" @@ -34,7 +34,7 @@ tasks: - flux --context {{.cluster}} {{.state}} kustomization volsync - flux --context {{.cluster}} -n {{.ns}} {{.state}} helmrelease volsync - kubectl --context {{.cluster}} -n {{.ns}} scale deployment volsync --replicas {{if eq "suspend" .state}}0{{else}}1{{end}} - env: *env + env: *env-vars vars: ns: '{{.ns | default "volsync-system"}}' state: '{{index .MATCH 0}}' @@ -54,7 +54,7 @@ tasks: - kubectl --context {{.cluster}} -n {{.ns}} wait job/{{.job}} --for condition=complete --timeout=1m - kubectl --context {{.cluster}} -n {{.ns}} logs job/{{.job}} --container main - kubectl --context {{.cluster}} -n {{.ns}} delete job {{.job}} - env: *env + env: *env-vars requires: vars: ["cluster", "app"] vars: @@ -79,7 +79,7 @@ tasks: - kubectl --context {{.cluster}} -n {{.ns}} logs job/{{.job}} --container minio - kubectl --context {{.cluster}} -n {{.ns}} logs job/{{.job}} --container r2 - kubectl --context {{.cluster}} -n {{.ns}} delete job {{.job}} - env: *env + env: *env-vars requires: vars: ["cluster", "app"] vars: @@ -103,7 +103,7 @@ tasks: - kubectl --context {{.cluster}} -n {{.ns}} patch replicationsources {{.app}} --type merge -p '{"spec":{"trigger":{"manual":"{{.now}}"}}}' - bash {{.VOLSYNC_SCRIPTS_DIR}}/wait-for-job.sh {{.job}} {{.ns}} {{.cluster}} - kubectl --context {{.cluster}} -n {{.ns}} wait job/{{.job}} --for condition=complete --timeout=120m - env: *env + env: *env-vars requires: vars: ["cluster", "app"] vars: @@ -128,11 +128,11 @@ tasks: app: Application to restore (required) previous: Previous number of snapshots to restore (default: 2) cmds: - - { task: .suspend, vars: *env } - - { task: .wipe, vars: *env } - - { task: .restore, vars: *env } - - { task: .resume, vars: *env } - env: *env + - { task: .suspend, vars: *env-vars } + - { task: .wipe, vars: *env-vars } + - { task: .restore, vars: *env-vars } + - { task: .resume, vars: *env-vars } + env: *env-vars requires: vars: ["cluster", "app"] vars: @@ -170,7 +170,7 @@ tasks: cmd: | {{- $items := (split "/" .ITEM) }} kubectl --context {{.cluster}} delete volumesnapshot -n {{ $items._0 }} {{ $items._1 }} - env: *env + env: *env-vars requires: vars: ["cluster"] vars: @@ -189,7 +189,7 @@ tasks: - flux --context {{.cluster}} -n {{.ns}} suspend helmrelease {{.app}} - kubectl --context {{.cluster}} -n {{.ns}} scale {{.controller}} --replicas 0 - kubectl --context {{.cluster}} -n {{.ns}} wait pod --for delete --selector="app.kubernetes.io/name={{.app}}" --timeout=2m - env: *env + env: *env-vars # Wipe the PVC of all data .wipe: @@ -200,7 +200,7 @@ tasks: - kubectl --context {{.cluster}} -n {{.ns}} wait job/{{.job}} --for condition=complete --timeout=120m - kubectl --context {{.cluster}} -n {{.ns}} logs job/{{.job}} --container main - kubectl --context {{.cluster}} -n {{.ns}} delete job {{.job}} - env: *env + env: *env-vars vars: job: volsync-wipe-{{.app}} @@ -212,7 +212,7 @@ tasks: - bash {{.VOLSYNC_SCRIPTS_DIR}}/wait-for-job.sh {{.job}} {{.ns}} {{.cluster}} - kubectl --context {{.cluster}} -n {{.ns}} wait job/{{.job}} --for condition=complete --timeout=120m - kubectl --context {{.cluster}} -n {{.ns}} delete replicationdestination {{.job}} - env: *env + env: *env-vars vars: job: volsync-dst-{{.app}} @@ -222,4 +222,4 @@ tasks: cmds: - flux --context {{.cluster}} -n {{.ns}} resume helmrelease {{.app}} - flux --context {{.cluster}} -n flux-system resume kustomization {{.app}} - env: *env + env: *env-vars diff --git a/kubernetes/main/apps/system-upgrade/system-upgrade-controller/ks.yaml b/kubernetes/main/apps/system-upgrade/system-upgrade-controller/ks.yaml index 2055ae7827517..a67a1729e8546 100644 --- a/kubernetes/main/apps/system-upgrade/system-upgrade-controller/ks.yaml +++ b/kubernetes/main/apps/system-upgrade/system-upgrade-controller/ks.yaml @@ -46,6 +46,7 @@ spec: timeout: 5m postBuild: substitute: + TALOS_SCHEMATIC_ID: d715f723f882b1e1e8063f1b89f237dcc0e3bd000f9f970243af59c8baae0100 # renovate: datasource=docker depName=ghcr.io/siderolabs/installer TALOS_VERSION: v1.7.3 # renovate: datasource=docker depName=ghcr.io/siderolabs/kubelet diff --git a/kubernetes/main/apps/system-upgrade/system-upgrade-controller/plans/talos.yaml b/kubernetes/main/apps/system-upgrade/system-upgrade-controller/plans/talos.yaml index 7d9274a4bde36..44bd1649b127b 100644 --- a/kubernetes/main/apps/system-upgrade/system-upgrade-controller/plans/talos.yaml +++ b/kubernetes/main/apps/system-upgrade/system-upgrade-controller/plans/talos.yaml @@ -43,6 +43,6 @@ spec: args: - --nodes=$(NODE_IP) - upgrade - - --image=factory.talos.dev/installer/d715f723f882b1e1e8063f1b89f237dcc0e3bd000f9f970243af59c8baae0100:$(SYSTEM_UPGRADE_PLAN_LATEST_VERSION) + - --image=factory.talos.dev/installer/${TALOS_SCHEMATIC_ID}:$(SYSTEM_UPGRADE_PLAN_LATEST_VERSION) - --preserve=true - --wait=false