From ad39602cfe634f61f5579daab389681d1226f194 Mon Sep 17 00:00:00 2001 From: Tope Emmanuel Date: Tue, 26 Sep 2023 21:11:11 +0100 Subject: [PATCH] Added paas to aks backup and restore WHY: IT is essential that the db is restored to aks HOW: By adding the backup an restore function to workflow --- ...as_to_aks_db_backup_and_restore_manual.yml | 172 +++++++++ Makefile | 4 + bin/konduit.sh | 327 ++++++++++++++++++ terraform/aks/config/development.tfvars.json | 1 + terraform/aks/config/production.tfvars.json | 1 + terraform/aks/config/review.tfvars.json | 1 + terraform/aks/config/staging.tfvars.json | 1 + terraform/aks/variables.tf | 3 + 8 files changed, 510 insertions(+) create mode 100644 .github/workflows/paas_to_aks_db_backup_and_restore_manual.yml create mode 100755 bin/konduit.sh diff --git a/.github/workflows/paas_to_aks_db_backup_and_restore_manual.yml b/.github/workflows/paas_to_aks_db_backup_and_restore_manual.yml new file mode 100644 index 0000000000..7bdcd521f4 --- /dev/null +++ b/.github/workflows/paas_to_aks_db_backup_and_restore_manual.yml @@ -0,0 +1,172 @@ +name: Backup and restore Postgres DB from PAAS to AKS + +on: + workflow_dispatch: + inputs: + environment: + description: Environment + type: choice + options: + - development + - staging + - production + +env: + BACKUP_ARTIFACT_NAME: ${{ inputs.environment }}-backup + +jobs: + backup: + name: Backup from PAAS + runs-on: ubuntu-latest + environment: ${{ inputs.environment }}_aks + + steps: + - uses: actions/checkout@v4 + + - uses: Azure/login@v1 + with: + creds: ${{ secrets.AZURE_CREDENTIALS }} + + - uses: DFE-Digital/github-actions/install-postgres-client@master + + - name: Set AKS environment name + id: set_aks_env_name + run: | + case "${{ inputs.environment }}" in + development) + echo "ENVIRONMENT_ABR=dev" >> $GITHUB_ENV + echo "TFVARS_NAME=dev" >> $GITHUB_ENV + echo "KEY_VAULT_NAME=s189t01-gse-dv-inf-kv" >> $GITHUB_OUTPUT + ;; + staging) + echo "ENVIRONMENT_ABR=staging" >> $GITHUB_ENV + echo "TFVARS_NAME=staging" >> $GITHUB_ENV + echo "KEY_VAULT_NAME=s189t01-gse-stg-inf-kv" >> $GITHUB_OUTPUT + ;; + production) + echo "ENVIRONMENT_ABR=prod" >> $GITHUB_ENV + echo "TFVARS_NAME=production" >> $GITHUB_ENV + echo "KEY_VAULT_NAME=s189p01-gse-pd-inf-kv" >> $GITHUB_OUTPUT + ;; + *) + echo "unknown cluster" + ;; + esac + + + - name: Set environment variables + shell: bash + run: | + tf_vars_file=terraform/aks/config/${{ inputs.environment }}.tfvars.json + echo "KEY_VAULT_NAME=$(jq -r '.infra_key_vault_name' ${tf_vars_file})" >> $GITHUB_ENV + echo "PAAS_SPACE=$(jq -r '.paas_space' ${tf_vars_file})" >> $GITHUB_ENV + + - name: Retrieve Cloudfoundry credentials from KV + uses: azure/CLI@v1 + id: fetch-cf-creds + with: + inlineScript: | + SECRET_VALUE=$(az keyvault secret show --name "PAAS-USERNAME" --vault-name "${{ env.KEY_VAULT_NAME}}" --query "value" -o tsv) + echo "::add-mask::$SECRET_VALUE" + echo "PAAS-USER=$SECRET_VALUE" >> $GITHUB_OUTPUT + + SECRET_VALUE=$(az keyvault secret show --name "PAAS-PASSWORD" --vault-name "${{ env.KEY_VAULT_NAME}}" --query "value" -o tsv) + echo "::add-mask::$SECRET_VALUE" + echo "PAAS-PASSWORD=$SECRET_VALUE" >> $GITHUB_OUTPUT + + - uses: DFE-Digital/github-actions/setup-cf-cli@master + with: + CF_USERNAME: ${{ steps.fetch-cf-creds.outputs.PAAS-USER }} + CF_PASSWORD: ${{ steps.fetch-cf-creds.outputs.PAAS-PASSWORD }} + CF_SPACE_NAME: ${{ env.PAAS_SPACE }} + INSTALL_CONDUIT: true + + - name: Backup database + run: | + cf conduit school-experience-${{ env.ENVIRONMENT_ABR }}-pg-common-svc -- pg_dump -E utf8 --clean --compress=1 --if-exists --no-owner --no-privileges --verbose -f backup.sql.gz + + - name: Upload backup + uses: actions/upload-artifact@v3 + with: + name: ${{ env.BACKUP_ARTIFACT_NAME }} + path: backup.sql.gz + retention-days: 1 + + + + restore: + name: Restore to AKS + runs-on: ubuntu-latest + needs: backup + + environment: ${{ inputs.environment }}_aks + env: + KEY_VAULT_NAME: ${{ needs.backup.outputs.KEY_VAULT_NAME }} + + steps: + - uses: actions/checkout@v4 + + - uses: Azure/login@v1 + with: + creds: ${{ secrets.AZURE_CREDENTIALS }} + + - name: Set environment variables + shell: bash + run: | + tf_vars_file=terraform/aks/config/${{ inputs.environment }}.tfvars.json + + - run: | + test_cluster_rg=s189t01-tsc-ts-rg + test_cluster_name=s189t01-tsc-test-aks + + prod_cluster_rg=s189p01-tsc-pd-rg + prod_cluster_name=s189p01-tsc-production-aks + + case "${{ inputs.environment }}" in + development) + echo "in development with rg = $test_cluster_rg clustername = $test_cluster_name" >> $GITHUB_ENV + echo "cluster_rg=$test_cluster_rg" >> $GITHUB_ENV + echo "cluster_name=$test_cluster_name" >> $GITHUB_ENV + echo "app_name=get-school-experience-development" >> $GITHUB_ENV + echo "key_vault_name=s189t01-gse-dv-inf-kv" >> $GITHUB_ENV + ;; + staging) + echo "cluster_rg=$test_cluster_rg" >> $GITHUB_ENV + echo "cluster_name=$test_cluster_name" >> $GITHUB_ENV + echo "app_name=get-school-experience-staging" >> $GITHUB_ENV + echo "key_vault_name=s189t01-gse-stg-inf-kv" >> $GITHUB_ENV + ;; + production) + echo "cluster_rg=$prod_cluster_rg" >> $GITHUB_ENV + echo "cluster_name=$prod_cluster_name" >> $GITHUB_ENV + echo "app_name=get-school-experience-production" >> $GITHUB_ENV + echo "key_vault_name=s189p01-gse-pd-inf-kv" >> $GITHUB_ENV + ;; + *) + echo "unknown cluster" + ;; + esac + + - uses: azure/setup-kubectl@v3 + + - run: | + az aks get-credentials -g ${{ env.cluster_rg }} -n ${{ env.cluster_name }} + make bin/konduit.sh + + - name: Download backup + uses: actions/download-artifact@v3 + with: + name: ${{ env.BACKUP_ARTIFACT_NAME }} + + - name: Restore database + run: bin/konduit.sh -i backup.sql.gz -c -k ${{ env.key_vault_name }} -d gse-${{ inputs.environment }} get-school-experience-${{ inputs.environment }} -- psql + + + - name: Remove PaaS event triggers + shell: bash + run: | + bin/konduit.sh -k ${{ env.key_vault_name }} -d gse-${{ inputs.environment }} get-school-experience-${{ inputs.environment }} -- psql -c 'drop event trigger forbid_ddl_reader; drop event trigger make_readable; drop event trigger reassign_owned;' + + - uses: geekyeggo/delete-artifact@v2 + with: + name: ${{ env.BACKUP_ARTIFACT_NAME }} diff --git a/Makefile b/Makefile index 395c2de8e5..92bb7fe4a5 100644 --- a/Makefile +++ b/Makefile @@ -160,6 +160,10 @@ terraform-apply: terraform-init terraform-apply-aks: terraform-init-aks terraform -chdir=terraform/aks apply -var-file "config/${CONFIG}.tfvars.json" ${AUTO_APPROVE} +bin/konduit.sh: + curl -s https://raw.githubusercontent.com/DFE-Digital/teacher-services-cloud/main/scripts/konduit.sh -o bin/konduit.sh \ + && chmod +x bin/konduit.sh + terraform-destroy: terraform-init terraform -chdir=terraform/paas destroy -var-file=${DEPLOY_ENV}.env.tfvars ${AUTO_APPROVE} diff --git a/bin/konduit.sh b/bin/konduit.sh new file mode 100755 index 0000000000..15743f9a9a --- /dev/null +++ b/bin/konduit.sh @@ -0,0 +1,327 @@ +#!/bin/bash +# Connect to a backend service via an app instance +# + +# TODO +# +# - usually more than one redis service. Set default and how to override? +# - test against azure redis service when available +# - confirm before running if interactive, a flag to run without confirmation? +# + +help() { + echo + echo "Script to connect to a k8 backing service via an app service" + echo + + echo "Syntax:" + echo " konduit [-a|-c|-h|-i file-name|-p postgres-var|-r redis-var|-t timeout] app-name -- command [args]" + echo " Connect to the default database for app-name" + echo + echo "or konduit [-a|-c|-h|-i file-name|-r redis-var|-t timeout] -d db-name -k key-vault app-name -- command [args]" + echo " Connect to a specific database from app-name" + echo " Requires a secret containing the DB URL in the specified Azure KV," + echo " with name {db-name}-database-url" + echo + echo "options:" + echo " -a Backend is an AKS service. Default is Azure backing service." + echo " -c Input file is compresses. Requires -i." + echo " -d db-name Database name, required if connecting to a db other than the app default." + echo " -i file-name Input file for a restore. Only valid for command psql." + echo " -k key-vault Key vault that holds the Azure secret containing the DB URL." + echo " The secret {db-name}-database-url must exist in this vault," + echo " and contain a full connection URL. The URL is in the format:" + echo " postgres://ADMIN_USER:URLENCODED(ADMIN_PASSWORD)@POSTGRES_SERVER_NAME-psql.postgres.database.azure.com:5432/DB_NAME." + echo " The ADMIN_PASSWORD can be url encoded using terraform console " + echo " using CMD: urlencode(ADMIN_PASSWORD)" + echo " -p postgres-var Variable for postgres [defaults to DATABASE_URL if not set]" + echo " Only valid for commands psql, pg_dump or pg_restore" + echo " -r redis-var Variable for redis cache [defaults to REDIS_URL if not set]" + echo " Only valid for command redis-cli" + echo " -t timeout Timeout in seconds. Default is 28800 but 3600 for psql, pg_dump or pg_restore commands." + echo " -h Print this help." + echo + echo "parameters:" + echo " app-name app name to connect to." + echo " command command to run." + echo " valid commands are psql, pg_dump, pg_restore or redis-cli" + echo " args args for the command" +} + +init_setup() { + if [ "${RUNCMD}" != "psql" ] && [ "${RUNCMD}" != "pg_dump" ] && [ "${RUNCMD}" != "pg_restore" ] && [ "${RUNCMD}" != "redis-cli" ]; then + echo + echo "Error: invalid command ${RUNCMD}" + echo "Only valid options are psql, pg_dump, pg_restore or redis-cli" + help + exit 1 + fi + + if [ "${Timeout}" = "" ]; then + # Default timeout for psql/pg_dump/pg_restore set to 8 hours. Increase if required. + # This is to allow for long running queries or backups. + # The timeout is reset for each command run. + # The timeout can be overridden with the -t option. + TMOUT=28800 # 8 hour timeout default for nc tunnel + if [ "${RUNCMD}" = "psql" ] && [ "${Inputfile}" != "" ]; then + # Default timeout for restore set to 1 hour. Increase if required. + TMOUT=3600 + elif [ "${RUNCMD}" = "pg_dump" ] || [ "${RUNCMD}" = "pg_restore" ]; then + # Default timeout for backup set to 1 hour. Increase if required. + TMOUT=3600 + fi + else + TMOUT="${Timeout}" + fi + + # If an input file is given, check it exists and is readable + if [ "${Inputfile}" != "" ] && [ ! -r "${Inputfile}" ]; then + echo "Error: invalid input file" + exit 1 + fi + + # Settings dependant on AKS or Azure backing service + if [ "${AKS}" = "" ]; then + # redis backing service requires TLS set for redis-cli + TLS="--tls" + REDIS_PORT=6380 + else + # redis aks service does not use TLS + TLS="" + REDIS_PORT=6379 + fi + + # Set default Redis var if not set + if [ "${Redis}" = "" ]; then + Redis="REDIS_URL" + fi + + # Set default Postgres var if not set + if [ "${Postgres}" = "" ]; then + Postgres="DATABASE_URL" + fi + + # Get the deployment namespace + NAMESPACE=$(kubectl get deployments -A | grep "${INSTANCE} " | awk '{print $1}') + + # Set service ports + DB_PORT=5432 +} + +check_instance() { + if [ "$INSTANCE" = "" ]; then + echo "Error: Must provide instance name as parameter e.g. apply-qa, apply-review-1234" + exit 1 + fi + # make sure it's LC + INSTANCE=$(echo "${INSTANCE}" | tr '[:upper:]' '[:lower:]') + # Lets check the container exists and we can connect to it first + if ! kubectl -n "${NAMESPACE}" exec -i deployment/"${INSTANCE}" -- echo; then + echo "Error: Container does not exist or connection cannot be established" + exit 1 + fi +} + +set_ports() { + # Get a random DEST port for the k8 container + # so there is minimal conflict between users + DEST_PORT=0 + until [ $DEST_PORT -gt 1024 ]; do + DEST_PORT=$RANDOM + done + + # Get a random LOCAL port + # so we can have more than one session if wanted + LOCAL_PORT=0 + until [ $LOCAL_PORT -gt 1024 ]; do + LOCAL_PORT=$RANDOM + nc -z 127.0.0.1 $LOCAL_PORT 2>/dev/null && LOCAL_PORT=0 # try again if it's in use + done +} + +set_db_psql() { + PORT=${DB_PORT} + # Get DB settings + # Either from the app DATABASE_URL or the AZURE KV secret + # + # DATABASE_URL format (K8_URL/KV_URL) + # for backing service + # postgres://ADMIN_USER:ADMIN_PASSWORD@s999t01-someapp-rv-review-99999-psql.postgres.database.azure.com:5432/someapp-postgres-review-99999 + # for k8 pod + # postgres://ADMIN_USER:ADMIN_PASSWORD@someapp-postgres-review-99999:5432/someapp-postgres-review-99999 + # + if [ "${DBName}" = "" ]; then + # If an input file is given, check it exists and is readable + K8_URL=$(echo "echo \$${Postgres}" | kubectl -n "${NAMESPACE}" exec -i deployment/"${INSTANCE}" -- sh) + DB_URL=$(echo "${K8_URL}" | sed "s/@[^~]*\//@127.0.0.1:${LOCAL_PORT}\//g") + DB_NAME=$(echo "${K8_URL}" | awk -F"@" '{print $2}' | awk -F":" '{print $1}') + else + KV_URL=$(az keyvault secret show --name "${DBName}"-database-url --vault-name "${KV}" | jq -r .value) + DB_URL=$(echo "${KV_URL}" | sed "s/@[^~]*\//@127.0.0.1:${LOCAL_PORT}\//g") + DB_NAME=$(echo "${KV_URL}" | awk -F"@" '{print $2}' | awk -F":" '{print $1}') + fi + + if [ "${KV_URL}" = "" ] && [ "${K8_URL}" = "" ] || [ "${DB_URL}" = "" ] || [ "${DB_NAME}" = "" ]; then + echo "Error: invalid DB settings" + exit 1 + fi +} + +set_db_redis() { + PORT=${REDIS_PORT} + # Get DB settings + # Either from the app REDIS_URL or the AZURE KV secret + # + # REDIS_URL (queue) or REDIS_CACHE_URL (cache) format (K8_URL/KV_URL) + # for backing service + # rediss://:somepassword=@s9999t99-att-env-redis-service.redis.cache.windows.net:6380/0 + # + # for k8 pod + # redis://someapp-redis-review-99999:6379/0 + # + # Not tested from an azure backing service + + if [ "${DBName}" = "" ]; then + K8_URL=$(echo "echo \$${Redis}" | kubectl -n "${NAMESPACE}" exec -i deployment/"${INSTANCE}" -- sh) + if [ "${AKS}" = "" ]; then + DB_URL=$(echo "${K8_URL}" | sed "s/@[^~]*\//@127.0.0.1:${LOCAL_PORT}\//g" | sed "s/rediss:\/\//rediss:\/\/default/g") + DB_NAME=$(echo "${K8_URL}" | awk -F"@" '{print $2}' | awk -F":" '{print $1}') + else + DB_URL=$(echo "$K8_URL" | sed "s/\/\/[^~]*/\/\/127.0.0.1:${LOCAL_PORT}\//g") + DB_NAME=$(echo "$K8_URL" | awk -F"/" '{print $3}' | awk -F":" '{print $1}') + fi + else + KV_URL=$(az keyvault secret show --name "${DBName}"-database-url --vault-name "${KV}" | jq -r .value) + if [ "${AKS}" = "" ]; then + DB_URL=$(echo "${KV_URL}" | sed "s/@[^~]*\//@127.0.0.1:${LOCAL_PORT}\//g" | sed "s/rediss:\/\//rediss:\/\/default/g") + else + DB_URL=$(echo "${KV_URL}" | sed "s/\/\/[^~]*/\/\/127.0.0.1:${LOCAL_PORT}\//g") + fi + DB_NAME="${DBName}" + fi + + if [ "${KV_URL}" = "" ] && [ "${K8_URL}" = "" ] || [ "${DB_URL}" = "" ] || [ "${DB_NAME}" = "" ]; then + echo "Error: invalid DB settings" + exit 1 + fi +} + +open_tunnels() { + # Open netcat tunnel between k8 deployment and postgres database + # Timeout of 8 hours set for an interactive session + # Testing for kubectl deployment with multiple replicas always hit the same pod (the first one?), + # will have to revisit if it becomes an issue + echo 'nc -v -lk -p '${DEST_PORT}' -w '${TMOUT}' -e /usr/bin/nc -w '${TMOUT} "${DB_NAME}" "${PORT}" | kubectl -n "${NAMESPACE}" exec -i deployment/"${INSTANCE}" -- sh & + + # Open local tunnel to k8 deployment + kubectl port-forward -n "${NAMESPACE}" deployment/"${INSTANCE}" ${LOCAL_PORT}:${DEST_PORT} & +} + +run_psql() { + if [ "$Inputfile" = "" ]; then + psql -d "$DB_URL" --no-password "${OTHERARGS}" + elif [ "$CompressedInput" = "" ]; then + psql -d "$DB_URL" --no-password <"$Inputfile" + else + gzip -d --to-stdout "${Inputfile}" | psql -d "$DB_URL" --no-password + fi +} + +run_pgdump() { + if [ "${OTHERARGS}" = "" ]; then + echo "ERROR: Must supply arguments for pg_dump" + exit 1 + fi + pg_dump -d "$DB_URL" --no-password ${OTHERARGS} +} + +run_pg_restore() { + if [ "${OTHERARGS}" = "" ]; then + echo "ERROR: Must supply arguments for pg_restore" + exit 1 + fi + pg_restore -d "$DB_URL" --no-password ${OTHERARGS} +} + +cleanup() { + unset DB_URL DB_NAME K8_URL + pkill -15 -f "kubectl port-forward.*${LOCAL_PORT}" + sleep 3 # let the port-forward finish + kubectl -n "${NAMESPACE}" exec -i deployment/"${INSTANCE}" -- pkill -15 -f "nc -v -lk -p ${DEST_PORT}" +} + +# Get the options +while getopts "ahcd:i:k:r:p:t:" option; do + case $option in + a) + AKS="True" + ;; + c) + CompressedInput="True" + ;; + d) + DBName=$OPTARG + ;; + k) + KV=$OPTARG + ;; + i) + Inputfile=$OPTARG + ;; + p) + Postgres=$OPTARG + ;; + r) + Redis=$OPTARG + ;; + t) + Timeout=$OPTARG + ;; + h) + help + exit + ;; + \?) + echo "Error: Invalid option" + exit 1 + ;; + esac +done +shift "$((OPTIND - 1))" +INSTANCE=$1 +# $2 is -- +RUNCMD=$3 +shift 3 +OTHERARGS=$* + +### +### Main +### + +init_setup +check_instance +set_ports +# Get DB settings and set the CMD to run +case $RUNCMD in +psql) + set_db_psql + CMD="run_psql" + ;; +pg_dump) + set_db_psql + CMD="run_pgdump" + ;; +pg_restore) + set_db_psql + CMD="run_pg_restore" + ;; +redis-cli) + set_db_redis + CMD="redis-cli -u $DB_URL $TLS ${OTHERARGS}" + ;; +esac +open_tunnels >/dev/null 2>&1 +sleep 5 # Need to allow the connections to open +$CMD # Run the command +echo Running cleanup... +cleanup >/dev/null 2>&1 # Cleanup on completion diff --git a/terraform/aks/config/development.tfvars.json b/terraform/aks/config/development.tfvars.json index 09858ed267..36dcf7006a 100644 --- a/terraform/aks/config/development.tfvars.json +++ b/terraform/aks/config/development.tfvars.json @@ -3,6 +3,7 @@ "namespace": "git-development", "environment": "development", "key_vault_resource_group": "s189t01-gse-dv-rg", + "paas_space" : "get-into-teaching", "infra_key_vault_name": "s189t01-gse-dv-inf-kv", "azure_enable_backup_storage": false, "enable_monitoring": false, diff --git a/terraform/aks/config/production.tfvars.json b/terraform/aks/config/production.tfvars.json index a93d383467..01a9ad3288 100644 --- a/terraform/aks/config/production.tfvars.json +++ b/terraform/aks/config/production.tfvars.json @@ -5,6 +5,7 @@ "azure_enable_backup_storage": true, "enable_monitoring": true, "infra_key_vault_name": "s189p01-gse-pd-inf-kv", + "paas_space" : "get-into-teaching-production", "statuscake_password_name": "SC-PASSWORD", "sidekiq_replicas" : 0, "sidekiq_memory_max" : "2Gi", diff --git a/terraform/aks/config/review.tfvars.json b/terraform/aks/config/review.tfvars.json index 7c9a74cbc6..11c3172af3 100644 --- a/terraform/aks/config/review.tfvars.json +++ b/terraform/aks/config/review.tfvars.json @@ -3,6 +3,7 @@ "namespace": "git-development", "azure_enable_backup_storage": false, "enable_monitoring": false, + "paas_space" : "get-into-teaching", "deploy_redis": false, "deploy_postgres": false, "key_vault_name": "s189t01-gse-rv-app-kv", diff --git a/terraform/aks/config/staging.tfvars.json b/terraform/aks/config/staging.tfvars.json index 83418ddfef..eb55318fe5 100644 --- a/terraform/aks/config/staging.tfvars.json +++ b/terraform/aks/config/staging.tfvars.json @@ -4,6 +4,7 @@ "environment": "staging", "infra_key_vault_name": "s189t01-gse-stg-inf-kv", "azure_enable_backup_storage": false, + "paas_space" : "get-into-teaching-test", "key_vault_resource_group": "s189t01-gse-stg-rg", "enable_monitoring": false, "sidekiq_replicas" : 2, diff --git a/terraform/aks/variables.tf b/terraform/aks/variables.tf index 82f189a7f9..ba0aa7860b 100644 --- a/terraform/aks/variables.tf +++ b/terraform/aks/variables.tf @@ -13,6 +13,9 @@ variable "azure_credentials_json" { default = null description = "JSON containing the service principal authentication key when running in automation" } +variable "paas_space" { + description = "the space for corresponding paas app" +} variable "azure_resource_prefix" { description = "Standard resource prefix. Usually s189t01 (test) or s189p01 (production)" }