From 96fcca2aed396b3910e78589f048d1f45301f9a6 Mon Sep 17 00:00:00 2001 From: lavData Date: Thu, 28 Dec 2023 16:26:31 +0700 Subject: [PATCH] feat: more config ttl for error job pod --- charts/airbyte-pod-sweeper/README.md | 117 +++++++++--------- .../templates/configmap.yaml | 11 ++ .../templates/deployment.yaml | 2 + charts/airbyte-pod-sweeper/values.yaml | 1 + 4 files changed, 73 insertions(+), 58 deletions(-) diff --git a/charts/airbyte-pod-sweeper/README.md b/charts/airbyte-pod-sweeper/README.md index 83e2dfc9e5b..0c907e35b6c 100644 --- a/charts/airbyte-pod-sweeper/README.md +++ b/charts/airbyte-pod-sweeper/README.md @@ -12,63 +12,64 @@ Helm chart to deploy airbyte-pod-sweeper ## Values -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| affinity | object | `{}` | | -| containerSecurityContext | object | `{}` | | -| enabled | bool | `true` | | -| extraVolumeMounts | list | `[]` | | -| extraVolumes | list | `[]` | | -| global.database.secretName | string | `""` | | -| global.database.secretValue | string | `""` | | -| global.jobs.kube.annotations | object | `{}` | | +| Key | Type | Default | Description | +|---------------------------------------------------|------|---------|-------------| +| affinity | object | `{}` | | +| containerSecurityContext | object | `{}` | | +| enabled | bool | `true` | | +| extraVolumeMounts | list | `[]` | | +| extraVolumes | list | `[]` | | +| global.database.secretName | string | `""` | | +| global.database.secretValue | string | `""` | | +| global.jobs.kube.annotations | object | `{}` | | | global.jobs.kube.main_container_image_pull_secret | string | `""` | | -| global.jobs.kube.nodeSelector | object | `{}` | | -| global.jobs.kube.tolerations | list | `[]` | | -| global.jobs.resources.limits | object | `{}` | | -| global.jobs.resources.requests | object | `{}` | | -| global.logs.accessKey.existingSecret | string | `""` | | -| global.logs.accessKey.existingSecretKey | string | `""` | | -| global.logs.accessKey.password | string | `"minio"` | | -| global.logs.externalMinio.enabled | bool | `false` | | -| global.logs.externalMinio.host | string | `"localhost"` | | -| global.logs.externalMinio.port | int | `9000` | | -| global.logs.gcs.bucket | string | `""` | | -| global.logs.gcs.credentials | string | `""` | | -| global.logs.gcs.credentialsJson | string | `""` | | -| global.logs.minio.enabled | bool | `true` | | -| global.logs.s3.bucket | string | `"airbyte-dev-logs"` | | -| global.logs.s3.bucketRegion | string | `""` | | -| global.logs.s3.enabled | bool | `false` | | -| global.logs.secretKey.existingSecret | string | `""` | | -| global.logs.secretKey.existingSecretKey | string | `""` | | -| global.logs.secretKey.password | string | `"minio123"` | | -| global.secretName | string | `""` | | -| global.serviceAccountName | string | `"airbyte-admin"` | | -| image.pullPolicy | string | `"IfNotPresent"` | | -| image.repository | string | `"bitnami/kubectl"` | | -| image.tag | string | `"latest"` | | -| livenessProbe.enabled | bool | `true` | | -| livenessProbe.failureThreshold | int | `3` | | -| livenessProbe.initialDelaySeconds | int | `5` | | -| livenessProbe.periodSeconds | int | `30` | | -| livenessProbe.successThreshold | int | `1` | | -| livenessProbe.timeoutSeconds | int | `1` | | -| namespace | string | `""` | | -| nodeSelector | object | `{}` | | -| podAnnotations | object | `{}` | | -| podLabels | object | `{}` | | -| readinessProbe.enabled | bool | `true` | | -| readinessProbe.failureThreshold | int | `3` | | -| readinessProbe.initialDelaySeconds | int | `5` | | -| readinessProbe.periodSeconds | int | `30` | | -| readinessProbe.successThreshold | int | `1` | | -| readinessProbe.timeoutSeconds | int | `1` | | -| replicaCount | int | `1` | | -| resources.limits | object | `{}` | | -| resources.requests | object | `{}` | | -| timeToDeletePods.running | string | `""` | | -| timeToDeletePods.succeeded | int | `120` | | -| timeToDeletePods.unsuccessful | int | `1440` | | -| tolerations | list | `[]` | | +| global.jobs.kube.nodeSelector | object | `{}` | | +| global.jobs.kube.tolerations | list | `[]` | | +| global.jobs.resources.limits | object | `{}` | | +| global.jobs.resources.requests | object | `{}` | | +| global.logs.accessKey.existingSecret | string | `""` | | +| global.logs.accessKey.existingSecretKey | string | `""` | | +| global.logs.accessKey.password | string | `"minio"` | | +| global.logs.externalMinio.enabled | bool | `false` | | +| global.logs.externalMinio.host | string | `"localhost"` | | +| global.logs.externalMinio.port | int | `9000` | | +| global.logs.gcs.bucket | string | `""` | | +| global.logs.gcs.credentials | string | `""` | | +| global.logs.gcs.credentialsJson | string | `""` | | +| global.logs.minio.enabled | bool | `true` | | +| global.logs.s3.bucket | string | `"airbyte-dev-logs"` | | +| global.logs.s3.bucketRegion | string | `""` | | +| global.logs.s3.enabled | bool | `false` | | +| global.logs.secretKey.existingSecret | string | `""` | | +| global.logs.secretKey.existingSecretKey | string | `""` | | +| global.logs.secretKey.password | string | `"minio123"` | | +| global.secretName | string | `""` | | +| global.serviceAccountName | string | `"airbyte-admin"` | | +| image.pullPolicy | string | `"IfNotPresent"` | | +| image.repository | string | `"bitnami/kubectl"` | | +| image.tag | string | `"latest"` | | +| livenessProbe.enabled | bool | `true` | | +| livenessProbe.failureThreshold | int | `3` | | +| livenessProbe.initialDelaySeconds | int | `5` | | +| livenessProbe.periodSeconds | int | `30` | | +| livenessProbe.successThreshold | int | `1` | | +| livenessProbe.timeoutSeconds | int | `1` | | +| namespace | string | `""` | | +| nodeSelector | object | `{}` | | +| podAnnotations | object | `{}` | | +| podLabels | object | `{}` | | +| readinessProbe.enabled | bool | `true` | | +| readinessProbe.failureThreshold | int | `3` | | +| readinessProbe.initialDelaySeconds | int | `5` | | +| readinessProbe.periodSeconds | int | `30` | | +| readinessProbe.successThreshold | int | `1` | | +| readinessProbe.timeoutSeconds | int | `1` | | +| replicaCount | int | `1` | | +| resources.limits | object | `{}` | | +| resources.requests | object | `{}` | | +| timeToDeletePods.running | string | `""` | | +| timeToDeletePods.error | string | `""` | | +| timeToDeletePods.succeeded | int | `120` | | +| timeToDeletePods.unsuccessful | int | `1440` | | +| tolerations | list | `[]` | | diff --git a/charts/airbyte-pod-sweeper/templates/configmap.yaml b/charts/airbyte-pod-sweeper/templates/configmap.yaml index 96ce9350871..d352120c5b5 100644 --- a/charts/airbyte-pod-sweeper/templates/configmap.yaml +++ b/charts/airbyte-pod-sweeper/templates/configmap.yaml @@ -44,6 +44,13 @@ data: NON_SUCCESS_DATE=`date -d $NON_SUCCESS_DATE_STR +%s` echo "Will sweep unsuccessful pods from before ${NON_SUCCESS_DATE_STR}" fi + + if [ -n "${ERROR_TTL_MINUTES}" ]; then + # Longer time window for unsuccessful pods (to debug) + ERROR_DATE_STR=`date -d "now - ${ERROR_TTL_MINUTES} minutes" --utc -Ins` + ERROR_DATE=`date -d $ERROR_DATE_STR +%s` + echo "Will sweep error pods from before ${ERROR_DATE_STR}" + fi ( IFS=$'\n' for POD in `get_job_pods`; do @@ -65,6 +72,10 @@ data: if [ "$POD_DATE" -lt "$NON_SUCCESS_DATE" ]; then delete_pod "$POD_NAME" "$POD_STATUS" "$POD_DATE_STR" fi + elif [ -n "${ERROR_TTL_MINUTES}" ] && [ "$POD_STATUS" = "Error" ]; then + if [ "$POD_DATE" -lt "$ERROR_DATE" ]; then + delete_pod "$POD_NAME" "$POD_STATUS" "$POD_DATE_STR" + fi fi done ) diff --git a/charts/airbyte-pod-sweeper/templates/deployment.yaml b/charts/airbyte-pod-sweeper/templates/deployment.yaml index 229027d7ca4..6cb4499d557 100644 --- a/charts/airbyte-pod-sweeper/templates/deployment.yaml +++ b/charts/airbyte-pod-sweeper/templates/deployment.yaml @@ -50,6 +50,8 @@ spec: value: "{{ .Values.timeToDeletePods.succeeded }}" - name: UNSUCCESSFUL_TTL_MINUTES value: "{{ .Values.timeToDeletePods.unsuccessful }}" + - name: ERROR_TTL_MINUTES + value: "{{ .Values.timeToDeletePods.error }}" {{- if .Values.containerSecurityContext }} securityContext: {{- toYaml .Values.containerSecurityContext | nindent 10 }} {{- end }} diff --git a/charts/airbyte-pod-sweeper/values.yaml b/charts/airbyte-pod-sweeper/values.yaml index c11303f2496..292b71f4d1f 100644 --- a/charts/airbyte-pod-sweeper/values.yaml +++ b/charts/airbyte-pod-sweeper/values.yaml @@ -221,6 +221,7 @@ extraVolumes: [] ## podSweeper.timeToDeletePods.unsuccessful Time to remove pods on neither running nor succeeded status (minutes). timeToDeletePods: running: "" + error: "" succeeded: 120 unsuccessful: 1440