diff --git a/docs/operator-manual/argocd-cmd-params-cm.yaml b/docs/operator-manual/argocd-cmd-params-cm.yaml index d4a754f0e44b95..8216a84d7276de 100644 --- a/docs/operator-manual/argocd-cmd-params-cm.yaml +++ b/docs/operator-manual/argocd-cmd-params-cm.yaml @@ -58,6 +58,12 @@ data: controller.sharding.algorithm: legacy # Number of allowed concurrent kubectl fork/execs. Any value less than 1 means no limit. controller.kubectl.parallelism.limit: "20" + # The maximum number of retries for each request + controller.k8sclient.retry.max: "0" + # The initial backoff delay on the first retry attempt in ms. Subsequent retries will double this backoff time up to a maximum threshold + controller.k8sclient.retry.base.backoff: "100" + # Grace period in seconds for ignoring consecutive errors while communicating with repo server. + controller.repo.error.grace.period.seconds: "180" ## Server properties # Listen on given address for incoming connections (default "0.0.0.0") @@ -72,6 +78,10 @@ data: server.rootpath: "" # Directory path that contains additional static assets server.staticassets: "/shared/app" + # The maximum number of retries for each request + server.k8sclient.retry.max: "0" + # The initial backoff delay on the first retry attempt in ms. Subsequent retries will double this backoff time up to a maximum threshold + server.k8sclient.retry.base.backoff: "100" # Set the logging format. One of: text|json (default "text") server.log.format: "text" diff --git a/docs/operator-manual/high_availability.md b/docs/operator-manual/high_availability.md index eaa000b5d96d50..ef3964e2761a5f 100644 --- a/docs/operator-manual/high_availability.md +++ b/docs/operator-manual/high_availability.md @@ -229,3 +229,91 @@ spec: path: my-application # ... ``` + +## Rate Limiting Application Reconciliations + +To prevent high controller resource usage or sync loops caused either due to misbehaving apps or other environment specific factors, +we can configure rate limits on the workqueues used by the application controller. There are two types of rate limits that can be configured: + + * Global rate limits + * Per item rate limits + +The final rate limiter uses a combination of both and calculates the final backoff as `max(globalBackoff, perItemBackoff)`. + +### Global rate limits + + This is enabled by default, it is a simple bucket based rate limiter that limits the number of items that can be queued per second. +This is useful to prevent a large number of apps from being queued at the same time. + +To configure the bucket limiter you can set the following environment variables: + + * `WORKQUEUE_BUCKET_SIZE` - The number of items that can be queued in a single burst. Defaults to 500. + * `WORKQUEUE_BUCKET_QPS` - The number of items that can be queued per second. Defaults to 50. + +### Per item rate limits + + This by default returns a fixed base delay/backoff value but can be configured to return exponential values, read further to understand it's working. +Per item rate limiter limits the number of times a particular item can be queued. This is based on exponential backoff where the backoff time for an item keeps increasing exponentially +if it is queued multiple times in a short period, but the backoff is reset automatically if a configured `cool down` period has elapsed since the last time the item was queued. + +To configure the per item limiter you can set the following environment variables: + + * `WORKQUEUE_FAILURE_COOLDOWN_NS` : The cool down period in nanoseconds, once period has elapsed for an item the backoff is reset. Exponential backoff is disabled if set to 0(default), eg. values : 10 * 10^9 (=10s) + * `WORKQUEUE_BASE_DELAY_NS` : The base delay in nanoseconds, this is the initial backoff used in the exponential backoff formula. Defaults to 1000 (=1μs) + * `WORKQUEUE_MAX_DELAY_NS` : The max delay in nanoseconds, this is the max backoff limit. Defaults to 3 * 10^9 (=3s) + * `WORKQUEUE_BACKOFF_FACTOR` : The backoff factor, this is the factor by which the backoff is increased for each retry. Defaults to 1.5 + +The formula used to calculate the backoff time for an item, where `numRequeue` is the number of times the item has been queued +and `lastRequeueTime` is the time at which the item was last queued: + +- When `WORKQUEUE_FAILURE_COOLDOWN_NS` != 0 : + +``` +backoff = time.Since(lastRequeueTime) >= WORKQUEUE_FAILURE_COOLDOWN_NS ? + WORKQUEUE_BASE_DELAY_NS : + min( + WORKQUEUE_MAX_DELAY_NS, + WORKQUEUE_BASE_DELAY_NS * WORKQUEUE_BACKOFF_FACTOR ^ (numRequeue) + ) +``` + +- When `WORKQUEUE_FAILURE_COOLDOWN_NS` = 0 : + +``` +backoff = WORKQUEUE_BASE_DELAY_NS +``` + +## HTTP Request Retry Strategy + +In scenarios where network instability or transient server errors occur, the retry strategy ensures the robustness of HTTP communication by automatically resending failed requests. It uses a combination of maximum retries and backoff intervals to prevent overwhelming the server or thrashing the network. + +### Configuring Retries + +The retry logic can be fine-tuned with the following environment variables: + +* `ARGOCD_K8SCLIENT_RETRY_MAX` - The maximum number of retries for each request. The request will be dropped after this count is reached. Defaults to 0 (no retries). +* `ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF` - The initial backoff delay on the first retry attempt in ms. Subsequent retries will double this backoff time up to a maximum threshold. Defaults to 100ms. + +### Backoff Strategy + +The backoff strategy employed is a simple exponential backoff without jitter. The backoff time increases exponentially with each retry attempt until a maximum backoff duration is reached. + +The formula for calculating the backoff time is: + +``` +backoff = min(retryWaitMax, baseRetryBackoff * (2 ^ retryAttempt)) +``` +Where `retryAttempt` starts at 0 and increments by 1 for each subsequent retry. + +### Maximum Wait Time + +There is a cap on the backoff time to prevent excessive wait times between retries. This cap is defined by: + +`retryWaitMax` - The maximum duration to wait before retrying. This ensures that retries happen within a reasonable timeframe. Defaults to 10 seconds. + +### Non-Retriable Conditions + +Not all HTTP responses are eligible for retries. The following conditions will not trigger a retry: + +* Responses with a status code indicating client errors (4xx) except for 429 Too Many Requests. +* Responses with the status code 501 Not Implemented. diff --git a/manifests/base/application-controller/argocd-application-controller-statefulset.yaml b/manifests/base/application-controller/argocd-application-controller-statefulset.yaml index 270fa05bcc62e0..560e4e0b6e3227 100644 --- a/manifests/base/application-controller/argocd-application-controller-statefulset.yaml +++ b/manifests/base/application-controller/argocd-application-controller-statefulset.yaml @@ -155,6 +155,18 @@ spec: name: argocd-cmd-params-cm key: controller.kubectl.parallelism.limit optional: true + - name: ARGOCD_K8SCLIENT_RETRY_MAX + valueFrom: + configMapKeyRef: + name: argocd-cmd-params-cm + key: controller.k8sclient.retry.max + optional: true + - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF + valueFrom: + configMapKeyRef: + name: argocd-cmd-params-cm + key: controller.k8sclient.retry.base.backoff + optional: true image: quay.io/argoproj/argocd:latest imagePullPolicy: Always name: argocd-application-controller diff --git a/manifests/base/server/argocd-server-deployment.yaml b/manifests/base/server/argocd-server-deployment.yaml index 66c6ed384b1d2d..b09891d26e5295 100644 --- a/manifests/base/server/argocd-server-deployment.yaml +++ b/manifests/base/server/argocd-server-deployment.yaml @@ -227,6 +227,18 @@ spec: name: argocd-cmd-params-cm key: server.enable.proxy.extension optional: true + - name: ARGOCD_K8SCLIENT_RETRY_MAX + valueFrom: + configMapKeyRef: + name: argocd-cmd-params-cm + key: server.k8sclient.retry.max + optional: true + - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF + valueFrom: + configMapKeyRef: + name: argocd-cmd-params-cm + key: server.k8sclient.retry.base.backoff + optional: true volumeMounts: - name: ssh-known-hosts mountPath: /app/config/ssh diff --git a/manifests/core-install.yaml b/manifests/core-install.yaml index 4b622103369fe2..2f14c79a5c9dcc 100644 --- a/manifests/core-install.yaml +++ b/manifests/core-install.yaml @@ -19451,6 +19451,18 @@ spec: key: controller.kubectl.parallelism.limit name: argocd-cmd-params-cm optional: true + - name: ARGOCD_K8SCLIENT_RETRY_MAX + valueFrom: + configMapKeyRef: + key: controller.k8sclient.retry.max + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF + valueFrom: + configMapKeyRef: + key: controller.k8sclient.retry.base.backoff + name: argocd-cmd-params-cm + optional: true image: quay.io/argoproj/argocd:v2.8.8 imagePullPolicy: Always name: argocd-application-controller diff --git a/manifests/ha/install.yaml b/manifests/ha/install.yaml index 1176c55fab318c..bf7a755d660fb9 100644 --- a/manifests/ha/install.yaml +++ b/manifests/ha/install.yaml @@ -20995,6 +20995,18 @@ spec: key: server.enable.proxy.extension name: argocd-cmd-params-cm optional: true + - name: ARGOCD_K8SCLIENT_RETRY_MAX + valueFrom: + configMapKeyRef: + key: server.k8sclient.retry.max + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF + valueFrom: + configMapKeyRef: + key: server.k8sclient.retry.base.backoff + name: argocd-cmd-params-cm + optional: true image: quay.io/argoproj/argocd:v2.8.8 imagePullPolicy: Always livenessProbe: @@ -21241,7 +21253,23 @@ spec: key: controller.kubectl.parallelism.limit name: argocd-cmd-params-cm optional: true +<<<<<<< HEAD image: quay.io/argoproj/argocd:v2.8.8 +======= + - name: ARGOCD_K8SCLIENT_RETRY_MAX + valueFrom: + configMapKeyRef: + key: controller.k8sclient.retry.max + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF + valueFrom: + configMapKeyRef: + key: controller.k8sclient.retry.base.backoff + name: argocd-cmd-params-cm + optional: true + image: quay.io/argoproj/argocd:v2.8.7 +>>>>>>> 1732b3105 (feat: add retry logic for k8s client #7692 (#16154)) imagePullPolicy: Always name: argocd-application-controller ports: diff --git a/manifests/ha/namespace-install.yaml b/manifests/ha/namespace-install.yaml index cb163091236d1f..c2d0391b659acf 100644 --- a/manifests/ha/namespace-install.yaml +++ b/manifests/ha/namespace-install.yaml @@ -2501,6 +2501,18 @@ spec: key: server.enable.proxy.extension name: argocd-cmd-params-cm optional: true + - name: ARGOCD_K8SCLIENT_RETRY_MAX + valueFrom: + configMapKeyRef: + key: server.k8sclient.retry.max + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF + valueFrom: + configMapKeyRef: + key: server.k8sclient.retry.base.backoff + name: argocd-cmd-params-cm + optional: true image: quay.io/argoproj/argocd:v2.8.8 imagePullPolicy: Always livenessProbe: @@ -2747,6 +2759,18 @@ spec: key: controller.kubectl.parallelism.limit name: argocd-cmd-params-cm optional: true + - name: ARGOCD_K8SCLIENT_RETRY_MAX + valueFrom: + configMapKeyRef: + key: controller.k8sclient.retry.max + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF + valueFrom: + configMapKeyRef: + key: controller.k8sclient.retry.base.backoff + name: argocd-cmd-params-cm + optional: true image: quay.io/argoproj/argocd:v2.8.8 imagePullPolicy: Always name: argocd-application-controller diff --git a/manifests/install.yaml b/manifests/install.yaml index 62710c81a51785..2ed7d2dc50800d 100644 --- a/manifests/install.yaml +++ b/manifests/install.yaml @@ -20050,7 +20050,23 @@ spec: key: server.enable.proxy.extension name: argocd-cmd-params-cm optional: true +<<<<<<< HEAD image: quay.io/argoproj/argocd:v2.8.8 +======= + - name: ARGOCD_K8SCLIENT_RETRY_MAX + valueFrom: + configMapKeyRef: + key: server.k8sclient.retry.max + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF + valueFrom: + configMapKeyRef: + key: server.k8sclient.retry.base.backoff + name: argocd-cmd-params-cm + optional: true + image: quay.io/argoproj/argocd:v2.8.7 +>>>>>>> 1732b3105 (feat: add retry logic for k8s client #7692 (#16154)) imagePullPolicy: Always livenessProbe: httpGet: @@ -20296,7 +20312,23 @@ spec: key: controller.kubectl.parallelism.limit name: argocd-cmd-params-cm optional: true +<<<<<<< HEAD image: quay.io/argoproj/argocd:v2.8.8 +======= + - name: ARGOCD_K8SCLIENT_RETRY_MAX + valueFrom: + configMapKeyRef: + key: controller.k8sclient.retry.max + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF + valueFrom: + configMapKeyRef: + key: controller.k8sclient.retry.base.backoff + name: argocd-cmd-params-cm + optional: true + image: quay.io/argoproj/argocd:v2.8.7 +>>>>>>> 1732b3105 (feat: add retry logic for k8s client #7692 (#16154)) imagePullPolicy: Always name: argocd-application-controller ports: diff --git a/manifests/namespace-install.yaml b/manifests/namespace-install.yaml index 0b54e80724ef84..0915a687152e84 100644 --- a/manifests/namespace-install.yaml +++ b/manifests/namespace-install.yaml @@ -1556,6 +1556,18 @@ spec: key: server.enable.proxy.extension name: argocd-cmd-params-cm optional: true + - name: ARGOCD_K8SCLIENT_RETRY_MAX + valueFrom: + configMapKeyRef: + key: server.k8sclient.retry.max + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF + valueFrom: + configMapKeyRef: + key: server.k8sclient.retry.base.backoff + name: argocd-cmd-params-cm + optional: true image: quay.io/argoproj/argocd:v2.8.8 imagePullPolicy: Always livenessProbe: @@ -1802,6 +1814,18 @@ spec: key: controller.kubectl.parallelism.limit name: argocd-cmd-params-cm optional: true + - name: ARGOCD_K8SCLIENT_RETRY_MAX + valueFrom: + configMapKeyRef: + key: controller.k8sclient.retry.max + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF + valueFrom: + configMapKeyRef: + key: controller.k8sclient.retry.base.backoff + name: argocd-cmd-params-cm + optional: true image: quay.io/argoproj/argocd:v2.8.8 imagePullPolicy: Always name: argocd-application-controller diff --git a/pkg/apis/application/v1alpha1/types.go b/pkg/apis/application/v1alpha1/types.go index 2eb5166c7d1eab..e37a12b656c65b 100644 --- a/pkg/apis/application/v1alpha1/types.go +++ b/pkg/apis/application/v1alpha1/types.go @@ -35,11 +35,11 @@ import ( "k8s.io/client-go/tools/clientcmd/api" "sigs.k8s.io/yaml" - "github.com/argoproj/argo-cd/v2/util/env" - "github.com/argoproj/argo-cd/v2/common" "github.com/argoproj/argo-cd/v2/util/collections" + "github.com/argoproj/argo-cd/v2/util/env" "github.com/argoproj/argo-cd/v2/util/helm" + utilhttp "github.com/argoproj/argo-cd/v2/util/http" "github.com/argoproj/argo-cd/v2/util/security" ) @@ -2850,6 +2850,12 @@ func SetK8SConfigDefaults(config *rest.Config) error { config.Timeout = K8sServerSideTimeout config.Transport = tr + maxRetries := env.ParseInt64FromEnv(utilhttp.EnvRetryMax, 0, 1, math.MaxInt64) + if maxRetries > 0 { + backoffDurationMS := env.ParseInt64FromEnv(utilhttp.EnvRetryBaseBackoff, 100, 1, math.MaxInt64) + backoffDuration := time.Duration(backoffDurationMS) * time.Millisecond + config.WrapTransport = utilhttp.WithRetry(maxRetries, backoffDuration) + } return nil } diff --git a/util/http/http.go b/util/http/http.go index 919e57a89e2960..7c13c71fde223b 100644 --- a/util/http/http.go +++ b/util/http/http.go @@ -1,21 +1,32 @@ package http import ( + "bytes" "fmt" + "io" "math" "net/http" "net/http/httputil" "strconv" "strings" + "time" - "github.com/argoproj/argo-cd/v2/util/env" + log "github.com/sirupsen/logrus" + "k8s.io/client-go/transport" "github.com/argoproj/argo-cd/v2/common" - - log "github.com/sirupsen/logrus" + "github.com/argoproj/argo-cd/v2/util/env" ) -const maxCookieLength = 4093 +const ( + maxCookieLength = 4093 + + // limit size of the resp to 512KB + respReadLimit = int64(524288) + retryWaitMax = time.Duration(10) * time.Second + EnvRetryMax = "ARGOCD_K8SCLIENT_RETRY_MAX" + EnvRetryBaseBackoff = "ARGOCD_K8SCLIENT_RETRY_BASE_BACKOFF" +) // max number of chunks a cookie can be broken into. To be compatible with // widest range of browsers, you shouldn't create more than 30 cookies per domain @@ -160,3 +171,71 @@ func (rt *TransportWithHeader) RoundTrip(r *http.Request) (*http.Response, error } return rt.RoundTripper.RoundTrip(r) } + +func WithRetry(maxRetries int64, baseRetryBackoff time.Duration) transport.WrapperFunc { + return func(rt http.RoundTripper) http.RoundTripper { + return &retryTransport{ + inner: rt, + maxRetries: maxRetries, + backoff: baseRetryBackoff, + } + } +} + +type retryTransport struct { + inner http.RoundTripper + maxRetries int64 + backoff time.Duration +} + +func isRetriable(resp *http.Response) bool { + if resp == nil { + return false + } + if resp.StatusCode == http.StatusTooManyRequests { + return true + } + if resp.StatusCode == 0 || (resp.StatusCode >= 500 && resp.StatusCode != http.StatusNotImplemented) { + return true + } + return false +} + +func (t *retryTransport) RoundTrip(req *http.Request) (*http.Response, error) { + var resp *http.Response + var err error + backoff := t.backoff + var bodyBytes []byte + if req.Body != nil { + bodyBytes, _ = io.ReadAll(req.Body) + } + for i := 0; i <= int(t.maxRetries); i++ { + req.Body = io.NopCloser(bytes.NewBuffer(bodyBytes)) + resp, err = t.inner.RoundTrip(req) + if i < int(t.maxRetries) && (err != nil || isRetriable(resp)) { + if resp != nil && resp.Body != nil { + drainBody(resp.Body) + } + if backoff > retryWaitMax { + backoff = retryWaitMax + } + select { + case <-time.After(backoff): + case <-req.Context().Done(): + return nil, req.Context().Err() + } + backoff *= 2 + continue + } + break + } + return resp, err +} + +func drainBody(body io.ReadCloser) { + defer body.Close() + _, err := io.Copy(io.Discard, io.LimitReader(body, respReadLimit)) + if err != nil { + log.Warnf("error reading response body: %s", err.Error()) + } +}