charts/deepgram-self-hosted/values.yaml

global:
  # -- (string) If using images from the Deepgram Quay image repositories,
  # or another private registry to which your cluster doesn't have default access,
  # you will need to provide a pre-configured K8s Secret
  # with image repository credentials. See chart docs for more details.
  pullSecretRef:

  # -- (string) Name of the pre-configured K8s Secret containing your Deepgram
  # self-hosted API key. See chart docs for more details.
  deepgramSecretRef:

  # -- Additional labels to add to all Deepgram resources
  additionalLabels: {}

  # -- When an API or Engine container is signaled to shutdown via Kubernetes sending a SIGTERM
  # signal, the container will stop listening on its port, and no new requests will be routed
  # to that container. However, the container will continue to run until all existing
  # batch or streaming requests have completed, after which it will gracefully shut down.
  #
  # Batch requests should be finished within 10-15 minutes, but streaming requests can proceed indefinitely.
  #
  # outstandingRequestGracePeriod defines the period (in sec) after which Kubernetes will forcefully
  # shutdown the container, terminating any outstanding connections. 1800 / 60 sec/min = 30 mins
  outstandingRequestGracePeriod: 1800

# -- Configuration options for horizontal scaling of Deepgram
# services. Only one of `static` and `auto` options can be enabled.
# @default -- ``
scaling:
  # -- Number of replicas to set during initial installation.
  # @default -- ``
  replicas:
    api: 1
    engine: 1

  # -- Enable pod autoscaling based on system load/traffic.
  # @default -- ``
  auto:
    enabled: false

    api:
      metrics:
        # -- Scale the API deployment to this Engine-to-Api pod ratio
        engineToApiRatio: 4
        # -- (list) If you have custom metrics you would like to scale with, you may add them here.
        # See the [k8s docs](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/)
        # for how to structure a list of metrics
        custom:

        # -- [Configurable scaling behavior](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#configurable-scaling-behavior)
      # @default -- "*See values.yaml file for default*"
      behavior:
        scaleDown:
          policies:
            - type: Pods
              value: 1
              periodSeconds: 60
            - type: Percent
              value: 25
              periodSeconds: 60

    engine:
      # -- Minimum number of Engine replicas.
      minReplicas: 1
      # -- Maximum number of Engine replicas.
      maxReplicas: 10
      metrics:
        # -- If `engine.concurrencyLimit.activeRequests` is set, this variable will
        # define the ratio of current active requests to maximum active requests at which
        # the Engine pods will scale. Setting this value too close to 1.0 may lead to a situation where
        # the cluster is at max capacity and rejects incoming requests. Setting the ratio too close to 0.0
        # will over-optimistically scale your cluster and increase compute costs unnecessarily.
        requestCapacityRatio:
        speechToText:
          batch:
            # -- (int) Scale the Engine pods based on a static desired number of speech-to-text batch requests per pod
            requestsPerPod:
          streaming:
            # -- (int) Scale the Engine pods based on a static desired number of speech-to-text streaming requests per pod
            requestsPerPod:
        textToSpeech:
          batch:
            # -- (int) Scale the Engine pods based on a static desired number of text-to-speech batch requests per pod
            requestsPerPod:
        # -- If you have custom metrics you would like to scale with, you may add them here.
        # See the [k8s docs](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/)
        # for how to structure a list of metrics
        custom: []

      # -- [Configurable scaling behavior](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#configurable-scaling-behavior)
      # @default -- "*See values.yaml file for default*"
      behavior:
        scaleDown:
          policies:
            - type: Pods
              value: 1
              periodSeconds: 60
            - type: Percent
              value: 25
              periodSeconds: 60

api:
  # -- namePrefix is the prefix to apply to the name of all K8s objects
  # associated with the Deepgram API containers.
  namePrefix: "deepgram-api"

  image:
    # -- path configures the image path to use for creating API containers.
    # You may change this from the public Quay image path if you have imported
    # Deepgram images into a private container registry.
    path: quay.io/deepgram/self-hosted-api
    # -- pullPolicy configures how the Kubelet attempts to pull the Deepgram API image
    pullPolicy: IfNotPresent
    # -- tag defines which Deepgram release to use for API containers
    tag: release-250130

  # -- Additional labels to add to API resources
  additionalLabels: {}

  # -- (object) Additional annotations to add to the API deployment
  additionalAnnotations:

  updateStrategy:
    rollingUpdate:
      # -- The maximum number of API pods, relative to the number of replicas,
      # that can go offline during a rolling update. See the
      # [Kubernetes documentation](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#max-unavailable)
      # for more details.
      maxUnavailable: 0
      # -- The maximum number of extra API pods that can be created during a rollingUpdate,
      # relative to the number of replicas. See the
      # [Kubernetes documentation](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#max-surge)
      # for more details.
      maxSurge: 1

  # -- Configure resource limits per API container. See
  # [Deepgram's documentation](https://developers.deepgram.com/docs/self-hosted-deployment-environments#api)
  # for more details.
  # @default -- ``
  resources:
    requests:
      memory: "4Gi"
      cpu: "2000m"
    limits:
      memory: "8Gi"
      cpu: "4000m"

  # -- Readiness probe customization for API pods.
  # @default -- ``
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 10
    failureThreshold: 1
  # -- Liveness probe customization for API pods.
  # @default -- ``
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 10
    failureThreshold: 3

  # -- [Affinity and anti-affinity](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity)
  # to apply for API pods.
  affinity: {}
  # -- [Tolerations](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/)
  # to apply to API pods.
  tolerations: []

  # -- [Security context](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/) for API pods.
  securityContext: {}

  serviceAccount:
    # -- Specifies whether to create a default service account for the Deepgram API Deployment.
    create: true
    # -- (string) Allows providing a custom service account name for the API component.
    # If left empty, the default service account name will be used.
    # If specified, and `api.serviceAccount.create = true`, this defines the name of the default service account.
    # If specified, and `api.serviceAccount.create = false`, this provides the name of a preconfigured service account
    # you wish to attach to the API deployment.
    name:

  # -- Configure how the API will listen for your requests
  # @default -- ``
  server:
    # baseUrl is the prefix requests to the API.
    baseUrl: "/v1"
    # -- host is the IP address to listen on. You will want to listen
    # on all interfaces to interact with other pods in the cluster.
    host: "0.0.0.0"
    # -- port to listen on.
    port: 8080

    # -- callbackConnTimeout configures how long to wait for a connection to a callback URL.
    # See [Deepgram's callback documentation](https://developers.deepgram.com/docs/callback)
    # for more details. The value should be a humantime duration.
    callbackConnTimeout: "1s"
    # -- callbackTimeout configures how long to wait for a response from a callback URL.
    # See [Deepgram's callback documentation](https://developers.deepgram.com/docs/callback)
    # for more details. The value should be a humantime duration.
    callbackTimeout: "10s"

    # -- fetchConnTimeout configures how long to wait for a connection to a fetch URL.
    # The value should be a humantime duration.
    # A fetch URL is a URL passed in an inference request from which a payload should be
    # downloaded.
    fetchConnTimeout: "1s"
    # -- fetchTimeout configures how long to wait for a response from a fetch URL.
    # The value should be a humantime duration.
    # A fetch URL is a URL passed in an inference request from which a payload should be
    # downloaded.
    fetchTimeout: "60s"

  # -- Specify custom DNS resolution options.
  # @default -- ``
  resolver:
    # -- nameservers allows for specifying custom domain name server(s).
    # A valid list item's format is "{IP} {PORT} {PROTOCOL (tcp or udp)}",
    # e.g. `"127.0.0.1 53 udp"`.
    nameservers: []
    # -- (int) maxTTL sets the DNS TTL value if specifying a custom DNS nameserver.
    maxTTL:

  # -- Enable ancillary features
  # @default -- ``
  features:
    # -- Enables entity detection on pre-recorded audio
    # *if* a valid entity detection model is available.
    # *WARNING*: Beta functionality.
    entityDetection: false

    # -- Enables entity-based redaction on pre-recorded audio
    # *if* a valid entity detection model is available.
    # *WARNING*: Beta functionality.
    entityRedaction: false

    # -- If API is receiving requests faster than Engine can process them, a request
    # queue will form. By default, this queue is stored in memory. Under high load,
    # the queue may grow too large and cause Out-Of-Memory errors. To avoid this,
    # set a diskBufferPath to buffer the overflow on the request queue to disk.
    #
    # WARN: This is only to temporarily buffer requests during high load.
    # If there is not enough Engine capacity to process the queued requests over time,
    # the queue (and response time) will grow indefinitely.
    diskBufferPath:

  # -- driverPool configures the backend pool of speech engines (generically referred to as
  # "drivers" here). The API will load-balance among drivers in the standard
  # pool; if one standard driver fails, the next one will be tried.
  # @default -- ``
  driverPool:
    # -- standard is the main driver pool to use.
    # @default -- ``
    standard:
      # -- timeoutBackoff is the factor to increase the timeout by
      # for each additional retry (for exponential backoff).
      timeoutBackoff: 1.2

      # -- retrySleep defines the initial sleep period (in humantime duration)
      # before attempting a retry.
      retrySleep: "2s"
      # -- retryBackoff is the factor to increase the retrySleep
      # by for each additional retry (for exponential backoff).
      retryBackoff: 1.6

      # -- Maximum response to deserialize from Driver (in bytes).
      # Default is 1GB, expressed in bytes.
      maxResponseSize: "1073741824"

engine:
  # -- namePrefix is the prefix to apply to the name of all K8s objects
  # associated with the Deepgram Engine containers.
  namePrefix: "deepgram-engine"

  image:
    # -- path configures the image path to use for creating Engine containers.
    # You may change this from the public Quay image path if you have imported
    # Deepgram images into a private container registry.
    path: quay.io/deepgram/self-hosted-engine
    # -- pullPolicy configures how the Kubelet attempts to pull the Deepgram Engine image
    pullPolicy: IfNotPresent
    # -- tag defines which Deepgram release to use for Engine containers
    tag: release-250130

  # -- Additional labels to add to Engine resources
  additionalLabels: {}

  # -- (object) Additional annotations to add to the Engine deployment
  additionalAnnotations:

  updateStrategy:
    rollingUpdate:
      # -- The maximum number of Engine pods, relative to the number of replicas,
      # that can go offline during a rolling update. See the
      # [Kubernetes documentation](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#max-unavailable)
      # for more details.
      maxUnavailable: 0
      # -- The maximum number of extra Engine pods that can be created during a rollingUpdate,
      # relative to the number of replicas. See the
      # [Kubernetes documentation](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#max-surge)
      # for more details.
      maxSurge: 1

  # -- Configure resource limits per Engine container. See
  # [Deepgram's documentation](https://developers.deepgram.com/docs/self-hosted-deployment-environments#engine)
  # for more details.
  # @default -- ``
  resources:
    requests:
      memory: "30Gi"
      cpu: "4000m"
      # -- gpu maps to the nvidia.com/gpu resource parameter
      gpu: 1
    limits:
      memory: "40Gi"
      cpu: "8000m"
      # -- gpu maps to the nvidia.com/gpu resource parameter
      gpu: 1

  # -- The startupProbe combination of `periodSeconds` and `failureThreshold` allows
  # time for the container to load all models and start listening for incoming requests.
  #
  # Model load time can be affected by hardware I/O speeds, as well as network speeds
  # if you are using a network volume mount for the models.
  #
  # If you are hitting the failure threshold before models are finished loading, you may
  # want to extend the startup probe. However, this will also extend the time it takes
  # to detect a pod that can't establish a network connection to validate its license.
  # @default -- ``
  startupProbe:
    # -- periodSeconds defines how often to execute the probe.
    periodSeconds: 10
    # -- failureThreshold defines how many unsuccessful startup probe attempts
    # are allowed before the container will be marked as Failed
    failureThreshold: 60

  # -- Readiness probe customization for Engine pods.
  # @default -- ``
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 10
    failureThreshold: 1
  # -- Liveness probe customization for Engine pods.
  # @default -- ``
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 10
    failureThreshold: 3

  # -- [Affinity and anti-affinity](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity)
  # to apply for Engine pods.
  affinity: {}
  # -- [Tolerations](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/)
  # to apply to Engine pods.
  tolerations: []

  # -- [Security context](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/) for API pods.
  securityContext: {}

  serviceAccount:
    # -- Specifies whether to create a default service account for the Deepgram Engine Deployment.
    create: true
    # -- (string) Allows providing a custom service account name for the Engine component.
    # If left empty, the default service account name will be used.
    # If specified, and `engine.serviceAccount.create = true`, this defines the name of the default service account.
    # If specified, and `engine.serviceAccount.create = false`, this provides the name of a preconfigured service account
    # you wish to attach to the Engine deployment.
    name:

  concurrencyLimit:
    # -- (int) activeRequests limits the number of active requests handled by
    # a single Engine container.
    # If additional requests beyond the limit are sent, the API container
    # forming the request will try a different Engine pod. If no Engine pods
    # are able to accept the request, the API will return a 429 HTTP response
    # to the client. The `nil` default means no limit will be set.
    activeRequests:

  # -- Configure Engine containers to listen for requests from API containers.
  # @default -- ``
  server:
    # -- host is the IP address to listen on for inference requests.
    # You will want to listen on all interfaces to interact with
    # other pods in the cluster.
    host: "0.0.0.0"
    # -- port to listen on for inference requests
    port: 8080

  # -- metricsServer exposes an endpoint on each Engine container
  # for reporting inference-specific system metrics.
  # See https://developers.deepgram.com/docs/metrics-guide#deepgram-engine
  # for more details.
  # @default -- ``
  metricsServer:
    # -- host is the IP address to listen on for metrics requests.
    # You will want to listen on all interfaces to interact with
    # other pods in the cluster.
    host: "0.0.0.0"
    # -- port to listen on for metrics requests
    port: 9991

  modelManager:
    volumes:
      customVolumeClaim:
        # -- You may manually create your own PersistentVolume and PersistentVolumeClaim to store and
        # expose model files to the Deepgram Engine. Configure your storage beforehand,
        # and enable here.
        # Note: Make sure the PV and PVC accessMode are set to `readWriteMany` or `readOnlyMany`
        enabled: false
        # -- (string) Name of your pre-configured PersistentVolumeClaim
        name:
        # -- Name of the directory within your pre-configured PersistentVolume
        # where the models are stored
        modelsDirectory: "/"

      aws:
        efs:
          # -- Whether to use an [AWS Elastic File Sytem](https://aws.amazon.com/efs/)
          # to store Deepgram models for use by Engine containers.
          # This option requires your cluster to be running in
          # [AWS EKS](https://aws.amazon.com/eks/).
          enabled: false
          # -- Name prefix for the resources associated with the model storage in AWS EFS.
          namePrefix: dg-models
          # -- (string) FileSystemId of existing AWS Elastic File System where
          # Deepgram model files will be persisted.
          # You can find it using the AWS CLI:
          # ```
          # $ aws efs describe-file-systems --query "FileSystems[*].FileSystemId"
          # ```
          fileSystemId:
          # -- Whether to force a fresh download of all model links provided,
          # even if models are already present in EFS.
          forceDownload: false
      gcp:
        gpd:
          # -- Whether to use an [GKE Persistent Disks](https://cloud.google.com/kubernetes-engine/docs/concepts/persistent-volumes)
          # to store Deepgram models for use by Engine containers.
          # This option requires your cluster to be running in
          # [GCP GKE](https://cloud.google.com/kubernetes-engine).
          # See the GKE documentation on
          # [using pre-existing persistent disks](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/preexisting-pd).
          enabled: false
          # -- Name prefix for the resources associated with the model storage in GCP GPD.
          namePrefix: dg-models
          # -- The storageClassName of the existing persistent disk.
          storageClassName: "standard-rwo"
          # -- The size of your pre-existing persistent disk.
          storageCapacity: "40G"
          # -- The identifier of your pre-existing persistent disk.
          # The format is projects/{project_id}/zones/{zone_name}/disks/{disk_name} for Zonal persistent disks,
          # or projects/{project_id}/regions/{region_name}/disks/{disk_name} for Regional persistent disks.
          volumeHandle: ""
          fsType: "ext4"

    models:
      # -- Deprecated field to automatically download models. Functionality still supported,
      # but migration to use `engine.modelManager.models.add` is strongly recommended.
      links: []
      # -- Links to your Deepgram models to automatically download
      # into storage backing a persistent volume.
      # **Automatic model management is currently supported for AWS EFS volumes only.**
      # Insert each model link provided to you by your Deepgram
      # Account Representative.
      add: []
      # -- If desiring to remove a model from storage (to reduce number of models loaded by
      # Engine on startup), move a link from the `engine.modelManager.models.add` section
      # to this section. You can also use a model name instead of the full link to designate
      # for removal.
      # **Automatic model management is currently supported for AWS EFS volumes only.**
      remove: []

  # -- chunking defines the size of audio chunks to process in seconds.
  # Adjusting these values will affect both inference performance and accuracy
  # of results. Please contact your Deepgram Account Representative if you
  # want to adjust any of these values.
  # @default -- ``
  chunking:
    speechToText:
      batch:
        # -- (float) minDuration is the minimum audio duration for a STT chunk size for a batch request
        minDuration:
        # -- (float) minDuration is the maximum audio duration for a STT chunk size for a batch request
        maxDuration:
      streaming:
        # -- (float) minDuration is the minimum audio duration for a STT chunk size for a streaming request
        minDuration:
        # -- (float) minDuration is the maximum audio duration for a STT chunk size for a streaming request
        maxDuration:
        # -- step defines how often to return interim results, in seconds.
        # This value may be lowered to increase the frequency of interim results.
        # However, this also causes a significant decrease in the number of concurrent
        # streams supported by a single GPU. Please contact your Deepgram Account
        # representative for more details.
        step: 1.0

  halfPrecision:
    # -- Engine will automatically enable half precision operations if your GPU supports
    # them. You can explicitly enable or disable this behavior with the state parameter
    # which supports `"enable"`, `"disabled"`, and `"auto"`.
    state: "auto"

# -- Configuration options for the optional
# [Deepgram License Proxy](https://developers.deepgram.com/docs/license-proxy).
# @default -- ``
licenseProxy:
  # -- The License Proxy is optional, but highly recommended to be deployed in production
  # to enable highly available environments.
  enabled: false

  # -- If the License Proxy is deployed, one replica should be sufficient to
  # support many API/Engine pods.
  # Highly available environments may wish to deploy a second replica to ensure
  # uptime, which can be toggled with this option.
  deploySecondReplica: false

  # -- Even with a License Proxy deployed, API and Engine pods can be configured to keep the
  # upstream `license.deepgram.com` license server as a fallback licensing option if the
  # License Proxy is unavailable.
  # Disable this option if you are restricting API/Engine Pod network access for security reasons,
  # and only the License Proxy should send egress traffic to the upstream license server.
  keepUpstreamServerAsBackup: true

  # -- namePrefix is the prefix to apply to the name of all K8s objects
  # associated with the Deepgram License Proxy containers.
  namePrefix: "deepgram-license-proxy"

  image:
    # -- path configures the image path to use for creating License Proxy containers.
    # You may change this from the public Quay image path if you have imported
    # Deepgram images into a private container registry.
    path: quay.io/deepgram/self-hosted-license-proxy
    # -- tag defines which Deepgram release to use for License Proxy containers
    tag: release-250130
    # -- pullPolicy configures how the Kubelet attempts to pull the Deepgram
    # License Proxy image
    pullPolicy: IfNotPresent

  # -- Additional labels to add to License Proxy resources
  additionalLabels: {}

  # -- (object) Additional annotations to add to the LicenseProxy deployment
  additionalAnnotations:

  updateStrategy:
    # -- For the LicenseProxy, we only expose maxSurge and not maxUnavailable.
    # This is to avoid accidentally having all LicenseProxy nodes go offline during upgrades,
    # which could impact the entire cluster's connection to the Deepgram License Server.
    # @default -- ``
    rollingUpdate:
      # -- The maximum number of extra License Proxy pods that can be created during a rollingUpdate,
      # relative to the number of replicas. See the
      # [Kubernetes documentation](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#max-surge)
      # for more details.
      maxSurge: 1

  # -- Configure resource limits per License Proxy container. See
  # [Deepgram's documentation](https://developers.deepgram.com/docs/license-proxy#system-requirements)
  # for more details.
  # @default -- ``
  resources:
    requests:
      memory: "1Gi"
      cpu: "1000m"
    limits:
      memory: "8Gi"
      cpu: "2000m"

  # -- Readiness probe customization for License Proxy pods.
  # @default -- ``
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 10
    failureThreshold: 1
  # -- Liveness probe customization for Proxy pods.
  # @default -- ``
  livenessProbe:
    initialDelaySeconds: 5
    periodSeconds: 10
    failureThreshold: 3

  # -- [Affinity and anti-affinity](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity)
  # to apply for License Proxy pods.
  affinity: {}
  # -- [Tolerations](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/)
  # to apply to License Proxy pods.
  tolerations: []

  # -- [Security context](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/) for API pods.
  securityContext: {}

  serviceAccount:
    # -- Specifies whether to create a default service account for the Deepgram License Proxy Deployment.
    create: true
    # -- (string) Allows providing a custom service account name for the LicenseProxy component.
    # If left empty, the default service account name will be used.
    # If specified, and `licenseProxy.serviceAccount.create = true`, this defines the name of the default service account.
    # If specified, and `licenseProxy.serviceAccount.create = false`, this provides the name of a preconfigured service account
    # you wish to attach to the License Proxy deployment.
    name:

  # -- Configure how the license proxy will listen for licensing requests.
  # @default -- ``
  server:
    # --host is the IP address to listen on. You will want to listen
    # on all interfaces to interact with other pods in the cluster.
    host: "0.0.0.0"
    # -- port to listen on.
    port: 8443

    # -- baseUrl is the prefix for incoming license verification requests.
    baseUrl: "/"

    # -- statusPort is the port to listen on for the status/health endpoint.
    statusPort: 8080

# -- Passthrough values for [NVIDIA GPU Operator Helm chart](https://github.com/NVIDIA/gpu-operator/blob/master/deployments/gpu-operator/values.yaml)
# You may use the NVIDIA GPU Operator to manage installation of NVIDIA drivers and the container toolkit on nodes with attached GPUs.
# @default -- ``
gpu-operator:
  # -- Whether to install the NVIDIA GPU Operator to manage driver and/or container toolkit installation.
  # See the list of [supported Operating Systems](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/platform-support.html#supported-operating-systems-and-kubernetes-platforms)
  # to verify compatibility with your cluster/nodes. Disable this option if your cluster/nodes are not compatible.
  # If disabled, you will need to self-manage NVIDIA software installation on all nodes where you want
  # to schedule Deepgram Engine pods.
  enabled: true
  driver:
    # -- Whether to install NVIDIA drivers on nodes where a NVIDIA GPU is detected.
    # If your Kubernetes nodes run a base image that comes with NVIDIA drivers pre-configured,
    # disable this option, but keep the parent `gpu-operator` and sibling `toolkit`
    # options enabled.
    enabled: true
    # -- NVIDIA driver version to install.
    version: "550.54.15"
  toolkit:
    # -- Whether to install NVIDIA drivers on nodes where a NVIDIA GPU is detected.
    enabled: true
    # -- NVIDIA container toolkit to install. The default `ubuntu` image tag for the
    # toolkit requires a dynamic runtime link to a version of GLIBC that may not be
    # present on nodes running older Linux distribution releases, such as Ubuntu 22.04.
    # Therefore, we specify the `ubi8` image, which statically links the GLIBC library
    # and avoids this issue.
    version: v1.15.0-ubi8

cluster-autoscaler:
  # -- Set to `true` to enable node autoscaling with AWS EKS. Note needed for GKE, as autoscaling is enabled by a
  # [cli option on cluster creation](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-autoscaler#creating_a_cluster_with_autoscaling).
  enabled: false
  rbac:
    serviceAccount:
      # -- Name of the IAM Service Account with the [necessary permissions](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/aws/README.md#permissions)
      name: cluster-autoscaler-sa
      annotations:
        # -- (string) Replace with the AWS Role ARN configured for the Cluster Autoscaler.
        # See the [Deepgram AWS EKS guide](https://developers.deepgram.com/docs/aws-k8s#creating-a-cluster)
        # or [Cluster Autoscaler AWS documentation](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/aws/README.md#permissions)
        # for details.
        eks.amazonaws.com/role-arn:
  autoDiscovery:
    # -- (string) Name of your AWS EKS cluster. Using the [Cluster Autoscaler](https://github.com/kubernetes/autoscaler)
    # on AWS requires knowledge of certain cluster metadata.
    clusterName:
  # -- (string) Region of your AWS EKS cluster. Using the [Cluster Autoscaler](https://github.com/kubernetes/autoscaler)
  # on AWS requires knowledge of certain cluster metadata.
  awsRegion:

# -- Passthrough values for [Prometheus k8s stack Helm chart](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack).
# Prometheus (and its adapter) should be configured when scaling.auto is enabled.
# You may choose to use the installation/configuration bundled in this Helm chart,
# or you may configure an existing Prometheus installation in your cluster to expose
# the needed values.
# See source Helm chart for explanation of available values. Default values provided in this chart are used
# to provide pod autoscaling for Deepgram pods.
# @default -- ``
kube-prometheus-stack:
  # -- (bool) Normally, this chart will be installed if `scaling.auto.enabled` is true. However, if you wish
  # to manage the Prometheus adapter in your cluster on your own and not as part of the Deepgram Helm chart,
  # you can force it to not be installed by setting this to `false`.
  includeDependency:

  fullnameOverride: "dg-prometheus-stack"
  prometheus:
    prometheusSpec:
      additionalScrapeConfigs:
        - job_name: "dg_engine_metrics"
          scrape_interval: "2s"
          kubernetes_sd_configs:
            - role: endpoints
              namespaces:
                names:
                  - "{{ .Release.Namespace }}"
          relabel_configs:
            - source_labels: [__meta_kubernetes_service_name]
              regex: "(.*)-metrics"
              action: keep
            - source_labels: [__meta_kubernetes_endpoint_port_name]
              regex: "metrics"
              action: keep
            - source_labels: [__meta_kubernetes_namespace]
              target_label: namespace
            - source_labels: [__meta_kubernetes_service_name]
              target_label: service
            - source_labels: [__meta_kubernetes_pod_name]
              target_label: pod

  prometheusOperator:
    enabled: true

  alertmanager:
    enabled: false

  grafana:
    enabled: true

  nodeExporter:
    enabled: false

  kube-state-metrics:
    enabled: true
    metricLabelsAllowlist:
      - namespaces=[{{ .Release.Namespace }}],deployments=[app]

# -- Passthrough values for [Prometheus Adapter Helm chart](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-adapter).
# Prometheus, and its adapter here, should be configured when scaling.auto is enabled.
# You may choose to use the installation/configuration bundled in this Helm chart,
# or you may configure an existing Prometheus installation in your cluster to expose
# the needed values.
# See source Helm chart for explanation of available values. Default values provided in this chart are used
# to provide pod autoscaling for Deepgram pods.
# @default -- ``
prometheus-adapter:
  # -- Normally, this chart will be installed if `scaling.auto.enabled` is true. However, if you wish
  # to manage the Prometheus adapter in your cluster on your own and not as part of the Deepgram Helm chart,
  # you can force it to not be installed by setting this to `false`.
  includeDependency:
  prometheus:
    url: http://dg-prometheus-stack-prometheus.{{ .Release.Namespace }}.svc
  rules:
    default: false
    external:
      - name:
          as: "engine_active_requests_stt_streaming"
        seriesQuery: 'engine_active_requests{kind="stream"}'
        metricsQuery: 'avg(engine_active_requests{kind="stream"})'
        resources:
          overrides:
            namespace: { resource: "namespace" }
            pod: { resource: "pod" }
            service: { resource: "service" }
      - name:
          as: "engine_active_requests_stt_batch"
        seriesQuery: 'engine_active_requests{kind="batch"}'
        metricsQuery: 'avg(engine_active_requests{kind="batch"})'
        resources:
          overrides:
            namespace: { resource: "namespace" }
            pod: { resource: "pod" }
            service: { resource: "service" }
      - name:
          as: "engine_active_requests_tts_batch"
        seriesQuery: 'engine_active_requests{kind="tts"}'
        metricsQuery: 'avg(engine_active_requests{kind="tts"})'
        resources:
          overrides:
            namespace: { resource: "namespace" }
            pod: { resource: "pod" }
            service: { resource: "service" }
      - name:
          as: "engine_estimated_stream_capacity"
        seriesQuery: 'engine_active_requests{kind="stream"}'
        metricsQuery: 'avg_over_time((sum(engine_active_requests{kind="stream"}) / sum(engine_estimated_stream_capacity))[1m:1m])'
        resources:
          overrides:
            namespace: { resource: "namespace" }
            pod: { resource: "pod" }
            service: { resource: "service" }
      - name:
          as: "engine_requests_active_to_max_ratio"
        seriesQuery: "engine_max_active_requests"
        metricsQuery: "avg_over_time((sum(engine_active_requests) / sum(engine_max_active_requests))[1m:1m])"
        resources:
          overrides:
            namespace: { resource: "namespace" }
            pod: { resource: "pod" }
            service: { resource: "service" }
      - name:
          as: "engine_to_api_pod_ratio"
        seriesQuery: 'kube_deployment_labels{label_app="deepgram-engine"}'
        metricsQuery: '(sum(kube_deployment_status_replicas and on(deployment) kube_deployment_labels{label_app="deepgram-engine"})) / (sum(kube_deployment_status_replicas and on(deployment) kube_deployment_labels{label_app="deepgram-api"}))'
        resources:
          overrides:
            namespace: { resource: "namespace" }
            pod: { resource: "pod" }