-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathvalues.yaml
791 lines (717 loc) · 34.8 KB
/
values.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
global:
# -- (string) If using images from the Deepgram Quay image repositories,
# or another private registry to which your cluster doesn't have default access,
# you will need to provide a pre-configured K8s Secret
# with image repository credentials. See chart docs for more details.
pullSecretRef:
# -- (string) Name of the pre-configured K8s Secret containing your Deepgram
# self-hosted API key. See chart docs for more details.
deepgramSecretRef:
# -- Additional labels to add to all Deepgram resources
additionalLabels: {}
# -- When an API or Engine container is signaled to shutdown via Kubernetes sending a SIGTERM
# signal, the container will stop listening on its port, and no new requests will be routed
# to that container. However, the container will continue to run until all existing
# batch or streaming requests have completed, after which it will gracefully shut down.
#
# Batch requests should be finished within 10-15 minutes, but streaming requests can proceed indefinitely.
#
# outstandingRequestGracePeriod defines the period (in sec) after which Kubernetes will forcefully
# shutdown the container, terminating any outstanding connections. 1800 / 60 sec/min = 30 mins
outstandingRequestGracePeriod: 1800
# -- Configuration options for horizontal scaling of Deepgram
# services. Only one of `static` and `auto` options can be enabled.
# @default -- ``
scaling:
# -- Number of replicas to set during initial installation.
# @default -- ``
replicas:
api: 1
engine: 1
# -- Enable pod autoscaling based on system load/traffic.
# @default -- ``
auto:
enabled: false
api:
metrics:
# -- Scale the API deployment to this Engine-to-Api pod ratio
engineToApiRatio: 4
# -- (list) If you have custom metrics you would like to scale with, you may add them here.
# See the [k8s docs](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/)
# for how to structure a list of metrics
custom:
# -- [Configurable scaling behavior](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#configurable-scaling-behavior)
# @default -- "*See values.yaml file for default*"
behavior:
scaleDown:
policies:
- type: Pods
value: 1
periodSeconds: 60
- type: Percent
value: 25
periodSeconds: 60
engine:
# -- Minimum number of Engine replicas.
minReplicas: 1
# -- Maximum number of Engine replicas.
maxReplicas: 10
metrics:
# -- If `engine.concurrencyLimit.activeRequests` is set, this variable will
# define the ratio of current active requests to maximum active requests at which
# the Engine pods will scale. Setting this value too close to 1.0 may lead to a situation where
# the cluster is at max capacity and rejects incoming requests. Setting the ratio too close to 0.0
# will over-optimistically scale your cluster and increase compute costs unnecessarily.
requestCapacityRatio:
speechToText:
batch:
# -- (int) Scale the Engine pods based on a static desired number of speech-to-text batch requests per pod
requestsPerPod:
streaming:
# -- (int) Scale the Engine pods based on a static desired number of speech-to-text streaming requests per pod
requestsPerPod:
textToSpeech:
batch:
# -- (int) Scale the Engine pods based on a static desired number of text-to-speech batch requests per pod
requestsPerPod:
# -- If you have custom metrics you would like to scale with, you may add them here.
# See the [k8s docs](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/)
# for how to structure a list of metrics
custom: []
# -- [Configurable scaling behavior](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#configurable-scaling-behavior)
# @default -- "*See values.yaml file for default*"
behavior:
scaleDown:
policies:
- type: Pods
value: 1
periodSeconds: 60
- type: Percent
value: 25
periodSeconds: 60
api:
# -- namePrefix is the prefix to apply to the name of all K8s objects
# associated with the Deepgram API containers.
namePrefix: "deepgram-api"
image:
# -- path configures the image path to use for creating API containers.
# You may change this from the public Quay image path if you have imported
# Deepgram images into a private container registry.
path: quay.io/deepgram/self-hosted-api
# -- pullPolicy configures how the Kubelet attempts to pull the Deepgram API image
pullPolicy: IfNotPresent
# -- tag defines which Deepgram release to use for API containers
tag: release-250130
# -- Additional labels to add to API resources
additionalLabels: {}
# -- (object) Additional annotations to add to the API deployment
additionalAnnotations:
updateStrategy:
rollingUpdate:
# -- The maximum number of API pods, relative to the number of replicas,
# that can go offline during a rolling update. See the
# [Kubernetes documentation](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#max-unavailable)
# for more details.
maxUnavailable: 0
# -- The maximum number of extra API pods that can be created during a rollingUpdate,
# relative to the number of replicas. See the
# [Kubernetes documentation](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#max-surge)
# for more details.
maxSurge: 1
# -- Configure resource limits per API container. See
# [Deepgram's documentation](https://developers.deepgram.com/docs/self-hosted-deployment-environments#api)
# for more details.
# @default -- ``
resources:
requests:
memory: "4Gi"
cpu: "2000m"
limits:
memory: "8Gi"
cpu: "4000m"
# -- Readiness probe customization for API pods.
# @default -- ``
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 10
failureThreshold: 1
# -- Liveness probe customization for API pods.
# @default -- ``
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 10
failureThreshold: 3
# -- [Affinity and anti-affinity](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity)
# to apply for API pods.
affinity: {}
# -- [Tolerations](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/)
# to apply to API pods.
tolerations: []
# -- [Security context](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/) for API pods.
securityContext: {}
serviceAccount:
# -- Specifies whether to create a default service account for the Deepgram API Deployment.
create: true
# -- (string) Allows providing a custom service account name for the API component.
# If left empty, the default service account name will be used.
# If specified, and `api.serviceAccount.create = true`, this defines the name of the default service account.
# If specified, and `api.serviceAccount.create = false`, this provides the name of a preconfigured service account
# you wish to attach to the API deployment.
name:
# -- Configure how the API will listen for your requests
# @default -- ``
server:
# baseUrl is the prefix requests to the API.
baseUrl: "/v1"
# -- host is the IP address to listen on. You will want to listen
# on all interfaces to interact with other pods in the cluster.
host: "0.0.0.0"
# -- port to listen on.
port: 8080
# -- callbackConnTimeout configures how long to wait for a connection to a callback URL.
# See [Deepgram's callback documentation](https://developers.deepgram.com/docs/callback)
# for more details. The value should be a humantime duration.
callbackConnTimeout: "1s"
# -- callbackTimeout configures how long to wait for a response from a callback URL.
# See [Deepgram's callback documentation](https://developers.deepgram.com/docs/callback)
# for more details. The value should be a humantime duration.
callbackTimeout: "10s"
# -- fetchConnTimeout configures how long to wait for a connection to a fetch URL.
# The value should be a humantime duration.
# A fetch URL is a URL passed in an inference request from which a payload should be
# downloaded.
fetchConnTimeout: "1s"
# -- fetchTimeout configures how long to wait for a response from a fetch URL.
# The value should be a humantime duration.
# A fetch URL is a URL passed in an inference request from which a payload should be
# downloaded.
fetchTimeout: "60s"
# -- Specify custom DNS resolution options.
# @default -- ``
resolver:
# -- nameservers allows for specifying custom domain name server(s).
# A valid list item's format is "{IP} {PORT} {PROTOCOL (tcp or udp)}",
# e.g. `"127.0.0.1 53 udp"`.
nameservers: []
# -- (int) maxTTL sets the DNS TTL value if specifying a custom DNS nameserver.
maxTTL:
# -- Enable ancillary features
# @default -- ``
features:
# -- Enables entity detection on pre-recorded audio
# *if* a valid entity detection model is available.
# *WARNING*: Beta functionality.
entityDetection: false
# -- Enables entity-based redaction on pre-recorded audio
# *if* a valid entity detection model is available.
# *WARNING*: Beta functionality.
entityRedaction: false
# -- If API is receiving requests faster than Engine can process them, a request
# queue will form. By default, this queue is stored in memory. Under high load,
# the queue may grow too large and cause Out-Of-Memory errors. To avoid this,
# set a diskBufferPath to buffer the overflow on the request queue to disk.
#
# WARN: This is only to temporarily buffer requests during high load.
# If there is not enough Engine capacity to process the queued requests over time,
# the queue (and response time) will grow indefinitely.
diskBufferPath:
# -- driverPool configures the backend pool of speech engines (generically referred to as
# "drivers" here). The API will load-balance among drivers in the standard
# pool; if one standard driver fails, the next one will be tried.
# @default -- ``
driverPool:
# -- standard is the main driver pool to use.
# @default -- ``
standard:
# -- timeoutBackoff is the factor to increase the timeout by
# for each additional retry (for exponential backoff).
timeoutBackoff: 1.2
# -- retrySleep defines the initial sleep period (in humantime duration)
# before attempting a retry.
retrySleep: "2s"
# -- retryBackoff is the factor to increase the retrySleep
# by for each additional retry (for exponential backoff).
retryBackoff: 1.6
# -- Maximum response to deserialize from Driver (in bytes).
# Default is 1GB, expressed in bytes.
maxResponseSize: "1073741824"
engine:
# -- namePrefix is the prefix to apply to the name of all K8s objects
# associated with the Deepgram Engine containers.
namePrefix: "deepgram-engine"
image:
# -- path configures the image path to use for creating Engine containers.
# You may change this from the public Quay image path if you have imported
# Deepgram images into a private container registry.
path: quay.io/deepgram/self-hosted-engine
# -- pullPolicy configures how the Kubelet attempts to pull the Deepgram Engine image
pullPolicy: IfNotPresent
# -- tag defines which Deepgram release to use for Engine containers
tag: release-250130
# -- Additional labels to add to Engine resources
additionalLabels: {}
# -- (object) Additional annotations to add to the Engine deployment
additionalAnnotations:
updateStrategy:
rollingUpdate:
# -- The maximum number of Engine pods, relative to the number of replicas,
# that can go offline during a rolling update. See the
# [Kubernetes documentation](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#max-unavailable)
# for more details.
maxUnavailable: 0
# -- The maximum number of extra Engine pods that can be created during a rollingUpdate,
# relative to the number of replicas. See the
# [Kubernetes documentation](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#max-surge)
# for more details.
maxSurge: 1
# -- Configure resource limits per Engine container. See
# [Deepgram's documentation](https://developers.deepgram.com/docs/self-hosted-deployment-environments#engine)
# for more details.
# @default -- ``
resources:
requests:
memory: "30Gi"
cpu: "4000m"
# -- gpu maps to the nvidia.com/gpu resource parameter
gpu: 1
limits:
memory: "40Gi"
cpu: "8000m"
# -- gpu maps to the nvidia.com/gpu resource parameter
gpu: 1
# -- The startupProbe combination of `periodSeconds` and `failureThreshold` allows
# time for the container to load all models and start listening for incoming requests.
#
# Model load time can be affected by hardware I/O speeds, as well as network speeds
# if you are using a network volume mount for the models.
#
# If you are hitting the failure threshold before models are finished loading, you may
# want to extend the startup probe. However, this will also extend the time it takes
# to detect a pod that can't establish a network connection to validate its license.
# @default -- ``
startupProbe:
# -- periodSeconds defines how often to execute the probe.
periodSeconds: 10
# -- failureThreshold defines how many unsuccessful startup probe attempts
# are allowed before the container will be marked as Failed
failureThreshold: 60
# -- Readiness probe customization for Engine pods.
# @default -- ``
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 10
failureThreshold: 1
# -- Liveness probe customization for Engine pods.
# @default -- ``
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 10
failureThreshold: 3
# -- [Affinity and anti-affinity](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity)
# to apply for Engine pods.
affinity: {}
# -- [Tolerations](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/)
# to apply to Engine pods.
tolerations: []
# -- [Security context](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/) for API pods.
securityContext: {}
serviceAccount:
# -- Specifies whether to create a default service account for the Deepgram Engine Deployment.
create: true
# -- (string) Allows providing a custom service account name for the Engine component.
# If left empty, the default service account name will be used.
# If specified, and `engine.serviceAccount.create = true`, this defines the name of the default service account.
# If specified, and `engine.serviceAccount.create = false`, this provides the name of a preconfigured service account
# you wish to attach to the Engine deployment.
name:
concurrencyLimit:
# -- (int) activeRequests limits the number of active requests handled by
# a single Engine container.
# If additional requests beyond the limit are sent, the API container
# forming the request will try a different Engine pod. If no Engine pods
# are able to accept the request, the API will return a 429 HTTP response
# to the client. The `nil` default means no limit will be set.
activeRequests:
# -- Configure Engine containers to listen for requests from API containers.
# @default -- ``
server:
# -- host is the IP address to listen on for inference requests.
# You will want to listen on all interfaces to interact with
# other pods in the cluster.
host: "0.0.0.0"
# -- port to listen on for inference requests
port: 8080
# -- metricsServer exposes an endpoint on each Engine container
# for reporting inference-specific system metrics.
# See https://developers.deepgram.com/docs/metrics-guide#deepgram-engine
# for more details.
# @default -- ``
metricsServer:
# -- host is the IP address to listen on for metrics requests.
# You will want to listen on all interfaces to interact with
# other pods in the cluster.
host: "0.0.0.0"
# -- port to listen on for metrics requests
port: 9991
modelManager:
volumes:
customVolumeClaim:
# -- You may manually create your own PersistentVolume and PersistentVolumeClaim to store and
# expose model files to the Deepgram Engine. Configure your storage beforehand,
# and enable here.
# Note: Make sure the PV and PVC accessMode are set to `readWriteMany` or `readOnlyMany`
enabled: false
# -- (string) Name of your pre-configured PersistentVolumeClaim
name:
# -- Name of the directory within your pre-configured PersistentVolume
# where the models are stored
modelsDirectory: "/"
aws:
efs:
# -- Whether to use an [AWS Elastic File Sytem](https://aws.amazon.com/efs/)
# to store Deepgram models for use by Engine containers.
# This option requires your cluster to be running in
# [AWS EKS](https://aws.amazon.com/eks/).
enabled: false
# -- Name prefix for the resources associated with the model storage in AWS EFS.
namePrefix: dg-models
# -- (string) FileSystemId of existing AWS Elastic File System where
# Deepgram model files will be persisted.
# You can find it using the AWS CLI:
# ```
# $ aws efs describe-file-systems --query "FileSystems[*].FileSystemId"
# ```
fileSystemId:
# -- Whether to force a fresh download of all model links provided,
# even if models are already present in EFS.
forceDownload: false
gcp:
gpd:
# -- Whether to use an [GKE Persistent Disks](https://cloud.google.com/kubernetes-engine/docs/concepts/persistent-volumes)
# to store Deepgram models for use by Engine containers.
# This option requires your cluster to be running in
# [GCP GKE](https://cloud.google.com/kubernetes-engine).
# See the GKE documentation on
# [using pre-existing persistent disks](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/preexisting-pd).
enabled: false
# -- Name prefix for the resources associated with the model storage in GCP GPD.
namePrefix: dg-models
# -- The storageClassName of the existing persistent disk.
storageClassName: "standard-rwo"
# -- The size of your pre-existing persistent disk.
storageCapacity: "40G"
# -- The identifier of your pre-existing persistent disk.
# The format is projects/{project_id}/zones/{zone_name}/disks/{disk_name} for Zonal persistent disks,
# or projects/{project_id}/regions/{region_name}/disks/{disk_name} for Regional persistent disks.
volumeHandle: ""
fsType: "ext4"
models:
# -- Deprecated field to automatically download models. Functionality still supported,
# but migration to use `engine.modelManager.models.add` is strongly recommended.
links: []
# -- Links to your Deepgram models to automatically download
# into storage backing a persistent volume.
# **Automatic model management is currently supported for AWS EFS volumes only.**
# Insert each model link provided to you by your Deepgram
# Account Representative.
add: []
# -- If desiring to remove a model from storage (to reduce number of models loaded by
# Engine on startup), move a link from the `engine.modelManager.models.add` section
# to this section. You can also use a model name instead of the full link to designate
# for removal.
# **Automatic model management is currently supported for AWS EFS volumes only.**
remove: []
# -- chunking defines the size of audio chunks to process in seconds.
# Adjusting these values will affect both inference performance and accuracy
# of results. Please contact your Deepgram Account Representative if you
# want to adjust any of these values.
# @default -- ``
chunking:
speechToText:
batch:
# -- (float) minDuration is the minimum audio duration for a STT chunk size for a batch request
minDuration:
# -- (float) minDuration is the maximum audio duration for a STT chunk size for a batch request
maxDuration:
streaming:
# -- (float) minDuration is the minimum audio duration for a STT chunk size for a streaming request
minDuration:
# -- (float) minDuration is the maximum audio duration for a STT chunk size for a streaming request
maxDuration:
# -- step defines how often to return interim results, in seconds.
# This value may be lowered to increase the frequency of interim results.
# However, this also causes a significant decrease in the number of concurrent
# streams supported by a single GPU. Please contact your Deepgram Account
# representative for more details.
step: 1.0
halfPrecision:
# -- Engine will automatically enable half precision operations if your GPU supports
# them. You can explicitly enable or disable this behavior with the state parameter
# which supports `"enable"`, `"disabled"`, and `"auto"`.
state: "auto"
# -- Configuration options for the optional
# [Deepgram License Proxy](https://developers.deepgram.com/docs/license-proxy).
# @default -- ``
licenseProxy:
# -- The License Proxy is optional, but highly recommended to be deployed in production
# to enable highly available environments.
enabled: false
# -- If the License Proxy is deployed, one replica should be sufficient to
# support many API/Engine pods.
# Highly available environments may wish to deploy a second replica to ensure
# uptime, which can be toggled with this option.
deploySecondReplica: false
# -- Even with a License Proxy deployed, API and Engine pods can be configured to keep the
# upstream `license.deepgram.com` license server as a fallback licensing option if the
# License Proxy is unavailable.
# Disable this option if you are restricting API/Engine Pod network access for security reasons,
# and only the License Proxy should send egress traffic to the upstream license server.
keepUpstreamServerAsBackup: true
# -- namePrefix is the prefix to apply to the name of all K8s objects
# associated with the Deepgram License Proxy containers.
namePrefix: "deepgram-license-proxy"
image:
# -- path configures the image path to use for creating License Proxy containers.
# You may change this from the public Quay image path if you have imported
# Deepgram images into a private container registry.
path: quay.io/deepgram/self-hosted-license-proxy
# -- tag defines which Deepgram release to use for License Proxy containers
tag: release-250130
# -- pullPolicy configures how the Kubelet attempts to pull the Deepgram
# License Proxy image
pullPolicy: IfNotPresent
# -- Additional labels to add to License Proxy resources
additionalLabels: {}
# -- (object) Additional annotations to add to the LicenseProxy deployment
additionalAnnotations:
updateStrategy:
# -- For the LicenseProxy, we only expose maxSurge and not maxUnavailable.
# This is to avoid accidentally having all LicenseProxy nodes go offline during upgrades,
# which could impact the entire cluster's connection to the Deepgram License Server.
# @default -- ``
rollingUpdate:
# -- The maximum number of extra License Proxy pods that can be created during a rollingUpdate,
# relative to the number of replicas. See the
# [Kubernetes documentation](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#max-surge)
# for more details.
maxSurge: 1
# -- Configure resource limits per License Proxy container. See
# [Deepgram's documentation](https://developers.deepgram.com/docs/license-proxy#system-requirements)
# for more details.
# @default -- ``
resources:
requests:
memory: "1Gi"
cpu: "1000m"
limits:
memory: "8Gi"
cpu: "2000m"
# -- Readiness probe customization for License Proxy pods.
# @default -- ``
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 10
failureThreshold: 1
# -- Liveness probe customization for Proxy pods.
# @default -- ``
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 10
failureThreshold: 3
# -- [Affinity and anti-affinity](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity)
# to apply for License Proxy pods.
affinity: {}
# -- [Tolerations](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/)
# to apply to License Proxy pods.
tolerations: []
# -- [Security context](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/) for API pods.
securityContext: {}
serviceAccount:
# -- Specifies whether to create a default service account for the Deepgram License Proxy Deployment.
create: true
# -- (string) Allows providing a custom service account name for the LicenseProxy component.
# If left empty, the default service account name will be used.
# If specified, and `licenseProxy.serviceAccount.create = true`, this defines the name of the default service account.
# If specified, and `licenseProxy.serviceAccount.create = false`, this provides the name of a preconfigured service account
# you wish to attach to the License Proxy deployment.
name:
# -- Configure how the license proxy will listen for licensing requests.
# @default -- ``
server:
# --host is the IP address to listen on. You will want to listen
# on all interfaces to interact with other pods in the cluster.
host: "0.0.0.0"
# -- port to listen on.
port: 8443
# -- baseUrl is the prefix for incoming license verification requests.
baseUrl: "/"
# -- statusPort is the port to listen on for the status/health endpoint.
statusPort: 8080
# -- Passthrough values for [NVIDIA GPU Operator Helm chart](https://github.com/NVIDIA/gpu-operator/blob/master/deployments/gpu-operator/values.yaml)
# You may use the NVIDIA GPU Operator to manage installation of NVIDIA drivers and the container toolkit on nodes with attached GPUs.
# @default -- ``
gpu-operator:
# -- Whether to install the NVIDIA GPU Operator to manage driver and/or container toolkit installation.
# See the list of [supported Operating Systems](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/platform-support.html#supported-operating-systems-and-kubernetes-platforms)
# to verify compatibility with your cluster/nodes. Disable this option if your cluster/nodes are not compatible.
# If disabled, you will need to self-manage NVIDIA software installation on all nodes where you want
# to schedule Deepgram Engine pods.
enabled: true
driver:
# -- Whether to install NVIDIA drivers on nodes where a NVIDIA GPU is detected.
# If your Kubernetes nodes run a base image that comes with NVIDIA drivers pre-configured,
# disable this option, but keep the parent `gpu-operator` and sibling `toolkit`
# options enabled.
enabled: true
# -- NVIDIA driver version to install.
version: "550.54.15"
toolkit:
# -- Whether to install NVIDIA drivers on nodes where a NVIDIA GPU is detected.
enabled: true
# -- NVIDIA container toolkit to install. The default `ubuntu` image tag for the
# toolkit requires a dynamic runtime link to a version of GLIBC that may not be
# present on nodes running older Linux distribution releases, such as Ubuntu 22.04.
# Therefore, we specify the `ubi8` image, which statically links the GLIBC library
# and avoids this issue.
version: v1.15.0-ubi8
cluster-autoscaler:
# -- Set to `true` to enable node autoscaling with AWS EKS. Note needed for GKE, as autoscaling is enabled by a
# [cli option on cluster creation](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-autoscaler#creating_a_cluster_with_autoscaling).
enabled: false
rbac:
serviceAccount:
# -- Name of the IAM Service Account with the [necessary permissions](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/aws/README.md#permissions)
name: cluster-autoscaler-sa
annotations:
# -- (string) Replace with the AWS Role ARN configured for the Cluster Autoscaler.
# See the [Deepgram AWS EKS guide](https://developers.deepgram.com/docs/aws-k8s#creating-a-cluster)
# or [Cluster Autoscaler AWS documentation](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/aws/README.md#permissions)
# for details.
eks.amazonaws.com/role-arn:
autoDiscovery:
# -- (string) Name of your AWS EKS cluster. Using the [Cluster Autoscaler](https://github.com/kubernetes/autoscaler)
# on AWS requires knowledge of certain cluster metadata.
clusterName:
# -- (string) Region of your AWS EKS cluster. Using the [Cluster Autoscaler](https://github.com/kubernetes/autoscaler)
# on AWS requires knowledge of certain cluster metadata.
awsRegion:
# -- Passthrough values for [Prometheus k8s stack Helm chart](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack).
# Prometheus (and its adapter) should be configured when scaling.auto is enabled.
# You may choose to use the installation/configuration bundled in this Helm chart,
# or you may configure an existing Prometheus installation in your cluster to expose
# the needed values.
# See source Helm chart for explanation of available values. Default values provided in this chart are used
# to provide pod autoscaling for Deepgram pods.
# @default -- ``
kube-prometheus-stack:
# -- (bool) Normally, this chart will be installed if `scaling.auto.enabled` is true. However, if you wish
# to manage the Prometheus adapter in your cluster on your own and not as part of the Deepgram Helm chart,
# you can force it to not be installed by setting this to `false`.
includeDependency:
fullnameOverride: "dg-prometheus-stack"
prometheus:
prometheusSpec:
additionalScrapeConfigs:
- job_name: "dg_engine_metrics"
scrape_interval: "2s"
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- "{{ .Release.Namespace }}"
relabel_configs:
- source_labels: [__meta_kubernetes_service_name]
regex: "(.*)-metrics"
action: keep
- source_labels: [__meta_kubernetes_endpoint_port_name]
regex: "metrics"
action: keep
- source_labels: [__meta_kubernetes_namespace]
target_label: namespace
- source_labels: [__meta_kubernetes_service_name]
target_label: service
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod
prometheusOperator:
enabled: true
alertmanager:
enabled: false
grafana:
enabled: true
nodeExporter:
enabled: false
kube-state-metrics:
enabled: true
metricLabelsAllowlist:
- namespaces=[{{ .Release.Namespace }}],deployments=[app]
# -- Passthrough values for [Prometheus Adapter Helm chart](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-adapter).
# Prometheus, and its adapter here, should be configured when scaling.auto is enabled.
# You may choose to use the installation/configuration bundled in this Helm chart,
# or you may configure an existing Prometheus installation in your cluster to expose
# the needed values.
# See source Helm chart for explanation of available values. Default values provided in this chart are used
# to provide pod autoscaling for Deepgram pods.
# @default -- ``
prometheus-adapter:
# -- Normally, this chart will be installed if `scaling.auto.enabled` is true. However, if you wish
# to manage the Prometheus adapter in your cluster on your own and not as part of the Deepgram Helm chart,
# you can force it to not be installed by setting this to `false`.
includeDependency:
prometheus:
url: http://dg-prometheus-stack-prometheus.{{ .Release.Namespace }}.svc
rules:
default: false
external:
- name:
as: "engine_active_requests_stt_streaming"
seriesQuery: 'engine_active_requests{kind="stream"}'
metricsQuery: 'avg(engine_active_requests{kind="stream"})'
resources:
overrides:
namespace: { resource: "namespace" }
pod: { resource: "pod" }
service: { resource: "service" }
- name:
as: "engine_active_requests_stt_batch"
seriesQuery: 'engine_active_requests{kind="batch"}'
metricsQuery: 'avg(engine_active_requests{kind="batch"})'
resources:
overrides:
namespace: { resource: "namespace" }
pod: { resource: "pod" }
service: { resource: "service" }
- name:
as: "engine_active_requests_tts_batch"
seriesQuery: 'engine_active_requests{kind="tts"}'
metricsQuery: 'avg(engine_active_requests{kind="tts"})'
resources:
overrides:
namespace: { resource: "namespace" }
pod: { resource: "pod" }
service: { resource: "service" }
- name:
as: "engine_estimated_stream_capacity"
seriesQuery: 'engine_active_requests{kind="stream"}'
metricsQuery: 'avg_over_time((sum(engine_active_requests{kind="stream"}) / sum(engine_estimated_stream_capacity))[1m:1m])'
resources:
overrides:
namespace: { resource: "namespace" }
pod: { resource: "pod" }
service: { resource: "service" }
- name:
as: "engine_requests_active_to_max_ratio"
seriesQuery: "engine_max_active_requests"
metricsQuery: "avg_over_time((sum(engine_active_requests) / sum(engine_max_active_requests))[1m:1m])"
resources:
overrides:
namespace: { resource: "namespace" }
pod: { resource: "pod" }
service: { resource: "service" }
- name:
as: "engine_to_api_pod_ratio"
seriesQuery: 'kube_deployment_labels{label_app="deepgram-engine"}'
metricsQuery: '(sum(kube_deployment_status_replicas and on(deployment) kube_deployment_labels{label_app="deepgram-engine"})) / (sum(kube_deployment_status_replicas and on(deployment) kube_deployment_labels{label_app="deepgram-api"}))'
resources:
overrides:
namespace: { resource: "namespace" }
pod: { resource: "pod" }