Skip to content
This repository has been archived by the owner on Jul 21, 2023. It is now read-only.

Commit

Permalink
Add autoscaling for intake/aggregate tasks. (#1042)
Browse files Browse the repository at this point in the history
Autoscaling is based on the queue depth: if there is a queue, we'll
scale up. The current policy is to stabilize over 5 minutes, then
add/remove one replica per minute.
  • Loading branch information
branlwyd authored Oct 20, 2021
1 parent d8d61f4 commit 42dd9da
Show file tree
Hide file tree
Showing 9 changed files with 640 additions and 19 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ jobs:
- uses: actions/checkout@v2
- uses: hashicorp/setup-terraform@v1
with:
terraform_version: 0.14.4
terraform_version: 0.14.8
- name: Terraform fmt
run: terraform fmt --check --recursive
- name: Terraform init
Expand Down
46 changes: 39 additions & 7 deletions terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,14 @@ variable "ingestors" {
type = map(object({
manifest_base_url = string
localities = map(object({
intake_worker_count = number
aggregate_worker_count = number
intake_worker_count = optional(number) # Deprecated: set {min,max}_intake_worker_count instead.
min_intake_worker_count = optional(number)
max_intake_worker_count = optional(number)

aggregate_worker_count = optional(number) # Deprecated: set {min,max}_aggregate_worker_count instead.
min_aggregate_worker_count = optional(number)
max_aggregate_worker_count = optional(number)

peer_share_processor_manifest_base_url = optional(string)
portal_server_manifest_base_url = optional(string)
aggregation_period = optional(string)
Expand Down Expand Up @@ -230,7 +236,7 @@ variable "cluster_settings" {
terraform {
backend "gcs" {}

required_version = ">= 0.14.4"
required_version = ">= 0.14.8"

# https://www.terraform.io/docs/language/expressions/type-constraints.html#experimental-optional-object-type-attributes
experiments = [module_variable_optional_attrs]
Expand Down Expand Up @@ -324,6 +330,9 @@ provider "kubernetes" {
host = local.kubernetes_cluster.endpoint
cluster_ca_certificate = base64decode(local.kubernetes_cluster.certificate_authority_data)
token = local.kubernetes_cluster.token
experiments {
manifest_resource = true
}
}

provider "helm" {
Expand Down Expand Up @@ -420,8 +429,22 @@ locals {
kubernetes_namespace = kubernetes_namespace.namespaces[pair[0]].metadata[0].name
packet_decryption_key_kubernetes_secret = kubernetes_secret.ingestion_packet_decryption_keys[pair[0]].metadata[0].name
ingestor_manifest_base_url = var.ingestors[pair[1]].manifest_base_url
intake_worker_count = var.ingestors[pair[1]].localities[pair[0]].intake_worker_count
aggregate_worker_count = var.ingestors[pair[1]].localities[pair[0]].aggregate_worker_count
min_intake_worker_count = coalesce(
var.ingestors[pair[1]].localities[pair[0]].min_intake_worker_count,
var.ingestors[pair[1]].localities[pair[0]].intake_worker_count
)
max_intake_worker_count = coalesce(
var.ingestors[pair[1]].localities[pair[0]].max_intake_worker_count,
var.ingestors[pair[1]].localities[pair[0]].intake_worker_count
)
min_aggregate_worker_count = coalesce(
var.ingestors[pair[1]].localities[pair[0]].min_aggregate_worker_count,
var.ingestors[pair[1]].localities[pair[0]].aggregate_worker_count
)
max_aggregate_worker_count = coalesce(
var.ingestors[pair[1]].localities[pair[0]].max_aggregate_worker_count,
var.ingestors[pair[1]].localities[pair[0]].aggregate_worker_count
)
peer_share_processor_manifest_base_url = coalesce(
var.ingestors[pair[1]].localities[pair[0]].peer_share_processor_manifest_base_url,
var.default_peer_share_processor_manifest_base_url
Expand Down Expand Up @@ -536,8 +559,10 @@ module "data_share_processors" {
facilitator_image = var.facilitator_image
facilitator_version = var.facilitator_version
container_registry = var.container_registry
intake_worker_count = each.value.intake_worker_count
aggregate_worker_count = each.value.aggregate_worker_count
min_intake_worker_count = each.value.min_intake_worker_count
max_intake_worker_count = each.value.max_intake_worker_count
min_aggregate_worker_count = each.value.min_aggregate_worker_count
max_aggregate_worker_count = each.value.max_aggregate_worker_count
eks_oidc_provider = var.use_aws ? module.eks[0].oidc_provider : { url = "", arn = "" }
gcp_workload_identity_pool_provider = local.gcp_workload_identity_pool_provider
}
Expand Down Expand Up @@ -659,6 +684,13 @@ module "portal_server_resources" {
depends_on = [module.gke]
}

module "custom_metrics" {
source = "./modules/custom_metrics"
environment = var.environment
use_aws = var.use_aws
eks_oidc_provider = var.use_aws ? module.eks[0].oidc_provider : { url = "", arn = "" }
}

# The monitoring module is disabled for now because it needs some AWS tweaks
# (wire up an EBS volume for metrics storage and forward SQS metrics into
# Prometheus). I'm commenting it out instead of putting in a count variable
Expand Down
8 changes: 7 additions & 1 deletion terraform/modules/account_mapping/account_mapping.tf
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ variable "aws_iam_role_name" {
default = ""
}

variable "aws_iam_role_managed_policy_arns" {
type = list(string)
default = []
}

variable "eks_oidc_provider" {
type = object({
arn = string
Expand Down Expand Up @@ -81,7 +86,8 @@ resource "google_service_account" "account" {
resource "aws_iam_role" "iam_role" {
count = var.aws_iam_role_name != "" ? 1 : 0

name = var.aws_iam_role_name
name = var.aws_iam_role_name
managed_policy_arns = var.aws_iam_role_managed_policy_arns

assume_role_policy = jsonencode({
Version = "2012-10-17"
Expand Down
Loading

0 comments on commit 42dd9da

Please sign in to comment.