From dfc5b27108876485986bdb9a79f6d8afa0b5bdde Mon Sep 17 00:00:00 2001 From: Johnny Graettinger Date: Mon, 4 Nov 2024 15:05:12 -0600 Subject: [PATCH] .github: deployment automation for data-plane-controller and agent-api Introduce workflow_dispatch actions for building and deploying these control-plane services to Cloud Run, using workload identity federation. Include Dockerfile infrastructure and entrypoints for placing secrets into exepected locations. Minor tweaks to data-plane-controller, adjusting defaults to work better with Cloud Run and also to support IPv4 addresses for Ansible hosts, which is required due to our dependency on Cloud Run (which only supports IPv4, paired with Cloud NAT). --- .github/workflows/deploy-agent-api.yaml | 61 ++++++++++++++++++ .../deploy-data-plane-controller.yaml | 64 +++++++++++++++++++ crates/agent/Dockerfile | 31 +++++++++ crates/agent/entrypoint.sh | 11 ++++ crates/automations/Cargo.toml | 4 +- crates/automations/src/server.rs | 6 +- crates/data-plane-controller/Dockerfile | 49 ++++++++++++++ crates/data-plane-controller/entrypoint.sh | 40 ++++++++++++ crates/data-plane-controller/src/lib.rs | 7 +- crates/data-plane-controller/src/stack.rs | 4 +- 10 files changed, 268 insertions(+), 9 deletions(-) create mode 100644 .github/workflows/deploy-agent-api.yaml create mode 100644 .github/workflows/deploy-data-plane-controller.yaml create mode 100644 crates/agent/Dockerfile create mode 100755 crates/agent/entrypoint.sh create mode 100644 crates/data-plane-controller/Dockerfile create mode 100755 crates/data-plane-controller/entrypoint.sh diff --git a/.github/workflows/deploy-agent-api.yaml b/.github/workflows/deploy-agent-api.yaml new file mode 100644 index 0000000000..7f6a14f01e --- /dev/null +++ b/.github/workflows/deploy-agent-api.yaml @@ -0,0 +1,61 @@ +name: Deploy agent-api + +on: + workflow_dispatch: {} + # TODO(johnny): Remove after merging. + push: + branches: [johnny/dpc-cd] + +env: + CARGO_INCREMENTAL: 0 # Faster from-scratch builds. + +jobs: + build: + runs-on: ubuntu-24.04 + permissions: + # Permissions required of the Github token in order for + # federated identity and authorization to work. + contents: read + id-token: write + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: true + lfs: true + + - uses: supabase/setup-cli@v1 + - run: supabase start + + - name: Build `agent` + run: cargo build --release -p agent + + - run: mv target/release/agent crates/agent/ + + - name: Authenticate with GCP Workload Identity Federation + uses: google-github-actions/auth@v2 + with: + service_account: cd-github-actions@estuary-control.iam.gserviceaccount.com + workload_identity_provider: projects/1084703453822/locations/global/workloadIdentityPools/github-actions/providers/github-actions-provider + + - name: Update Cloud Run service `agent-api` + uses: google-github-actions/deploy-cloudrun@v2 + with: + service: agent-api + project_id: estuary-control + region: us-central1 + source: crates/agent/ + timeout: 10m + + env_vars: |- + BUILDS_ROOT=gs://estuary-control/builds/ + DATABASE_CA=/etc/db-ca.crt + DATABASE_URL=postgresql://postgres@db.eyrcnmuzzyriypdajwdk.supabase.co:5432/postgres + NO_COLOR=1 + + secrets: |- + PGPASSWORD=POSTGRES_PASSWORD:latest + CONTROL_PLANE_DB_CA_CERT=CONTROL_PLANE_DB_CA_CERT:latest + + env_vars_update_strategy: overwrite + secrets_update_strategy: overwrite diff --git a/.github/workflows/deploy-data-plane-controller.yaml b/.github/workflows/deploy-data-plane-controller.yaml new file mode 100644 index 0000000000..93a91cf4a7 --- /dev/null +++ b/.github/workflows/deploy-data-plane-controller.yaml @@ -0,0 +1,64 @@ +name: Deploy data-plane-controller + +on: + workflow_dispatch: {} + # TODO(johnny): Remove after merging. + push: + branches: [johnny/dpc-cd] + +env: + CARGO_INCREMENTAL: 0 # Faster from-scratch builds. + +jobs: + build: + runs-on: ubuntu-24.04 + permissions: + # Permissions required of the Github token in order for + # federated identity and authorization to work. + contents: read + id-token: write + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: true + lfs: true + + - uses: supabase/setup-cli@v1 + - run: supabase start + + - name: Build `data-plane-controller` + run: cargo build --release -p data-plane-controller + + - run: mv target/release/data-plane-controller crates/data-plane-controller/ + + - name: Authenticate with GCP Workload Identity Federation + uses: google-github-actions/auth@v2 + with: + service_account: cd-github-actions@estuary-control.iam.gserviceaccount.com + workload_identity_provider: projects/1084703453822/locations/global/workloadIdentityPools/github-actions/providers/github-actions-provider + + - name: Update Cloud Run job `data-plane-controller` + uses: google-github-actions/deploy-cloudrun@v2 + with: + job: data-plane-controller + project_id: estuary-control + region: us-central1 + source: crates/data-plane-controller/ + timeout: 2h # Self-cancels after 1 hour, with 1 hour grace period. + + env_vars: |- + DPC_DATABASE_CA=/etc/db-ca.crt + DPC_DATABASE_URL=postgresql://postgres@db.eyrcnmuzzyriypdajwdk.supabase.co:5432/postgres + NO_COLOR=1 + + secrets: |- + CONTROL_PLANE_DB_CA_CERT=CONTROL_PLANE_DB_CA_CERT:latest + DPC_GITHUB_SSH_KEY=DPC_GITHUB_SSH_KEY:latest + DPC_IAM_CREDENTIALS=DPC_IAM_CREDENTIALS:latest + DPC_SERVICE_ACCOUNT=DPC_SERVICE_ACCOUNT:latest + PGPASSWORD=POSTGRES_PASSWORD:latest + VULTR_API_KEY=DPC_VULTR_API_KEY:latest + + env_vars_update_strategy: overwrite + secrets_update_strategy: overwrite diff --git a/crates/agent/Dockerfile b/crates/agent/Dockerfile new file mode 100644 index 0000000000..d645361fd2 --- /dev/null +++ b/crates/agent/Dockerfile @@ -0,0 +1,31 @@ +FROM ubuntu:noble + +# Install required packages. +RUN apt update -y \ + && apt install --no-install-recommends -y \ + ca-certificates \ + s3cmd \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Install the `sops` CLI. +RUN curl -L -o /usr/local/bin/sops \ + https://github.com/getsops/sops/releases/download/v3.9.1/sops-v3.9.1.linux.amd64 \ + && chmod +x /usr/local/bin/sops + +# Copy in our local assets. +COPY agent /usr/local/bin/ +COPY entrypoint.sh /usr/local/bin/ + +ENV BIN_DIR /usr/local/bin/ +ENV RUST_LOG=info + +CMD ["/usr/local/bin/entrypoint.sh"] + +# Example of running this container locally: +# docker run --rm --net=host -it \ +# -e CONTROL_PLANE_DB_CA_CERT="$( /etc/db-ca.crt + +exec agent --allow-origin=https://dashboard.estuary.dev --allow-origin=http://localhost:3000 \ No newline at end of file diff --git a/crates/automations/Cargo.toml b/crates/automations/Cargo.toml index 70a7f166ad..8b27b714ff 100644 --- a/crates/automations/Cargo.toml +++ b/crates/automations/Cargo.toml @@ -14,12 +14,12 @@ models = { path = "../models", features = ["sqlx-support"] } anyhow = { workspace = true } futures = { workspace = true } +rand = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } sqlx = { workspace = true } tokio = { workspace = true } tracing = { workspace = true } -tracing-subscriber = { workspace = true } [dev-dependencies] -rand = { workspace = true } +tracing-subscriber = { workspace = true } \ No newline at end of file diff --git a/crates/automations/src/server.rs b/crates/automations/src/server.rs index 92f0055156..05b520bfda 100644 --- a/crates/automations/src/server.rs +++ b/crates/automations/src/server.rs @@ -227,8 +227,12 @@ async fn ready_tasks_iter( // If permits remain, there were not enough tasks to dequeue. // Sleep for up-to `dequeue_interval`, cancelling early if a task completes. if permits.num_permits() != 0 { + // Jitter dequeue by 10% in either direction, to ensure + // distribution of tasks and retries across executors. + let jitter = 0.9 + rand::random::() * 0.2; // [0.9, 1.1) + tokio::select! { - () = tokio::time::sleep(dequeue_interval) => (), + () = tokio::time::sleep(dequeue_interval.mul_f64(jitter)) => (), _ = semaphore.clone().acquire_owned() => (), // Cancel sleep. } } diff --git a/crates/data-plane-controller/Dockerfile b/crates/data-plane-controller/Dockerfile new file mode 100644 index 0000000000..0661de7b8a --- /dev/null +++ b/crates/data-plane-controller/Dockerfile @@ -0,0 +1,49 @@ +FROM ubuntu:noble + +# Install required packages. +RUN apt update -y \ + && apt install --no-install-recommends -y \ + ca-certificates \ + certbot \ + curl \ + git \ + openssh-client \ + python3-certbot-dns-google \ + python3-poetry \ + python3-venv \ + && rm -rf /var/lib/apt/lists/* + +# Install the `pulumi` CLI. +RUN curl -fsSL https://get.pulumi.com/ | bash -s +RUN ln -s /root/.pulumi/bin/pulumi /usr/local/bin/pulumi + +# Install the `sops` CLI. +RUN curl -L -o /usr/local/bin/sops \ + https://github.com/getsops/sops/releases/download/v3.9.1/sops-v3.9.1.linux.amd64 \ + && chmod +x /usr/local/bin/sops + +# Copy in our local assets. +COPY data-plane-controller /usr/local/bin/ +COPY entrypoint.sh /usr/local/bin/ + +# AWS profile to expect in ~/.aws/credentials +ENV AWS_PROFILE=data-plane-ops +# GCP Service Account JSON credentials path. +ENV GOOGLE_APPLICATION_CREDENTIALS=/etc/data_plane_controller.json +# Disable host-key checks when cloning our git repo. +ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no" + +ENV RUST_LOG=info + +CMD ["/usr/local/bin/entrypoint.sh"] + +# Example of running this container locally: +# docker run --rm --net=host -it \ +# -e CONTROL_PLANE_DB_CA_CERT="$( /etc/db-ca.crt +printf '%s\n' "${DPC_GITHUB_SSH_KEY}" > /root/ssh_key +printf '%s\n' "${DPC_IAM_CREDENTIALS}" > /root/.aws/credentials +printf '%s\n' "${DPC_SERVICE_ACCOUNT}" > ${GOOGLE_APPLICATION_CREDENTIALS} + +# Start background ssh-agent, evaluate output to set variables, and add SSH key. +chmod 0400 /root/ssh_key +eval "$(ssh-agent -s)" +ssh-add /root/ssh_key + +# Log out the IP from which we're running. +echo "Current egress IP:" +curl -s -S http://icanhazip.com + +# Start data-plane-controller in the background +data-plane-controller & +DPC_PID=$! + +# Start a background timer to send SIGINT after one hour. +( + sleep 3600 + kill -INT ${DPC_PID} 2>/dev/null || true +) & + +# Wait for data-plane-controller to exit and surface it's status. +set +o errexit +wait ${DPC_PID} +DPC_STATUS=${?} + +echo "data-plane-controller exited with status ${DPC_STATUS}" +exit ${DPC_STATUS} \ No newline at end of file diff --git a/crates/data-plane-controller/src/lib.rs b/crates/data-plane-controller/src/lib.rs index 1ee4ee4354..f4f628113f 100644 --- a/crates/data-plane-controller/src/lib.rs +++ b/crates/data-plane-controller/src/lib.rs @@ -17,19 +17,18 @@ pub struct Args { env = "DPC_DATABASE_URL", default_value = "postgres://postgres:postgres@127.0.0.1:5432/postgres" )] - #[serde(skip_serializing)] database_url: url::Url, /// Path to CA certificate of the database. #[clap(long = "database-ca", env = "DPC_DATABASE_CA")] database_ca: Option, /// Number of tasks which may be polled concurrently. - #[clap(long = "concurrency", env = "DPC_CONCURRENCY", default_value = "2")] + #[clap(long = "concurrency", env = "DPC_CONCURRENCY", default_value = "1")] concurrency: u32, /// Interval between polls for dequeue-able tasks when otherwise idle. #[clap( long = "dequeue-interval", env = "DPC_DEQUEUE_INTERVAL", - default_value = "5s" + default_value = "10s" )] #[serde(with = "humantime_serde")] #[arg(value_parser = humantime::parse_duration)] @@ -96,7 +95,7 @@ pub async fn run(args: Args) -> anyhow::Result<()> { } let pg_pool = sqlx::postgres::PgPoolOptions::new() - .acquire_timeout(std::time::Duration::from_secs(5)) + .acquire_timeout(std::time::Duration::from_secs(30)) .connect_with(pg_options) .await .context("connecting to database")?; diff --git a/crates/data-plane-controller/src/stack.rs b/crates/data-plane-controller/src/stack.rs index 0ec32aa92a..0f5276d32f 100644 --- a/crates/data-plane-controller/src/stack.rs +++ b/crates/data-plane-controller/src/stack.rs @@ -29,7 +29,7 @@ pub struct DataPlane { pub control_plane_api: url::Url, pub data_buckets: Vec, pub gcp_project: String, - pub ssh_subnets: Vec, + pub ssh_subnets: Vec, #[serde(default, skip_serializing_if = "Vec::is_empty")] pub private_links: Vec, pub deployments: Vec, @@ -86,7 +86,7 @@ pub struct AnsibleRole { #[derive(Debug, serde::Serialize, serde::Deserialize)] pub struct AnsibleHost { - pub ansible_host: std::net::Ipv6Addr, + pub ansible_host: std::net::IpAddr, pub ansible_user: String, pub host_fqdn: String, pub local_cert_pem: String,