From 1fb4ebf281791611320d8dc926932c98003bea99 Mon Sep 17 00:00:00 2001 From: leroyjb Date: Fri, 24 Jan 2025 21:52:19 +0000 Subject: [PATCH 1/3] add dws multiclusters example folder --- .../dws-multiclusters-example/README.md | 48 ++++ .../create-clusters.sh | 70 ++++++ .../create-multikueue-kubeconfig.sh | 232 ++++++++++++++++++ .../deploy-multikueue.sh | 70 ++++++ .../dws-multi-worker.yaml | 54 ++++ .../dws-multiclusters-example/dws-multi.yaml | 94 +++++++ .../job-multi-dws-autopilot.yaml | 39 +++ .../dws-multiclusters-example/prom.yaml | 16 ++ 8 files changed, 623 insertions(+) create mode 100644 tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/README.md create mode 100755 tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/create-clusters.sh create mode 100755 tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/create-multikueue-kubeconfig.sh create mode 100755 tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/deploy-multikueue.sh create mode 100644 tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/dws-multi-worker.yaml create mode 100644 tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/dws-multi.yaml create mode 100644 tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/job-multi-dws-autopilot.yaml create mode 100644 tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/prom.yaml diff --git a/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/README.md b/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/README.md new file mode 100644 index 000000000..cc71eb33a --- /dev/null +++ b/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/README.md @@ -0,0 +1,48 @@ +# Create Clusters + +``` +./create-clusters.sh +``` + +# Install Kueue + +``` +./deploy-multikueue.sh +``` + +## Validate installation + +``` +kubectl get clusterqueues dws-cluster-queue -o jsonpath="{range .status.conditions[?(@.type == \"Active\")]}CQ - Active: {@.status} Reason: {@.reason} Message: {@.message}{'\n'}{end}" +kubectl get admissionchecks sample-dws-multikueue -o jsonpath="{range .status.conditions[?(@.type == \"Active\")]}AC - Active: {@.status} Reason: {@.reason} Message: {@.message}{'\n'}{end}" +kubectl get multikueuecluster multikueue-dws-worker-asia -o jsonpath="{range .status.conditions[?(@.type == \"Active\")]}MC-ASIA - Active: {@.status} Reason: {@.reason} Message: {@.message}{'\n'}{end}" +kubectl get multikueuecluster multikueue-dws-worker-us -o jsonpath="{range .status.conditions[?(@.type == \"Active\")]}MC-US - Active: {@.status} Reason: {@.reason} Message: {@.message}{'\n'}{end}" +kubectl get multikueuecluster multikueue-dws-worker-eu -o jsonpath="{range .status.conditions[?(@.type == \"Active\")]}MC-EU - Active: {@.status} Reason: {@.reason} Message: {@.message}{'\n'}{end}" +``` + +Output : + +``` +CQ - Active: True Reason: Ready Message: Can admit new workloads +AC - Active: True Reason: Active Message: The admission check is active +MC-ASIA - Active: True Reason: Active Message: Connected +MC-US - Active: True Reason: Active Message: Connected +MC-EU - Active: True Reason: Active Message: Connected +``` + +# Launch job + + + +``` +kubectl create -f job-multi-dws-autopilot.yaml +``` + +## Get the status of the job + +``` +kubectl get workloads.kueue.x-k8s.io -o jsonpath='{.items[0].status.admissionChecks}' +``` + +In the output message, you can find where the job is scheduled + diff --git a/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/create-clusters.sh b/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/create-clusters.sh new file mode 100755 index 000000000..555f9a8bf --- /dev/null +++ b/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/create-clusters.sh @@ -0,0 +1,70 @@ +#!/bin/bash + +# Copyright 2024 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -o nounset +set -o pipefail + +echo 'Create GKE Autopilot clusters' + +KUEUE_VERSION=v0.8.1 +regions=("europe-west4" "asia-southeast1" "us-east4" "europe-west4") +kubeconfigs=("manager-europe-west4" "worker-asia-southeast1" "worker-us-east4" "worker-eu-west4") +PROJECT_ID=$(gcloud config get-value project) +PROJECT_NUMBER=$(gcloud projects describe $PROJECT_ID --format="value(projectNumber)") +PREFIX_MANAGER="man" +PREFIX_WORKER="w" +JOBSET_VERSION=v0.6.0 + +# Loop through the regions +for i in "${!regions[@]}"; do + region="${regions[$i]}" + echo "$region" + # Construct the cluster name, adding "manager" if it's the first region + if [[ $i -eq 0 ]]; then + cluster_name="$PREFIX_MANAGER-$region" + else + cluster_name="$PREFIX_WORKER-$region" + fi + + #Create the cluster + gcloud container clusters create-auto "$cluster_name" \ + --project "$PROJECT_ID" \ + --region "$region" \ + --release-channel "regular" \ + --async +done +for i in "${!regions[@]}"; do + region="${regions[$i]}" + if [[ $i -eq 0 ]]; then + cluster_name="$PREFIX_MANAGER-$region" + else + cluster_name="$PREFIX_WORKER-$region" + fi + + # opId=$(gcloud container operations list --filter "TARGET=https://container.googleapis.com/v1/projects/$PROJECT_NUMBER/locations/$region/clusters/$cluster_name" --format="value(name)") + #gcloud container operations wait "$opId" --project "$PROJECT_ID" --region "$region" + set +e + until gcloud -q container clusters get-credentials "$cluster_name" \ + --project "$PROJECT_ID" \ + --region "$region"; do + echo "GKE Cluster is provisioning. Retrying in 15 seconds..." + sleep 15 + done + set -e + configname="${kubeconfigs[$i]}" + kubectl config rename-context "gke_$PROJECT_ID"_"$region"_"$cluster_name" "$configname" +done diff --git a/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/create-multikueue-kubeconfig.sh b/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/create-multikueue-kubeconfig.sh new file mode 100755 index 000000000..75c392b16 --- /dev/null +++ b/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/create-multikueue-kubeconfig.sh @@ -0,0 +1,232 @@ +#!/bin/bash + +# Copyright 2024 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -o nounset +set -o pipefail + +KUBECONFIG_OUT=${1:-kubeconfig} +MULTIKUEUE_SA=multikueue-sa +NAMESPACE=kueue-system + +# Creating a restricted MultiKueue role, service account and role binding" +kubectl apply -f - <"${KUBECONFIG_OUT}" < Date: Fri, 7 Feb 2025 11:35:04 +0000 Subject: [PATCH 2/3] add terraform support --- .../dws-multiclusters-example/.gitignore | 1 + .../dws-multiclusters-example/README.md | 45 ++++++++++--- .../create-clusters.sh | 4 +- .../deploy-multikueue.sh | 2 +- .../dws-multiclusters-example/dws-multi.yaml | 2 +- .../dws-multiclusters-example/tf/clusters.tf | 67 +++++++++++++++++++ .../dws-multiclusters-example/tf/outputs.tf | 10 +++ .../dws-multiclusters-example/tf/variables.tf | 41 ++++++++++++ .../dws-multiclusters-example/tf/versions.tf | 22 ++++++ 9 files changed, 180 insertions(+), 14 deletions(-) create mode 100644 tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/.gitignore create mode 100644 tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/tf/clusters.tf create mode 100644 tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/tf/outputs.tf create mode 100644 tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/tf/variables.tf create mode 100644 tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/tf/versions.tf diff --git a/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/.gitignore b/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/.gitignore new file mode 100644 index 000000000..7cc402c7a --- /dev/null +++ b/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/.gitignore @@ -0,0 +1 @@ +*.kubeconfig diff --git a/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/README.md b/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/README.md index cc71eb33a..665aaf9d9 100644 --- a/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/README.md +++ b/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/README.md @@ -1,16 +1,39 @@ -# Create Clusters +# Multikueue-dws-integration + +This repository provides the files needed to demonstrate how to use MultiKueue with Dynamic Workload Scheduler (DWS) GKE Autopilot. This setup allows you to run workloads across multiple GKE clusters in different regions, automatically leveraging available GPU resources thanks to DWS. + +## Repository Contents + +This repository contains the following files: + +* `create-clusters.sh`: Script to create the required GKE clusters (one manager and three workers). +* `tf folder`: contains the terraform script to create the required GKE clusters (one manager and three workers). You can use it instead of the bash script. +* `deploy-multikueue.sh`: Script to install and configure Kueue and MultiKueue on the clusters. +* `dws-multi-worker.yaml`: Kueue configuration for the worker clusters, including manager configuration. +* `job-multi-dws-autopilot.yaml`: Example job definition to be submitted to the MultiKueue setup. + +## Setup and Usage + +### Create Clusters ``` -./create-clusters.sh +cd tf +terraform init +terraform plan +terraform apply -var project_id= ``` -# Install Kueue +### Install Kueue + +After creating the GKE clusters and updating your kubeconfig files, install the Kueue components: ``` ./deploy-multikueue.sh ``` -## Validate installation +### Validate installation + +Verify the Kueue installation and the connection between the manager and worker clusters: ``` kubectl get clusterqueues dws-cluster-queue -o jsonpath="{range .status.conditions[?(@.type == \"Active\")]}CQ - Active: {@.status} Reason: {@.reason} Message: {@.message}{'\n'}{end}" @@ -20,7 +43,7 @@ kubectl get multikueuecluster multikueue-dws-worker-us -o jsonpath="{range .stat kubectl get multikueuecluster multikueue-dws-worker-eu -o jsonpath="{range .status.conditions[?(@.type == \"Active\")]}MC-EU - Active: {@.status} Reason: {@.reason} Message: {@.message}{'\n'}{end}" ``` -Output : +A successful output should look like this: ``` CQ - Active: True Reason: Ready Message: Can admit new workloads @@ -30,19 +53,21 @@ MC-US - Active: True Reason: Active Message: Connected MC-EU - Active: True Reason: Active Message: Connected ``` -# Launch job - +### Launch job +Submit your job to the Kueue controller, which will run it on a worker cluster with available resources: ``` kubectl create -f job-multi-dws-autopilot.yaml ``` -## Get the status of the job +### Get the status of the job + +To check the job status and see where it's scheduled: ``` -kubectl get workloads.kueue.x-k8s.io -o jsonpath='{.items[0].status.admissionChecks}' +kubectl get workloads.kueue.x-k8s.io -o jsonpath='{range .items[*]}{.status.admissionChecks}{"\n"}{end}' ``` -In the output message, you can find where the job is scheduled +In the output message, you can find where the job is scheduled# diff --git a/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/create-clusters.sh b/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/create-clusters.sh index 555f9a8bf..c9d46e52a 100755 --- a/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/create-clusters.sh +++ b/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/create-clusters.sh @@ -55,8 +55,8 @@ for i in "${!regions[@]}"; do cluster_name="$PREFIX_WORKER-$region" fi - # opId=$(gcloud container operations list --filter "TARGET=https://container.googleapis.com/v1/projects/$PROJECT_NUMBER/locations/$region/clusters/$cluster_name" --format="value(name)") - #gcloud container operations wait "$opId" --project "$PROJECT_ID" --region "$region" + opId=$(gcloud container operations list --filter "TARGET=https://container.googleapis.com/v1/projects/$PROJECT_NUMBER/locations/$region/clusters/$cluster_name" --format="value(name)") + gcloud container operations wait "$opId" --project "$PROJECT_ID" --region "$region" set +e until gcloud -q container clusters get-credentials "$cluster_name" \ --project "$PROJECT_ID" \ diff --git a/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/deploy-multikueue.sh b/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/deploy-multikueue.sh index 26ea9d61f..019c86df7 100755 --- a/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/deploy-multikueue.sh +++ b/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/deploy-multikueue.sh @@ -21,7 +21,7 @@ set -o pipefail KUEUE_VERSION=v0.10.0 KFLOW_VERSION=v1.8.0 regions=("europe-west4" "asia-southeast1" "us-east4" "europe-west4") -kubeconfigs=("manager-europe-west4" "worker-asia-southeast1" "worker-us-east4" "worker-eu-west4") +kubeconfigs=("manager-europe-west4" "worker-asia-southeast1" "worker-us-east4" "worker-europe-west4") PROJECT_ID=$(gcloud config get-value project) PROJECT_NUMBER=$(gcloud projects describe $PROJECT_ID --format="value(projectNumber)") PREFIX_MANAGER="man" diff --git a/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/dws-multi.yaml b/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/dws-multi.yaml index 0af384315..105d79708 100644 --- a/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/dws-multi.yaml +++ b/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/dws-multi.yaml @@ -41,7 +41,7 @@ metadata: spec: kubeConfig: locationType: Secret - location: worker-eu-west4-secret + location: worker-europe-west4-secret --- apiVersion: kueue.x-k8s.io/v1beta1 kind: MultiKueueCluster diff --git a/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/tf/clusters.tf b/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/tf/clusters.tf new file mode 100644 index 000000000..662624288 --- /dev/null +++ b/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/tf/clusters.tf @@ -0,0 +1,67 @@ +provider "google" { + project = var.project_id +} +# Create network and subnets for each region +resource "google_compute_network" "network" { + name = "dws-network" + auto_create_subnetworks = true + +} + + +resource "google_container_cluster" "autopilot_manager_cluster" { + + + name = "${var.cluster_manager_name_prefix}-${var.location_manager}" + location = var.location_manager + network = google_compute_network.network.id + + enable_autopilot = true + deletion_protection = false + +} +# Create GKE Autopilot worker clusters +resource "google_container_cluster" "autopilot_worker_clusters" { + for_each = { + for region in var.regions_workers : region => { + region = region + name = "${var.cluster_worker_names_prefix}-${region}" # Use prefix and region + } + } + + name = each.value.name + location = each.value.region + network = google_compute_network.network.id # Reference the SINGLE network + enable_autopilot = true + deletion_protection = false + +} + +# Get the kubeconfig for each cluster and update the context +resource "null_resource" "update_kubeconfig" { + for_each = google_container_cluster.autopilot_worker_clusters + + provisioner "local-exec" { + command = < { + name = cluster.name + region = cluster.location + network = cluster.network + subnet = cluster.subnetwork + } + } +} diff --git a/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/tf/variables.tf b/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/tf/variables.tf new file mode 100644 index 000000000..65f6a03dd --- /dev/null +++ b/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/tf/variables.tf @@ -0,0 +1,41 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "project_id" { + type = string + description = "GCP project ID" +} + +variable "location_manager" { + type = string + description = "Location of GKE cluster" + default = "europe-west4" +} +variable "cluster_manager_name_prefix" { + type = string + description = "Prefix of MultiKueue Manager GKE cluster" + default = "manager" +} + +variable "regions_workers" { + type = list(string) + default = ["europe-west4", "asia-southeast1", "us-east4"] +} + +variable "cluster_worker_names_prefix" { + type = string + description = "Prefix of MultiKueue Workers GKE cluster" + default = "worker" +} + diff --git a/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/tf/versions.tf b/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/tf/versions.tf new file mode 100644 index 000000000..4daa3eca5 --- /dev/null +++ b/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/tf/versions.tf @@ -0,0 +1,22 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_providers { + google = { + source = "hashicorp/google" + } + + } +} From 79c7c1afdc22f2f7b4405edeb5403155dd636788 Mon Sep 17 00:00:00 2001 From: leroyjb Date: Fri, 7 Feb 2025 11:55:05 +0000 Subject: [PATCH 3/3] update README Terraform command --- .../dws-multiclusters-example/README.md | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/README.md b/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/README.md index 665aaf9d9..ca1f65387 100644 --- a/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/README.md +++ b/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/README.md @@ -17,10 +17,9 @@ This repository contains the following files: ### Create Clusters ``` -cd tf -terraform init -terraform plan -terraform apply -var project_id= +terraform -chdir=tf init +terraform -chdir=tf plan +terraform -chdir=tf apply -var project_id= ``` ### Install Kueue @@ -71,3 +70,11 @@ kubectl get workloads.kueue.x-k8s.io -o jsonpath='{range .items[*]}{.status.admi In the output message, you can find where the job is scheduled# +### Destroy resources + + +``` +terraform -chdir=tf destroy -var project_id= +``` + +