Skip to content

Commit

Permalink
Merge branch 'GoogleCloudPlatform:main' into gke-secondary-boot-disk-…
Browse files Browse the repository at this point in the history
…image-cache
  • Loading branch information
i-jw authored Feb 12, 2025
2 parents bdbf4f2 + d64781d commit fee5412
Show file tree
Hide file tree
Showing 102 changed files with 5,912 additions and 118 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -567,7 +567,19 @@ def metrics_to_scrape(backend: str) -> List[str]:
# It must be populated on the outputs 'metrics' field as 'key':'stats'
# If a value is specified for a given key, it will be populated on the outputs `summary_stats.stats` field as 'value':'stats' as well.
if backend == "vllm":
return ["vllm:gpu_cache_usage_perc", "vllm:num_requests_waiting"]
return [
"vllm:gpu_cache_usage_perc",
"vllm:num_requests_waiting",
"vllm:num_requests_running",
"vllm:num_requests_swapped",
"vllm:time_to_first_token_seconds",
"vllm:time_per_output_token_seconds",
"vllm:request_queue_time_seconds",
"vllm:request_inference_time_seconds",
"vllm:request_prompt_tokens",
"vllm:request_generation_tokens",
"vllm:iteration_tokens_total",
]
elif backend == "jetstream":
return [
"jetstream_slots_used_percentage",
Expand Down
78 changes: 77 additions & 1 deletion cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,81 @@ steps:
allowFailure: true
waitFor: ['create gke cluster']

- id: 'Generate Kubeconfig'
name: 'gcr.io/cloud-builders/gcloud'
env:
- 'KUBECONFIG=/workspace/kubeconfig'
- 'USE_GKE_GCLOUD_AUTH_PLUGIN=False'
args:
- 'container'
- 'clusters'
- 'get-credentials'
- 'ml-${SHORT_SHA}-${_PR_NUMBER}-${_BUILD_ID}-cluster'
- '--region=${_REGION}'
- '--project=$PROJECT_ID'
allowFailure: true
waitFor: ['test rag']

- id: 'Copy metadata'
name: 'ubuntu'
entrypoint: 'bash'
args:
- '-c'
- |
mkdir -p security_test/scan_target/ && find . -mindepth 1 -maxdepth 1 -type d ! -name "security_test" -exec cp -r {} security_test/scan_target/ \;
mkdir -p /workspace/security_test/scan_target
# Exclude /workspace/security_test from the copy to avoid recursive issue
find . -mindepth 1 -maxdepth 1 ! -path "./security_test" -exec cp -r {} /workspace/security_test/scan_target/ \;
chown -R 65532:65532 /workspace/security_test/scan_target
mkdir -p /workspace/security_test/allowlist
cp security_test/config.yaml /workspace/security_test/config.yaml
cp -r security_test/allowlist/* /workspace/security_test/allowlist/ || echo "Allowlist folder is empty or not found"
allowFailure: true
waitFor: ['Generate Kubeconfig']

# gcr.io/cloud-builders/docker is a special image: This image provided by Google Cloud contains the docker command-line tool, which is essential for executing Docker commands like docker build and docker run within your Cloud Build steps.
# gcr.io/${_PROJECT_ID}/check_violations:latest is your application image: This image contains your security check tool and its dependencies. It's designed to be run, not to build or run other Docker images.
- name: 'gcr.io/cloud-builders/docker'
id: 'Run shipshape on cluster'
args:
- 'run'
- '--network=cloudbuild'
- '--rm'
- '-v'
- '/workspace/security_test/allowlist:/workspace/security_test/allowlist'
- '-v'
- '/workspace/security_test/config.yaml:/workspace/security_test/config.yaml'
- '-v'
- '/workspace/kubeconfig:/root/.kube/config'
- '${_SHIPSHAPE_IMAGE}'
- '--mode=cluster'
- '--allowlist_folder=/workspace/security_test/allowlist'
- '--kube_config_path=/root/.kube/config'
- '--max_wait_duration=3000'
- '--max_parallel=100'
- '--cluster_scan_config_path=/workspace/security_test/config.yaml'
allowFailure: true
waitFor: ['Copy metadata']


- id: 'Run Shipshape on helm'
name: 'gcr.io/cloud-builders/docker'
args:
- 'run'
- '--network=cloudbuild'
- '--rm'
- '-v'
- '/workspace/security_test/allowlist:/workspace/security_test/allowlist'
- '-v'
- '/workspace/security_test/scan_target:/workspace/security_test/scan_target'
- '${_SHIPSHAPE_IMAGE}'
- '--mode=helm'
- '--allowlist_folder=/workspace/security_test/allowlist'
- '--scan_path=/workspace/security_test/scan_target'
- '--max_wait_duration=60'
allowFailure: true
waitFor: ['Copy metadata']

- id: 'cleanup rag'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'bash'
Expand All @@ -304,7 +379,7 @@ steps:
-var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \
-auto-approve -no-color
allowFailure: true
waitFor: ['test rag']
waitFor: ['Run shipshape on cluster', 'Run Shipshape on helm']

- id: 'cleanup gke cluster'
name: 'gcr.io/$PROJECT_ID/terraform'
Expand Down Expand Up @@ -387,6 +462,7 @@ substitutions:
_USER_NAME: github
_AUTOPILOT_CLUSTER: "false"
_BUILD_ID: ${BUILD_ID:0:8}
_SHIPSHAPE_IMAGE: us-docker.pkg.dev/k8ssecurityvalidation-agent/k8ssecurityvalidation-agent/k8ssecurityvalidation-agent@sha256:cd45e6cd84e9a45462ddbca18c4731fd4e264d517ee98131eb5be4eb57691f44
logsBucket: gs://ai-on-gke-build-logs
options:
substitutionOption: "ALLOW_LOOSE"
Expand Down
11 changes: 11 additions & 0 deletions infrastructure/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,17 @@ output "ca_certificate" {

}

output "service_account" {
value = var.create_cluster && var.autopilot_cluster && var.private_cluster ? module.private-gke-autopilot-cluster[0].service_account : (
var.create_cluster && !var.autopilot_cluster && var.private_cluster ? module.private-gke-standard-cluster[0].service_account : (
var.create_cluster && var.autopilot_cluster && !var.private_cluster ? module.public-gke-autopilot-cluster[0].service_account : (
var.create_cluster && !var.autopilot_cluster && !var.private_cluster ? module.public-gke-standard-cluster[0].service_account :
"")))
sensitive = true
depends_on = [module.private-gke-autopilot-cluster, module.private-gke-standard-cluster, module.public-gke-autopilot-cluster, module.public-gke-standard-cluster]

}

output "private_cluster" {
value = var.private_cluster
}
4 changes: 4 additions & 0 deletions modules/gke-autopilot-private-cluster/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,8 @@ output "endpoint" {

output "ca_certificate" {
value = module.gke.ca_certificate
}

output "service_account" {
value = module.gke.service_account
}
4 changes: 4 additions & 0 deletions modules/gke-autopilot-public-cluster/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,8 @@ output "endpoint" {

output "ca_certificate" {
value = module.gke.ca_certificate
}

output "service_account" {
value = module.gke.service_account
}
4 changes: 4 additions & 0 deletions modules/gke-standard-private-cluster/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,8 @@ output "endpoint" {

output "ca_certificate" {
value = module.gke.ca_certificate
}

output "service_account" {
value = module.gke.service_account
}
5 changes: 5 additions & 0 deletions modules/gke-standard-public-cluster/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,8 @@ output "endpoint" {
output "ca_certificate" {
value = module.gke.ca_certificate
}


output "service_account" {
value = module.gke.service_account
}
4 changes: 2 additions & 2 deletions ray-on-gke/tpu/kuberay-tpu-webhook/Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Image URL to use all building/pushing image targets
IMG ?= us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/tpu-webhook:v1.2.1-gke.0
IMG ?= us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/tpu-webhook:v1.2.2-gke.1

# For europe, use europe-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/tpu-webhook:v1.2.0-gke.1
# For europe, use europe-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/tpu-webhook

# Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set)
ifeq (,$(shell go env GOBIN))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ spec:
spec:
serviceAccountName: kuberay-tpu-webhook
containers:
- image: us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/tpu-webhook:v1.2.1-gke.0
- image: us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/tpu-webhook:v1.2.2-gke.1
imagePullPolicy: Always
name: kuberay-tpu-webhook
args:
Expand Down
8 changes: 4 additions & 4 deletions ray-on-gke/tpu/kuberay-tpu-webhook/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,11 @@ require (
github.com/rogpeppe/go-internal v1.11.0 // indirect
github.com/spf13/pflag v1.0.5 // indirect
golang.org/x/exp v0.0.0-20220722155223-a9213eeb770e // indirect
golang.org/x/net v0.23.0 // indirect
golang.org/x/net v0.33.0 // indirect
golang.org/x/oauth2 v0.12.0 // indirect
golang.org/x/sys v0.18.0 // indirect
golang.org/x/term v0.18.0 // indirect
golang.org/x/text v0.14.0 // indirect
golang.org/x/sys v0.28.0 // indirect
golang.org/x/term v0.27.0 // indirect
golang.org/x/text v0.21.0 // indirect
golang.org/x/time v0.3.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
google.golang.org/appengine v1.6.7 // indirect
Expand Down
20 changes: 10 additions & 10 deletions ray-on-gke/tpu/kuberay-tpu-webhook/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,8 @@ golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs=
golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg=
golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I=
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
golang.org/x/oauth2 v0.12.0 h1:smVPGxink+n1ZI5pkQa8y6fZT0RW0MgCO5bFpepy4B4=
golang.org/x/oauth2 v0.12.0/go.mod h1:A74bZ3aGXgCY0qaIC9Ahg6Lglin4AMAco8cIv9baba4=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
Expand All @@ -139,23 +139,23 @@ golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5h
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20211007075335-d3039528d8ac/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4=
golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.18.0 h1:FcHjZXDMxI8mM3nwhX9HlKop4C0YQvCVCdwYl2wOtE8=
golang.org/x/term v0.18.0/go.mod h1:ILwASektA3OnRv7amZ1xhE/KTR+u50pbXfZ03+6Nx58=
golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA=
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.27.0 h1:WP60Sv1nlK1T6SupCHbXzSaN0b9wUmsPoRS9b61A23Q=
golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo=
golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4=
golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/tools v0.18.0 h1:k8NLag8AGHnn+PHbl7g43CtqZAwG60vZkLqgyZgIHgQ=
golang.org/x/tools v0.18.0/go.mod h1:GL7B4CwcLLeo59yx/9UWWuNOW1n3VZ4f5axWfML7Lcg=
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg=
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
Expand Down
4 changes: 2 additions & 2 deletions ray-on-gke/tpu/kuberay-tpu-webhook/helm-chart/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.2.1
version: 0.2.2

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "1.2.1"
appVersion: "1.2.2"
2 changes: 1 addition & 1 deletion ray-on-gke/tpu/kuberay-tpu-webhook/helm-chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ tpuWebhook:

image:
repository: us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/tpu-webhook
tag: v1.2.1-gke.0
tag: v1.2.2-gke.1
pullPolicy: Always

deployment:
Expand Down
19 changes: 15 additions & 4 deletions ray-on-gke/tpu/kuberay-tpu-webhook/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"time"

ray "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
utils "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
admissionv1 "k8s.io/api/admission/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -244,16 +245,26 @@ func extractRayCluster(admissionReview *admissionv1.AdmissionReview) (*ray.RayCl
return &rayCluster, nil
}

// generateHeadlessServiceName returns the expected TPU headless service name for a RayCluster
func generateHeadlessServiceName(clusterName string) string {
serviceName := fmt.Sprintf("%s-%s", clusterName, headlessServiceSuffix)

// Apply the same truncation as in the RayCluster controller when generating the headless service
// name. This is to maintain the up-to 63 char compatibility guarantee for hostnames (RFC 1123).
return utils.CheckName(serviceName)
}

// genDNSHostnames returns list of DNS hostnames for TPU VM hosts as a string
func genDNSHostnames(numOfHosts int32, groupName string, clusterName string, namespace string, replicaIndex int) (string, error) {
if numOfHosts == 0 {
err := errors.New("workerGroupSpec NumOfHosts not set")
return "", err
}
headlessServiceName := generateHeadlessServiceName(clusterName)
hostNames := make([]string, numOfHosts)
// Host names will be of the form {WORKER_GROUP_NAME}-{REPLICA_INDEX}-{HOST_INDEX}.headless-worker-svc
// Host names will be of the form {WORKER_GROUP_NAME}-{REPLICA_INDEX}-{HOST_INDEX}.{CLUSTER_NAME}-headless-worker-svc
for j := 0; j < int(numOfHosts); j++ {
hostNames[j] = fmt.Sprintf("%s-%d-%d.%s-%s", groupName, replicaIndex, j, clusterName, headlessServiceSuffix)
hostNames[j] = fmt.Sprintf("%s-%d-%d.%s", groupName, replicaIndex, j, headlessServiceName)
}
klog.V(1).InfoS("genDNSHostnames", "RayCluster", namespace+"/"+clusterName, "NumOfHosts", numOfHosts, "Replica Index", replicaIndex)
return strings.Join(hostNames, ","), nil
Expand All @@ -268,7 +279,7 @@ func injectHostnames(clusterName string, hostNames string, envPath string, conta
Value: hostNames,
}
subdomainPatch["path"] = subdomainPath
subdomainPatch["value"] = fmt.Sprintf("%s-%s", clusterName, headlessServiceSuffix)
subdomainPatch["value"] = generateHeadlessServiceName(clusterName)
// create new EnvVar array if container.Env is empty, and append hostnames if not
if len(container.Env) == 0 {
hostNamesPatch["path"] = envPath
Expand Down Expand Up @@ -678,7 +689,7 @@ func (t *TPUWebhookServer) mutatePod(admissionReview *admissionv1.AdmissionRevie
return nil, err
}
klog.V(1).InfoS("mutatePod", "RayCluster", namespace+"/"+clusterName, "TPU_WORKER_HOSTNAMES", hostnames)
klog.V(1).InfoS("mutatePod", "RayCluster", namespace+"/"+clusterName, "subdomain", clusterName+"-"+headlessServiceSuffix)
klog.V(1).InfoS("mutatePod", "RayCluster", namespace+"/"+clusterName, "subdomain", generateHeadlessServiceName(clusterName))
injectHostnames(clusterName, hostnames, path, container, &patches)
}
// inject TPU_WORKER_ID
Expand Down
Loading

0 comments on commit fee5412

Please sign in to comment.