Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TPU Provisioner: Node pool hash comparison #967

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tpu-provisioner/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Build the manager binary
FROM golang:1.23 as builder
FROM golang:1.23 AS builder
ARG TARGETOS
ARG TARGETARCH

Expand Down
49 changes: 28 additions & 21 deletions tpu-provisioner/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,9 @@ func main() {
GCPForceOnDemand bool `envconfig:"GCP_FORCE_ON_DEMAND" default:"false"`

// NodeMinLifespan is the amount of time that should pass between a Node object
// creation and a cleanup of that Node. This needs to be long enough to allow
// the node to become Ready and for a pending Pod to be scheduled on it.
NodeMinLifespan time.Duration `envconfig:"NODE_MIN_LIFESPAN" default:"3m"`
// creation and a cleanup of that Node. This is mostly irrelevant now that JobSet
// existance is checked before deleting a NodePool.
NodeMinLifespan time.Duration `envconfig:"NODE_MIN_LIFESPAN" default:"10s"`

NodepoolDeletionDelay time.Duration `envconfig:"NODEPOOL_DELETION_DELAY" default:"30s"`

Expand Down Expand Up @@ -198,30 +198,37 @@ func main() {
"podToNodeLabels", cfg.GCPPodToNodeLabels,
)

clusterCtx := cloud.GKEContext{
ProjectID: cfg.GCPProjectID,
ClusterLocation: cfg.GCPClusterLocation,
Cluster: cfg.GCPCluster,
NodeZone: cfg.GCPZone,
NodeServiceAccount: cfg.GCPNodeServiceAccount,
NodeAdditionalNetworks: cfg.GCPNodeAdditionalNetworks,
NodeSecondaryDisk: cfg.GCPNodeSecondaryDisk,
NodeTags: cfg.GCPNodeTags,
NodeDiskType: cfg.GCPNodeDiskType,
NodeConfidentialStorage: cfg.GCPNodeConfidentialStorage,
NodeBootDiskKMSKey: cfg.GCPNodeBootDiskKMSKey,
PodToNodeLabels: cfg.GCPPodToNodeLabels,
NodeSecureBoot: cfg.GCPNodeSecureBoot,
ForceOnDemand: cfg.GCPForceOnDemand,
}

containers, err := containerv1beta1.NewService(context.Background() /*, option.WithCredentials(creds)*/)
if err != nil {
setupLog.Error(err, "unable to create gke client")
os.Exit(1)
}
nodePoolsService := &cloud.GKENodePoolService{
ClusterContext: clusterCtx,
Service: containers,
}

provider = &cloud.GKE{
Service: containers,
ClusterContext: cloud.GKEContext{
ProjectID: cfg.GCPProjectID,
ClusterLocation: cfg.GCPClusterLocation,
Cluster: cfg.GCPCluster,
NodeZone: cfg.GCPZone,
NodeServiceAccount: cfg.GCPNodeServiceAccount,
NodeAdditionalNetworks: cfg.GCPNodeAdditionalNetworks,
NodeSecondaryDisk: cfg.GCPNodeSecondaryDisk,
NodeTags: cfg.GCPNodeTags,
NodeDiskType: cfg.GCPNodeDiskType,
NodeConfidentialStorage: cfg.GCPNodeConfidentialStorage,
NodeBootDiskKMSKey: cfg.GCPNodeBootDiskKMSKey,
PodToNodeLabels: cfg.GCPPodToNodeLabels,
NodeSecureBoot: cfg.GCPNodeSecureBoot,
ForceOnDemand: cfg.GCPForceOnDemand,
},
Recorder: mgr.GetEventRecorderFor("tpu-provisioner"),
NodePools: nodePoolsService,
ClusterContext: clusterCtx,
Recorder: mgr.GetEventRecorderFor("tpu-provisioner"),
}
case "mock":
provider = &cloud.Mock{}
Expand Down
18 changes: 11 additions & 7 deletions tpu-provisioner/examples/jobset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ spec:
maxRestarts: 3
replicatedJobs:
- name: workers
replicas: 3 # set to number of node pools
replicas: 1 # set to number of node pools
template:
spec:
backoffLimit: 0
Expand All @@ -21,8 +21,15 @@ spec:
spec:
restartPolicy: Never
nodeSelector:
cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
cloud.google.com/gke-tpu-accelerator: tpu-v5p-slice
cloud.google.com/gke-tpu-topology: 2x2x2
cloud.google.com/gke-spot: "true"
abc: xyz
tolerations:
- key: cloud.google.com/gke-spot
operator: Equal
value: "true"
effect: NoSchedule
containers:
- name: tpu-job
image: python:3.8
Expand All @@ -31,11 +38,8 @@ spec:
securityContext:
privileged: true
command:
- bash
- -c
- |
pip install 'jax[tpu]' -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
python -c 'import jax; print("TPU cores:", jax.device_count())'
- "sleep"
- "600"
resources:
requests:
google.com/tpu: 4
Expand Down
2 changes: 2 additions & 0 deletions tpu-provisioner/internal/cloud/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ const (
LabelJobSetName = keyPrefix + "tpu-provisioner-jobset-name"
LabelJobSetNamespace = keyPrefix + "tpu-provisioner-jobset-namespace"

LabelNodePoolHash = keyPrefix + "tpu-provisioner-nodepool-hash"

LabelProvisionerNodepoolID = "provisioner-nodepool-id"

// AnnotationCopyLabels is a comma-separated list of labels to copy from the Pod to the node pool config (Nodes).
Expand Down
Loading