-
Notifications
You must be signed in to change notification settings - Fork 87
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Setup Docker for TPU execution and update infra scripts.
I tried to optimize the Docker image size a bit using a staged build, as Ray currently requires a source build of Meson, which requires a Clang installation... even with this jax & libtpu are each themselves >250MB installs, so there's no avoiding a large image size at the moment. Still, with this configuration, a v5-32 (the most I could get given GCPs stingy IP address allocation) takes about 50 seconds to run setup-vm.sh and pull the initial image. After the initial pull, new deployments take a few seconds to package up the current source directory. It's still possible to use the `git clone` approach via a volume mount, but the permissions are a bit finicky at that point, and I'm not sure how many options we want to have.
- Loading branch information
Showing
10 changed files
with
290 additions
and
128 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -140,6 +140,7 @@ dmypy.json | |
/wandb | ||
|
||
# dataset cache files | ||
/cache | ||
*.parquet | ||
ledger.json | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
FROM python:3.10 AS build | ||
RUN apt-get update && apt-get install -y clang | ||
RUN pip install virtualenv | ||
|
||
# venv binaries encode their directory, so we need to setup the venv in the final location | ||
RUN virtualenv -p python3.10 /opt/levanter/.venv | ||
RUN /opt/levanter/.venv/bin/pip install -U "jax[tpu]==0.4.26" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html | ||
|
||
WORKDIR /tmp | ||
|
||
# Add only the requirements files to cache dependency build/installation | ||
ADD pyproject.toml README.md /tmp/ | ||
RUN /opt/levanter/.venv/bin/pip install -e . | ||
|
||
FROM python:3.10 | ||
|
||
WORKDIR /opt/levanter | ||
COPY --from=build /opt/levanter/.venv /opt/levanter/.venv | ||
ENV TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS=60 TENSORSTORE_CURL_LOW_SPEED_LIMIT_BYTES=1024 | ||
ADD . /opt/levanter/ | ||
|
||
# Setup venv Python as the default | ||
ENV PATH=/opt/levanter/.venv/bin:$PATH |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
#!/usr/bin/python | ||
|
||
""" | ||
Build and deploy the Levanter base image to Artifact Registry. | ||
It is not necessary to run this yourself unless you are deploying a new base image: the launch | ||
script will automatically build and deploy an image based on your current code. | ||
""" | ||
|
||
import argparse | ||
import json | ||
import subprocess | ||
|
||
CLEANUP_POLICY = [ | ||
{ | ||
"name": "delete-stale", | ||
"action": {"type": "Delete"}, | ||
"condition": { | ||
"olderThan": "86400s", | ||
"tagState": "ANY", | ||
}, | ||
}, | ||
{ | ||
"name": "keep-latest", | ||
"action": {"type": "Keep"}, | ||
"mostRecentVersions": { | ||
"keepCount": 5, | ||
}, | ||
}, | ||
] | ||
|
||
|
||
def _run(*args, **kw): | ||
print("Running ", " ".join(args[0])) | ||
return subprocess.check_output(*args, **kw) | ||
|
||
|
||
def build_and_push_docker_image(project_id, region, repository, image_name): | ||
"""Builds a Docker image, enables artifact access, and pushes to Artifact Registry.""" | ||
|
||
artifact_repo = f"{region}-docker.pkg.dev/{project_id}/{repository}" | ||
|
||
# Activate artifact registry and setup the repository. | ||
_run(["gcloud", "services", "enable", "artifactregistry.googleapis.com"]) | ||
|
||
try: | ||
_run( | ||
[ | ||
"gcloud", | ||
"artifacts", | ||
"repositories", | ||
"create", | ||
"levanter", | ||
f"--location={region}", | ||
"--repository-format=docker", | ||
], | ||
stderr=subprocess.STDOUT, | ||
) | ||
except subprocess.CalledProcessError as e: | ||
# Ignore error if repository already exists. | ||
if b"ALREADY_EXISTS" not in e.output: | ||
print("Error creating repository: ", e.output) | ||
raise | ||
|
||
with open("/tmp/cleanup-policy.json", "w") as f: | ||
json.dump(CLEANUP_POLICY, f, indent=2) | ||
|
||
_run( | ||
[ | ||
"gcloud", | ||
"artifacts", | ||
"repositories", | ||
"set-cleanup-policies", | ||
f"--location={region}", | ||
"--policy=/tmp/cleanup-policy.json", | ||
repository, | ||
] | ||
) | ||
|
||
# Grant public read access ('allUsers') for TPU VMs | ||
_run( | ||
[ | ||
"gcloud", | ||
"artifacts", | ||
"repositories", | ||
"add-iam-policy-binding", | ||
"--member=allUsers", | ||
"--role=roles/artifactregistry.reader", | ||
f"--location={region}", | ||
repository, | ||
] | ||
) | ||
|
||
_run( | ||
[ | ||
"gcloud", | ||
"--project", | ||
project_id, | ||
"artifacts", | ||
"repositories", | ||
"add-iam-policy-binding", | ||
repository, | ||
"--location", | ||
region, | ||
"--member", | ||
"allUsers", | ||
"--role", | ||
"roles/artifactregistry.reader", | ||
] | ||
) | ||
|
||
_run(["gcloud", "auth", "configure-docker", f"{region}-docker.pkg.dev"]) | ||
_run( | ||
[ | ||
"docker", | ||
"buildx", | ||
"build", | ||
"--platform=linux/amd64", | ||
"-t", | ||
image_name, | ||
"-f", | ||
"docker/tpu/Dockerfile", | ||
".", | ||
] | ||
) | ||
|
||
full_image_name = f"{artifact_repo}/{image_name}:latest" | ||
_run(["docker", "tag", image_name, full_image_name]) | ||
_run(["docker", "push", full_image_name]) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description="Build and push Docker image to Artifact Registry.") | ||
parser.add_argument("--project", required=True, help="GCP project ID") | ||
parser.add_argument("--region", required=True, help="Artifact Registry region (e.g., us-west4)") | ||
parser.add_argument("--repository", default="levanter", help="Artifact Registry repository name") | ||
parser.add_argument("--image", default="levanter", help="Docker image name.") | ||
args = parser.parse_args() | ||
|
||
build_and_push_docker_image(args.project, args.region, args.repository, args.image) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.