-
Notifications
You must be signed in to change notification settings - Fork 87
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Setup Docker for TPU execution and update infra scripts.
I tried to optimize the Docker image size a bit using a staged build, as Ray currently requires a source build of Meson, which requires a Clang installation... even with this jax & libtpu are each themselves >250MB installs, so there's no avoiding a large image size at the moment. Still, with this configuration, a v5-32 (the most I could get given GCPs stingy IP address allocation) takes about 50 seconds to run setup-vm.sh and pull the initial image. After the initial pull, new deployments take a few seconds to package up the current source directory. It's still possible to use the `git clone` approach via a volume mount, but the permissions are a bit finicky at that point, and I'm not sure how many options we want to have.
- Loading branch information
Showing
14 changed files
with
588 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
FROM python:3.10 AS build | ||
RUN apt-get update && apt-get install -y clang | ||
RUN pip install virtualenv | ||
|
||
# venv binaries encode their directory, so we need to setup the venv in the final location | ||
RUN virtualenv -p python3.10 /opt/levanter/.venv | ||
RUN /opt/levanter/.venv/bin/pip install -U "jax[tpu]==0.4.26" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html | ||
|
||
# Add only the requirements files to cache dependency build/installation | ||
WORKDIR /tmp | ||
ADD pyproject.toml README.md /tmp/ | ||
RUN /opt/levanter/.venv/bin/pip install -e '.[test]' | ||
|
||
FROM python:3.10 | ||
|
||
WORKDIR /opt/levanter | ||
COPY --from=build /opt/levanter/.venv /opt/levanter/.venv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
ARG IMAGE=ghcr.io/rjpower/levanter | ||
ARG TAG=latest | ||
|
||
FROM ${IMAGE}:${TAG} | ||
|
||
ENV TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS=60\ | ||
TENSORSTORE_CURL_LOW_SPEED_LIMIT_BYTES=1024\ | ||
RAY_USAGE_STATS_ENABLED=0\ | ||
PATH=/opt/levanter/.venv/bin:$PATH\ | ||
PYTHONPATH=/opt/levanter:/opt/levanter/src:/opt/levanter/examples:/opt/levanter/tests\ | ||
HOME=/home/levanter | ||
|
||
WORKDIR /opt/levanter | ||
|
||
ADD pyproject.toml README.md /opt/levanter/ | ||
RUN pip install -e '.[test]' | ||
ADD . /opt/levanter |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
import argparse | ||
import os | ||
import subprocess | ||
import typing | ||
|
||
from google.cloud import storage | ||
import yaml | ||
|
||
|
||
def run_command(*args, **kwargs): | ||
print("Running:", " ".join(list(args))) | ||
return subprocess.check_call(args, **kwargs) | ||
|
||
|
||
def add_ssh_key(ssh_key_filename): | ||
# format 3072 SHA256:... key-name (RSA) | ||
key_hash = subprocess.check_output(["ssh-keygen", "-lf", ssh_key_filename]).decode("utf-8").split()[1] | ||
existing_keys = subprocess.check_output(["ssh-add", "-l"]).decode("utf-8").split("\n") | ||
for key in existing_keys: | ||
if key_hash in key: | ||
print('Found existing key in ssh-agent, skipping "ssh-add"') | ||
return | ||
|
||
subprocess.check_call(["ssh-add", ssh_key_filename]) | ||
|
||
|
||
def tpu_ssh(tpu_name, zone, *args): | ||
add_ssh_key(os.path.expanduser("~/.ssh/google_compute_engine")) | ||
return run_command( | ||
"gcloud", | ||
"alpha", | ||
"compute", | ||
"tpus", | ||
"tpu-vm", | ||
"ssh", | ||
tpu_name, | ||
"--worker=all", | ||
f"--zone={zone}", | ||
"--command=%s" % " ".join(args), | ||
) | ||
|
||
|
||
# Oddly enough, there's no API to simply fetch the current gcloud configuration... | ||
def gcloud_config(): | ||
client = storage.Client() | ||
return { | ||
"project": client.project, | ||
} | ||
|
||
|
||
def add_arg( | ||
parser: argparse.ArgumentParser, config: typing.Dict, flags: typing.List[str], required=False, default=None, **kw | ||
): | ||
"""Add an argument to the parser, using `config` or the environment to resolve default values.""" | ||
key = flags[0].lstrip("-").replace("-", "_") | ||
if key in config: | ||
default = config[key] | ||
|
||
if key.upper() in os.environ: | ||
default = os.environ[key.upper()] | ||
|
||
if default is not None: | ||
kw["default"] = default | ||
elif required: | ||
kw["required"] = True | ||
|
||
parser.add_argument(*flags, **kw) | ||
|
||
|
||
def load_config(): | ||
if os.path.exists(".config"): | ||
return yaml.load(open(".config", "r"), Loader=yaml.SafeLoader) | ||
else: | ||
return {} |
Oops, something went wrong.