diff --git a/docker/tpu/Dockerfile b/docker/tpu/Dockerfile.base similarity index 67% rename from docker/tpu/Dockerfile rename to docker/tpu/Dockerfile.base index 496807245..f82682307 100644 --- a/docker/tpu/Dockerfile +++ b/docker/tpu/Dockerfile.base @@ -15,11 +15,4 @@ RUN /opt/levanter/.venv/bin/pip install -e '.[test]' FROM python:3.10 WORKDIR /opt/levanter -COPY --from=build /opt/levanter/.venv /opt/levanter/.venv -ENV TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS=60\ - TENSORSTORE_CURL_LOW_SPEED_LIMIT_BYTES=1024\ - RAY_USAGE_STATS_ENABLED=0 - -ADD . /opt/levanter/ -# Setup venv Python as the default -ENV PATH=/opt/levanter/.venv/bin:$PATH \ No newline at end of file +COPY --from=build /opt/levanter/.venv /opt/levanter/.venv \ No newline at end of file diff --git a/docker/tpu/Dockerfile.incremental b/docker/tpu/Dockerfile.incremental new file mode 100644 index 000000000..8ec4cbd05 --- /dev/null +++ b/docker/tpu/Dockerfile.incremental @@ -0,0 +1,16 @@ +ARG REPO_LOCATION=us-west4-docker.pkg.dev/beastmaster-408319/levanter +ARG BASE_VERSION=latest + +FROM ${REPO_LOCATION}/levanter:${BASE_VERSION} + +ENV TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS=60\ + TENSORSTORE_CURL_LOW_SPEED_LIMIT_BYTES=1024\ + RAY_USAGE_STATS_ENABLED=0\ + PATH=/opt/levanter/.venv/bin:$PATH + +WORKDIR /opt/levanter + +ADD pyproject.toml README.md /opt/levanter/ +RUN pip install -e '.[test]' + +ADD . /opt/levanter \ No newline at end of file diff --git a/docs/Getting-Started-TPU-VM.md b/docs/Getting-Started-TPU-VM.md index 1012b0d06..1d2a50945 100644 --- a/docs/Getting-Started-TPU-VM.md +++ b/docs/Getting-Started-TPU-VM.md @@ -87,6 +87,9 @@ Now that you have a TPU VM instance, you can follow the [Getting Started](Gettin ### Launch a GPT-2 Small in unattended mode +You will need a [Docker installation](https://docs.docker.com/engine/install/) +on your development machine to build and run images on TPUs. + First create a configuration file for future launches in your Levanter directory: ``` diff --git a/infra/launch.py b/infra/launch.py index c5d62e252..b8397ec50 100755 --- a/infra/launch.py +++ b/infra/launch.py @@ -9,7 +9,7 @@ import yaml -from infra import deploy +from infra import push_docker from google.cloud import storage @@ -21,15 +21,23 @@ def gcloud_config(): } -def _arg_default(config: typing.Dict, key: str, required=False, default=None): - """For argparse: if value is in config, use it, otherwise mark the argument required.""" +def _add_arg( + parser: argparse.ArgumentParser, config: typing.Dict, flags: typing.List[str], required=False, default=None, **kw +): + # Precendece is config file, then environment, then direct arguments + key = flags[0].lstrip("-").replace("-", "_") if key in config: - return {"default": config[key]} - if required: - return {"required": True} + default = config[key] + + if key.upper() in os.environ: + default = os.environ[key.upper()] + if default is not None: - return {"default": default} - return {} + kw["default"] = default + elif required: + kw["required"] = True + + parser.add_argument(*flags, **kw) if __name__ == "__main__": @@ -40,16 +48,12 @@ def _arg_default(config: typing.Dict, key: str, required=False, default=None): else: config = {} - parser.add_argument("--tpu", type=str, **_arg_default(config, "tpu", required=True)) - parser.add_argument("--project", type=str, default=gcloud_config()["project"]) - parser.add_argument("--zone", type=str, **_arg_default(config, "zone", required=True)) - parser.add_argument( - "--docker_repository", type=str, **_arg_default(config, "docker_repository", default="levanter") - ) - parser.add_argument( - "--image_name", type=str, **_arg_default(config, "image_name", default=f"levanter-{getpass.getuser()}") - ) - parser.add_argument("--foreground", action="store_true", default=False) + _add_arg(parser, config, ["--tpu"], required=True) + _add_arg(parser, config, ["--zone"], required=True) + _add_arg(parser, config, ["--docker-repository"], default="levanter") + _add_arg(parser, config, ["--image-name"], default=f"levanter-{getpass.getuser()}") + _add_arg(parser, config, ["--foreground"], default=False, action="store_true") + _add_arg(parser, config, ["--project"], default=gcloud_config()["project"]) parser.add_argument( "-e", "--env", action="append", nargs=2, metavar=("KEY", "VALUE"), default=config.get("env", {}).items() ) @@ -67,8 +71,8 @@ def _arg_default(config: typing.Dict, key: str, required=False, default=None): foreground = args.foreground env = {k: v for k, v in args.env} - if not "WANDB_PROJECT" in env: - env.append["WANDB_PROJECT"] = "levanter" + if "WANDB_PROJECT" not in env: + env["WANDB_PROJECT"] = "levanter" if command[0] == "--": command = command[1:] @@ -76,7 +80,7 @@ def _arg_default(config: typing.Dict, key: str, required=False, default=None): # make an image tag based on the unix timestamp to ensure we always pull the latest image image_tag = int(time.time()) - full_image_id = deploy.push_to_gcp( + full_image_id = push_docker.push_to_gcp( project_id=project, region=region, repository=docker_repository, diff --git a/infra/launch.sh b/infra/launch.sh new file mode 100755 index 000000000..055891350 --- /dev/null +++ b/infra/launch.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# This script is used for launching on TPU pods (or other direct run environments) via remote ssh with a virtual env +set -e +umask 000 +LEV_ROOT=$(dirname "$(readlink -f $0)")/.. + +# figure out venv, first check if we wrote a path in infra/venv_path +if [ ! -d "$VENV" ] && [ -f "$LEV_ROOT/infra/venv_path.txt" ]; then + VENV=$(cat "$LEV_ROOT"/infra/venv_path.txt) +fi + +# if we still don't have a venv, we'll look in our default +if [ ! -d "$VENV" ]; then + VENV=/files/venv32 +fi + +if [ ! -d "$VENV" ]; then + VENV=~/files/venv310 +fi + +source $VENV/bin/activate + +PYTHONPATH=${LEV_ROOT}:${LEV_ROOT}/src:${LEV_ROOT}/examples:$PYTHONPATH nohup "$@" >& "~/log-$(hostname).log" & diff --git a/infra/deploy.py b/infra/push_docker.py similarity index 69% rename from infra/deploy.py rename to infra/push_docker.py index 90fdae6b6..a0354c975 100644 --- a/infra/deploy.py +++ b/infra/push_docker.py @@ -1,7 +1,7 @@ #!/usr/bin/python """ -Build and deploy the Levanter base image to Artifact Registry. +Build and deploy the Levanter base image to Artifact Registry or Docker Hub. It is not necessary to run this yourself unless you are deploying a new base image: the launch script will automatically build and deploy an image based on your current code. @@ -11,7 +11,7 @@ import json import subprocess -CLEANUP_POLICY = [ +GCP_CLEANUP_POLICY = [ { "name": "delete-stale", "action": {"type": "Delete"}, @@ -47,7 +47,7 @@ def configure_gcp_docker(project_id, region, repository): "artifacts", "repositories", "create", - "levanter", + repository, f"--location={region}", "--repository-format=docker", ], @@ -60,7 +60,7 @@ def configure_gcp_docker(project_id, region, repository): raise with open("/tmp/cleanup-policy.json", "w") as f: - json.dump(CLEANUP_POLICY, f, indent=2) + json.dump(GCP_CLEANUP_POLICY, f, indent=2) _run( [ @@ -109,12 +109,9 @@ def configure_gcp_docker(project_id, region, repository): _run(["gcloud", "auth", "configure-docker", f"{region}-docker.pkg.dev"]) -def push_to_gcp(project_id, region, repository, image_name, image_tag="latest") -> str: +def build_docker(docker_file, image_name, image_tag) -> str: """Builds a Docker image, enables artifact access, and pushes to Artifact Registry.""" - artifact_repo = f"{region}-docker.pkg.dev/{project_id}/{repository}" - configure_gcp_docker(project_id, region, repository) - _run( [ "docker", @@ -124,46 +121,56 @@ def push_to_gcp(project_id, region, repository, image_name, image_tag="latest") "-t", f"{image_name}:{image_tag}", "-f", - "docker/tpu/Dockerfile", + docker_file, ".", ] ) - full_image_name = f"{artifact_repo}/{image_name}:{image_tag}" - _run(["docker", "tag", image_name, full_image_name]) - _run(["docker", "push", full_image_name]) + return f"{image_name}:{image_tag}" - return f"{region}-docker.pkg.dev/{project_id}/{repository}/{image_name}:{image_tag}" +# Disabled until we can figure out how Docker hub organizations work +# def push_to_docker_hub(local_image, target_image, image_tag): +# """Pushes a local Docker image to Docker Hub.""" +# local_image = build_docker(local_image, image_tag) -def push_to_docker_hub(image_name): - """Builds a Docker image and pushes to Docker hub.""" +# _run(["docker", "tag", local_image, f"{target_image}:{image_tag}"]) +# _run(["docker", "push", f"{target_image}:{image_tag}"]) +# return target_image - _run( - [ - "docker", - "buildx", - "build", - "--platform=linux/amd64", - "-t", - image_name, - "-f", - "docker/tpu/Dockerfile", - ".", - ] - ) - full_image_name = f"levanter/{image_name}:latest" - _run(["docker", "tag", full_image_name]) +def push_to_gcp( + project_id, region, repository, image_name, image_tag, docker_file="docker/tpu/Dockerfile.incremental" +) -> str: + """Pushes a local Docker image to Artifact Registry.""" + + local_image = build_docker(docker_file=docker_file, image_name=image_name, image_tag=image_tag) + + artifact_repo = f"{region}-docker.pkg.dev/{project_id}/{repository}" + + full_image_name = f"{artifact_repo}/{image_name}:{image_tag}" + _run(["docker", "tag", local_image, full_image_name]) _run(["docker", "push", full_image_name]) + return f"{region}-docker.pkg.dev/{project_id}/{repository}/{image_name}:{image_tag}" + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Build and push Docker image to Artifact Registry.") - parser.add_argument("--project", required=True, help="GCP project ID") - parser.add_argument("--region", required=True, help="Artifact Registry region (e.g., us-west4)") + parser.add_argument("--project", help="GCP project ID", required=True) + parser.add_argument("--region", help="Artifact Registry region (e.g., us-west4)", required=True) parser.add_argument("--repository", default="levanter", help="Artifact Registry repository name") parser.add_argument("--image", default="levanter", help="Docker image name.") + parser.add_argument("--image_tag", default="latest", help="Docker image tag.") + args = parser.parse_args() - push_to_gcp(args.project, args.region, args.repository, args.image) + configure_gcp_docker(args.project, args.region, args.repository) + push_to_gcp( + args.project, + args.region, + args.repository, + args.image, + args.image_tag, + docker_file="docker/tpu/Dockerfile.base", + )