Skip to content

Commit

Permalink
Cleanup scripts and split Dockerfiles into base and incremental.
Browse files Browse the repository at this point in the history
  • Loading branch information
rjpower committed May 28, 2024
1 parent e3c3e55 commit 9d51405
Show file tree
Hide file tree
Showing 6 changed files with 108 additions and 62 deletions.
9 changes: 1 addition & 8 deletions docker/tpu/Dockerfile → docker/tpu/Dockerfile.base
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,4 @@ RUN /opt/levanter/.venv/bin/pip install -e '.[test]'
FROM python:3.10

WORKDIR /opt/levanter
COPY --from=build /opt/levanter/.venv /opt/levanter/.venv
ENV TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS=60\
TENSORSTORE_CURL_LOW_SPEED_LIMIT_BYTES=1024\
RAY_USAGE_STATS_ENABLED=0

ADD . /opt/levanter/
# Setup venv Python as the default
ENV PATH=/opt/levanter/.venv/bin:$PATH
COPY --from=build /opt/levanter/.venv /opt/levanter/.venv
16 changes: 16 additions & 0 deletions docker/tpu/Dockerfile.incremental
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
ARG REPO_LOCATION=us-west4-docker.pkg.dev/beastmaster-408319/levanter
ARG BASE_VERSION=latest

FROM ${REPO_LOCATION}/levanter:${BASE_VERSION}

ENV TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS=60\
TENSORSTORE_CURL_LOW_SPEED_LIMIT_BYTES=1024\
RAY_USAGE_STATS_ENABLED=0\
PATH=/opt/levanter/.venv/bin:$PATH

WORKDIR /opt/levanter

ADD pyproject.toml README.md /opt/levanter/
RUN pip install -e '.[test]'

ADD . /opt/levanter
3 changes: 3 additions & 0 deletions docs/Getting-Started-TPU-VM.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ Now that you have a TPU VM instance, you can follow the [Getting Started](Gettin

### Launch a GPT-2 Small in unattended mode

You will need a [Docker installation](https://docs.docker.com/engine/install/)
on your development machine to build and run images on TPUs.

First create a configuration file for future launches in your Levanter directory:

```
Expand Down
46 changes: 25 additions & 21 deletions infra/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import yaml

from infra import deploy
from infra import push_docker
from google.cloud import storage


Expand All @@ -21,15 +21,23 @@ def gcloud_config():
}


def _arg_default(config: typing.Dict, key: str, required=False, default=None):
"""For argparse: if value is in config, use it, otherwise mark the argument required."""
def _add_arg(
parser: argparse.ArgumentParser, config: typing.Dict, flags: typing.List[str], required=False, default=None, **kw
):
# Precendece is config file, then environment, then direct arguments
key = flags[0].lstrip("-").replace("-", "_")
if key in config:
return {"default": config[key]}
if required:
return {"required": True}
default = config[key]

if key.upper() in os.environ:
default = os.environ[key.upper()]

if default is not None:
return {"default": default}
return {}
kw["default"] = default
elif required:
kw["required"] = True

parser.add_argument(*flags, **kw)


if __name__ == "__main__":
Expand All @@ -40,16 +48,12 @@ def _arg_default(config: typing.Dict, key: str, required=False, default=None):
else:
config = {}

parser.add_argument("--tpu", type=str, **_arg_default(config, "tpu", required=True))
parser.add_argument("--project", type=str, default=gcloud_config()["project"])
parser.add_argument("--zone", type=str, **_arg_default(config, "zone", required=True))
parser.add_argument(
"--docker_repository", type=str, **_arg_default(config, "docker_repository", default="levanter")
)
parser.add_argument(
"--image_name", type=str, **_arg_default(config, "image_name", default=f"levanter-{getpass.getuser()}")
)
parser.add_argument("--foreground", action="store_true", default=False)
_add_arg(parser, config, ["--tpu"], required=True)
_add_arg(parser, config, ["--zone"], required=True)
_add_arg(parser, config, ["--docker-repository"], default="levanter")
_add_arg(parser, config, ["--image-name"], default=f"levanter-{getpass.getuser()}")
_add_arg(parser, config, ["--foreground"], default=False, action="store_true")
_add_arg(parser, config, ["--project"], default=gcloud_config()["project"])
parser.add_argument(
"-e", "--env", action="append", nargs=2, metavar=("KEY", "VALUE"), default=config.get("env", {}).items()
)
Expand All @@ -67,16 +71,16 @@ def _arg_default(config: typing.Dict, key: str, required=False, default=None):
foreground = args.foreground
env = {k: v for k, v in args.env}

if not "WANDB_PROJECT" in env:
env.append["WANDB_PROJECT"] = "levanter"
if "WANDB_PROJECT" not in env:
env["WANDB_PROJECT"] = "levanter"

if command[0] == "--":
command = command[1:]

# make an image tag based on the unix timestamp to ensure we always pull the latest image
image_tag = int(time.time())

full_image_id = deploy.push_to_gcp(
full_image_id = push_docker.push_to_gcp(
project_id=project,
region=region,
repository=docker_repository,
Expand Down
23 changes: 23 additions & 0 deletions infra/launch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash
# This script is used for launching on TPU pods (or other direct run environments) via remote ssh with a virtual env
set -e
umask 000
LEV_ROOT=$(dirname "$(readlink -f $0)")/..

# figure out venv, first check if we wrote a path in infra/venv_path
if [ ! -d "$VENV" ] && [ -f "$LEV_ROOT/infra/venv_path.txt" ]; then
VENV=$(cat "$LEV_ROOT"/infra/venv_path.txt)
fi

# if we still don't have a venv, we'll look in our default
if [ ! -d "$VENV" ]; then
VENV=/files/venv32
fi

if [ ! -d "$VENV" ]; then
VENV=~/files/venv310
fi

source $VENV/bin/activate

PYTHONPATH=${LEV_ROOT}:${LEV_ROOT}/src:${LEV_ROOT}/examples:$PYTHONPATH nohup "$@" >& "~/log-$(hostname).log" &
73 changes: 40 additions & 33 deletions infra/deploy.py → infra/push_docker.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python

"""
Build and deploy the Levanter base image to Artifact Registry.
Build and deploy the Levanter base image to Artifact Registry or Docker Hub.
It is not necessary to run this yourself unless you are deploying a new base image: the launch
script will automatically build and deploy an image based on your current code.
Expand All @@ -11,7 +11,7 @@
import json
import subprocess

CLEANUP_POLICY = [
GCP_CLEANUP_POLICY = [
{
"name": "delete-stale",
"action": {"type": "Delete"},
Expand Down Expand Up @@ -47,7 +47,7 @@ def configure_gcp_docker(project_id, region, repository):
"artifacts",
"repositories",
"create",
"levanter",
repository,
f"--location={region}",
"--repository-format=docker",
],
Expand All @@ -60,7 +60,7 @@ def configure_gcp_docker(project_id, region, repository):
raise

with open("/tmp/cleanup-policy.json", "w") as f:
json.dump(CLEANUP_POLICY, f, indent=2)
json.dump(GCP_CLEANUP_POLICY, f, indent=2)

_run(
[
Expand Down Expand Up @@ -109,12 +109,9 @@ def configure_gcp_docker(project_id, region, repository):
_run(["gcloud", "auth", "configure-docker", f"{region}-docker.pkg.dev"])


def push_to_gcp(project_id, region, repository, image_name, image_tag="latest") -> str:
def build_docker(docker_file, image_name, image_tag) -> str:
"""Builds a Docker image, enables artifact access, and pushes to Artifact Registry."""

artifact_repo = f"{region}-docker.pkg.dev/{project_id}/{repository}"
configure_gcp_docker(project_id, region, repository)

_run(
[
"docker",
Expand All @@ -124,46 +121,56 @@ def push_to_gcp(project_id, region, repository, image_name, image_tag="latest")
"-t",
f"{image_name}:{image_tag}",
"-f",
"docker/tpu/Dockerfile",
docker_file,
".",
]
)

full_image_name = f"{artifact_repo}/{image_name}:{image_tag}"
_run(["docker", "tag", image_name, full_image_name])
_run(["docker", "push", full_image_name])
return f"{image_name}:{image_tag}"

return f"{region}-docker.pkg.dev/{project_id}/{repository}/{image_name}:{image_tag}"

# Disabled until we can figure out how Docker hub organizations work
# def push_to_docker_hub(local_image, target_image, image_tag):
# """Pushes a local Docker image to Docker Hub."""
# local_image = build_docker(local_image, image_tag)

def push_to_docker_hub(image_name):
"""Builds a Docker image and pushes to Docker hub."""
# _run(["docker", "tag", local_image, f"{target_image}:{image_tag}"])
# _run(["docker", "push", f"{target_image}:{image_tag}"])
# return target_image

_run(
[
"docker",
"buildx",
"build",
"--platform=linux/amd64",
"-t",
image_name,
"-f",
"docker/tpu/Dockerfile",
".",
]
)

full_image_name = f"levanter/{image_name}:latest"
_run(["docker", "tag", full_image_name])
def push_to_gcp(
project_id, region, repository, image_name, image_tag, docker_file="docker/tpu/Dockerfile.incremental"
) -> str:
"""Pushes a local Docker image to Artifact Registry."""

local_image = build_docker(docker_file=docker_file, image_name=image_name, image_tag=image_tag)

artifact_repo = f"{region}-docker.pkg.dev/{project_id}/{repository}"

full_image_name = f"{artifact_repo}/{image_name}:{image_tag}"
_run(["docker", "tag", local_image, full_image_name])
_run(["docker", "push", full_image_name])

return f"{region}-docker.pkg.dev/{project_id}/{repository}/{image_name}:{image_tag}"


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Build and push Docker image to Artifact Registry.")
parser.add_argument("--project", required=True, help="GCP project ID")
parser.add_argument("--region", required=True, help="Artifact Registry region (e.g., us-west4)")
parser.add_argument("--project", help="GCP project ID", required=True)
parser.add_argument("--region", help="Artifact Registry region (e.g., us-west4)", required=True)
parser.add_argument("--repository", default="levanter", help="Artifact Registry repository name")
parser.add_argument("--image", default="levanter", help="Docker image name.")
parser.add_argument("--image_tag", default="latest", help="Docker image tag.")

args = parser.parse_args()

push_to_gcp(args.project, args.region, args.repository, args.image)
configure_gcp_docker(args.project, args.region, args.repository)
push_to_gcp(
args.project,
args.region,
args.repository,
args.image,
args.image_tag,
docker_file="docker/tpu/Dockerfile.base",
)

0 comments on commit 9d51405

Please sign in to comment.