Cleanup scripts and split Dockerfiles into base and incremental.

stanford-crfm · May 28, 2024 · 9d51405 · 9d51405
1 parent e3c3e55
commit 9d51405
Show file tree

Hide file tree

Showing 6 changed files with 108 additions and 62 deletions.
diff --git a/docker/tpu/Dockerfile → docker/tpu/Dockerfile.base b/docker/tpu/Dockerfile → docker/tpu/Dockerfile.base
@@ -15,11 +15,4 @@ RUN /opt/levanter/.venv/bin/pip install -e '.[test]'
 FROM python:3.10
 
 WORKDIR /opt/levanter
-COPY --from=build /opt/levanter/.venv /opt/levanter/.venv
-ENV TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS=60\
-    TENSORSTORE_CURL_LOW_SPEED_LIMIT_BYTES=1024\
-    RAY_USAGE_STATS_ENABLED=0
-
-ADD . /opt/levanter/
-# Setup venv Python as the default
-ENV PATH=/opt/levanter/.venv/bin:$PATH
+COPY --from=build /opt/levanter/.venv /opt/levanter/.venv
diff --git a/docker/tpu/Dockerfile.incremental b/docker/tpu/Dockerfile.incremental
@@ -0,0 +1,16 @@
+ARG REPO_LOCATION=us-west4-docker.pkg.dev/beastmaster-408319/levanter
+ARG BASE_VERSION=latest
+
+FROM ${REPO_LOCATION}/levanter:${BASE_VERSION}
+
+ENV TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS=60\
+    TENSORSTORE_CURL_LOW_SPEED_LIMIT_BYTES=1024\
+    RAY_USAGE_STATS_ENABLED=0\
+    PATH=/opt/levanter/.venv/bin:$PATH
+
+WORKDIR /opt/levanter
+
+ADD pyproject.toml README.md /opt/levanter/
+RUN pip install -e '.[test]'
+
+ADD . /opt/levanter
diff --git a/docs/Getting-Started-TPU-VM.md b/docs/Getting-Started-TPU-VM.md
@@ -87,6 +87,9 @@ Now that you have a TPU VM instance, you can follow the [Getting Started](Gettin
 
 ### Launch a GPT-2 Small in unattended mode
 
+You will need a [Docker installation](https://docs.docker.com/engine/install/)
+on your development machine to build and run images on TPUs.
+
 First create a configuration file for future launches in your Levanter directory:
 
 ```

diff --git a/infra/launch.py b/infra/launch.py
@@ -9,7 +9,7 @@
 
 import yaml
 
-from infra import deploy
+from infra import push_docker
 from google.cloud import storage
 
 
@@ -21,15 +21,23 @@ def gcloud_config():
     }
 
 
-def _arg_default(config: typing.Dict, key: str, required=False, default=None):
-    """For argparse: if value is in config, use it, otherwise mark the argument required."""
+def _add_arg(
+    parser: argparse.ArgumentParser, config: typing.Dict, flags: typing.List[str], required=False, default=None, **kw
+):
+    # Precendece is config file, then environment, then direct arguments
+    key = flags[0].lstrip("-").replace("-", "_")
     if key in config:
-        return {"default": config[key]}
-    if required:
-        return {"required": True}
+        default = config[key]
+
+    if key.upper() in os.environ:
+        default = os.environ[key.upper()]
+
     if default is not None:
-        return {"default": default}
-    return {}
+        kw["default"] = default
+    elif required:
+        kw["required"] = True
+
+    parser.add_argument(*flags, **kw)
 
 
 if __name__ == "__main__":
@@ -40,16 +48,12 @@ def _arg_default(config: typing.Dict, key: str, required=False, default=None):
     else:
         config = {}
 
-    parser.add_argument("--tpu", type=str, **_arg_default(config, "tpu", required=True))
-    parser.add_argument("--project", type=str, default=gcloud_config()["project"])
-    parser.add_argument("--zone", type=str, **_arg_default(config, "zone", required=True))
-    parser.add_argument(
-        "--docker_repository", type=str, **_arg_default(config, "docker_repository", default="levanter")
-    )
-    parser.add_argument(
-        "--image_name", type=str, **_arg_default(config, "image_name", default=f"levanter-{getpass.getuser()}")
-    )
-    parser.add_argument("--foreground", action="store_true", default=False)
+    _add_arg(parser, config, ["--tpu"], required=True)
+    _add_arg(parser, config, ["--zone"], required=True)
+    _add_arg(parser, config, ["--docker-repository"], default="levanter")
+    _add_arg(parser, config, ["--image-name"], default=f"levanter-{getpass.getuser()}")
+    _add_arg(parser, config, ["--foreground"], default=False, action="store_true")
+    _add_arg(parser, config, ["--project"], default=gcloud_config()["project"])
     parser.add_argument(
         "-e", "--env", action="append", nargs=2, metavar=("KEY", "VALUE"), default=config.get("env", {}).items()
     )
@@ -67,16 +71,16 @@ def _arg_default(config: typing.Dict, key: str, required=False, default=None):
     foreground = args.foreground
     env = {k: v for k, v in args.env}
 
-    if not "WANDB_PROJECT" in env:
-        env.append["WANDB_PROJECT"] = "levanter"
+    if "WANDB_PROJECT" not in env:
+        env["WANDB_PROJECT"] = "levanter"
 
     if command[0] == "--":
         command = command[1:]
 
     # make an image tag based on the unix timestamp to ensure we always pull the latest image
     image_tag = int(time.time())
 
-    full_image_id = deploy.push_to_gcp(
+    full_image_id = push_docker.push_to_gcp(
         project_id=project,
         region=region,
         repository=docker_repository,

diff --git a/infra/launch.sh b/infra/launch.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# This script is used for launching on TPU pods (or other direct run environments) via remote ssh with a virtual env
+set -e
+umask 000
+LEV_ROOT=$(dirname "$(readlink -f $0)")/..
+
+# figure out venv, first check if we wrote a path in infra/venv_path
+if [ ! -d "$VENV" ] && [ -f "$LEV_ROOT/infra/venv_path.txt" ]; then
+  VENV=$(cat "$LEV_ROOT"/infra/venv_path.txt)
+fi
+
+# if we still don't have a venv, we'll look in our default
+if [ ! -d "$VENV" ]; then
+  VENV=/files/venv32
+fi
+
+if [ ! -d "$VENV" ]; then
+  VENV=~/files/venv310
+fi
+
+source $VENV/bin/activate
+
+PYTHONPATH=${LEV_ROOT}:${LEV_ROOT}/src:${LEV_ROOT}/examples:$PYTHONPATH nohup "$@" >& "~/log-$(hostname).log" &
diff --git a/infra/deploy.py → infra/push_docker.py b/infra/deploy.py → infra/push_docker.py
@@ -1,7 +1,7 @@
 #!/usr/bin/python
 
 """
-Build and deploy the Levanter base image to Artifact Registry.
+Build and deploy the Levanter base image to Artifact Registry or Docker Hub.
 
 It is not necessary to run this yourself unless you are deploying a new base image: the launch
 script will automatically build and deploy an image based on your current code.
@@ -11,7 +11,7 @@
 import json
 import subprocess
 
-CLEANUP_POLICY = [
+GCP_CLEANUP_POLICY = [
     {
         "name": "delete-stale",
         "action": {"type": "Delete"},
@@ -47,7 +47,7 @@ def configure_gcp_docker(project_id, region, repository):
                 "artifacts",
                 "repositories",
                 "create",
-                "levanter",
+                repository,
                 f"--location={region}",
                 "--repository-format=docker",
             ],
@@ -60,7 +60,7 @@ def configure_gcp_docker(project_id, region, repository):
             raise
 
     with open("/tmp/cleanup-policy.json", "w") as f:
-        json.dump(CLEANUP_POLICY, f, indent=2)
+        json.dump(GCP_CLEANUP_POLICY, f, indent=2)
 
     _run(
         [
@@ -109,12 +109,9 @@ def configure_gcp_docker(project_id, region, repository):
     _run(["gcloud", "auth", "configure-docker", f"{region}-docker.pkg.dev"])
 
 
-def push_to_gcp(project_id, region, repository, image_name, image_tag="latest") -> str:
+def build_docker(docker_file, image_name, image_tag) -> str:
     """Builds a Docker image, enables artifact access, and pushes to Artifact Registry."""
 
-    artifact_repo = f"{region}-docker.pkg.dev/{project_id}/{repository}"
-    configure_gcp_docker(project_id, region, repository)
-
     _run(
         [
             "docker",
@@ -124,46 +121,56 @@ def push_to_gcp(project_id, region, repository, image_name, image_tag="latest")
             "-t",
             f"{image_name}:{image_tag}",
             "-f",
-            "docker/tpu/Dockerfile",
+            docker_file,
             ".",
         ]
     )
 
-    full_image_name = f"{artifact_repo}/{image_name}:{image_tag}"
-    _run(["docker", "tag", image_name, full_image_name])
-    _run(["docker", "push", full_image_name])
+    return f"{image_name}:{image_tag}"
 
-    return f"{region}-docker.pkg.dev/{project_id}/{repository}/{image_name}:{image_tag}"
 
+# Disabled until we can figure out how Docker hub organizations work
+# def push_to_docker_hub(local_image, target_image, image_tag):
+#     """Pushes a local Docker image to Docker Hub."""
+#     local_image = build_docker(local_image, image_tag)
 
-def push_to_docker_hub(image_name):
-    """Builds a Docker image and pushes to Docker hub."""
+#     _run(["docker", "tag", local_image, f"{target_image}:{image_tag}"])
+#     _run(["docker", "push", f"{target_image}:{image_tag}"])
+#     return target_image
 
-    _run(
-        [
-            "docker",
-            "buildx",
-            "build",
-            "--platform=linux/amd64",
-            "-t",
-            image_name,
-            "-f",
-            "docker/tpu/Dockerfile",
-            ".",
-        ]
-    )
 
-    full_image_name = f"levanter/{image_name}:latest"
-    _run(["docker", "tag", full_image_name])
+def push_to_gcp(
+    project_id, region, repository, image_name, image_tag, docker_file="docker/tpu/Dockerfile.incremental"
+) -> str:
+    """Pushes a local Docker image to Artifact Registry."""
+
+    local_image = build_docker(docker_file=docker_file, image_name=image_name, image_tag=image_tag)
+
+    artifact_repo = f"{region}-docker.pkg.dev/{project_id}/{repository}"
+
+    full_image_name = f"{artifact_repo}/{image_name}:{image_tag}"
+    _run(["docker", "tag", local_image, full_image_name])
     _run(["docker", "push", full_image_name])
 
+    return f"{region}-docker.pkg.dev/{project_id}/{repository}/{image_name}:{image_tag}"
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Build and push Docker image to Artifact Registry.")
-    parser.add_argument("--project", required=True, help="GCP project ID")
-    parser.add_argument("--region", required=True, help="Artifact Registry region (e.g., us-west4)")
+    parser.add_argument("--project", help="GCP project ID", required=True)
+    parser.add_argument("--region", help="Artifact Registry region (e.g., us-west4)", required=True)
     parser.add_argument("--repository", default="levanter", help="Artifact Registry repository name")
     parser.add_argument("--image", default="levanter", help="Docker image name.")
+    parser.add_argument("--image_tag", default="latest", help="Docker image tag.")
+
     args = parser.parse_args()
 
-    push_to_gcp(args.project, args.region, args.repository, args.image)
+    configure_gcp_docker(args.project, args.region, args.repository)
+    push_to_gcp(
+        args.project,
+        args.region,
+        args.repository,
+        args.image,
+        args.image_tag,
+        docker_file="docker/tpu/Dockerfile.base",
+    )