From 3b3173c8f846d387ba85182a1ca67020b07906ff Mon Sep 17 00:00:00 2001
From: Russell Power <russell.power@gmail.com>
Date: Fri, 24 May 2024 22:11:37 -0700
Subject: [PATCH 1/7] Setup Docker for TPU execution and update infra scripts.

I tried to optimize the Docker image size a bit using a staged build, as Ray
currently requires a source build of Meson, which requires a Clang
installation... even with this jax & libtpu are each themselves >250MB
installs, so there's no avoiding a large image size at the moment.

Still, with this configuration, a v5-32 (the most I could get given GCPs stingy
IP address allocation) takes about 50 seconds to run setup-vm.sh and pull the
initial image.  After the initial pull, new deployments take a few seconds to
package up the current source directory.

It's still possible to use the `git clone` approach via a volume mount, but the
permissions are a bit finicky at that point, and I'm not sure how many options
we want to have.
---
 .dockerignore                            |   4 +-
 .github/workflows/tpu_unit_tests.yaml    |   6 +-
 .gitignore                               |   4 +
 docker/tpu/Dockerfile.base               |  17 ++
 docker/tpu/Dockerfile.incremental        |  17 ++
 docs/Getting-Started-TPU-VM.md           |  43 +++--
 docs/Training-On-Your-Data.md            |  17 +-
 docs/tutorials/Training-On-Audio-Data.md |   2 +-
 infra/__init__.py                        |   0
 infra/helpers/cli.py                     |  80 +++++++++
 infra/launch.py                          | 211 +++++++++++++++++++++++
 infra/push_docker.py                     | 210 ++++++++++++++++++++++
 pyproject.toml                           |  12 +-
 src/levanter/tracker/wandb.py            |   3 +
 14 files changed, 607 insertions(+), 19 deletions(-)
 create mode 100644 docker/tpu/Dockerfile.base
 create mode 100644 docker/tpu/Dockerfile.incremental
 create mode 100644 infra/__init__.py
 create mode 100644 infra/helpers/cli.py
 create mode 100755 infra/launch.py
 create mode 100644 infra/push_docker.py

diff --git a/.dockerignore b/.dockerignore
index f5ceb7397..17fbbcfe1 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,3 +1,5 @@
+.git
+
 scratch
 cache
 wandb
@@ -44,6 +46,7 @@ instance/
 
 # Sphinx documentation
 docs/_build/
+docs/figures/
 
 # PyBuilder
 target/
@@ -105,7 +108,6 @@ dmypy.json
 # JetBrains
 .idea/
 
-
 # dataset cache files
 **/*.parquet
 **/ledger.json
diff --git a/.github/workflows/tpu_unit_tests.yaml b/.github/workflows/tpu_unit_tests.yaml
index 3e27426eb..4f3eaccb3 100644
--- a/.github/workflows/tpu_unit_tests.yaml
+++ b/.github/workflows/tpu_unit_tests.yaml
@@ -31,14 +31,12 @@ jobs:
           export TPU_NAME=ci-run-${{ github.run_id }}
           eval "$(ssh-agent -s)"
           TRUE_SHA=${{ github.event.pull_request.head.sha }}
-          bash infra/spin-up-vm.sh $TPU_NAME -z ${TPU_ZONE} -t v4-8 --preemptible -s infra/helpers/setup-tpu-vm-tests.sh -b ${TRUE_SHA} --retries 1
-#          infra/babysit-tpu-vm.sh $TPU_NAME -z ${{ TPU_ZONE }} -t v4-8 --preemptible -s infra/helpers/setup-tpu-vm-tests.sh -b ${{ github.sha }} --retries 1 -- \
-#            PYTHONPATH=$PYTHONPATH:levanter/tests bash levanter/infra/run.sh pytest levanter/tests -m "not entry"
+          bash infra/spin-up-vm.sh $TPU_NAME -z ${TPU_ZONE} -t v4-8 --preemptible --retries 1
 
       - name: Run most tests
         run: |
           export TPU_NAME=ci-run-${{ github.run_id }}
-          gcloud compute tpus tpu-vm ssh $TPU_NAME --zone ${TPU_ZONE} --command "PYTHONPATH=$PYTHONPATH:levanter/tests bash levanter/infra/run.sh pytest levanter/tests -m 'not entry'"
+          python infra/launch.py --foreground --tpu=$TPU_NAME --zone=$TPU_ZONE -- /opt/levanter/.venv/bin/pytest tests -m "not entry"
 # Something's wrong with these
 #
 #      - name: Run forked tests
diff --git a/.gitignore b/.gitignore
index c66f6f352..835da2048 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,8 @@
 /scratch
 
+# Configuration for TPU launches/secrets
+.config
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -140,6 +143,7 @@ dmypy.json
 /wandb
 
 # dataset cache files
+/cache
 *.parquet
 ledger.json
 
diff --git a/docker/tpu/Dockerfile.base b/docker/tpu/Dockerfile.base
new file mode 100644
index 000000000..9a93736b1
--- /dev/null
+++ b/docker/tpu/Dockerfile.base
@@ -0,0 +1,17 @@
+FROM python:3.10 AS build
+RUN apt-get update && apt-get install -y clang
+RUN pip install virtualenv
+
+# venv binaries encode their directory, so we need to setup the venv in the final location
+RUN virtualenv -p python3.10 /opt/levanter/.venv
+RUN /opt/levanter/.venv/bin/pip install -U "jax[tpu]==0.4.26" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
+
+# Add only the requirements files to cache dependency build/installation
+WORKDIR /tmp
+ADD pyproject.toml README.md /tmp/
+RUN /opt/levanter/.venv/bin/pip install -e '.[test]'
+
+FROM python:3.10
+
+WORKDIR /opt/levanter
+COPY --from=build /opt/levanter/.venv /opt/levanter/.venv
\ No newline at end of file
diff --git a/docker/tpu/Dockerfile.incremental b/docker/tpu/Dockerfile.incremental
new file mode 100644
index 000000000..10afb1ca7
--- /dev/null
+++ b/docker/tpu/Dockerfile.incremental
@@ -0,0 +1,17 @@
+ARG IMAGE=ghcr.io/rjpower/levanter
+ARG TAG=latest
+
+FROM ${IMAGE}:${TAG}
+
+ENV TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS=60\
+    TENSORSTORE_CURL_LOW_SPEED_LIMIT_BYTES=1024\
+    RAY_USAGE_STATS_ENABLED=0\
+    PATH=/opt/levanter/.venv/bin:$PATH\
+    PYTHONPATH=/opt/levanter:/opt/levanter/src:/opt/levanter/examples:/opt/levanter/tests\
+    HOME=/home/levanter
+
+WORKDIR /opt/levanter
+
+ADD pyproject.toml README.md /opt/levanter/
+RUN pip install -e '.[test]'
+ADD . /opt/levanter
\ No newline at end of file
diff --git a/docs/Getting-Started-TPU-VM.md b/docs/Getting-Started-TPU-VM.md
index fe73eef70..1d2a50945 100644
--- a/docs/Getting-Started-TPU-VM.md
+++ b/docs/Getting-Started-TPU-VM.md
@@ -85,18 +85,40 @@ the VM. That's explained down below in the [Running Levanter GPT-2](#running-lev
 ## Running Levanter GPT-2
 Now that you have a TPU VM instance, you can follow the [Getting Started](Getting-Started-Training.md) steps, but here are a few shortcuts:
 
-### Launch a GPT-2 Small in unattended mode (using nohup)
+### Launch a GPT-2 Small in unattended mode
+
+You will need a [Docker installation](https://docs.docker.com/engine/install/)
+on your development machine to build and run images on TPUs.
+
+First create a configuration file for future launches in your Levanter directory:
+
+```
+cat > .config <<EOF
+env:
+    WANDB_API_KEY:  ...
+    WANDB_ENTITY: ...
+    WANDB_PROJECT: levanter
+    HF_TOKEN: ...
+
+docker_repository: levanter
+zone: us-west4-a
+tpu: test-tpu
+EOF
+```
+
+Everything after the `--` is run on each worker.
+
 ```bash
-gcloud compute tpus tpu-vm ssh $NAME --zone $ZONE --worker=all --command 'WANDB_API_KEY=... levanter/infra/launch.sh python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
+python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
 ```
 
-`launch.sh` will run the command in the background and redirect stdout and stderr to a log file in the home directory
-on each worker.
+`launch.py` will package your directory and create and deploy a Docker image  on each worker.
 
 ### Launch a GPT-2 Small in interactive mode
-This version writes to the terminal, you should use tmux or something for long running jobs for this version. It's mostly for debugging.
+
+To run in the foreground, use `--foreground` with the `launch.py` script. You should use tmux or something for long running jobs for this version. It's mostly for debugging.
 ```bash
-gcloud compute tpus tpu-vm ssh $NAME --zone $ZONE --worker=all --command 'WANDB_API_KEY=... levanter/infra/run.sh python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
+python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
 ```
 
 ### Babysitting Script
@@ -113,11 +135,12 @@ You can run it like this:
 
 ```bash
 infra/babysit-tpu-vm <name> -z <zone> -t <type> [--preemptible]  -- \
-    WANDB_API_KEY=... levanter/infra/run.sh python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml
+    python infra/launch.py -- levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml
 ```
 
-That `--` is important! It separates the spin up args from the running args. Also, you should never use `launch.sh`
-with `babysit`, because nohup exits immediately with exit code 0.
+That `--` is important! It separates the spin up args from the running args.
+Also you should always use `--foregrouund` with `babysit-tpu-vm`, as the
+background mode will always return immediately.
 
 ### Running your own config
 
@@ -132,7 +155,7 @@ Afterward, you can use the config directly from the TPU VM instance, e.g.:
 
 ```bash
 infra/babysit-tpu-vm <name> -z <zone> -t <type> [--preemptible] -- \
-    WANDB_API_KEY=... levanter/infra/run.sh python levanter/src/levanter/main/train_lm.py --config_path gs://my_bucket/my_config.yaml \
+    python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path gs://my_bucket/my_config.yaml \
     --trainer.checkpointer.base_path gs://path/to/checkpoints/
 ```
 
diff --git a/docs/Training-On-Your-Data.md b/docs/Training-On-Your-Data.md
index c14b0ba66..cac96de31 100644
--- a/docs/Training-On-Your-Data.md
+++ b/docs/Training-On-Your-Data.md
@@ -395,8 +395,23 @@ bash infra/spin-up-tpu-vm.sh my-tpu -z us-east1-d -t v3-128
 
 This will spin up a TPU VM instance and install Levanter on it. You can then run a command like so:
 
+
+```
+cat > .config <<EOF
+env:
+    WANDB_API_KEY:  ...
+    WANDB_ENTITY: ...
+    WANDB_PROJECT: levanter
+    HF_TOKEN: ...
+
+docker_repository: levanter
+zone: us-west4-a
+tpu: test-tpu
+EOF
+```
+
 ```bash
-gcloud compute tpus tpu-vm ssh my-tpu   --zone us-east1-d --worker=all --command="WANDB_API_KEY=... levanter/infra/launch.sh python levanter/src/levanter/main/train_lm.py --config_path gs://path/to/config.yaml"
+python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path gs://path/to/config.yaml"
 ```
 
 ## Monitoring
diff --git a/docs/tutorials/Training-On-Audio-Data.md b/docs/tutorials/Training-On-Audio-Data.md
index 235b2e79b..bdab91c43 100644
--- a/docs/tutorials/Training-On-Audio-Data.md
+++ b/docs/tutorials/Training-On-Audio-Data.md
@@ -189,7 +189,7 @@ bash infra/spin-up-tpu-vm.sh my-tpu -z us-east1-d -t v3-128
 This will spin up a TPU VM instance and install Levanter on it. You can then run a command like so:
 
 ```bash
-gcloud compute tpus tpu-vm ssh my-tpu   --zone us-east1-d --worker=all --command="WANDB_API_KEY=... levanter/infra/launch.sh python levanter/src/levanter/main/train_asr.py --config_path gs://path/to/config.yaml"
+python infra/launch.py -- python levanter/src/levanter/main/train_asr.py --config_path gs://path/to/config.yaml"
 ```
 
 ### GPU
diff --git a/infra/__init__.py b/infra/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/infra/helpers/cli.py b/infra/helpers/cli.py
new file mode 100644
index 000000000..8b7cf9ed2
--- /dev/null
+++ b/infra/helpers/cli.py
@@ -0,0 +1,80 @@
+import argparse
+import os
+import subprocess
+import typing
+
+from google.cloud import storage
+import yaml
+
+
+def run_command(*args, **kwargs):
+    print("Running:", " ".join(list(args)))
+    return subprocess.check_call(args, **kwargs)
+
+
+def add_ssh_key(ssh_key_filename):
+    # format 3072 SHA256:... key-name (RSA)
+    key_hash = subprocess.check_output(["ssh-keygen", "-lf", ssh_key_filename]).decode("utf-8").split()[1]
+    existing_keys = subprocess.check_output(["ssh-add", "-l"]).decode("utf-8").split("\n")
+    for key in existing_keys:
+        if key_hash in key:
+            print('Found existing key in ssh-agent, skipping "ssh-add"')
+            return
+
+    subprocess.check_call(["ssh-add", ssh_key_filename])
+
+
+def tpu_ssh(tpu_name, zone, *args, ignore_failure=False):
+    add_ssh_key(os.path.expanduser("~/.ssh/google_compute_engine"))
+    try:
+        return run_command(
+            "gcloud",
+            "alpha",
+            "compute",
+            "tpus",
+            "tpu-vm",
+            "ssh",
+            tpu_name,
+            "--worker=all",
+            f"--zone={zone}",
+            "--command=%s" % " ".join(args),
+        )
+    except subprocess.CalledProcessError as e:
+        if ignore_failure:
+            print("Ignoring failure:", e)
+        else:
+            raise
+
+
+# Oddly enough, there's no API to simply fetch the current gcloud configuration...
+def gcloud_config():
+    client = storage.Client()
+    return {
+        "project": client.project,
+    }
+
+
+def add_arg(
+    parser: argparse.ArgumentParser, config: typing.Dict, flags: typing.List[str], required=False, default=None, **kw
+):
+    """Add an argument to the parser, using `config` or the environment to resolve default values."""
+    key = flags[0].lstrip("-").replace("-", "_")
+    if key in config:
+        default = config[key]
+
+    if key.upper() in os.environ:
+        default = os.environ[key.upper()]
+
+    if default is not None:
+        kw["default"] = default
+    elif required:
+        kw["required"] = True
+
+    parser.add_argument(*flags, **kw)
+
+
+def load_config():
+    if os.path.exists(".config"):
+        return yaml.load(open(".config", "r"), Loader=yaml.SafeLoader)
+    else:
+        return {}
diff --git a/infra/launch.py b/infra/launch.py
new file mode 100755
index 000000000..6cd3c1e9e
--- /dev/null
+++ b/infra/launch.py
@@ -0,0 +1,211 @@
+#!/usr/bin/python
+
+import argparse
+import getpass
+import subprocess
+import time
+
+from infra import push_docker
+
+from infra.helpers import cli
+
+
+def setup_vm_docker(tpu_name, zone, docker_base_image):
+    cli.tpu_ssh(
+        tpu_name,
+        zone,
+        "sudo",
+        "usermod",
+        "-aG",
+        "docker",
+        getpass.getuser(),
+    )
+
+    cli.tpu_ssh(
+        tpu_name,
+        zone,
+        "docker",
+        "pull",
+        docker_base_image,
+    )
+
+    cli.tpu_ssh(tpu_name, zone, "docker", "volume", "create", "--driver=local", "levanter")
+
+
+def list_tpus(zone):
+    tpus = subprocess.check_output(
+        [
+            "gcloud",
+            "alpha",
+            "compute",
+            "tpus",
+            "tpu-vm",
+            "list",
+        ]
+    )
+    rows = tpus.decode("utf-8").split("\n")
+    header = rows[0].split()
+    tpus = []
+    for row in rows[1:]:
+        if row:
+            tpus.append(dict(zip(header, row.split())))
+    return tpus
+
+
+def start_tpu_vm(
+    tpu_name, *, tpu_type, preemptible, version, zone, autodelete, project, docker_repository, docker_base_image
+):
+    tpu_exists = any([tpu["NAME"] == tpu_name for tpu in list_tpus(zone)])
+    if tpu_exists:
+        if not autodelete:
+            print("TPU already exists and autodelete is false, leaving it as is.")
+            return
+
+        print("TPU already exists, deleting...")
+        cli.run_command(
+            "gcloud",
+            "alpha",
+            "compute",
+            "tpus",
+            "delete",
+            "--quiet",
+            f"--zone={zone}",
+            tpu_name,
+        )
+
+    print(f"Creating new TPU {tpu_name} in {zone} of type {tpu_type}...")
+    cli.run_command(
+        "gcloud",
+        "alpha",
+        "compute",
+        "tpus",
+        "tpu-vm",
+        "create",
+        tpu_name,
+        f"--accelerator-type={tpu_type}",
+        f"--version={version}",
+        "--zone=" + zone,
+        "--preemptible" if preemptible else "",
+        "--quiet",
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    config = cli.load_config()
+
+    cli.add_arg(parser, config, ["--autodelete"], default=False, action="store_true")
+    cli.add_arg(parser, config, ["--docker_base_image"], default="ghcr.io/rjpower/levanter:latest")
+    cli.add_arg(parser, config, ["--docker_repository"], default="levanter")
+    cli.add_arg(parser, config, ["--foreground"], default=False, action="store_true")
+    cli.add_arg(parser, config, ["--image_name"], default=f"levanter-{getpass.getuser()}")
+    cli.add_arg(parser, config, ["--preemptible"], default=False, action="store_true")
+    cli.add_arg(parser, config, ["--project"], default=cli.gcloud_config()["project"])
+    cli.add_arg(parser, config, ["--tpu"], required=True)
+    cli.add_arg(parser, config, ["--tpu_type"])
+    cli.add_arg(parser, config, ["--version"], default="tpu-ubuntu2204-base")
+    cli.add_arg(parser, config, ["--zone"], required=True)
+    cli.add_arg(parser, config, ["--retries"], default=0, type=int)
+
+    parser.add_argument(
+        "-e", "--env", action="append", nargs=2, metavar=("KEY", "VALUE"), default=config.get("env", {}).items()
+    )
+    parser.add_argument("command", nargs=argparse.REMAINDER)
+
+    args = parser.parse_args()
+
+    autodelete = args.autodelete
+    command = args.command
+    docker_base_image = args.docker_base_image
+    docker_repository = args.docker_repository
+    foreground = args.foreground
+    image_id = args.image_name
+    preemptible = args.preemptible
+    project = args.project
+    if args.retries < 0:
+        retries = 10000000
+    else:
+        retries = args.retries
+    tpu_name = args.tpu
+    tpu_type = args.tpu_type
+    version = args.version
+    zone = args.zone
+
+    region = "-".join(zone.split("-")[:-1])
+    env = {k: v for k, v in args.env}
+
+    if "WANDB_PROJECT" not in env:
+        env["WANDB_PROJECT"] = "levanter"
+
+    if command[0] == "--":
+        command = command[1:]
+
+    for i in range(retries + 1):
+        try:
+            start_tpu_vm(
+                tpu_name=tpu_name,
+                tpu_type=tpu_type,
+                preemptible=preemptible,
+                version=version,
+                zone=zone,
+                autodelete=autodelete,
+                project=project,
+                docker_repository=docker_repository,
+                docker_base_image=docker_base_image,
+            )
+
+            setup_vm_docker(
+                tpu_name=tpu_name,
+                zone=zone,
+                docker_base_image=docker_base_image,
+            )
+
+            # make an image tag based on the unix timestamp to ensure we always pull the latest image
+            tag = run_id = int(time.time())
+
+            full_image_id = push_docker.push_to_gcp(
+                project_id=project,
+                region=region,
+                repository=docker_repository,
+                image_name=image_id,
+                tag=tag,
+                docker_file="docker/tpu/Dockerfile.incremental",
+            )
+
+            git_commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
+
+            cli.tpu_ssh(tpu_name, zone, "docker", "stop", "levanter", "-t", "1", ignore_failure=True)
+            cli.tpu_ssh(tpu_name, zone, "docker", "rm", "-f", "levanter", ignore_failure=True)
+
+            docker_command = [
+                "docker",
+                "run",
+                "-t" if foreground else "-d",
+                "--name=levanter",
+                "--privileged",
+                "--shm-size=32gb",
+                "--net=host",
+                "--init",
+                "--mount",
+                "type=volume,source=levanter,target=/home/levanter",
+                "-v",
+                "/tmp:/tmp",
+                "-e",
+                f"WANDB_DOCKER={image_id}",
+                "-e",
+                f"GIT_COMMIT={git_commit}",
+                "-e",
+                f"RUN_ID={run_id}",
+            ]
+
+            for k, v in env.items():
+                docker_command.extend(["-e", k + f"='{str(v)}'"])
+
+            docker_command.extend([full_image_id, " ".join(command)])
+
+            print(f"Running on tpu_name... {tpu_name}")
+            cli.tpu_ssh(tpu_name, zone, *docker_command)
+        except subprocess.CalledProcessError as e:
+            print("Error running command.")
+            if i < retries - 1:
+                print("Retrying... %d/%d" % (i + 1, retries))
diff --git a/infra/push_docker.py b/infra/push_docker.py
new file mode 100644
index 000000000..b16b23ac2
--- /dev/null
+++ b/infra/push_docker.py
@@ -0,0 +1,210 @@
+#!/usr/bin/python
+
+"""
+Build and deploy the Levanter base image to Artifact Registry or Docker Hub.
+
+It is not necessary to run this yourself unless you are deploying a new base image: the launch
+script will automatically build and deploy an image based on your current code.
+"""
+
+import argparse
+from calendar import c
+import json
+import os
+import subprocess
+
+from infra.helpers import cli
+
+GCP_CLEANUP_POLICY = [
+    {
+        "name": "delete-stale",
+        "action": {"type": "Delete"},
+        "condition": {
+            "olderThan": "86400s",
+            "tagState": "ANY",
+        },
+    },
+    {
+        "name": "keep-latest",
+        "action": {"type": "Keep"},
+        "mostRecentVersions": {
+            "keepCount": 5,
+        },
+    },
+]
+
+
+def _run(*args, **kw):
+    print("Running ", " ".join(args[0]))
+    return subprocess.check_output(*args, **kw)
+
+
+def configure_gcp_docker(project_id, region, repository):
+    """Setup Artifact registry repository and configure permissions to enable TPU access."""
+    # check if the repository already exists
+    try:
+        _run(["gcloud", "artifacts", "repositories", "describe", f"--location={region}", repository])
+        return
+    except subprocess.CalledProcessError as e:
+        if "NOT_FOUND" not in e.output:
+            raise
+
+    # Activate artifact registry and setup the repository.
+    _run(["gcloud", "services", "enable", "artifactregistry.googleapis.com"])
+
+    try:
+        _run(
+            [
+                "gcloud",
+                "artifacts",
+                "repositories",
+                "create",
+                repository,
+                f"--location={region}",
+                "--repository-format=docker",
+            ],
+            stderr=subprocess.STDOUT,
+        )
+    except subprocess.CalledProcessError as e:
+        # Ignore error if repository already exists.
+        if b"ALREADY_EXISTS" not in e.output:
+            print("Error creating repository: ", e.output)
+            raise
+
+    with open("/tmp/cleanup-policy.json", "w") as f:
+        json.dump(GCP_CLEANUP_POLICY, f, indent=2)
+
+    _run(
+        [
+            "gcloud",
+            "artifacts",
+            "repositories",
+            "set-cleanup-policies",
+            f"--location={region}",
+            "--policy=/tmp/cleanup-policy.json",
+            repository,
+        ]
+    )
+
+    # Grant public read access ('allUsers') for TPU VMs
+    _run(
+        [
+            "gcloud",
+            "artifacts",
+            "repositories",
+            "add-iam-policy-binding",
+            "--member=allUsers",
+            "--role=roles/artifactregistry.reader",
+            f"--location={region}",
+            repository,
+        ]
+    )
+
+    _run(
+        [
+            "gcloud",
+            "--project",
+            project_id,
+            "artifacts",
+            "repositories",
+            "add-iam-policy-binding",
+            repository,
+            "--location",
+            region,
+            "--member",
+            "allUsers",
+            "--role",
+            "roles/artifactregistry.reader",
+        ]
+    )
+
+    _run(["gcloud", "auth", "configure-docker", "--quiet", f"{region}-docker.pkg.dev"])
+
+
+def build_docker(docker_file, image_name, tag) -> str:
+    """Builds a Docker image, enables artifact access, and pushes to Artifact Registry."""
+
+    _run(
+        [
+            "docker",
+            "buildx",
+            "build",
+            "--platform=linux/amd64",
+            "-t",
+            f"{image_name}:{tag}",
+            "-f",
+            docker_file,
+            ".",
+        ]
+    )
+
+    return f"{image_name}:{tag}"
+
+
+# Disabled until we can figure out how Docker hub organizations work
+def push_to_github(local_image, tag, github_user=None, github_token=None, docker_file=None):
+    """Pushes a local Docker image to Docker Hub."""
+
+    # Authenticate the docker service with Github if a token exists
+    if github_token:
+        login_process = subprocess.Popen(
+            ["docker", "login", "ghcr.io", "-u", github_user, "--password-stdin"], stdin=subprocess.PIPE
+        )
+        print(login_process.communicate(input=github_token.encode(), timeout=10))
+
+    remote_name = f"ghcr.io/{github_user}/{local_image}:{tag}"
+    local_name = build_docker(docker_file=docker_file, image_name=local_image, tag=tag)
+
+    _run(["docker", "tag", local_name, remote_name])
+    _run(["docker", "push", remote_name])
+    return remote_name
+
+
+def push_to_gcp(project_id, region, repository, image_name, tag, docker_file) -> str:
+    """Pushes a local Docker image to Artifact Registry."""
+    configure_gcp_docker(project_id, region, repository)
+    local_image = build_docker(docker_file=docker_file, image_name=image_name, tag=tag)
+
+    artifact_repo = f"{region}-docker.pkg.dev/{project_id}/{repository}"
+
+    full_image_name = f"{artifact_repo}/{image_name}:{tag}"
+    _run(["docker", "tag", local_image, full_image_name])
+    _run(["docker", "push", full_image_name])
+
+    return f"{region}-docker.pkg.dev/{project_id}/{repository}/{image_name}:{tag}"
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Build and push Docker image to Artifact Registry.")
+    config = cli.load_config()
+    cli.add_arg(parser, config, ["--project"], help="GCP project ID")
+    cli.add_arg(parser, config, ["--region"], help="Artifact Registry region (e.g., us-west4)")
+    cli.add_arg(parser, config, ["--repository"], default="levanter", help="Artifact Registry repository name")
+    cli.add_arg(parser, config, ["--image"], default="levanter", help="Docker image name.")
+    cli.add_arg(parser, config, ["--tag"], default="latest", help="Docker image tag.")
+    cli.add_arg(parser, config, ["--github_user"], default=None, help="Github user name.")
+    cli.add_arg(parser, config, ["--github_token"], default=None, help="Github token.")
+    cli.add_arg(parser, config, ["--docker_file"], default="docker/tpu/Dockerfile.base", help="Dockerfile to use.")
+
+    # push to either github or GCP
+    cli.add_arg(parser, config, ["--docker_target"], choices=["github", "gcp"], required=True)
+
+    args = parser.parse_args()
+
+    if args.docker_target == "github":
+        assert args.github_user, "Must specify --github_user when pushing to Github"
+        assert args.github_token, "Must specify --github_token when pushing to Github"
+        push_to_github(args.image, args.tag, args.github_user, args.github_token, docker_file=args.docker_file)
+    else:
+        assert args.region, "Must specify --region when pushing to GCP"
+        assert args.project, "Must specify --project when pushing to GCP"
+        assert args.repository, "Must specify --repository when pushing to GCP"
+
+        push_to_gcp(
+            args.project,
+            args.region,
+            args.repository,
+            args.image,
+            args.tag,
+            docker_file=args.docker_file,
+        )
diff --git a/pyproject.toml b/pyproject.toml
index 60e44e15b..527f88f7d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -67,8 +67,6 @@ dev-mode-dirs = [".", "src"]
 [tool.hatch.metadata]
 allow-direct-references = true
 
-
-
 [tool.hatch.build.targets.wheel]
 packages = ["levanter"]
 
@@ -109,3 +107,13 @@ markers = [
     "entry: marks tests as entry point tests (deselect with '-m \"not entry\"')",
     "ray: marks tests that require Ray (deselect with '-m \"not ray\"')",
 ]
+
+[project.optional-dependencies]
+test = [
+    "pytest",
+    "flake8",
+    "pytest",
+    "soundfile",
+    "librosa",
+    "pytest-forked"
+]
\ No newline at end of file
diff --git a/src/levanter/tracker/wandb.py b/src/levanter/tracker/wandb.py
index 9d41e935a..c98c0727c 100644
--- a/src/levanter/tracker/wandb.py
+++ b/src/levanter/tracker/wandb.py
@@ -208,6 +208,9 @@ def _git_settings(self):
         return other_settings
 
     def _get_git_sha(self, code_dir) -> Optional[str]:
+        if "GIT_COMMIT" in os.environ:
+            return os.environ["GIT_COMMIT"]
+
         try:
             repo = Repo(code_dir)
             git_sha = repo.head.commit.hexsha

From d4a1d4fb9d9fb0b4cd7bd8004ea38955883e6e1b Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@stanford.edu>
Date: Tue, 11 Jun 2024 00:02:38 -0700
Subject: [PATCH 2/7] misc dumb fixes

---
 infra/launch.py      | 13 ++++++++-----
 infra/push_docker.py |  5 ++---
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/infra/launch.py b/infra/launch.py
index 6cd3c1e9e..6b465f700 100755
--- a/infra/launch.py
+++ b/infra/launch.py
@@ -6,7 +6,6 @@
 import time
 
 from infra import push_docker
-
 from infra.helpers import cli
 
 
@@ -41,6 +40,7 @@ def list_tpus(zone):
             "tpus",
             "tpu-vm",
             "list",
+            "--zone=" + zone,
         ]
     )
     rows = tpus.decode("utf-8").split("\n")
@@ -67,6 +67,7 @@ def start_tpu_vm(
             "alpha",
             "compute",
             "tpus",
+            "tpu-vm",
             "delete",
             "--quiet",
             f"--zone={zone}",
@@ -74,7 +75,7 @@ def start_tpu_vm(
         )
 
     print(f"Creating new TPU {tpu_name} in {zone} of type {tpu_type}...")
-    cli.run_command(
+    command = [
         "gcloud",
         "alpha",
         "compute",
@@ -85,9 +86,11 @@ def start_tpu_vm(
         f"--accelerator-type={tpu_type}",
         f"--version={version}",
         "--zone=" + zone,
-        "--preemptible" if preemptible else "",
         "--quiet",
-    )
+    ]
+    if preemptible:
+        command.append("--preemptible")
+    cli.run_command(*command)
 
 
 if __name__ == "__main__":
@@ -205,7 +208,7 @@ def start_tpu_vm(
 
             print(f"Running on tpu_name... {tpu_name}")
             cli.tpu_ssh(tpu_name, zone, *docker_command)
-        except subprocess.CalledProcessError as e:
+        except subprocess.CalledProcessError as e:  # noqa: F841
             print("Error running command.")
             if i < retries - 1:
                 print("Retrying... %d/%d" % (i + 1, retries))
diff --git a/infra/push_docker.py b/infra/push_docker.py
index b16b23ac2..a85b64f18 100644
--- a/infra/push_docker.py
+++ b/infra/push_docker.py
@@ -8,13 +8,12 @@
 """
 
 import argparse
-from calendar import c
 import json
-import os
 import subprocess
 
 from infra.helpers import cli
 
+
 GCP_CLEANUP_POLICY = [
     {
         "name": "delete-stale",
@@ -46,7 +45,7 @@ def configure_gcp_docker(project_id, region, repository):
         _run(["gcloud", "artifacts", "repositories", "describe", f"--location={region}", repository])
         return
     except subprocess.CalledProcessError as e:
-        if "NOT_FOUND" not in e.output:
+        if b"NOT_FOUND" not in e.stderr:
             raise
 
     # Activate artifact registry and setup the repository.

From bc861c07e5efe69c9491382d8ef752409bfb0688 Mon Sep 17 00:00:00 2001
From: David Hall <dlwh@stanford.edu>
Date: Tue, 11 Jun 2024 00:12:05 -0700
Subject: [PATCH 3/7] there we go

---
 infra/push_docker.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/infra/push_docker.py b/infra/push_docker.py
index a85b64f18..e3e416290 100644
--- a/infra/push_docker.py
+++ b/infra/push_docker.py
@@ -42,10 +42,13 @@ def configure_gcp_docker(project_id, region, repository):
     """Setup Artifact registry repository and configure permissions to enable TPU access."""
     # check if the repository already exists
     try:
-        _run(["gcloud", "artifacts", "repositories", "describe", f"--location={region}", repository])
+        _run(
+            ["gcloud", "artifacts", "repositories", "describe", f"--location={region}", repository],
+            stderr=subprocess.STDOUT,
+        )
         return
     except subprocess.CalledProcessError as e:
-        if b"NOT_FOUND" not in e.stderr:
+        if b"NOT_FOUND" not in e.output:
             raise
 
     # Activate artifact registry and setup the repository.

From 20999304c3975c61db7e68cfd910214f9825a499 Mon Sep 17 00:00:00 2001
From: Russell Power <russell.power@gmail.com>
Date: Tue, 11 Jun 2024 08:27:13 -0700
Subject: [PATCH 4/7] Adjust docs to reflect new config format and cleanup a
 few flags.

---
 .github/workflows/tpu_unit_tests.yaml    |  6 ++-
 docs/Getting-Started-TPU-VM.md           | 54 ++++++++++++------------
 docs/Training-On-Your-Data.md            | 20 ++++++---
 docs/tutorials/Training-On-Audio-Data.md | 28 +++++++++---
 infra/launch.py                          | 20 ++++-----
 5 files changed, 76 insertions(+), 52 deletions(-)

diff --git a/.github/workflows/tpu_unit_tests.yaml b/.github/workflows/tpu_unit_tests.yaml
index 4f3eaccb3..3e27426eb 100644
--- a/.github/workflows/tpu_unit_tests.yaml
+++ b/.github/workflows/tpu_unit_tests.yaml
@@ -31,12 +31,14 @@ jobs:
           export TPU_NAME=ci-run-${{ github.run_id }}
           eval "$(ssh-agent -s)"
           TRUE_SHA=${{ github.event.pull_request.head.sha }}
-          bash infra/spin-up-vm.sh $TPU_NAME -z ${TPU_ZONE} -t v4-8 --preemptible --retries 1
+          bash infra/spin-up-vm.sh $TPU_NAME -z ${TPU_ZONE} -t v4-8 --preemptible -s infra/helpers/setup-tpu-vm-tests.sh -b ${TRUE_SHA} --retries 1
+#          infra/babysit-tpu-vm.sh $TPU_NAME -z ${{ TPU_ZONE }} -t v4-8 --preemptible -s infra/helpers/setup-tpu-vm-tests.sh -b ${{ github.sha }} --retries 1 -- \
+#            PYTHONPATH=$PYTHONPATH:levanter/tests bash levanter/infra/run.sh pytest levanter/tests -m "not entry"
 
       - name: Run most tests
         run: |
           export TPU_NAME=ci-run-${{ github.run_id }}
-          python infra/launch.py --foreground --tpu=$TPU_NAME --zone=$TPU_ZONE -- /opt/levanter/.venv/bin/pytest tests -m "not entry"
+          gcloud compute tpus tpu-vm ssh $TPU_NAME --zone ${TPU_ZONE} --command "PYTHONPATH=$PYTHONPATH:levanter/tests bash levanter/infra/run.sh pytest levanter/tests -m 'not entry'"
 # Something's wrong with these
 #
 #      - name: Run forked tests
diff --git a/docs/Getting-Started-TPU-VM.md b/docs/Getting-Started-TPU-VM.md
index 1d2a50945..8fc8f98ee 100644
--- a/docs/Getting-Started-TPU-VM.md
+++ b/docs/Getting-Started-TPU-VM.md
@@ -95,25 +95,32 @@ First create a configuration file for future launches in your Levanter directory
 ```
 cat > .config <<EOF
 env:
-    WANDB_API_KEY:  ...
-    WANDB_ENTITY: ...
-    WANDB_PROJECT: levanter
-    HF_TOKEN: ...
+    WANDB_API_KEY: 
+    WANDB_ENTITY: 
+    WANDB_PROJECT: 
+    HF_TOKEN: 
+    TPU_STDERR_LOG_LEVEL: 0
+    TPU_MIN_LOG_LEVEL: 0
+    LIBTPU_INIT_ARGS: <extra args to libtpu>
 
 docker_repository: levanter
 zone: us-west4-a
-tpu: test-tpu
+tpu_name: test-spin-up-32
+tpu_type: "v5litepod-16"
+vm_image: "tpu-ubuntu2204-base"
+preemptible: true
+autodelete: false
+subnetwork: "default"
+
 EOF
 ```
 
-Everything after the `--` is run on each worker.
+Now run `launch.py`. This will package your current directory into a Docker image and run it on your workers. Everything after the `--` is run on each worker.
 
 ```bash
 python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
 ```
 
-`launch.py` will package your directory and create and deploy a Docker image  on each worker.
-
 ### Launch a GPT-2 Small in interactive mode
 
 To run in the foreground, use `--foreground` with the `launch.py` script. You should use tmux or something for long running jobs for this version. It's mostly for debugging.
@@ -124,18 +131,11 @@ python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config
 ### Babysitting Script
 
 If you are using a preemptible TPU VM, you probably want to use the "babysitting" script that automatically re-creates
-the VM. This is because preemptible instances can be preempted and will always be killed every 24 hours. The babysitting
-script handles both the creation of the node and the running of a job, and also relaunches the TPU VM if it gets preempted.
-It keeps running the command (and relaunching) until the command exits successfully.
-
-Note that the babysitting-script will automatically set the `RUN_ID` environment variable if not set, and pass it to the
-training command. This ensures that restarted jobs have the same run id, which is important for resumes to work.
-
-You can run it like this:
+the VM. This is because preemptible instances can be preempted and will always be killed every 24 hours. You can run `launch.py` with the `--retries` and `--foreground` parameter to accomplish this. If `--retries` is greater than 1, `launch.py` will automatically attempt to re-create the VM and re-run the command if it fails. (`--foreground` is necessary to keep the script from returning immediately.)
 
 ```bash
-infra/babysit-tpu-vm <name> -z <zone> -t <type> [--preemptible]  -- \
-    python infra/launch.py -- levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml
+    python infra/launch.py --retries=100 --foreground --tpu_name=my_tpu -- python src/levanter/main/train_lm.py --config_path config/my_config.yaml \
+    --trainer.checkpointer.base_path gs://path/to/checkpoints/
 ```
 
 That `--` is important! It separates the spin up args from the running args.
@@ -144,28 +144,26 @@ background mode will always return immediately.
 
 ### Running your own config
 
-If you want to run your own config, we suggest you start from one of the existing configs. Then, if you're not using
-an NFS server or similar, you should upload your config to GCS:
+If you want to run your own config, we suggest you start from one of the existing configs. Just copy it to
+a new file:
+
+`cp config/gpt2_small.yaml config/my_config.yaml`
+
+If you're using `launch.py`, the config will be automatically uploaded as part of your Docker image, so you
+can just reference the local config path in your command line:
 
-```bash
-gsutil cp my_config.yaml gs://my_bucket//my_config.yaml
 ```
 
 Afterward, you can use the config directly from the TPU VM instance, e.g.:
 
 ```bash
-infra/babysit-tpu-vm <name> -z <zone> -t <type> [--preemptible] -- \
-    python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path gs://my_bucket/my_config.yaml \
+    python infra/launch.py -- python src/levanter/main/train_lm.py --config_path config/my_config.yaml \
     --trainer.checkpointer.base_path gs://path/to/checkpoints/
 ```
 
-The `--config_path` argument can be a local path, a GCS path, or any URL loadable by fsspec.
 With this configuration (unless `trainer.load_checkpoint` is false), Levanter will automatically
 try to load the latest checkpoint if it exists.
 
-Tokenizers are also loaded via fsspec, so you can use the same trick to load them from GCS if you have a custom
-tokenizer, or you can use an HF tokenizer.
-
 ## Common Issues
 ### (CRFM) Permission denied on `/files`
 
diff --git a/docs/Training-On-Your-Data.md b/docs/Training-On-Your-Data.md
index cac96de31..11aba38b8 100644
--- a/docs/Training-On-Your-Data.md
+++ b/docs/Training-On-Your-Data.md
@@ -398,15 +398,25 @@ This will spin up a TPU VM instance and install Levanter on it. You can then run
 
 ```
 cat > .config <<EOF
+cat > .config <<EOF
 env:
-    WANDB_API_KEY:  ...
-    WANDB_ENTITY: ...
-    WANDB_PROJECT: levanter
-    HF_TOKEN: ...
+    WANDB_API_KEY: 
+    WANDB_ENTITY: 
+    WANDB_PROJECT: 
+    HF_TOKEN: 
+    TPU_STDERR_LOG_LEVEL: 0
+    TPU_MIN_LOG_LEVEL: 0
+    LIBTPU_INIT_ARGS: <extra args to libtpu>
 
 docker_repository: levanter
 zone: us-west4-a
-tpu: test-tpu
+tpu_name: test-spin-up-32
+tpu_type: "v5litepod-16"
+vm_image: "tpu-ubuntu2204-base"
+preemptible: true
+autodelete: false
+subnetwork: "default"
+
 EOF
 ```
 
diff --git a/docs/tutorials/Training-On-Audio-Data.md b/docs/tutorials/Training-On-Audio-Data.md
index bdab91c43..fab6e7d4f 100644
--- a/docs/tutorials/Training-On-Audio-Data.md
+++ b/docs/tutorials/Training-On-Audio-Data.md
@@ -179,16 +179,30 @@ infra/babysit-tpu-vm my-tpu -z us-east1-d -t v3-128 -- \
 
 #### Spin up and manual launch
 
-You should probably use the automated setup script, as described in the [relevant section of the TPU guide](../Getting-Started-TPU-VM.md#automatic-setup).
-Here's what that looks like:
+You can start up a TPU VM and launch your instance with `launch.py`. To simplify your command for multiple launches, you can put common parameters into `.config` in your `levanter` directory:
+
+cat > .config <<EOF
+env:
+    WANDB_API_KEY: 
+    WANDB_ENTITY: 
+    WANDB_PROJECT: 
+    HF_TOKEN: 
+    TPU_STDERR_LOG_LEVEL: 0
+    TPU_MIN_LOG_LEVEL: 0
+    LIBTPU_INIT_ARGS: <extra args to libtpu>
+
+docker_repository: levanter
+zone: us-west4-a
+tpu_name: test-spin-up-32
+tpu_type: "v5litepod-16"
+vm_image: "tpu-ubuntu2204-base"
+preemptible: true
+autodelete: false
+subnetwork: "default"
+EOF
 
 ```bash
-bash infra/spin-up-tpu-vm.sh my-tpu -z us-east1-d -t v3-128
-```
-
-This will spin up a TPU VM instance and install Levanter on it. You can then run a command like so:
 
-```bash
 python infra/launch.py -- python levanter/src/levanter/main/train_asr.py --config_path gs://path/to/config.yaml"
 ```
 
diff --git a/infra/launch.py b/infra/launch.py
index 6b465f700..3eae2b23a 100755
--- a/infra/launch.py
+++ b/infra/launch.py
@@ -10,6 +10,7 @@
 
 
 def setup_vm_docker(tpu_name, zone, docker_base_image):
+    """Change docker permissions on `tpu_name` and setup the cache volume."""
     cli.tpu_ssh(
         tpu_name,
         zone,
@@ -52,9 +53,7 @@ def list_tpus(zone):
     return tpus
 
 
-def start_tpu_vm(
-    tpu_name, *, tpu_type, preemptible, version, zone, autodelete, project, docker_repository, docker_base_image
-):
+def start_tpu_vm(tpu_name, *, tpu_type, preemptible, version, zone, autodelete):
     tpu_exists = any([tpu["NAME"] == tpu_name for tpu in list_tpus(zone)])
     if tpu_exists:
         if not autodelete:
@@ -104,11 +103,12 @@ def start_tpu_vm(
     cli.add_arg(parser, config, ["--image_name"], default=f"levanter-{getpass.getuser()}")
     cli.add_arg(parser, config, ["--preemptible"], default=False, action="store_true")
     cli.add_arg(parser, config, ["--project"], default=cli.gcloud_config()["project"])
-    cli.add_arg(parser, config, ["--tpu"], required=True)
-    cli.add_arg(parser, config, ["--tpu_type"])
+    cli.add_arg(parser, config, ["--tpu_name"], required=True)
+    cli.add_arg(parser, config, ["--tpu_type"], required=True)
     cli.add_arg(parser, config, ["--version"], default="tpu-ubuntu2204-base")
     cli.add_arg(parser, config, ["--zone"], required=True)
     cli.add_arg(parser, config, ["--retries"], default=0, type=int)
+    cli.add_arg(parser, config, ["--run_id"], default=int(time.time()), type=int)
 
     parser.add_argument(
         "-e", "--env", action="append", nargs=2, metavar=("KEY", "VALUE"), default=config.get("env", {}).items()
@@ -129,10 +129,11 @@ def start_tpu_vm(
         retries = 10000000
     else:
         retries = args.retries
-    tpu_name = args.tpu
+    tpu_name = args.tpu_name
     tpu_type = args.tpu_type
     version = args.version
     zone = args.zone
+    run_id = args.run_id
 
     region = "-".join(zone.split("-")[:-1])
     env = {k: v for k, v in args.env}
@@ -152,11 +153,10 @@ def start_tpu_vm(
                 version=version,
                 zone=zone,
                 autodelete=autodelete,
-                project=project,
-                docker_repository=docker_repository,
-                docker_base_image=docker_base_image,
             )
 
+            # We don't technically need to setup on every run, but if we are working on a
+            # stale VM or a VM from e.g. spin-up-vm.sh, this ensures things always work.
             setup_vm_docker(
                 tpu_name=tpu_name,
                 zone=zone,
@@ -164,7 +164,7 @@ def start_tpu_vm(
             )
 
             # make an image tag based on the unix timestamp to ensure we always pull the latest image
-            tag = run_id = int(time.time())
+            tag = int(time.time())
 
             full_image_id = push_docker.push_to_gcp(
                 project_id=project,

From 1929ac274055563f6400e87ef4c09af9dfb8d2f7 Mon Sep 17 00:00:00 2001
From: Russell Power <russell.power@gmail.com>
Date: Tue, 11 Jun 2024 09:59:39 -0700
Subject: [PATCH 5/7] Tiny doc cleanups.

---
 docker/tpu/Dockerfile.incremental        | 1 +
 docs/Training-On-Your-Data.md            | 3 +--
 docs/tutorials/Training-On-Audio-Data.md | 3 +--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/docker/tpu/Dockerfile.incremental b/docker/tpu/Dockerfile.incremental
index 10afb1ca7..8a6c8f5c8 100644
--- a/docker/tpu/Dockerfile.incremental
+++ b/docker/tpu/Dockerfile.incremental
@@ -14,4 +14,5 @@ WORKDIR /opt/levanter
 
 ADD pyproject.toml README.md /opt/levanter/
 RUN pip install -e '.[test]'
+RUN pip install librosa soundfile
 ADD . /opt/levanter
\ No newline at end of file
diff --git a/docs/Training-On-Your-Data.md b/docs/Training-On-Your-Data.md
index 11aba38b8..7361d1d7f 100644
--- a/docs/Training-On-Your-Data.md
+++ b/docs/Training-On-Your-Data.md
@@ -410,7 +410,6 @@ env:
 
 docker_repository: levanter
 zone: us-west4-a
-tpu_name: test-spin-up-32
 tpu_type: "v5litepod-16"
 vm_image: "tpu-ubuntu2204-base"
 preemptible: true
@@ -421,7 +420,7 @@ EOF
 ```
 
 ```bash
-python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path gs://path/to/config.yaml"
+python infra/launch.py --tpu_name=my_tpu -- python levanter/src/levanter/main/train_lm.py --config_path gs://path/to/config.yaml"
 ```
 
 ## Monitoring
diff --git a/docs/tutorials/Training-On-Audio-Data.md b/docs/tutorials/Training-On-Audio-Data.md
index fab6e7d4f..f57b9a06f 100644
--- a/docs/tutorials/Training-On-Audio-Data.md
+++ b/docs/tutorials/Training-On-Audio-Data.md
@@ -193,7 +193,6 @@ env:
 
 docker_repository: levanter
 zone: us-west4-a
-tpu_name: test-spin-up-32
 tpu_type: "v5litepod-16"
 vm_image: "tpu-ubuntu2204-base"
 preemptible: true
@@ -203,7 +202,7 @@ EOF
 
 ```bash
 
-python infra/launch.py -- python levanter/src/levanter/main/train_asr.py --config_path gs://path/to/config.yaml"
+python infra/launch.py --tpu_name=my_tpu -- python levanter/src/levanter/main/train_asr.py --config_path gs://path/to/config.yaml"
 ```
 
 ### GPU

From 22d1a64100dcea425fe6d9c242d19832622790ae Mon Sep 17 00:00:00 2001
From: Russell Power <russell.power@gmail.com>
Date: Tue, 11 Jun 2024 14:51:38 -0700
Subject: [PATCH 6/7] Add back tokenizer documentation.

---
 docs/Getting-Started-TPU-VM.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/Getting-Started-TPU-VM.md b/docs/Getting-Started-TPU-VM.md
index 8fc8f98ee..6f92d3f38 100644
--- a/docs/Getting-Started-TPU-VM.md
+++ b/docs/Getting-Started-TPU-VM.md
@@ -164,6 +164,10 @@ Afterward, you can use the config directly from the TPU VM instance, e.g.:
 With this configuration (unless `trainer.load_checkpoint` is false), Levanter will automatically
 try to load the latest checkpoint if it exists.
 
+Tokenizers and configuration files are loaded via `fsspec` which supports remote
+filesystems , so you can also copy your tokenizer or config file to GCS and use
+a `gs://` path to access it.
+
 ## Common Issues
 ### (CRFM) Permission denied on `/files`
 

From 911fbbc5796d4f1313bdf02a32f56e12c032714c Mon Sep 17 00:00:00 2001
From: Russell Power <russell.power@gmail.com>
Date: Tue, 11 Jun 2024 15:08:57 -0700
Subject: [PATCH 7/7] Fix doc typo, cleanup base dependency installation.

---
 docker/tpu/Dockerfile.base        | 9 +++++----
 docker/tpu/Dockerfile.incremental | 1 -
 docs/Training-On-Your-Data.md     | 1 -
 infra/helpers/cli.py              | 1 -
 infra/launch.py                   | 8 --------
 infra/push_docker.py              | 1 +
 6 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/docker/tpu/Dockerfile.base b/docker/tpu/Dockerfile.base
index 9a93736b1..b9b6106ab 100644
--- a/docker/tpu/Dockerfile.base
+++ b/docker/tpu/Dockerfile.base
@@ -4,12 +4,13 @@ RUN pip install virtualenv
 
 # venv binaries encode their directory, so we need to setup the venv in the final location
 RUN virtualenv -p python3.10 /opt/levanter/.venv
-RUN /opt/levanter/.venv/bin/pip install -U "jax[tpu]==0.4.26" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
+ENV PATH /opt/levanter/.venv/bin:$PATH
+RUN /opt/levanter/.venv/bin/pip install -U hatch "jax[tpu]==0.4.26" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
 
-# Add only the requirements files to cache dependency build/installation
-WORKDIR /tmp
+# Install package dependencies to make incremental builds faster.
+WORKDIR /tmp/
 ADD pyproject.toml README.md /tmp/
-RUN /opt/levanter/.venv/bin/pip install -e '.[test]'
+RUN pip install $(hatch dep show requirements --all)
 
 FROM python:3.10
 
diff --git a/docker/tpu/Dockerfile.incremental b/docker/tpu/Dockerfile.incremental
index 8a6c8f5c8..10afb1ca7 100644
--- a/docker/tpu/Dockerfile.incremental
+++ b/docker/tpu/Dockerfile.incremental
@@ -14,5 +14,4 @@ WORKDIR /opt/levanter
 
 ADD pyproject.toml README.md /opt/levanter/
 RUN pip install -e '.[test]'
-RUN pip install librosa soundfile
 ADD . /opt/levanter
\ No newline at end of file
diff --git a/docs/Training-On-Your-Data.md b/docs/Training-On-Your-Data.md
index 7361d1d7f..9879306dc 100644
--- a/docs/Training-On-Your-Data.md
+++ b/docs/Training-On-Your-Data.md
@@ -398,7 +398,6 @@ This will spin up a TPU VM instance and install Levanter on it. You can then run
 
 ```
 cat > .config <<EOF
-cat > .config <<EOF
 env:
     WANDB_API_KEY: 
     WANDB_ENTITY: 
diff --git a/infra/helpers/cli.py b/infra/helpers/cli.py
index 8b7cf9ed2..7a2f61574 100644
--- a/infra/helpers/cli.py
+++ b/infra/helpers/cli.py
@@ -18,7 +18,6 @@ def add_ssh_key(ssh_key_filename):
     existing_keys = subprocess.check_output(["ssh-add", "-l"]).decode("utf-8").split("\n")
     for key in existing_keys:
         if key_hash in key:
-            print('Found existing key in ssh-agent, skipping "ssh-add"')
             return
 
     subprocess.check_call(["ssh-add", ssh_key_filename])
diff --git a/infra/launch.py b/infra/launch.py
index 3eae2b23a..a1fed90c1 100755
--- a/infra/launch.py
+++ b/infra/launch.py
@@ -21,14 +21,6 @@ def setup_vm_docker(tpu_name, zone, docker_base_image):
         getpass.getuser(),
     )
 
-    cli.tpu_ssh(
-        tpu_name,
-        zone,
-        "docker",
-        "pull",
-        docker_base_image,
-    )
-
     cli.tpu_ssh(tpu_name, zone, "docker", "volume", "create", "--driver=local", "levanter")
 
 
diff --git a/infra/push_docker.py b/infra/push_docker.py
index e3e416290..a0712ff85 100644
--- a/infra/push_docker.py
+++ b/infra/push_docker.py
@@ -46,6 +46,7 @@ def configure_gcp_docker(project_id, region, repository):
             ["gcloud", "artifacts", "repositories", "describe", f"--location={region}", repository],
             stderr=subprocess.STDOUT,
         )
+        print(f"Found existing artifact registry repository `{repository}`, skipping setup.")
         return
     except subprocess.CalledProcessError as e:
         if b"NOT_FOUND" not in e.output: