diff --git a/.dockerignore b/.dockerignore
index f5ceb7397..17fbbcfe1 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,3 +1,5 @@
+.git
+
 scratch
 cache
 wandb
@@ -44,6 +46,7 @@ instance/
 
 # Sphinx documentation
 docs/_build/
+docs/figures/
 
 # PyBuilder
 target/
@@ -105,7 +108,6 @@ dmypy.json
 # JetBrains
 .idea/
 
-
 # dataset cache files
 **/*.parquet
 **/ledger.json
diff --git a/.gitignore b/.gitignore
index c66f6f352..835da2048 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,8 @@
 /scratch
 
+# Configuration for TPU launches/secrets
+.config
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -140,6 +143,7 @@ dmypy.json
 /wandb
 
 # dataset cache files
+/cache
 *.parquet
 ledger.json
 
diff --git a/docker/tpu/Dockerfile.base b/docker/tpu/Dockerfile.base
new file mode 100644
index 000000000..b9b6106ab
--- /dev/null
+++ b/docker/tpu/Dockerfile.base
@@ -0,0 +1,18 @@
+FROM python:3.10 AS build
+RUN apt-get update && apt-get install -y clang
+RUN pip install virtualenv
+
+# venv binaries encode their directory, so we need to setup the venv in the final location
+RUN virtualenv -p python3.10 /opt/levanter/.venv
+ENV PATH /opt/levanter/.venv/bin:$PATH
+RUN /opt/levanter/.venv/bin/pip install -U hatch "jax[tpu]==0.4.26" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
+
+# Install package dependencies to make incremental builds faster.
+WORKDIR /tmp/
+ADD pyproject.toml README.md /tmp/
+RUN pip install $(hatch dep show requirements --all)
+
+FROM python:3.10
+
+WORKDIR /opt/levanter
+COPY --from=build /opt/levanter/.venv /opt/levanter/.venv
\ No newline at end of file
diff --git a/docker/tpu/Dockerfile.incremental b/docker/tpu/Dockerfile.incremental
new file mode 100644
index 000000000..10afb1ca7
--- /dev/null
+++ b/docker/tpu/Dockerfile.incremental
@@ -0,0 +1,17 @@
+ARG IMAGE=ghcr.io/rjpower/levanter
+ARG TAG=latest
+
+FROM ${IMAGE}:${TAG}
+
+ENV TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS=60\
+    TENSORSTORE_CURL_LOW_SPEED_LIMIT_BYTES=1024\
+    RAY_USAGE_STATS_ENABLED=0\
+    PATH=/opt/levanter/.venv/bin:$PATH\
+    PYTHONPATH=/opt/levanter:/opt/levanter/src:/opt/levanter/examples:/opt/levanter/tests\
+    HOME=/home/levanter
+
+WORKDIR /opt/levanter
+
+ADD pyproject.toml README.md /opt/levanter/
+RUN pip install -e '.[test]'
+ADD . /opt/levanter
\ No newline at end of file
diff --git a/docs/Getting-Started-TPU-VM.md b/docs/Getting-Started-TPU-VM.md
index fe73eef70..6f92d3f38 100644
--- a/docs/Getting-Started-TPU-VM.md
+++ b/docs/Getting-Started-TPU-VM.md
@@ -85,63 +85,88 @@ the VM. That's explained down below in the [Running Levanter GPT-2](#running-lev
 ## Running Levanter GPT-2
 Now that you have a TPU VM instance, you can follow the [Getting Started](Getting-Started-Training.md) steps, but here are a few shortcuts:
 
-### Launch a GPT-2 Small in unattended mode (using nohup)
-```bash
-gcloud compute tpus tpu-vm ssh $NAME --zone $ZONE --worker=all --command 'WANDB_API_KEY=... levanter/infra/launch.sh python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
+### Launch a GPT-2 Small in unattended mode
+
+You will need a [Docker installation](https://docs.docker.com/engine/install/)
+on your development machine to build and run images on TPUs.
+
+First create a configuration file for future launches in your Levanter directory:
+
 ```
+cat > .config <<EOF
+env:
+    WANDB_API_KEY: 
+    WANDB_ENTITY: 
+    WANDB_PROJECT: 
+    HF_TOKEN: 
+    TPU_STDERR_LOG_LEVEL: 0
+    TPU_MIN_LOG_LEVEL: 0
+    LIBTPU_INIT_ARGS: <extra args to libtpu>
+
+docker_repository: levanter
+zone: us-west4-a
+tpu_name: test-spin-up-32
+tpu_type: "v5litepod-16"
+vm_image: "tpu-ubuntu2204-base"
+preemptible: true
+autodelete: false
+subnetwork: "default"
+
+EOF
+```
+
+Now run `launch.py`. This will package your current directory into a Docker image and run it on your workers. Everything after the `--` is run on each worker.
 
-`launch.sh` will run the command in the background and redirect stdout and stderr to a log file in the home directory
-on each worker.
+```bash
+python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
+```
 
 ### Launch a GPT-2 Small in interactive mode
-This version writes to the terminal, you should use tmux or something for long running jobs for this version. It's mostly for debugging.
+
+To run in the foreground, use `--foreground` with the `launch.py` script. You should use tmux or something for long running jobs for this version. It's mostly for debugging.
 ```bash
-gcloud compute tpus tpu-vm ssh $NAME --zone $ZONE --worker=all --command 'WANDB_API_KEY=... levanter/infra/run.sh python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
+python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
 ```
 
 ### Babysitting Script
 
 If you are using a preemptible TPU VM, you probably want to use the "babysitting" script that automatically re-creates
-the VM. This is because preemptible instances can be preempted and will always be killed every 24 hours. The babysitting
-script handles both the creation of the node and the running of a job, and also relaunches the TPU VM if it gets preempted.
-It keeps running the command (and relaunching) until the command exits successfully.
-
-Note that the babysitting-script will automatically set the `RUN_ID` environment variable if not set, and pass it to the
-training command. This ensures that restarted jobs have the same run id, which is important for resumes to work.
-
-You can run it like this:
+the VM. This is because preemptible instances can be preempted and will always be killed every 24 hours. You can run `launch.py` with the `--retries` and `--foreground` parameter to accomplish this. If `--retries` is greater than 1, `launch.py` will automatically attempt to re-create the VM and re-run the command if it fails. (`--foreground` is necessary to keep the script from returning immediately.)
 
 ```bash
-infra/babysit-tpu-vm <name> -z <zone> -t <type> [--preemptible]  -- \
-    WANDB_API_KEY=... levanter/infra/run.sh python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml
+    python infra/launch.py --retries=100 --foreground --tpu_name=my_tpu -- python src/levanter/main/train_lm.py --config_path config/my_config.yaml \
+    --trainer.checkpointer.base_path gs://path/to/checkpoints/
 ```
 
-That `--` is important! It separates the spin up args from the running args. Also, you should never use `launch.sh`
-with `babysit`, because nohup exits immediately with exit code 0.
+That `--` is important! It separates the spin up args from the running args.
+Also you should always use `--foregrouund` with `babysit-tpu-vm`, as the
+background mode will always return immediately.
 
 ### Running your own config
 
-If you want to run your own config, we suggest you start from one of the existing configs. Then, if you're not using
-an NFS server or similar, you should upload your config to GCS:
+If you want to run your own config, we suggest you start from one of the existing configs. Just copy it to
+a new file:
+
+`cp config/gpt2_small.yaml config/my_config.yaml`
+
+If you're using `launch.py`, the config will be automatically uploaded as part of your Docker image, so you
+can just reference the local config path in your command line:
 
-```bash
-gsutil cp my_config.yaml gs://my_bucket//my_config.yaml
 ```
 
 Afterward, you can use the config directly from the TPU VM instance, e.g.:
 
 ```bash
-infra/babysit-tpu-vm <name> -z <zone> -t <type> [--preemptible] -- \
-    WANDB_API_KEY=... levanter/infra/run.sh python levanter/src/levanter/main/train_lm.py --config_path gs://my_bucket/my_config.yaml \
+    python infra/launch.py -- python src/levanter/main/train_lm.py --config_path config/my_config.yaml \
     --trainer.checkpointer.base_path gs://path/to/checkpoints/
 ```
 
-The `--config_path` argument can be a local path, a GCS path, or any URL loadable by fsspec.
 With this configuration (unless `trainer.load_checkpoint` is false), Levanter will automatically
 try to load the latest checkpoint if it exists.
 
-Tokenizers are also loaded via fsspec, so you can use the same trick to load them from GCS if you have a custom
-tokenizer, or you can use an HF tokenizer.
+Tokenizers and configuration files are loaded via `fsspec` which supports remote
+filesystems , so you can also copy your tokenizer or config file to GCS and use
+a `gs://` path to access it.
 
 ## Common Issues
 ### (CRFM) Permission denied on `/files`
diff --git a/docs/Training-On-Your-Data.md b/docs/Training-On-Your-Data.md
index c14b0ba66..9879306dc 100644
--- a/docs/Training-On-Your-Data.md
+++ b/docs/Training-On-Your-Data.md
@@ -395,8 +395,31 @@ bash infra/spin-up-tpu-vm.sh my-tpu -z us-east1-d -t v3-128
 
 This will spin up a TPU VM instance and install Levanter on it. You can then run a command like so:
 
+
+```
+cat > .config <<EOF
+env:
+    WANDB_API_KEY: 
+    WANDB_ENTITY: 
+    WANDB_PROJECT: 
+    HF_TOKEN: 
+    TPU_STDERR_LOG_LEVEL: 0
+    TPU_MIN_LOG_LEVEL: 0
+    LIBTPU_INIT_ARGS: <extra args to libtpu>
+
+docker_repository: levanter
+zone: us-west4-a
+tpu_type: "v5litepod-16"
+vm_image: "tpu-ubuntu2204-base"
+preemptible: true
+autodelete: false
+subnetwork: "default"
+
+EOF
+```
+
 ```bash
-gcloud compute tpus tpu-vm ssh my-tpu   --zone us-east1-d --worker=all --command="WANDB_API_KEY=... levanter/infra/launch.sh python levanter/src/levanter/main/train_lm.py --config_path gs://path/to/config.yaml"
+python infra/launch.py --tpu_name=my_tpu -- python levanter/src/levanter/main/train_lm.py --config_path gs://path/to/config.yaml"
 ```
 
 ## Monitoring
diff --git a/docs/tutorials/Training-On-Audio-Data.md b/docs/tutorials/Training-On-Audio-Data.md
index 235b2e79b..f57b9a06f 100644
--- a/docs/tutorials/Training-On-Audio-Data.md
+++ b/docs/tutorials/Training-On-Audio-Data.md
@@ -179,17 +179,30 @@ infra/babysit-tpu-vm my-tpu -z us-east1-d -t v3-128 -- \
 
 #### Spin up and manual launch
 
-You should probably use the automated setup script, as described in the [relevant section of the TPU guide](../Getting-Started-TPU-VM.md#automatic-setup).
-Here's what that looks like:
+You can start up a TPU VM and launch your instance with `launch.py`. To simplify your command for multiple launches, you can put common parameters into `.config` in your `levanter` directory:
+
+cat > .config <<EOF
+env:
+    WANDB_API_KEY: 
+    WANDB_ENTITY: 
+    WANDB_PROJECT: 
+    HF_TOKEN: 
+    TPU_STDERR_LOG_LEVEL: 0
+    TPU_MIN_LOG_LEVEL: 0
+    LIBTPU_INIT_ARGS: <extra args to libtpu>
+
+docker_repository: levanter
+zone: us-west4-a
+tpu_type: "v5litepod-16"
+vm_image: "tpu-ubuntu2204-base"
+preemptible: true
+autodelete: false
+subnetwork: "default"
+EOF
 
 ```bash
-bash infra/spin-up-tpu-vm.sh my-tpu -z us-east1-d -t v3-128
-```
-
-This will spin up a TPU VM instance and install Levanter on it. You can then run a command like so:
 
-```bash
-gcloud compute tpus tpu-vm ssh my-tpu   --zone us-east1-d --worker=all --command="WANDB_API_KEY=... levanter/infra/launch.sh python levanter/src/levanter/main/train_asr.py --config_path gs://path/to/config.yaml"
+python infra/launch.py --tpu_name=my_tpu -- python levanter/src/levanter/main/train_asr.py --config_path gs://path/to/config.yaml"
 ```
 
 ### GPU
diff --git a/infra/__init__.py b/infra/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/infra/helpers/cli.py b/infra/helpers/cli.py
new file mode 100644
index 000000000..7a2f61574
--- /dev/null
+++ b/infra/helpers/cli.py
@@ -0,0 +1,79 @@
+import argparse
+import os
+import subprocess
+import typing
+
+from google.cloud import storage
+import yaml
+
+
+def run_command(*args, **kwargs):
+    print("Running:", " ".join(list(args)))
+    return subprocess.check_call(args, **kwargs)
+
+
+def add_ssh_key(ssh_key_filename):
+    # format 3072 SHA256:... key-name (RSA)
+    key_hash = subprocess.check_output(["ssh-keygen", "-lf", ssh_key_filename]).decode("utf-8").split()[1]
+    existing_keys = subprocess.check_output(["ssh-add", "-l"]).decode("utf-8").split("\n")
+    for key in existing_keys:
+        if key_hash in key:
+            return
+
+    subprocess.check_call(["ssh-add", ssh_key_filename])
+
+
+def tpu_ssh(tpu_name, zone, *args, ignore_failure=False):
+    add_ssh_key(os.path.expanduser("~/.ssh/google_compute_engine"))
+    try:
+        return run_command(
+            "gcloud",
+            "alpha",
+            "compute",
+            "tpus",
+            "tpu-vm",
+            "ssh",
+            tpu_name,
+            "--worker=all",
+            f"--zone={zone}",
+            "--command=%s" % " ".join(args),
+        )
+    except subprocess.CalledProcessError as e:
+        if ignore_failure:
+            print("Ignoring failure:", e)
+        else:
+            raise
+
+
+# Oddly enough, there's no API to simply fetch the current gcloud configuration...
+def gcloud_config():
+    client = storage.Client()
+    return {
+        "project": client.project,
+    }
+
+
+def add_arg(
+    parser: argparse.ArgumentParser, config: typing.Dict, flags: typing.List[str], required=False, default=None, **kw
+):
+    """Add an argument to the parser, using `config` or the environment to resolve default values."""
+    key = flags[0].lstrip("-").replace("-", "_")
+    if key in config:
+        default = config[key]
+
+    if key.upper() in os.environ:
+        default = os.environ[key.upper()]
+
+    if default is not None:
+        kw["default"] = default
+    elif required:
+        kw["required"] = True
+
+    parser.add_argument(*flags, **kw)
+
+
+def load_config():
+    if os.path.exists(".config"):
+        return yaml.load(open(".config", "r"), Loader=yaml.SafeLoader)
+    else:
+        return {}
diff --git a/infra/launch.py b/infra/launch.py
new file mode 100755
index 000000000..a1fed90c1
--- /dev/null
+++ b/infra/launch.py
@@ -0,0 +1,206 @@
+#!/usr/bin/python
+
+import argparse
+import getpass
+import subprocess
+import time
+
+from infra import push_docker
+from infra.helpers import cli
+
+
+def setup_vm_docker(tpu_name, zone, docker_base_image):
+    """Change docker permissions on `tpu_name` and setup the cache volume."""
+    cli.tpu_ssh(
+        tpu_name,
+        zone,
+        "sudo",
+        "usermod",
+        "-aG",
+        "docker",
+        getpass.getuser(),
+    )
+
+    cli.tpu_ssh(tpu_name, zone, "docker", "volume", "create", "--driver=local", "levanter")
+
+
+def list_tpus(zone):
+    tpus = subprocess.check_output(
+        [
+            "gcloud",
+            "alpha",
+            "compute",
+            "tpus",
+            "tpu-vm",
+            "list",
+            "--zone=" + zone,
+        ]
+    )
+    rows = tpus.decode("utf-8").split("\n")
+    header = rows[0].split()
+    tpus = []
+    for row in rows[1:]:
+        if row:
+            tpus.append(dict(zip(header, row.split())))
+    return tpus
+
+
+def start_tpu_vm(tpu_name, *, tpu_type, preemptible, version, zone, autodelete):
+    tpu_exists = any([tpu["NAME"] == tpu_name for tpu in list_tpus(zone)])
+    if tpu_exists:
+        if not autodelete:
+            print("TPU already exists and autodelete is false, leaving it as is.")
+            return
+
+        print("TPU already exists, deleting...")
+        cli.run_command(
+            "gcloud",
+            "alpha",
+            "compute",
+            "tpus",
+            "tpu-vm",
+            "delete",
+            "--quiet",
+            f"--zone={zone}",
+            tpu_name,
+        )
+
+    print(f"Creating new TPU {tpu_name} in {zone} of type {tpu_type}...")
+    command = [
+        "gcloud",
+        "alpha",
+        "compute",
+        "tpus",
+        "tpu-vm",
+        "create",
+        tpu_name,
+        f"--accelerator-type={tpu_type}",
+        f"--version={version}",
+        "--zone=" + zone,
+        "--quiet",
+    ]
+    if preemptible:
+        command.append("--preemptible")
+    cli.run_command(*command)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    config = cli.load_config()
+
+    cli.add_arg(parser, config, ["--autodelete"], default=False, action="store_true")
+    cli.add_arg(parser, config, ["--docker_base_image"], default="ghcr.io/rjpower/levanter:latest")
+    cli.add_arg(parser, config, ["--docker_repository"], default="levanter")
+    cli.add_arg(parser, config, ["--foreground"], default=False, action="store_true")
+    cli.add_arg(parser, config, ["--image_name"], default=f"levanter-{getpass.getuser()}")
+    cli.add_arg(parser, config, ["--preemptible"], default=False, action="store_true")
+    cli.add_arg(parser, config, ["--project"], default=cli.gcloud_config()["project"])
+    cli.add_arg(parser, config, ["--tpu_name"], required=True)
+    cli.add_arg(parser, config, ["--tpu_type"], required=True)
+    cli.add_arg(parser, config, ["--version"], default="tpu-ubuntu2204-base")
+    cli.add_arg(parser, config, ["--zone"], required=True)
+    cli.add_arg(parser, config, ["--retries"], default=0, type=int)
+    cli.add_arg(parser, config, ["--run_id"], default=int(time.time()), type=int)
+
+    parser.add_argument(
+        "-e", "--env", action="append", nargs=2, metavar=("KEY", "VALUE"), default=config.get("env", {}).items()
+    )
+    parser.add_argument("command", nargs=argparse.REMAINDER)
+
+    args = parser.parse_args()
+
+    autodelete = args.autodelete
+    command = args.command
+    docker_base_image = args.docker_base_image
+    docker_repository = args.docker_repository
+    foreground = args.foreground
+    image_id = args.image_name
+    preemptible = args.preemptible
+    project = args.project
+    if args.retries < 0:
+        retries = 10000000
+    else:
+        retries = args.retries
+    tpu_name = args.tpu_name
+    tpu_type = args.tpu_type
+    version = args.version
+    zone = args.zone
+    run_id = args.run_id
+
+    region = "-".join(zone.split("-")[:-1])
+    env = {k: v for k, v in args.env}
+
+    if "WANDB_PROJECT" not in env:
+        env["WANDB_PROJECT"] = "levanter"
+
+    if command[0] == "--":
+        command = command[1:]
+
+    for i in range(retries + 1):
+        try:
+            start_tpu_vm(
+                tpu_name=tpu_name,
+                tpu_type=tpu_type,
+                preemptible=preemptible,
+                version=version,
+                zone=zone,
+                autodelete=autodelete,
+            )
+
+            # We don't technically need to setup on every run, but if we are working on a
+            # stale VM or a VM from e.g. spin-up-vm.sh, this ensures things always work.
+            setup_vm_docker(
+                tpu_name=tpu_name,
+                zone=zone,
+                docker_base_image=docker_base_image,
+            )
+
+            # make an image tag based on the unix timestamp to ensure we always pull the latest image
+            tag = int(time.time())
+
+            full_image_id = push_docker.push_to_gcp(
+                project_id=project,
+                region=region,
+                repository=docker_repository,
+                image_name=image_id,
+                tag=tag,
+                docker_file="docker/tpu/Dockerfile.incremental",
+            )
+
+            git_commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
+
+            cli.tpu_ssh(tpu_name, zone, "docker", "stop", "levanter", "-t", "1", ignore_failure=True)
+            cli.tpu_ssh(tpu_name, zone, "docker", "rm", "-f", "levanter", ignore_failure=True)
+
+            docker_command = [
+                "docker",
+                "run",
+                "-t" if foreground else "-d",
+                "--name=levanter",
+                "--privileged",
+                "--shm-size=32gb",
+                "--net=host",
+                "--init",
+                "--mount",
+                "type=volume,source=levanter,target=/home/levanter",
+                "-v",
+                "/tmp:/tmp",
+                "-e",
+                f"WANDB_DOCKER={image_id}",
+                "-e",
+                f"GIT_COMMIT={git_commit}",
+                "-e",
+                f"RUN_ID={run_id}",
+            ]
+
+            for k, v in env.items():
+                docker_command.extend(["-e", k + f"='{str(v)}'"])
+
+            docker_command.extend([full_image_id, " ".join(command)])
+
+            print(f"Running on tpu_name... {tpu_name}")
+            cli.tpu_ssh(tpu_name, zone, *docker_command)
+        except subprocess.CalledProcessError as e:  # noqa: F841
+            print("Error running command.")
+            if i < retries - 1:
+                print("Retrying... %d/%d" % (i + 1, retries))
diff --git a/infra/push_docker.py b/infra/push_docker.py
new file mode 100644
index 000000000..a0712ff85
--- /dev/null
+++ b/infra/push_docker.py
@@ -0,0 +1,213 @@
+#!/usr/bin/python
+
+"""
+Build and deploy the Levanter base image to Artifact Registry or Docker Hub.
+
+It is not necessary to run this yourself unless you are deploying a new base image: the launch
+script will automatically build and deploy an image based on your current code.
+"""
+
+import argparse
+import json
+import subprocess
+
+from infra.helpers import cli
+
+
+GCP_CLEANUP_POLICY = [
+    {
+        "name": "delete-stale",
+        "action": {"type": "Delete"},
+        "condition": {
+            "olderThan": "86400s",
+            "tagState": "ANY",
+        },
+    },
+    {
+        "name": "keep-latest",
+        "action": {"type": "Keep"},
+        "mostRecentVersions": {
+            "keepCount": 5,
+        },
+    },
+]
+
+
+def _run(*args, **kw):
+    print("Running ", " ".join(args[0]))
+    return subprocess.check_output(*args, **kw)
+
+
+def configure_gcp_docker(project_id, region, repository):
+    """Setup Artifact registry repository and configure permissions to enable TPU access."""
+    # check if the repository already exists
+    try:
+        _run(
+            ["gcloud", "artifacts", "repositories", "describe", f"--location={region}", repository],
+            stderr=subprocess.STDOUT,
+        )
+        print(f"Found existing artifact registry repository `{repository}`, skipping setup.")
+        return
+    except subprocess.CalledProcessError as e:
+        if b"NOT_FOUND" not in e.output:
+            raise
+
+    # Activate artifact registry and setup the repository.
+    _run(["gcloud", "services", "enable", "artifactregistry.googleapis.com"])
+
+    try:
+        _run(
+            [
+                "gcloud",
+                "artifacts",
+                "repositories",
+                "create",
+                repository,
+                f"--location={region}",
+                "--repository-format=docker",
+            ],
+            stderr=subprocess.STDOUT,
+        )
+    except subprocess.CalledProcessError as e:
+        # Ignore error if repository already exists.
+        if b"ALREADY_EXISTS" not in e.output:
+            print("Error creating repository: ", e.output)
+            raise
+
+    with open("/tmp/cleanup-policy.json", "w") as f:
+        json.dump(GCP_CLEANUP_POLICY, f, indent=2)
+
+    _run(
+        [
+            "gcloud",
+            "artifacts",
+            "repositories",
+            "set-cleanup-policies",
+            f"--location={region}",
+            "--policy=/tmp/cleanup-policy.json",
+            repository,
+        ]
+    )
+
+    # Grant public read access ('allUsers') for TPU VMs
+    _run(
+        [
+            "gcloud",
+            "artifacts",
+            "repositories",
+            "add-iam-policy-binding",
+            "--member=allUsers",
+            "--role=roles/artifactregistry.reader",
+            f"--location={region}",
+            repository,
+        ]
+    )
+
+    _run(
+        [
+            "gcloud",
+            "--project",
+            project_id,
+            "artifacts",
+            "repositories",
+            "add-iam-policy-binding",
+            repository,
+            "--location",
+            region,
+            "--member",
+            "allUsers",
+            "--role",
+            "roles/artifactregistry.reader",
+        ]
+    )
+
+    _run(["gcloud", "auth", "configure-docker", "--quiet", f"{region}-docker.pkg.dev"])
+
+
+def build_docker(docker_file, image_name, tag) -> str:
+    """Builds a Docker image, enables artifact access, and pushes to Artifact Registry."""
+
+    _run(
+        [
+            "docker",
+            "buildx",
+            "build",
+            "--platform=linux/amd64",
+            "-t",
+            f"{image_name}:{tag}",
+            "-f",
+            docker_file,
+            ".",
+        ]
+    )
+
+    return f"{image_name}:{tag}"
+
+
+# Disabled until we can figure out how Docker hub organizations work
+def push_to_github(local_image, tag, github_user=None, github_token=None, docker_file=None):
+    """Pushes a local Docker image to Docker Hub."""
+
+    # Authenticate the docker service with Github if a token exists
+    if github_token:
+        login_process = subprocess.Popen(
+            ["docker", "login", "ghcr.io", "-u", github_user, "--password-stdin"], stdin=subprocess.PIPE
+        )
+        print(login_process.communicate(input=github_token.encode(), timeout=10))
+
+    remote_name = f"ghcr.io/{github_user}/{local_image}:{tag}"
+    local_name = build_docker(docker_file=docker_file, image_name=local_image, tag=tag)
+
+    _run(["docker", "tag", local_name, remote_name])
+    _run(["docker", "push", remote_name])
+    return remote_name
+
+
+def push_to_gcp(project_id, region, repository, image_name, tag, docker_file) -> str:
+    """Pushes a local Docker image to Artifact Registry."""
+    configure_gcp_docker(project_id, region, repository)
+    local_image = build_docker(docker_file=docker_file, image_name=image_name, tag=tag)
+
+    artifact_repo = f"{region}-docker.pkg.dev/{project_id}/{repository}"
+
+    full_image_name = f"{artifact_repo}/{image_name}:{tag}"
+    _run(["docker", "tag", local_image, full_image_name])
+    _run(["docker", "push", full_image_name])
+
+    return f"{region}-docker.pkg.dev/{project_id}/{repository}/{image_name}:{tag}"
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Build and push Docker image to Artifact Registry.")
+    config = cli.load_config()
+    cli.add_arg(parser, config, ["--project"], help="GCP project ID")
+    cli.add_arg(parser, config, ["--region"], help="Artifact Registry region (e.g., us-west4)")
+    cli.add_arg(parser, config, ["--repository"], default="levanter", help="Artifact Registry repository name")
+    cli.add_arg(parser, config, ["--image"], default="levanter", help="Docker image name.")
+    cli.add_arg(parser, config, ["--tag"], default="latest", help="Docker image tag.")
+    cli.add_arg(parser, config, ["--github_user"], default=None, help="Github user name.")
+    cli.add_arg(parser, config, ["--github_token"], default=None, help="Github token.")
+    cli.add_arg(parser, config, ["--docker_file"], default="docker/tpu/Dockerfile.base", help="Dockerfile to use.")
+
+    # push to either github or GCP
+    cli.add_arg(parser, config, ["--docker_target"], choices=["github", "gcp"], required=True)
+
+    args = parser.parse_args()
+
+    if args.docker_target == "github":
+        assert args.github_user, "Must specify --github_user when pushing to Github"
+        assert args.github_token, "Must specify --github_token when pushing to Github"
+        push_to_github(args.image, args.tag, args.github_user, args.github_token, docker_file=args.docker_file)
+    else:
+        assert args.region, "Must specify --region when pushing to GCP"
+        assert args.project, "Must specify --project when pushing to GCP"
+        assert args.repository, "Must specify --repository when pushing to GCP"
+
+        push_to_gcp(
+            args.project,
+            args.region,
+            args.repository,
+            args.image,
+            args.tag,
+            docker_file=args.docker_file,
+        )
diff --git a/pyproject.toml b/pyproject.toml
index 60e44e15b..527f88f7d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -67,8 +67,6 @@ dev-mode-dirs = [".", "src"]
 [tool.hatch.metadata]
 allow-direct-references = true
 
-
-
 [tool.hatch.build.targets.wheel]
 packages = ["levanter"]
 
@@ -109,3 +107,13 @@ markers = [
     "entry: marks tests as entry point tests (deselect with '-m \"not entry\"')",
     "ray: marks tests that require Ray (deselect with '-m \"not ray\"')",
 ]
+
+[project.optional-dependencies]
+test = [
+    "pytest",
+    "flake8",
+    "pytest",
+    "soundfile",
+    "librosa",
+    "pytest-forked"
+]
\ No newline at end of file
diff --git a/src/levanter/tracker/wandb.py b/src/levanter/tracker/wandb.py
index 9d41e935a..c98c0727c 100644
--- a/src/levanter/tracker/wandb.py
+++ b/src/levanter/tracker/wandb.py
@@ -208,6 +208,9 @@ def _git_settings(self):
         return other_settings
 
     def _get_git_sha(self, code_dir) -> Optional[str]:
+        if "GIT_COMMIT" in os.environ:
+            return os.environ["GIT_COMMIT"]
+
         try:
             repo = Repo(code_dir)
             git_sha = repo.head.commit.hexsha