diff --git a/.dockerignore b/.dockerignore
index f5ceb7397..17fbbcfe1 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,3 +1,5 @@
+.git
+
 scratch
 cache
 wandb
@@ -44,6 +46,7 @@ instance/
 
 # Sphinx documentation
 docs/_build/
+docs/figures/
 
 # PyBuilder
 target/
@@ -105,7 +108,6 @@ dmypy.json
 # JetBrains
 .idea/
 
-
 # dataset cache files
 **/*.parquet
 **/ledger.json
diff --git a/.github/workflows/tpu_unit_tests.yaml b/.github/workflows/tpu_unit_tests.yaml
index 3e27426eb..4f3eaccb3 100644
--- a/.github/workflows/tpu_unit_tests.yaml
+++ b/.github/workflows/tpu_unit_tests.yaml
@@ -31,14 +31,12 @@ jobs:
           export TPU_NAME=ci-run-${{ github.run_id }}
           eval "$(ssh-agent -s)"
           TRUE_SHA=${{ github.event.pull_request.head.sha }}
-          bash infra/spin-up-vm.sh $TPU_NAME -z ${TPU_ZONE} -t v4-8 --preemptible -s infra/helpers/setup-tpu-vm-tests.sh -b ${TRUE_SHA} --retries 1
-#          infra/babysit-tpu-vm.sh $TPU_NAME -z ${{ TPU_ZONE }} -t v4-8 --preemptible -s infra/helpers/setup-tpu-vm-tests.sh -b ${{ github.sha }} --retries 1 -- \
-#            PYTHONPATH=$PYTHONPATH:levanter/tests bash levanter/infra/run.sh pytest levanter/tests -m "not entry"
+          bash infra/spin-up-vm.sh $TPU_NAME -z ${TPU_ZONE} -t v4-8 --preemptible --retries 1
 
       - name: Run most tests
         run: |
           export TPU_NAME=ci-run-${{ github.run_id }}
-          gcloud compute tpus tpu-vm ssh $TPU_NAME --zone ${TPU_ZONE} --command "PYTHONPATH=$PYTHONPATH:levanter/tests bash levanter/infra/run.sh pytest levanter/tests -m 'not entry'"
+          python infra/launch.py --foreground --tpu=$TPU_NAME --zone=$TPU_ZONE -- /opt/levanter/.venv/bin/pytest tests -m "not entry"
 # Something's wrong with these
 #
 #      - name: Run forked tests
diff --git a/.gitignore b/.gitignore
index c66f6f352..835da2048 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,8 @@
 /scratch
 
+# Configuration for TPU launches/secrets
+.config
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -140,6 +143,7 @@ dmypy.json
 /wandb
 
 # dataset cache files
+/cache
 *.parquet
 ledger.json
 
diff --git a/docker/tpu/Dockerfile.base b/docker/tpu/Dockerfile.base
new file mode 100644
index 000000000..9a93736b1
--- /dev/null
+++ b/docker/tpu/Dockerfile.base
@@ -0,0 +1,17 @@
+FROM python:3.10 AS build
+RUN apt-get update && apt-get install -y clang
+RUN pip install virtualenv
+
+# venv binaries encode their directory, so we need to setup the venv in the final location
+RUN virtualenv -p python3.10 /opt/levanter/.venv
+RUN /opt/levanter/.venv/bin/pip install -U "jax[tpu]==0.4.26" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
+
+# Add only the requirements files to cache dependency build/installation
+WORKDIR /tmp
+ADD pyproject.toml README.md /tmp/
+RUN /opt/levanter/.venv/bin/pip install -e '.[test]'
+
+FROM python:3.10
+
+WORKDIR /opt/levanter
+COPY --from=build /opt/levanter/.venv /opt/levanter/.venv
\ No newline at end of file
diff --git a/docker/tpu/Dockerfile.incremental b/docker/tpu/Dockerfile.incremental
new file mode 100644
index 000000000..10afb1ca7
--- /dev/null
+++ b/docker/tpu/Dockerfile.incremental
@@ -0,0 +1,17 @@
+ARG IMAGE=ghcr.io/rjpower/levanter
+ARG TAG=latest
+
+FROM ${IMAGE}:${TAG}
+
+ENV TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS=60\
+    TENSORSTORE_CURL_LOW_SPEED_LIMIT_BYTES=1024\
+    RAY_USAGE_STATS_ENABLED=0\
+    PATH=/opt/levanter/.venv/bin:$PATH\
+    PYTHONPATH=/opt/levanter:/opt/levanter/src:/opt/levanter/examples:/opt/levanter/tests\
+    HOME=/home/levanter
+
+WORKDIR /opt/levanter
+
+ADD pyproject.toml README.md /opt/levanter/
+RUN pip install -e '.[test]'
+ADD . /opt/levanter
\ No newline at end of file
diff --git a/docs/Getting-Started-TPU-VM.md b/docs/Getting-Started-TPU-VM.md
index fe73eef70..1d2a50945 100644
--- a/docs/Getting-Started-TPU-VM.md
+++ b/docs/Getting-Started-TPU-VM.md
@@ -85,18 +85,40 @@ the VM. That's explained down below in the [Running Levanter GPT-2](#running-lev
 ## Running Levanter GPT-2
 Now that you have a TPU VM instance, you can follow the [Getting Started](Getting-Started-Training.md) steps, but here are a few shortcuts:
 
-### Launch a GPT-2 Small in unattended mode (using nohup)
+### Launch a GPT-2 Small in unattended mode
+
+You will need a [Docker installation](https://docs.docker.com/engine/install/)
+on your development machine to build and run images on TPUs.
+
+First create a configuration file for future launches in your Levanter directory:
+
+```
+cat > .config <<EOF
+env:
+    WANDB_API_KEY:  ...
+    WANDB_ENTITY: ...
+    WANDB_PROJECT: levanter
+    HF_TOKEN: ...
+
+docker_repository: levanter
+zone: us-west4-a
+tpu: test-tpu
+EOF
+```
+
+Everything after the `--` is run on each worker.
+
 ```bash
-gcloud compute tpus tpu-vm ssh $NAME --zone $ZONE --worker=all --command 'WANDB_API_KEY=... levanter/infra/launch.sh python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
+python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
 ```
 
-`launch.sh` will run the command in the background and redirect stdout and stderr to a log file in the home directory
-on each worker.
+`launch.py` will package your directory and create and deploy a Docker image  on each worker.
 
 ### Launch a GPT-2 Small in interactive mode
-This version writes to the terminal, you should use tmux or something for long running jobs for this version. It's mostly for debugging.
+
+To run in the foreground, use `--foreground` with the `launch.py` script. You should use tmux or something for long running jobs for this version. It's mostly for debugging.
 ```bash
-gcloud compute tpus tpu-vm ssh $NAME --zone $ZONE --worker=all --command 'WANDB_API_KEY=... levanter/infra/run.sh python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
+python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
 ```
 
 ### Babysitting Script
@@ -113,11 +135,12 @@ You can run it like this:
 
 ```bash
 infra/babysit-tpu-vm <name> -z <zone> -t <type> [--preemptible]  -- \
-    WANDB_API_KEY=... levanter/infra/run.sh python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml
+    python infra/launch.py -- levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml
 ```
 
-That `--` is important! It separates the spin up args from the running args. Also, you should never use `launch.sh`
-with `babysit`, because nohup exits immediately with exit code 0.
+That `--` is important! It separates the spin up args from the running args.
+Also you should always use `--foregrouund` with `babysit-tpu-vm`, as the
+background mode will always return immediately.
 
 ### Running your own config
 
@@ -132,7 +155,7 @@ Afterward, you can use the config directly from the TPU VM instance, e.g.:
 
 ```bash
 infra/babysit-tpu-vm <name> -z <zone> -t <type> [--preemptible] -- \
-    WANDB_API_KEY=... levanter/infra/run.sh python levanter/src/levanter/main/train_lm.py --config_path gs://my_bucket/my_config.yaml \
+    python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path gs://my_bucket/my_config.yaml \
     --trainer.checkpointer.base_path gs://path/to/checkpoints/
 ```
 
diff --git a/docs/Training-On-Your-Data.md b/docs/Training-On-Your-Data.md
index c14b0ba66..cac96de31 100644
--- a/docs/Training-On-Your-Data.md
+++ b/docs/Training-On-Your-Data.md
@@ -395,8 +395,23 @@ bash infra/spin-up-tpu-vm.sh my-tpu -z us-east1-d -t v3-128
 
 This will spin up a TPU VM instance and install Levanter on it. You can then run a command like so:
 
+
+```
+cat > .config <<EOF
+env:
+    WANDB_API_KEY:  ...
+    WANDB_ENTITY: ...
+    WANDB_PROJECT: levanter
+    HF_TOKEN: ...
+
+docker_repository: levanter
+zone: us-west4-a
+tpu: test-tpu
+EOF
+```
+
 ```bash
-gcloud compute tpus tpu-vm ssh my-tpu   --zone us-east1-d --worker=all --command="WANDB_API_KEY=... levanter/infra/launch.sh python levanter/src/levanter/main/train_lm.py --config_path gs://path/to/config.yaml"
+python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path gs://path/to/config.yaml"
 ```
 
 ## Monitoring
diff --git a/docs/tutorials/Training-On-Audio-Data.md b/docs/tutorials/Training-On-Audio-Data.md
index 235b2e79b..bdab91c43 100644
--- a/docs/tutorials/Training-On-Audio-Data.md
+++ b/docs/tutorials/Training-On-Audio-Data.md
@@ -189,7 +189,7 @@ bash infra/spin-up-tpu-vm.sh my-tpu -z us-east1-d -t v3-128
 This will spin up a TPU VM instance and install Levanter on it. You can then run a command like so:
 
 ```bash
-gcloud compute tpus tpu-vm ssh my-tpu   --zone us-east1-d --worker=all --command="WANDB_API_KEY=... levanter/infra/launch.sh python levanter/src/levanter/main/train_asr.py --config_path gs://path/to/config.yaml"
+python infra/launch.py -- python levanter/src/levanter/main/train_asr.py --config_path gs://path/to/config.yaml"
 ```
 
 ### GPU
diff --git a/infra/__init__.py b/infra/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/infra/helpers/cli.py b/infra/helpers/cli.py
new file mode 100644
index 000000000..2abc50de7
--- /dev/null
+++ b/infra/helpers/cli.py
@@ -0,0 +1,74 @@
+import argparse
+import os
+import subprocess
+import typing
+
+from google.cloud import storage
+import yaml
+
+
+def run_command(*args, **kwargs):
+    print("Running:", " ".join(list(args)))
+    return subprocess.check_call(args, **kwargs)
+
+
+def add_ssh_key(ssh_key_filename):
+    # format 3072 SHA256:... key-name (RSA)
+    key_hash = subprocess.check_output(["ssh-keygen", "-lf", ssh_key_filename]).decode("utf-8").split()[1]
+    existing_keys = subprocess.check_output(["ssh-add", "-l"]).decode("utf-8").split("\n")
+    for key in existing_keys:
+        if key_hash in key:
+            print('Found existing key in ssh-agent, skipping "ssh-add"')
+            return
+
+    subprocess.check_call(["ssh-add", ssh_key_filename])
+
+
+def tpu_ssh(tpu_name, zone, *args):
+    add_ssh_key(os.path.expanduser("~/.ssh/google_compute_engine"))
+    return run_command(
+        "gcloud",
+        "alpha",
+        "compute",
+        "tpus",
+        "tpu-vm",
+        "ssh",
+        tpu_name,
+        "--worker=all",
+        f"--zone={zone}",
+        "--command=%s" % " ".join(args),
+    )
+
+
+# Oddly enough, there's no API to simply fetch the current gcloud configuration...
+def gcloud_config():
+    client = storage.Client()
+    return {
+        "project": client.project,
+    }
+
+
+def add_arg(
+    parser: argparse.ArgumentParser, config: typing.Dict, flags: typing.List[str], required=False, default=None, **kw
+):
+    """Add an argument to the parser, using `config` or the environment to resolve default values."""
+    key = flags[0].lstrip("-").replace("-", "_")
+    if key in config:
+        default = config[key]
+
+    if key.upper() in os.environ:
+        default = os.environ[key.upper()]
+
+    if default is not None:
+        kw["default"] = default
+    elif required:
+        kw["required"] = True
+
+    parser.add_argument(*flags, **kw)
+
+
+def load_config():
+    if os.path.exists(".config"):
+        return yaml.load(open(".config", "r"), Loader=yaml.SafeLoader)
+    else:
+        return {}
diff --git a/infra/launch.py b/infra/launch.py
new file mode 100755
index 000000000..9fe544844
--- /dev/null
+++ b/infra/launch.py
@@ -0,0 +1,206 @@
+#!/usr/bin/python
+
+import argparse
+import getpass
+import subprocess
+import time
+
+from infra import push_docker
+
+from infra.helpers import cli
+
+
+def setup_vm_docker(tpu_name, zone, docker_base_image):
+    cli.tpu_ssh(
+        tpu_name,
+        zone,
+        "sudo",
+        "usermod",
+        "-aG",
+        "docker",
+        getpass.getuser(),
+    )
+
+    cli.tpu_ssh(
+        tpu_name,
+        zone,
+        "docker",
+        "pull",
+        docker_base_image,
+    )
+
+    cli.tpu_ssh(tpu_name, zone, "docker", "volume", "create", "--driver=local", "levanter")
+
+
+def list_tpus(zone):
+    tpus = subprocess.check_output(
+        [
+            "gcloud",
+            "alpha",
+            "compute",
+            "tpus",
+            "tpu-vm",
+            "list",
+        ]
+    )
+    rows = tpus.decode("utf-8").split("\n")
+    header = rows[0].split()
+    tpus = []
+    for row in rows[1:]:
+        if row:
+            tpus.append(dict(zip(header, row.split())))
+    return tpus
+
+
+def start_tpu_vm(
+    tpu_name, *, tpu_type, preemptible, version, zone, autodelete, project, docker_repository, docker_base_image
+):
+    tpu_exists = any([tpu["NAME"] == tpu_name for tpu in list_tpus(zone)])
+    if tpu_exists:
+        if not autodelete:
+            print("TPU already exists and autodelete is false, leaving it as is.")
+            return
+
+        print("TPU already exists, deleting...")
+        cli.run_command(
+            "gcloud",
+            "alpha",
+            "compute",
+            "tpus",
+            "delete",
+            "--quiet",
+            f"--zone={zone}",
+            tpu_name,
+        )
+
+    print(f"Creating new TPU {tpu_name} in {zone} of type {tpu_type}...")
+    cli.run_command(
+        "gcloud",
+        "alpha",
+        "compute",
+        "tpus",
+        "tpu-vm",
+        "create",
+        tpu_name,
+        f"--accelerator-type={tpu_type}",
+        f"--version={version}",
+        "--zone=" + zone,
+        "--preemptible" if preemptible else "",
+        "--quiet",
+    )
+
+    setup_vm_docker(
+        tpu_name=tpu_name,
+        zone=zone,
+        docker_base_image=docker_base_image,
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    config = cli.load_config()
+
+    cli.add_arg(parser, config, ["--autodelete"], default=False, action="store_true")
+    cli.add_arg(parser, config, ["--docker-base-image"], default="ghcr.io/rjpower/levanter:latest")
+    cli.add_arg(parser, config, ["--docker-repository"], default="levanter")
+    cli.add_arg(parser, config, ["--foreground"], default=False, action="store_true")
+    cli.add_arg(parser, config, ["--image-name"], default=f"levanter-{getpass.getuser()}")
+    cli.add_arg(parser, config, ["--preemptible"], default=False, action="store_true")
+    cli.add_arg(parser, config, ["--project"], default=cli.gcloud_config()["project"])
+    cli.add_arg(parser, config, ["--tpu"], required=True)
+    cli.add_arg(parser, config, ["--tpu-type"])
+    cli.add_arg(parser, config, ["--version"], default="tpu-ubuntu2204-base")
+    cli.add_arg(parser, config, ["--zone"], required=True)
+    cli.add_arg(parser, config, ["--retries"], default=0, type=int)
+
+    parser.add_argument(
+        "-e", "--env", action="append", nargs=2, metavar=("KEY", "VALUE"), default=config.get("env", {}).items()
+    )
+    parser.add_argument("command", nargs=argparse.REMAINDER)
+
+    args = parser.parse_args()
+
+    autodelete = args.autodelete
+    command = args.command
+    docker_base_image = args.docker_base_image
+    docker_repository = args.docker_repository
+    foreground = args.foreground
+    image_id = args.image_name
+    preemptible = args.preemptible
+    project = args.project
+    if args.retries < 0:
+        retries = 10000000
+    else:
+        retries = args.retries
+    tpu_name = args.tpu
+    tpu_type = args.tpu_type
+    version = args.version
+    zone = args.zone
+
+    region = "-".join(zone.split("-")[:-1])
+    env = {k: v for k, v in args.env}
+
+    if "WANDB_PROJECT" not in env:
+        env["WANDB_PROJECT"] = "levanter"
+
+    if command[0] == "--":
+        command = command[1:]
+
+    for i in range(retries + 1):
+        try:
+            start_tpu_vm(
+                tpu_name=tpu_name,
+                tpu_type=tpu_type,
+                preemptible=preemptible,
+                version=version,
+                zone=zone,
+                autodelete=autodelete,
+                project=project,
+                docker_repository=docker_repository,
+                docker_base_image=docker_base_image,
+            )
+
+            # make an image tag based on the unix timestamp to ensure we always pull the latest image
+            tag = run_id = int(time.time())
+
+            full_image_id = push_docker.push_to_gcp(
+                project_id=project,
+                region=region,
+                repository=docker_repository,
+                image_name=image_id,
+                tag=tag,
+                docker_file="docker/tpu/Dockerfile.incremental",
+            )
+
+            git_commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
+            docker_command = [
+                "docker",
+                "run",
+                "-t" if foreground else "-d",
+                "--privileged",
+                "--shm-size=32gb",
+                "--net=host",
+                "--init",
+                "--mount",
+                "type=volume,source=levanter,target=/home/levanter",
+                "-v",
+                "/tmp:/tmp",
+                "-e",
+                f"WANDB_DOCKER={image_id}",
+                "-e",
+                f"GIT_COMMIT={git_commit}",
+                "-e",
+                f"RUN_ID={run_id}",
+            ]
+
+            for k, v in env.items():
+                docker_command.extend(["-e", k + f"='{str(v)}'"])
+
+            docker_command.extend([full_image_id, " ".join(command)])
+
+            print(f"Running on tpu_name... {tpu_name}")
+            cli.tpu_ssh(tpu_name, zone, *docker_command)
+        except subprocess.CalledProcessError as e:
+            print("Error running command.")
+            if i < retries - 1:
+                print("Retrying... %d/%d" % (i + 1, retries))
diff --git a/infra/push_docker.py b/infra/push_docker.py
new file mode 100644
index 000000000..80115d746
--- /dev/null
+++ b/infra/push_docker.py
@@ -0,0 +1,202 @@
+#!/usr/bin/python
+
+"""
+Build and deploy the Levanter base image to Artifact Registry or Docker Hub.
+
+It is not necessary to run this yourself unless you are deploying a new base image: the launch
+script will automatically build and deploy an image based on your current code.
+"""
+
+import argparse
+from calendar import c
+import json
+import os
+import subprocess
+
+from infra.helpers import cli
+
+GCP_CLEANUP_POLICY = [
+    {
+        "name": "delete-stale",
+        "action": {"type": "Delete"},
+        "condition": {
+            "olderThan": "86400s",
+            "tagState": "ANY",
+        },
+    },
+    {
+        "name": "keep-latest",
+        "action": {"type": "Keep"},
+        "mostRecentVersions": {
+            "keepCount": 5,
+        },
+    },
+]
+
+
+def _run(*args, **kw):
+    print("Running ", " ".join(args[0]))
+    return subprocess.check_output(*args, **kw)
+
+
+def configure_gcp_docker(project_id, region, repository):
+    """Setup Artifact registry repository and configure permissions to enable TPU access."""
+    # Activate artifact registry and setup the repository.
+    _run(["gcloud", "services", "enable", "artifactregistry.googleapis.com"])
+
+    try:
+        _run(
+            [
+                "gcloud",
+                "artifacts",
+                "repositories",
+                "create",
+                repository,
+                f"--location={region}",
+                "--repository-format=docker",
+            ],
+            stderr=subprocess.STDOUT,
+        )
+    except subprocess.CalledProcessError as e:
+        # Ignore error if repository already exists.
+        if b"ALREADY_EXISTS" not in e.output:
+            print("Error creating repository: ", e.output)
+            raise
+
+    with open("/tmp/cleanup-policy.json", "w") as f:
+        json.dump(GCP_CLEANUP_POLICY, f, indent=2)
+
+    _run(
+        [
+            "gcloud",
+            "artifacts",
+            "repositories",
+            "set-cleanup-policies",
+            f"--location={region}",
+            "--policy=/tmp/cleanup-policy.json",
+            repository,
+        ]
+    )
+
+    # Grant public read access ('allUsers') for TPU VMs
+    _run(
+        [
+            "gcloud",
+            "artifacts",
+            "repositories",
+            "add-iam-policy-binding",
+            "--member=allUsers",
+            "--role=roles/artifactregistry.reader",
+            f"--location={region}",
+            repository,
+        ]
+    )
+
+    _run(
+        [
+            "gcloud",
+            "--project",
+            project_id,
+            "artifacts",
+            "repositories",
+            "add-iam-policy-binding",
+            repository,
+            "--location",
+            region,
+            "--member",
+            "allUsers",
+            "--role",
+            "roles/artifactregistry.reader",
+        ]
+    )
+
+    _run(["gcloud", "auth", "configure-docker", "--quiet", f"{region}-docker.pkg.dev"])
+
+
+def build_docker(docker_file, image_name, tag) -> str:
+    """Builds a Docker image, enables artifact access, and pushes to Artifact Registry."""
+
+    _run(
+        [
+            "docker",
+            "buildx",
+            "build",
+            "--platform=linux/amd64",
+            "-t",
+            f"{image_name}:{tag}",
+            "-f",
+            docker_file,
+            ".",
+        ]
+    )
+
+    return f"{image_name}:{tag}"
+
+
+# Disabled until we can figure out how Docker hub organizations work
+def push_to_github(local_image, tag, github_user=None, github_token=None, docker_file=None):
+    """Pushes a local Docker image to Docker Hub."""
+
+    # Authenticate the docker service with Github if a token exists
+    if github_token:
+        login_process = subprocess.Popen(
+            ["docker", "login", "ghcr.io", "-u", github_user, "--password-stdin"], stdin=subprocess.PIPE
+        )
+        print(login_process.communicate(input=github_token.encode(), timeout=10))
+
+    remote_name = f"ghcr.io/{github_user}/{local_image}:{tag}"
+    local_name = build_docker(docker_file=docker_file, image_name=local_image, tag=tag)
+
+    _run(["docker", "tag", local_name, remote_name])
+    _run(["docker", "push", remote_name])
+    return remote_name
+
+
+def push_to_gcp(project_id, region, repository, image_name, tag, docker_file) -> str:
+    """Pushes a local Docker image to Artifact Registry."""
+    configure_gcp_docker(project_id, region, repository)
+    local_image = build_docker(docker_file=docker_file, image_name=image_name, tag=tag)
+
+    artifact_repo = f"{region}-docker.pkg.dev/{project_id}/{repository}"
+
+    full_image_name = f"{artifact_repo}/{image_name}:{tag}"
+    _run(["docker", "tag", local_image, full_image_name])
+    _run(["docker", "push", full_image_name])
+
+    return f"{region}-docker.pkg.dev/{project_id}/{repository}/{image_name}:{tag}"
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Build and push Docker image to Artifact Registry.")
+    config = cli.load_config()
+    cli.add_arg(parser, config, ["--project"], help="GCP project ID")
+    cli.add_arg(parser, config, ["--region"], help="Artifact Registry region (e.g., us-west4)")
+    cli.add_arg(parser, config, ["--repository"], default="levanter", help="Artifact Registry repository name")
+    cli.add_arg(parser, config, ["--image"], default="levanter", help="Docker image name.")
+    cli.add_arg(parser, config, ["--tag"], default="latest", help="Docker image tag.")
+    cli.add_arg(parser, config, ["--github_user"], default=None, help="Github user name.")
+    cli.add_arg(parser, config, ["--github_token"], default=None, help="Github token.")
+    cli.add_arg(parser, config, ["--docker_file"], default="docker/tpu/Dockerfile.base", help="Dockerfile to use.")
+
+    # push to either github or GCP
+    cli.add_arg(parser, config, ["--docker_target"], choices=["github", "gcp"], required=True)
+
+    args = parser.parse_args()
+
+    if args.docker_target == "github":
+        assert args.github_user, "Must specify --github_user when pushing to Github"
+        assert args.github_token, "Must specify --github_token when pushing to Github"
+        push_to_github(args.image, args.tag, args.github_user, args.github_token, docker_file=args.docker_file)
+    else:
+        assert args.region, "Must specify --region when pushing to GCP"
+        assert args.project, "Must specify --project when pushing to GCP"
+        assert args.repository, "Must specify --repository when pushing to GCP"
+
+        push_to_gcp(
+            args.project,
+            args.region,
+            args.repository,
+            args.image,
+            args.tag,
+            docker_file=args.docker_file,
+        )
diff --git a/pyproject.toml b/pyproject.toml
index f17a26791..76bdd8864 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -67,8 +67,6 @@ dev-mode-dirs = [".", "src"]
 [tool.hatch.metadata]
 allow-direct-references = true
 
-
-
 [tool.hatch.build.targets.wheel]
 packages = ["levanter"]
 
@@ -109,3 +107,13 @@ markers = [
     "entry: marks tests as entry point tests (deselect with '-m \"not entry\"')",
     "ray: marks tests that require Ray (deselect with '-m \"not ray\"')",
 ]
+
+[project.optional-dependencies]
+test = [
+    "pytest",
+    "flake8",
+    "pytest",
+    "soundfile",
+    "librosa",
+    "pytest-forked"
+]
\ No newline at end of file
diff --git a/src/levanter/tracker/wandb.py b/src/levanter/tracker/wandb.py
index 9d41e935a..c98c0727c 100644
--- a/src/levanter/tracker/wandb.py
+++ b/src/levanter/tracker/wandb.py
@@ -208,6 +208,9 @@ def _git_settings(self):
         return other_settings
 
     def _get_git_sha(self, code_dir) -> Optional[str]:
+        if "GIT_COMMIT" in os.environ:
+            return os.environ["GIT_COMMIT"]
+
         try:
             repo = Repo(code_dir)
             git_sha = repo.head.commit.hexsha