stanford-crfm · dlwh · Jun 11, 2024 · May 25, 2024 · Jun 11, 2024 · Jun 11, 2024
diff --git a/.dockerignore b/.dockerignore
@@ -1,3 +1,5 @@
+.git
+
 scratch
 cache
 wandb
@@ -44,6 +46,7 @@ instance/
 
 # Sphinx documentation
 docs/_build/
+docs/figures/
 
 # PyBuilder
 target/
@@ -105,7 +108,6 @@ dmypy.json
 # JetBrains
 .idea/
 
-
 # dataset cache files
 **/*.parquet
 **/ledger.json

diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,8 @@
 /scratch
 
+# Configuration for TPU launches/secrets
+.config
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -140,6 +143,7 @@ dmypy.json
 /wandb
 
 # dataset cache files
+/cache
 *.parquet
 ledger.json
 

diff --git a/docker/tpu/Dockerfile.base b/docker/tpu/Dockerfile.base
@@ -0,0 +1,18 @@
+FROM python:3.10 AS build
+RUN apt-get update && apt-get install -y clang
+RUN pip install virtualenv
+
+# venv binaries encode their directory, so we need to setup the venv in the final location
+RUN virtualenv -p python3.10 /opt/levanter/.venv
+ENV PATH /opt/levanter/.venv/bin:$PATH
+RUN /opt/levanter/.venv/bin/pip install -U hatch "jax[tpu]==0.4.26" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
+
+# Install package dependencies to make incremental builds faster.
+WORKDIR /tmp/
+ADD pyproject.toml README.md /tmp/
+RUN pip install $(hatch dep show requirements --all)
+
+FROM python:3.10
+
+WORKDIR /opt/levanter
+COPY --from=build /opt/levanter/.venv /opt/levanter/.venv
diff --git a/docker/tpu/Dockerfile.incremental b/docker/tpu/Dockerfile.incremental
@@ -0,0 +1,17 @@
+ARG IMAGE=ghcr.io/rjpower/levanter
+ARG TAG=latest
+
+FROM ${IMAGE}:${TAG}
+
+ENV TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS=60\
+    TENSORSTORE_CURL_LOW_SPEED_LIMIT_BYTES=1024\
+    RAY_USAGE_STATS_ENABLED=0\
+    PATH=/opt/levanter/.venv/bin:$PATH\
+    PYTHONPATH=/opt/levanter:/opt/levanter/src:/opt/levanter/examples:/opt/levanter/tests\
+    HOME=/home/levanter
+
+WORKDIR /opt/levanter
+
+ADD pyproject.toml README.md /opt/levanter/
+RUN pip install -e '.[test]'
+ADD . /opt/levanter
diff --git a/docs/Getting-Started-TPU-VM.md b/docs/Getting-Started-TPU-VM.md
@@ -85,63 +85,88 @@ the VM. That's explained down below in the [Running Levanter GPT-2](#running-lev
 ## Running Levanter GPT-2
 Now that you have a TPU VM instance, you can follow the [Getting Started](Getting-Started-Training.md) steps, but here are a few shortcuts:
 
-### Launch a GPT-2 Small in unattended mode (using nohup)
-```bash
-gcloud compute tpus tpu-vm ssh $NAME --zone $ZONE --worker=all --command 'WANDB_API_KEY=... levanter/infra/launch.sh python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
+### Launch a GPT-2 Small in unattended mode
+
+You will need a [Docker installation](https://docs.docker.com/engine/install/)
+on your development machine to build and run images on TPUs.
+
+First create a configuration file for future launches in your Levanter directory:
+
 ```
+cat > .config <<EOF
+env:
+    WANDB_API_KEY: 
+    WANDB_ENTITY: 
+    WANDB_PROJECT: 
+    HF_TOKEN: 
+    TPU_STDERR_LOG_LEVEL: 0
+    TPU_MIN_LOG_LEVEL: 0
+    LIBTPU_INIT_ARGS: <extra args to libtpu>
+
+docker_repository: levanter
+zone: us-west4-a
+tpu_name: test-spin-up-32
+tpu_type: "v5litepod-16"
+vm_image: "tpu-ubuntu2204-base"
+preemptible: true
+autodelete: false
+subnetwork: "default"
+
+EOF
+```
+
+Now run `launch.py`. This will package your current directory into a Docker image and run it on your workers. Everything after the `--` is run on each worker.
 
-`launch.sh` will run the command in the background and redirect stdout and stderr to a log file in the home directory
-on each worker.
+```bash
+python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
+```
 
 ### Launch a GPT-2 Small in interactive mode
-This version writes to the terminal, you should use tmux or something for long running jobs for this version. It's mostly for debugging.
+
+To run in the foreground, use `--foreground` with the `launch.py` script. You should use tmux or something for long running jobs for this version. It's mostly for debugging.
 ```bash
-gcloud compute tpus tpu-vm ssh $NAME --zone $ZONE --worker=all --command 'WANDB_API_KEY=... levanter/infra/run.sh python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
+python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
 ```
 
 ### Babysitting Script
 
 If you are using a preemptible TPU VM, you probably want to use the "babysitting" script that automatically re-creates
-the VM. This is because preemptible instances can be preempted and will always be killed every 24 hours. The babysitting
-script handles both the creation of the node and the running of a job, and also relaunches the TPU VM if it gets preempted.
-It keeps running the command (and relaunching) until the command exits successfully.
-
-Note that the babysitting-script will automatically set the `RUN_ID` environment variable if not set, and pass it to the
-training command. This ensures that restarted jobs have the same run id, which is important for resumes to work.
-
-You can run it like this:
+the VM. This is because preemptible instances can be preempted and will always be killed every 24 hours. You can run `launch.py` with the `--retries` and `--foreground` parameter to accomplish this. If `--retries` is greater than 1, `launch.py` will automatically attempt to re-create the VM and re-run the command if it fails. (`--foreground` is necessary to keep the script from returning immediately.)
 
 ```bash
-infra/babysit-tpu-vm <name> -z <zone> -t <type> [--preemptible]  -- \
-    WANDB_API_KEY=... levanter/infra/run.sh python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml
+    python infra/launch.py --retries=100 --foreground --tpu_name=my_tpu -- python src/levanter/main/train_lm.py --config_path config/my_config.yaml \
+    --trainer.checkpointer.base_path gs://path/to/checkpoints/
 ```
 
-That `--` is important! It separates the spin up args from the running args. Also, you should never use `launch.sh`
-with `babysit`, because nohup exits immediately with exit code 0.
+That `--` is important! It separates the spin up args from the running args.
+Also you should always use `--foregrouund` with `babysit-tpu-vm`, as the
+background mode will always return immediately.
 
 ### Running your own config
 
-If you want to run your own config, we suggest you start from one of the existing configs. Then, if you're not using
-an NFS server or similar, you should upload your config to GCS:
+If you want to run your own config, we suggest you start from one of the existing configs. Just copy it to
+a new file:
+
+`cp config/gpt2_small.yaml config/my_config.yaml`
+
+If you're using `launch.py`, the config will be automatically uploaded as part of your Docker image, so you
+can just reference the local config path in your command line:
 
-```bash
-gsutil cp my_config.yaml gs://my_bucket//my_config.yaml
 ```
 
 Afterward, you can use the config directly from the TPU VM instance, e.g.:
 
 ```bash
-infra/babysit-tpu-vm <name> -z <zone> -t <type> [--preemptible] -- \
-    WANDB_API_KEY=... levanter/infra/run.sh python levanter/src/levanter/main/train_lm.py --config_path gs://my_bucket/my_config.yaml \
+    python infra/launch.py -- python src/levanter/main/train_lm.py --config_path config/my_config.yaml \
     --trainer.checkpointer.base_path gs://path/to/checkpoints/
 ```
 
-The `--config_path` argument can be a local path, a GCS path, or any URL loadable by fsspec.
 With this configuration (unless `trainer.load_checkpoint` is false), Levanter will automatically
 try to load the latest checkpoint if it exists.
 
-Tokenizers are also loaded via fsspec, so you can use the same trick to load them from GCS if you have a custom
-tokenizer, or you can use an HF tokenizer.
+Tokenizers and configuration files are loaded via `fsspec` which supports remote
+filesystems , so you can also copy your tokenizer or config file to GCS and use
+a `gs://` path to access it.
 
 ## Common Issues
 ### (CRFM) Permission denied on `/files`

diff --git a/docs/Training-On-Your-Data.md b/docs/Training-On-Your-Data.md
@@ -395,8 +395,31 @@ bash infra/spin-up-tpu-vm.sh my-tpu -z us-east1-d -t v3-128
 
 This will spin up a TPU VM instance and install Levanter on it. You can then run a command like so:
 
+
+```
+cat > .config <<EOF
+env:
+    WANDB_API_KEY: 
+    WANDB_ENTITY: 
+    WANDB_PROJECT: 
+    HF_TOKEN: 
+    TPU_STDERR_LOG_LEVEL: 0
+    TPU_MIN_LOG_LEVEL: 0
+    LIBTPU_INIT_ARGS: <extra args to libtpu>
+
+docker_repository: levanter
+zone: us-west4-a
+tpu_type: "v5litepod-16"
+vm_image: "tpu-ubuntu2204-base"
+preemptible: true
+autodelete: false
+subnetwork: "default"
+
+EOF
+```
+
 ```bash
-gcloud compute tpus tpu-vm ssh my-tpu   --zone us-east1-d --worker=all --command="WANDB_API_KEY=... levanter/infra/launch.sh python levanter/src/levanter/main/train_lm.py --config_path gs://path/to/config.yaml"
+python infra/launch.py --tpu_name=my_tpu -- python levanter/src/levanter/main/train_lm.py --config_path gs://path/to/config.yaml"
 ```
 
 ## Monitoring

diff --git a/docs/tutorials/Training-On-Audio-Data.md b/docs/tutorials/Training-On-Audio-Data.md
@@ -179,17 +179,30 @@ infra/babysit-tpu-vm my-tpu -z us-east1-d -t v3-128 -- \
 
 #### Spin up and manual launch
 
-You should probably use the automated setup script, as described in the [relevant section of the TPU guide](../Getting-Started-TPU-VM.md#automatic-setup).
-Here's what that looks like:
+You can start up a TPU VM and launch your instance with `launch.py`. To simplify your command for multiple launches, you can put common parameters into `.config` in your `levanter` directory:
+
+cat > .config <<EOF
+env:
+    WANDB_API_KEY: 
+    WANDB_ENTITY: 
+    WANDB_PROJECT: 
+    HF_TOKEN: 
+    TPU_STDERR_LOG_LEVEL: 0
+    TPU_MIN_LOG_LEVEL: 0
+    LIBTPU_INIT_ARGS: <extra args to libtpu>
+
+docker_repository: levanter
+zone: us-west4-a
+tpu_type: "v5litepod-16"
+vm_image: "tpu-ubuntu2204-base"
+preemptible: true
+autodelete: false
+subnetwork: "default"
+EOF
 
 ```bash
-bash infra/spin-up-tpu-vm.sh my-tpu -z us-east1-d -t v3-128
-```
-
-This will spin up a TPU VM instance and install Levanter on it. You can then run a command like so:
 
-```bash
-gcloud compute tpus tpu-vm ssh my-tpu   --zone us-east1-d --worker=all --command="WANDB_API_KEY=... levanter/infra/launch.sh python levanter/src/levanter/main/train_asr.py --config_path gs://path/to/config.yaml"
+python infra/launch.py --tpu_name=my_tpu -- python levanter/src/levanter/main/train_asr.py --config_path gs://path/to/config.yaml"
 ```
 
 ### GPU

diff --git a/infra/__init__.py b/infra/__init__.py
diff --git a/infra/helpers/cli.py b/infra/helpers/cli.py
@@ -0,0 +1,79 @@
+import argparse
+import os
+import subprocess
+import typing
+
+from google.cloud import storage
+import yaml
+
+
+def run_command(*args, **kwargs):
+    print("Running:", " ".join(list(args)))
+    return subprocess.check_call(args, **kwargs)
+
+
+def add_ssh_key(ssh_key_filename):
+    # format 3072 SHA256:... key-name (RSA)
+    key_hash = subprocess.check_output(["ssh-keygen", "-lf", ssh_key_filename]).decode("utf-8").split()[1]
+    existing_keys = subprocess.check_output(["ssh-add", "-l"]).decode("utf-8").split("\n")
+    for key in existing_keys:
+        if key_hash in key:
+            return
+
+    subprocess.check_call(["ssh-add", ssh_key_filename])
+
+
+def tpu_ssh(tpu_name, zone, *args, ignore_failure=False):
+    add_ssh_key(os.path.expanduser("~/.ssh/google_compute_engine"))
+    try:
+        return run_command(
+            "gcloud",
+            "alpha",
+            "compute",
+            "tpus",
+            "tpu-vm",
+            "ssh",
+            tpu_name,
+            "--worker=all",
+            f"--zone={zone}",
+            "--command=%s" % " ".join(args),
+        )
+    except subprocess.CalledProcessError as e:
+        if ignore_failure:
+            print("Ignoring failure:", e)
+        else:
+            raise
+
+
+# Oddly enough, there's no API to simply fetch the current gcloud configuration...
+def gcloud_config():
+    client = storage.Client()
+    return {
+        "project": client.project,
+    }
+
+
+def add_arg(
+    parser: argparse.ArgumentParser, config: typing.Dict, flags: typing.List[str], required=False, default=None, **kw
+):
+    """Add an argument to the parser, using `config` or the environment to resolve default values."""
+    key = flags[0].lstrip("-").replace("-", "_")
+    if key in config:
+        default = config[key]
+
+    if key.upper() in os.environ:
+        default = os.environ[key.upper()]
+
+    if default is not None:
+        kw["default"] = default
+    elif required:
+        kw["required"] = True
+
+    parser.add_argument(*flags, **kw)
+
+
+def load_config():
+    if os.path.exists(".config"):
+        return yaml.load(open(".config", "r"), Loader=yaml.SafeLoader)
+    else:
+        return {}