Adjust docs to reflect new config format and cleanup a few flags.

stanford-crfm · Jun 11, 2024 · 2099930 · 2099930
1 parent bd81df1
commit 2099930
Show file tree

Hide file tree

Showing 5 changed files with 76 additions and 52 deletions.
diff --git a/.github/workflows/tpu_unit_tests.yaml b/.github/workflows/tpu_unit_tests.yaml
@@ -31,12 +31,14 @@ jobs:
           export TPU_NAME=ci-run-${{ github.run_id }}
           eval "$(ssh-agent -s)"
           TRUE_SHA=${{ github.event.pull_request.head.sha }}
-          bash infra/spin-up-vm.sh $TPU_NAME -z ${TPU_ZONE} -t v4-8 --preemptible --retries 1
+          bash infra/spin-up-vm.sh $TPU_NAME -z ${TPU_ZONE} -t v4-8 --preemptible -s infra/helpers/setup-tpu-vm-tests.sh -b ${TRUE_SHA} --retries 1
+#          infra/babysit-tpu-vm.sh $TPU_NAME -z ${{ TPU_ZONE }} -t v4-8 --preemptible -s infra/helpers/setup-tpu-vm-tests.sh -b ${{ github.sha }} --retries 1 -- \
+#            PYTHONPATH=$PYTHONPATH:levanter/tests bash levanter/infra/run.sh pytest levanter/tests -m "not entry"
 
       - name: Run most tests
         run: |
           export TPU_NAME=ci-run-${{ github.run_id }}
-          python infra/launch.py --foreground --tpu=$TPU_NAME --zone=$TPU_ZONE -- /opt/levanter/.venv/bin/pytest tests -m "not entry"
+          gcloud compute tpus tpu-vm ssh $TPU_NAME --zone ${TPU_ZONE} --command "PYTHONPATH=$PYTHONPATH:levanter/tests bash levanter/infra/run.sh pytest levanter/tests -m 'not entry'"
 # Something's wrong with these
 #
 #      - name: Run forked tests

diff --git a/docs/Getting-Started-TPU-VM.md b/docs/Getting-Started-TPU-VM.md
@@ -95,25 +95,32 @@ First create a configuration file for future launches in your Levanter directory
 ```
 cat > .config <<EOF
 env:
-    WANDB_API_KEY:  ...
-    WANDB_ENTITY: ...
-    WANDB_PROJECT: levanter
-    HF_TOKEN: ...
+    WANDB_API_KEY: 
+    WANDB_ENTITY: 
+    WANDB_PROJECT: 
+    HF_TOKEN: 
+    TPU_STDERR_LOG_LEVEL: 0
+    TPU_MIN_LOG_LEVEL: 0
+    LIBTPU_INIT_ARGS: <extra args to libtpu>
 
 docker_repository: levanter
 zone: us-west4-a
-tpu: test-tpu
+tpu_name: test-spin-up-32
+tpu_type: "v5litepod-16"
+vm_image: "tpu-ubuntu2204-base"
+preemptible: true
+autodelete: false
+subnetwork: "default"
+
 EOF
 ```
 
-Everything after the `--` is run on each worker.
+Now run `launch.py`. This will package your current directory into a Docker image and run it on your workers. Everything after the `--` is run on each worker.
 
 ```bash
 python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
 ```
 
-`launch.py` will package your directory and create and deploy a Docker image  on each worker.
-
 ### Launch a GPT-2 Small in interactive mode
 
 To run in the foreground, use `--foreground` with the `launch.py` script. You should use tmux or something for long running jobs for this version. It's mostly for debugging.
@@ -124,18 +131,11 @@ python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config
 ### Babysitting Script
 
 If you are using a preemptible TPU VM, you probably want to use the "babysitting" script that automatically re-creates
-the VM. This is because preemptible instances can be preempted and will always be killed every 24 hours. The babysitting
-script handles both the creation of the node and the running of a job, and also relaunches the TPU VM if it gets preempted.
-It keeps running the command (and relaunching) until the command exits successfully.
-
-Note that the babysitting-script will automatically set the `RUN_ID` environment variable if not set, and pass it to the
-training command. This ensures that restarted jobs have the same run id, which is important for resumes to work.
-
-You can run it like this:
+the VM. This is because preemptible instances can be preempted and will always be killed every 24 hours. You can run `launch.py` with the `--retries` and `--foreground` parameter to accomplish this. If `--retries` is greater than 1, `launch.py` will automatically attempt to re-create the VM and re-run the command if it fails. (`--foreground` is necessary to keep the script from returning immediately.)
 
 ```bash
-infra/babysit-tpu-vm <name> -z <zone> -t <type> [--preemptible]  -- \
-    python infra/launch.py -- levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml
+    python infra/launch.py --retries=100 --foreground --tpu_name=my_tpu -- python src/levanter/main/train_lm.py --config_path config/my_config.yaml \
+    --trainer.checkpointer.base_path gs://path/to/checkpoints/
 ```
 
 That `--` is important! It separates the spin up args from the running args.
@@ -144,28 +144,26 @@ background mode will always return immediately.
 
 ### Running your own config
 
-If you want to run your own config, we suggest you start from one of the existing configs. Then, if you're not using
-an NFS server or similar, you should upload your config to GCS:
+If you want to run your own config, we suggest you start from one of the existing configs. Just copy it to
+a new file:
+
+`cp config/gpt2_small.yaml config/my_config.yaml`
+
+If you're using `launch.py`, the config will be automatically uploaded as part of your Docker image, so you
+can just reference the local config path in your command line:
 
-```bash
-gsutil cp my_config.yaml gs://my_bucket//my_config.yaml
 ```
 
 Afterward, you can use the config directly from the TPU VM instance, e.g.:
 
 ```bash
-infra/babysit-tpu-vm <name> -z <zone> -t <type> [--preemptible] -- \
-    python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path gs://my_bucket/my_config.yaml \
+    python infra/launch.py -- python src/levanter/main/train_lm.py --config_path config/my_config.yaml \
     --trainer.checkpointer.base_path gs://path/to/checkpoints/
 ```
 
-The `--config_path` argument can be a local path, a GCS path, or any URL loadable by fsspec.
 With this configuration (unless `trainer.load_checkpoint` is false), Levanter will automatically
 try to load the latest checkpoint if it exists.
 
-Tokenizers are also loaded via fsspec, so you can use the same trick to load them from GCS if you have a custom
-tokenizer, or you can use an HF tokenizer.
-
 ## Common Issues
 ### (CRFM) Permission denied on `/files`
 

diff --git a/docs/Training-On-Your-Data.md b/docs/Training-On-Your-Data.md
@@ -398,15 +398,25 @@ This will spin up a TPU VM instance and install Levanter on it. You can then run
 
 ```
 cat > .config <<EOF
+cat > .config <<EOF
 env:
-    WANDB_API_KEY:  ...
-    WANDB_ENTITY: ...
-    WANDB_PROJECT: levanter
-    HF_TOKEN: ...
+    WANDB_API_KEY: 
+    WANDB_ENTITY: 
+    WANDB_PROJECT: 
+    HF_TOKEN: 
+    TPU_STDERR_LOG_LEVEL: 0
+    TPU_MIN_LOG_LEVEL: 0
+    LIBTPU_INIT_ARGS: <extra args to libtpu>
 
 docker_repository: levanter
 zone: us-west4-a
-tpu: test-tpu
+tpu_name: test-spin-up-32
+tpu_type: "v5litepod-16"
+vm_image: "tpu-ubuntu2204-base"
+preemptible: true
+autodelete: false
+subnetwork: "default"
+
 EOF
 ```
 

diff --git a/docs/tutorials/Training-On-Audio-Data.md b/docs/tutorials/Training-On-Audio-Data.md
@@ -179,16 +179,30 @@ infra/babysit-tpu-vm my-tpu -z us-east1-d -t v3-128 -- \
 
 #### Spin up and manual launch
 
-You should probably use the automated setup script, as described in the [relevant section of the TPU guide](../Getting-Started-TPU-VM.md#automatic-setup).
-Here's what that looks like:
+You can start up a TPU VM and launch your instance with `launch.py`. To simplify your command for multiple launches, you can put common parameters into `.config` in your `levanter` directory:
+
+cat > .config <<EOF
+env:
+    WANDB_API_KEY: 
+    WANDB_ENTITY: 
+    WANDB_PROJECT: 
+    HF_TOKEN: 
+    TPU_STDERR_LOG_LEVEL: 0
+    TPU_MIN_LOG_LEVEL: 0
+    LIBTPU_INIT_ARGS: <extra args to libtpu>
+
+docker_repository: levanter
+zone: us-west4-a
+tpu_name: test-spin-up-32
+tpu_type: "v5litepod-16"
+vm_image: "tpu-ubuntu2204-base"
+preemptible: true
+autodelete: false
+subnetwork: "default"
+EOF
 
 ```bash
-bash infra/spin-up-tpu-vm.sh my-tpu -z us-east1-d -t v3-128
-```
-
-This will spin up a TPU VM instance and install Levanter on it. You can then run a command like so:
 
-```bash
 python infra/launch.py -- python levanter/src/levanter/main/train_asr.py --config_path gs://path/to/config.yaml"
 ```
 

diff --git a/infra/launch.py b/infra/launch.py
@@ -10,6 +10,7 @@
 
 
 def setup_vm_docker(tpu_name, zone, docker_base_image):
+    """Change docker permissions on `tpu_name` and setup the cache volume."""
     cli.tpu_ssh(
         tpu_name,
         zone,
@@ -52,9 +53,7 @@ def list_tpus(zone):
     return tpus
 
 
-def start_tpu_vm(
-    tpu_name, *, tpu_type, preemptible, version, zone, autodelete, project, docker_repository, docker_base_image
-):
+def start_tpu_vm(tpu_name, *, tpu_type, preemptible, version, zone, autodelete):
     tpu_exists = any([tpu["NAME"] == tpu_name for tpu in list_tpus(zone)])
     if tpu_exists:
         if not autodelete:
@@ -104,11 +103,12 @@ def start_tpu_vm(
     cli.add_arg(parser, config, ["--image_name"], default=f"levanter-{getpass.getuser()}")
     cli.add_arg(parser, config, ["--preemptible"], default=False, action="store_true")
     cli.add_arg(parser, config, ["--project"], default=cli.gcloud_config()["project"])
-    cli.add_arg(parser, config, ["--tpu"], required=True)
-    cli.add_arg(parser, config, ["--tpu_type"])
+    cli.add_arg(parser, config, ["--tpu_name"], required=True)
+    cli.add_arg(parser, config, ["--tpu_type"], required=True)
     cli.add_arg(parser, config, ["--version"], default="tpu-ubuntu2204-base")
     cli.add_arg(parser, config, ["--zone"], required=True)
     cli.add_arg(parser, config, ["--retries"], default=0, type=int)
+    cli.add_arg(parser, config, ["--run_id"], default=int(time.time()), type=int)
 
     parser.add_argument(
         "-e", "--env", action="append", nargs=2, metavar=("KEY", "VALUE"), default=config.get("env", {}).items()
@@ -129,10 +129,11 @@ def start_tpu_vm(
         retries = 10000000
     else:
         retries = args.retries
-    tpu_name = args.tpu
+    tpu_name = args.tpu_name
     tpu_type = args.tpu_type
     version = args.version
     zone = args.zone
+    run_id = args.run_id
 
     region = "-".join(zone.split("-")[:-1])
     env = {k: v for k, v in args.env}
@@ -152,19 +153,18 @@ def start_tpu_vm(
                 version=version,
                 zone=zone,
                 autodelete=autodelete,
-                project=project,
-                docker_repository=docker_repository,
-                docker_base_image=docker_base_image,
             )
 
+            # We don't technically need to setup on every run, but if we are working on a
+            # stale VM or a VM from e.g. spin-up-vm.sh, this ensures things always work.
             setup_vm_docker(
                 tpu_name=tpu_name,
                 zone=zone,
                 docker_base_image=docker_base_image,
             )
 
             # make an image tag based on the unix timestamp to ensure we always pull the latest image
-            tag = run_id = int(time.time())
+            tag = int(time.time())
 
             full_image_id = push_docker.push_to_gcp(
                 project_id=project,