Adjust docs to reflect new config format and cleanup a few flags.

stanford-crfm · Jun 11, 2024 · c64c38d · c64c38d
1 parent bd81df1
commit c64c38d
Show file tree

Hide file tree

Showing 5 changed files with 62 additions and 31 deletions.
diff --git a/.github/workflows/tpu_unit_tests.yaml b/.github/workflows/tpu_unit_tests.yaml
@@ -31,12 +31,14 @@ jobs:
           export TPU_NAME=ci-run-${{ github.run_id }}
           eval "$(ssh-agent -s)"
           TRUE_SHA=${{ github.event.pull_request.head.sha }}
-          bash infra/spin-up-vm.sh $TPU_NAME -z ${TPU_ZONE} -t v4-8 --preemptible --retries 1
+          bash infra/spin-up-vm.sh $TPU_NAME -z ${TPU_ZONE} -t v4-8 --preemptible -s infra/helpers/setup-tpu-vm-tests.sh -b ${TRUE_SHA} --retries 1
+#          infra/babysit-tpu-vm.sh $TPU_NAME -z ${{ TPU_ZONE }} -t v4-8 --preemptible -s infra/helpers/setup-tpu-vm-tests.sh -b ${{ github.sha }} --retries 1 -- \
+#            PYTHONPATH=$PYTHONPATH:levanter/tests bash levanter/infra/run.sh pytest levanter/tests -m "not entry"
 
       - name: Run most tests
         run: |
           export TPU_NAME=ci-run-${{ github.run_id }}
-          python infra/launch.py --foreground --tpu=$TPU_NAME --zone=$TPU_ZONE -- /opt/levanter/.venv/bin/pytest tests -m "not entry"
+          gcloud compute tpus tpu-vm ssh $TPU_NAME --zone ${TPU_ZONE} --command "PYTHONPATH=$PYTHONPATH:levanter/tests bash levanter/infra/run.sh pytest levanter/tests -m 'not entry'"
 # Something's wrong with these
 #
 #      - name: Run forked tests

diff --git a/docs/Getting-Started-TPU-VM.md b/docs/Getting-Started-TPU-VM.md
@@ -95,25 +95,32 @@ First create a configuration file for future launches in your Levanter directory
 ```
 cat > .config <<EOF
 env:
-    WANDB_API_KEY:  ...
-    WANDB_ENTITY: ...
-    WANDB_PROJECT: levanter
-    HF_TOKEN: ...
+    WANDB_API_KEY: 
+    WANDB_ENTITY: 
+    WANDB_PROJECT: 
+    HF_TOKEN: 
+    TPU_STDERR_LOG_LEVEL: 0
+    TPU_MIN_LOG_LEVEL: 0
+    LIBTPU_INIT_ARGS: <extra args to libtpu>
 
 docker_repository: levanter
 zone: us-west4-a
-tpu: test-tpu
+tpu_name: test-spin-up-32
+tpu_type: "v5litepod-16"
+vm_image: "tpu-ubuntu2204-base"
+preemptible: true
+autodelete: false
+subnetwork: "default"
+
 EOF
 ```
 
-Everything after the `--` is run on each worker.
+Now run `launch.py`. This will package your current directory into a Docker image and run it on your workers. Everything after the `--` is run on each worker.
 
 ```bash
 python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
 ```
 
-`launch.py` will package your directory and create and deploy a Docker image  on each worker.
-
 ### Launch a GPT-2 Small in interactive mode
 
 To run in the foreground, use `--foreground` with the `launch.py` script. You should use tmux or something for long running jobs for this version. It's mostly for debugging.

diff --git a/docs/Training-On-Your-Data.md b/docs/Training-On-Your-Data.md
@@ -398,15 +398,25 @@ This will spin up a TPU VM instance and install Levanter on it. You can then run
 
 ```
 cat > .config <<EOF
+cat > .config <<EOF
 env:
-    WANDB_API_KEY:  ...
-    WANDB_ENTITY: ...
-    WANDB_PROJECT: levanter
-    HF_TOKEN: ...
+    WANDB_API_KEY: 
+    WANDB_ENTITY: 
+    WANDB_PROJECT: 
+    HF_TOKEN: 
+    TPU_STDERR_LOG_LEVEL: 0
+    TPU_MIN_LOG_LEVEL: 0
+    LIBTPU_INIT_ARGS: <extra args to libtpu>
 
 docker_repository: levanter
 zone: us-west4-a
-tpu: test-tpu
+tpu_name: test-spin-up-32
+tpu_type: "v5litepod-16"
+vm_image: "tpu-ubuntu2204-base"
+preemptible: true
+autodelete: false
+subnetwork: "default"
+
 EOF
 ```
 

diff --git a/docs/tutorials/Training-On-Audio-Data.md b/docs/tutorials/Training-On-Audio-Data.md
@@ -179,16 +179,30 @@ infra/babysit-tpu-vm my-tpu -z us-east1-d -t v3-128 -- \
 
 #### Spin up and manual launch
 
-You should probably use the automated setup script, as described in the [relevant section of the TPU guide](../Getting-Started-TPU-VM.md#automatic-setup).
-Here's what that looks like:
+You can start up a TPU VM and launch your instance with `launch.py`. To simplify your command for multiple launches, you can put common parameters into `.config` in your `levanter` directory:
+
+cat > .config <<EOF
+env:
+    WANDB_API_KEY: 
+    WANDB_ENTITY: 
+    WANDB_PROJECT: 
+    HF_TOKEN: 
+    TPU_STDERR_LOG_LEVEL: 0
+    TPU_MIN_LOG_LEVEL: 0
+    LIBTPU_INIT_ARGS: <extra args to libtpu>
+
+docker_repository: levanter
+zone: us-west4-a
+tpu_name: test-spin-up-32
+tpu_type: "v5litepod-16"
+vm_image: "tpu-ubuntu2204-base"
+preemptible: true
+autodelete: false
+subnetwork: "default"
+EOF
 
 ```bash
-bash infra/spin-up-tpu-vm.sh my-tpu -z us-east1-d -t v3-128
-```
-
-This will spin up a TPU VM instance and install Levanter on it. You can then run a command like so:
 
-```bash
 python infra/launch.py -- python levanter/src/levanter/main/train_asr.py --config_path gs://path/to/config.yaml"
 ```
 

diff --git a/infra/launch.py b/infra/launch.py
@@ -10,6 +10,7 @@
 
 
 def setup_vm_docker(tpu_name, zone, docker_base_image):
+    """Change docker permissions on `tpu_name` and setup the cache volume."""
     cli.tpu_ssh(
         tpu_name,
         zone,
@@ -52,9 +53,7 @@ def list_tpus(zone):
     return tpus
 
 
-def start_tpu_vm(
-    tpu_name, *, tpu_type, preemptible, version, zone, autodelete, project, docker_repository, docker_base_image
-):
+def start_tpu_vm(tpu_name, *, tpu_type, preemptible, version, zone, autodelete):
     tpu_exists = any([tpu["NAME"] == tpu_name for tpu in list_tpus(zone)])
     if tpu_exists:
         if not autodelete:
@@ -104,8 +103,8 @@ def start_tpu_vm(
     cli.add_arg(parser, config, ["--image_name"], default=f"levanter-{getpass.getuser()}")
     cli.add_arg(parser, config, ["--preemptible"], default=False, action="store_true")
     cli.add_arg(parser, config, ["--project"], default=cli.gcloud_config()["project"])
-    cli.add_arg(parser, config, ["--tpu"], required=True)
-    cli.add_arg(parser, config, ["--tpu_type"])
+    cli.add_arg(parser, config, ["--tpu_name"], required=True)
+    cli.add_arg(parser, config, ["--tpu_type"], required=True)
     cli.add_arg(parser, config, ["--version"], default="tpu-ubuntu2204-base")
     cli.add_arg(parser, config, ["--zone"], required=True)
     cli.add_arg(parser, config, ["--retries"], default=0, type=int)
@@ -129,7 +128,7 @@ def start_tpu_vm(
         retries = 10000000
     else:
         retries = args.retries
-    tpu_name = args.tpu
+    tpu_name = args.tpu_name
     tpu_type = args.tpu_type
     version = args.version
     zone = args.zone
@@ -152,11 +151,10 @@ def start_tpu_vm(
                 version=version,
                 zone=zone,
                 autodelete=autodelete,
-                project=project,
-                docker_repository=docker_repository,
-                docker_base_image=docker_base_image,
             )
 
+            # We don't technically need to setup on every run, but if we are working on a
+            # stale VM or a VM from e.g. spin-up-vm.sh, this ensures things always work.
             setup_vm_docker(
                 tpu_name=tpu_name,
                 zone=zone,