From 3b3173c8f846d387ba85182a1ca67020b07906ff Mon Sep 17 00:00:00 2001 From: Russell Power Date: Fri, 24 May 2024 22:11:37 -0700 Subject: [PATCH 1/7] Setup Docker for TPU execution and update infra scripts. I tried to optimize the Docker image size a bit using a staged build, as Ray currently requires a source build of Meson, which requires a Clang installation... even with this jax & libtpu are each themselves >250MB installs, so there's no avoiding a large image size at the moment. Still, with this configuration, a v5-32 (the most I could get given GCPs stingy IP address allocation) takes about 50 seconds to run setup-vm.sh and pull the initial image. After the initial pull, new deployments take a few seconds to package up the current source directory. It's still possible to use the `git clone` approach via a volume mount, but the permissions are a bit finicky at that point, and I'm not sure how many options we want to have. --- .dockerignore | 4 +- .github/workflows/tpu_unit_tests.yaml | 6 +- .gitignore | 4 + docker/tpu/Dockerfile.base | 17 ++ docker/tpu/Dockerfile.incremental | 17 ++ docs/Getting-Started-TPU-VM.md | 43 +++-- docs/Training-On-Your-Data.md | 17 +- docs/tutorials/Training-On-Audio-Data.md | 2 +- infra/__init__.py | 0 infra/helpers/cli.py | 80 +++++++++ infra/launch.py | 211 +++++++++++++++++++++++ infra/push_docker.py | 210 ++++++++++++++++++++++ pyproject.toml | 12 +- src/levanter/tracker/wandb.py | 3 + 14 files changed, 607 insertions(+), 19 deletions(-) create mode 100644 docker/tpu/Dockerfile.base create mode 100644 docker/tpu/Dockerfile.incremental create mode 100644 infra/__init__.py create mode 100644 infra/helpers/cli.py create mode 100755 infra/launch.py create mode 100644 infra/push_docker.py diff --git a/.dockerignore b/.dockerignore index f5ceb7397..17fbbcfe1 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,3 +1,5 @@ +.git + scratch cache wandb @@ -44,6 +46,7 @@ instance/ # Sphinx documentation docs/_build/ +docs/figures/ # PyBuilder target/ @@ -105,7 +108,6 @@ dmypy.json # JetBrains .idea/ - # dataset cache files **/*.parquet **/ledger.json diff --git a/.github/workflows/tpu_unit_tests.yaml b/.github/workflows/tpu_unit_tests.yaml index 3e27426eb..4f3eaccb3 100644 --- a/.github/workflows/tpu_unit_tests.yaml +++ b/.github/workflows/tpu_unit_tests.yaml @@ -31,14 +31,12 @@ jobs: export TPU_NAME=ci-run-${{ github.run_id }} eval "$(ssh-agent -s)" TRUE_SHA=${{ github.event.pull_request.head.sha }} - bash infra/spin-up-vm.sh $TPU_NAME -z ${TPU_ZONE} -t v4-8 --preemptible -s infra/helpers/setup-tpu-vm-tests.sh -b ${TRUE_SHA} --retries 1 -# infra/babysit-tpu-vm.sh $TPU_NAME -z ${{ TPU_ZONE }} -t v4-8 --preemptible -s infra/helpers/setup-tpu-vm-tests.sh -b ${{ github.sha }} --retries 1 -- \ -# PYTHONPATH=$PYTHONPATH:levanter/tests bash levanter/infra/run.sh pytest levanter/tests -m "not entry" + bash infra/spin-up-vm.sh $TPU_NAME -z ${TPU_ZONE} -t v4-8 --preemptible --retries 1 - name: Run most tests run: | export TPU_NAME=ci-run-${{ github.run_id }} - gcloud compute tpus tpu-vm ssh $TPU_NAME --zone ${TPU_ZONE} --command "PYTHONPATH=$PYTHONPATH:levanter/tests bash levanter/infra/run.sh pytest levanter/tests -m 'not entry'" + python infra/launch.py --foreground --tpu=$TPU_NAME --zone=$TPU_ZONE -- /opt/levanter/.venv/bin/pytest tests -m "not entry" # Something's wrong with these # # - name: Run forked tests diff --git a/.gitignore b/.gitignore index c66f6f352..835da2048 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ /scratch +# Configuration for TPU launches/secrets +.config + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -140,6 +143,7 @@ dmypy.json /wandb # dataset cache files +/cache *.parquet ledger.json diff --git a/docker/tpu/Dockerfile.base b/docker/tpu/Dockerfile.base new file mode 100644 index 000000000..9a93736b1 --- /dev/null +++ b/docker/tpu/Dockerfile.base @@ -0,0 +1,17 @@ +FROM python:3.10 AS build +RUN apt-get update && apt-get install -y clang +RUN pip install virtualenv + +# venv binaries encode their directory, so we need to setup the venv in the final location +RUN virtualenv -p python3.10 /opt/levanter/.venv +RUN /opt/levanter/.venv/bin/pip install -U "jax[tpu]==0.4.26" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html + +# Add only the requirements files to cache dependency build/installation +WORKDIR /tmp +ADD pyproject.toml README.md /tmp/ +RUN /opt/levanter/.venv/bin/pip install -e '.[test]' + +FROM python:3.10 + +WORKDIR /opt/levanter +COPY --from=build /opt/levanter/.venv /opt/levanter/.venv \ No newline at end of file diff --git a/docker/tpu/Dockerfile.incremental b/docker/tpu/Dockerfile.incremental new file mode 100644 index 000000000..10afb1ca7 --- /dev/null +++ b/docker/tpu/Dockerfile.incremental @@ -0,0 +1,17 @@ +ARG IMAGE=ghcr.io/rjpower/levanter +ARG TAG=latest + +FROM ${IMAGE}:${TAG} + +ENV TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS=60\ + TENSORSTORE_CURL_LOW_SPEED_LIMIT_BYTES=1024\ + RAY_USAGE_STATS_ENABLED=0\ + PATH=/opt/levanter/.venv/bin:$PATH\ + PYTHONPATH=/opt/levanter:/opt/levanter/src:/opt/levanter/examples:/opt/levanter/tests\ + HOME=/home/levanter + +WORKDIR /opt/levanter + +ADD pyproject.toml README.md /opt/levanter/ +RUN pip install -e '.[test]' +ADD . /opt/levanter \ No newline at end of file diff --git a/docs/Getting-Started-TPU-VM.md b/docs/Getting-Started-TPU-VM.md index fe73eef70..1d2a50945 100644 --- a/docs/Getting-Started-TPU-VM.md +++ b/docs/Getting-Started-TPU-VM.md @@ -85,18 +85,40 @@ the VM. That's explained down below in the [Running Levanter GPT-2](#running-lev ## Running Levanter GPT-2 Now that you have a TPU VM instance, you can follow the [Getting Started](Getting-Started-Training.md) steps, but here are a few shortcuts: -### Launch a GPT-2 Small in unattended mode (using nohup) +### Launch a GPT-2 Small in unattended mode + +You will need a [Docker installation](https://docs.docker.com/engine/install/) +on your development machine to build and run images on TPUs. + +First create a configuration file for future launches in your Levanter directory: + +``` +cat > .config <' +python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://' ``` -`launch.sh` will run the command in the background and redirect stdout and stderr to a log file in the home directory -on each worker. +`launch.py` will package your directory and create and deploy a Docker image on each worker. ### Launch a GPT-2 Small in interactive mode -This version writes to the terminal, you should use tmux or something for long running jobs for this version. It's mostly for debugging. + +To run in the foreground, use `--foreground` with the `launch.py` script. You should use tmux or something for long running jobs for this version. It's mostly for debugging. ```bash -gcloud compute tpus tpu-vm ssh $NAME --zone $ZONE --worker=all --command 'WANDB_API_KEY=... levanter/infra/run.sh python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://' +python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://' ``` ### Babysitting Script @@ -113,11 +135,12 @@ You can run it like this: ```bash infra/babysit-tpu-vm -z -t [--preemptible] -- \ - WANDB_API_KEY=... levanter/infra/run.sh python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml + python infra/launch.py -- levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml ``` -That `--` is important! It separates the spin up args from the running args. Also, you should never use `launch.sh` -with `babysit`, because nohup exits immediately with exit code 0. +That `--` is important! It separates the spin up args from the running args. +Also you should always use `--foregrouund` with `babysit-tpu-vm`, as the +background mode will always return immediately. ### Running your own config @@ -132,7 +155,7 @@ Afterward, you can use the config directly from the TPU VM instance, e.g.: ```bash infra/babysit-tpu-vm -z -t [--preemptible] -- \ - WANDB_API_KEY=... levanter/infra/run.sh python levanter/src/levanter/main/train_lm.py --config_path gs://my_bucket/my_config.yaml \ + python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path gs://my_bucket/my_config.yaml \ --trainer.checkpointer.base_path gs://path/to/checkpoints/ ``` diff --git a/docs/Training-On-Your-Data.md b/docs/Training-On-Your-Data.md index c14b0ba66..cac96de31 100644 --- a/docs/Training-On-Your-Data.md +++ b/docs/Training-On-Your-Data.md @@ -395,8 +395,23 @@ bash infra/spin-up-tpu-vm.sh my-tpu -z us-east1-d -t v3-128 This will spin up a TPU VM instance and install Levanter on it. You can then run a command like so: + +``` +cat > .config < str: + """Builds a Docker image, enables artifact access, and pushes to Artifact Registry.""" + + _run( + [ + "docker", + "buildx", + "build", + "--platform=linux/amd64", + "-t", + f"{image_name}:{tag}", + "-f", + docker_file, + ".", + ] + ) + + return f"{image_name}:{tag}" + + +# Disabled until we can figure out how Docker hub organizations work +def push_to_github(local_image, tag, github_user=None, github_token=None, docker_file=None): + """Pushes a local Docker image to Docker Hub.""" + + # Authenticate the docker service with Github if a token exists + if github_token: + login_process = subprocess.Popen( + ["docker", "login", "ghcr.io", "-u", github_user, "--password-stdin"], stdin=subprocess.PIPE + ) + print(login_process.communicate(input=github_token.encode(), timeout=10)) + + remote_name = f"ghcr.io/{github_user}/{local_image}:{tag}" + local_name = build_docker(docker_file=docker_file, image_name=local_image, tag=tag) + + _run(["docker", "tag", local_name, remote_name]) + _run(["docker", "push", remote_name]) + return remote_name + + +def push_to_gcp(project_id, region, repository, image_name, tag, docker_file) -> str: + """Pushes a local Docker image to Artifact Registry.""" + configure_gcp_docker(project_id, region, repository) + local_image = build_docker(docker_file=docker_file, image_name=image_name, tag=tag) + + artifact_repo = f"{region}-docker.pkg.dev/{project_id}/{repository}" + + full_image_name = f"{artifact_repo}/{image_name}:{tag}" + _run(["docker", "tag", local_image, full_image_name]) + _run(["docker", "push", full_image_name]) + + return f"{region}-docker.pkg.dev/{project_id}/{repository}/{image_name}:{tag}" + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Build and push Docker image to Artifact Registry.") + config = cli.load_config() + cli.add_arg(parser, config, ["--project"], help="GCP project ID") + cli.add_arg(parser, config, ["--region"], help="Artifact Registry region (e.g., us-west4)") + cli.add_arg(parser, config, ["--repository"], default="levanter", help="Artifact Registry repository name") + cli.add_arg(parser, config, ["--image"], default="levanter", help="Docker image name.") + cli.add_arg(parser, config, ["--tag"], default="latest", help="Docker image tag.") + cli.add_arg(parser, config, ["--github_user"], default=None, help="Github user name.") + cli.add_arg(parser, config, ["--github_token"], default=None, help="Github token.") + cli.add_arg(parser, config, ["--docker_file"], default="docker/tpu/Dockerfile.base", help="Dockerfile to use.") + + # push to either github or GCP + cli.add_arg(parser, config, ["--docker_target"], choices=["github", "gcp"], required=True) + + args = parser.parse_args() + + if args.docker_target == "github": + assert args.github_user, "Must specify --github_user when pushing to Github" + assert args.github_token, "Must specify --github_token when pushing to Github" + push_to_github(args.image, args.tag, args.github_user, args.github_token, docker_file=args.docker_file) + else: + assert args.region, "Must specify --region when pushing to GCP" + assert args.project, "Must specify --project when pushing to GCP" + assert args.repository, "Must specify --repository when pushing to GCP" + + push_to_gcp( + args.project, + args.region, + args.repository, + args.image, + args.tag, + docker_file=args.docker_file, + ) diff --git a/pyproject.toml b/pyproject.toml index 60e44e15b..527f88f7d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -67,8 +67,6 @@ dev-mode-dirs = [".", "src"] [tool.hatch.metadata] allow-direct-references = true - - [tool.hatch.build.targets.wheel] packages = ["levanter"] @@ -109,3 +107,13 @@ markers = [ "entry: marks tests as entry point tests (deselect with '-m \"not entry\"')", "ray: marks tests that require Ray (deselect with '-m \"not ray\"')", ] + +[project.optional-dependencies] +test = [ + "pytest", + "flake8", + "pytest", + "soundfile", + "librosa", + "pytest-forked" +] \ No newline at end of file diff --git a/src/levanter/tracker/wandb.py b/src/levanter/tracker/wandb.py index 9d41e935a..c98c0727c 100644 --- a/src/levanter/tracker/wandb.py +++ b/src/levanter/tracker/wandb.py @@ -208,6 +208,9 @@ def _git_settings(self): return other_settings def _get_git_sha(self, code_dir) -> Optional[str]: + if "GIT_COMMIT" in os.environ: + return os.environ["GIT_COMMIT"] + try: repo = Repo(code_dir) git_sha = repo.head.commit.hexsha From d4a1d4fb9d9fb0b4cd7bd8004ea38955883e6e1b Mon Sep 17 00:00:00 2001 From: David Hall Date: Tue, 11 Jun 2024 00:02:38 -0700 Subject: [PATCH 2/7] misc dumb fixes --- infra/launch.py | 13 ++++++++----- infra/push_docker.py | 5 ++--- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/infra/launch.py b/infra/launch.py index 6cd3c1e9e..6b465f700 100755 --- a/infra/launch.py +++ b/infra/launch.py @@ -6,7 +6,6 @@ import time from infra import push_docker - from infra.helpers import cli @@ -41,6 +40,7 @@ def list_tpus(zone): "tpus", "tpu-vm", "list", + "--zone=" + zone, ] ) rows = tpus.decode("utf-8").split("\n") @@ -67,6 +67,7 @@ def start_tpu_vm( "alpha", "compute", "tpus", + "tpu-vm", "delete", "--quiet", f"--zone={zone}", @@ -74,7 +75,7 @@ def start_tpu_vm( ) print(f"Creating new TPU {tpu_name} in {zone} of type {tpu_type}...") - cli.run_command( + command = [ "gcloud", "alpha", "compute", @@ -85,9 +86,11 @@ def start_tpu_vm( f"--accelerator-type={tpu_type}", f"--version={version}", "--zone=" + zone, - "--preemptible" if preemptible else "", "--quiet", - ) + ] + if preemptible: + command.append("--preemptible") + cli.run_command(*command) if __name__ == "__main__": @@ -205,7 +208,7 @@ def start_tpu_vm( print(f"Running on tpu_name... {tpu_name}") cli.tpu_ssh(tpu_name, zone, *docker_command) - except subprocess.CalledProcessError as e: + except subprocess.CalledProcessError as e: # noqa: F841 print("Error running command.") if i < retries - 1: print("Retrying... %d/%d" % (i + 1, retries)) diff --git a/infra/push_docker.py b/infra/push_docker.py index b16b23ac2..a85b64f18 100644 --- a/infra/push_docker.py +++ b/infra/push_docker.py @@ -8,13 +8,12 @@ """ import argparse -from calendar import c import json -import os import subprocess from infra.helpers import cli + GCP_CLEANUP_POLICY = [ { "name": "delete-stale", @@ -46,7 +45,7 @@ def configure_gcp_docker(project_id, region, repository): _run(["gcloud", "artifacts", "repositories", "describe", f"--location={region}", repository]) return except subprocess.CalledProcessError as e: - if "NOT_FOUND" not in e.output: + if b"NOT_FOUND" not in e.stderr: raise # Activate artifact registry and setup the repository. From bc861c07e5efe69c9491382d8ef752409bfb0688 Mon Sep 17 00:00:00 2001 From: David Hall Date: Tue, 11 Jun 2024 00:12:05 -0700 Subject: [PATCH 3/7] there we go --- infra/push_docker.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/infra/push_docker.py b/infra/push_docker.py index a85b64f18..e3e416290 100644 --- a/infra/push_docker.py +++ b/infra/push_docker.py @@ -42,10 +42,13 @@ def configure_gcp_docker(project_id, region, repository): """Setup Artifact registry repository and configure permissions to enable TPU access.""" # check if the repository already exists try: - _run(["gcloud", "artifacts", "repositories", "describe", f"--location={region}", repository]) + _run( + ["gcloud", "artifacts", "repositories", "describe", f"--location={region}", repository], + stderr=subprocess.STDOUT, + ) return except subprocess.CalledProcessError as e: - if b"NOT_FOUND" not in e.stderr: + if b"NOT_FOUND" not in e.output: raise # Activate artifact registry and setup the repository. From 20999304c3975c61db7e68cfd910214f9825a499 Mon Sep 17 00:00:00 2001 From: Russell Power Date: Tue, 11 Jun 2024 08:27:13 -0700 Subject: [PATCH 4/7] Adjust docs to reflect new config format and cleanup a few flags. --- .github/workflows/tpu_unit_tests.yaml | 6 ++- docs/Getting-Started-TPU-VM.md | 54 ++++++++++++------------ docs/Training-On-Your-Data.md | 20 ++++++--- docs/tutorials/Training-On-Audio-Data.md | 28 +++++++++--- infra/launch.py | 20 ++++----- 5 files changed, 76 insertions(+), 52 deletions(-) diff --git a/.github/workflows/tpu_unit_tests.yaml b/.github/workflows/tpu_unit_tests.yaml index 4f3eaccb3..3e27426eb 100644 --- a/.github/workflows/tpu_unit_tests.yaml +++ b/.github/workflows/tpu_unit_tests.yaml @@ -31,12 +31,14 @@ jobs: export TPU_NAME=ci-run-${{ github.run_id }} eval "$(ssh-agent -s)" TRUE_SHA=${{ github.event.pull_request.head.sha }} - bash infra/spin-up-vm.sh $TPU_NAME -z ${TPU_ZONE} -t v4-8 --preemptible --retries 1 + bash infra/spin-up-vm.sh $TPU_NAME -z ${TPU_ZONE} -t v4-8 --preemptible -s infra/helpers/setup-tpu-vm-tests.sh -b ${TRUE_SHA} --retries 1 +# infra/babysit-tpu-vm.sh $TPU_NAME -z ${{ TPU_ZONE }} -t v4-8 --preemptible -s infra/helpers/setup-tpu-vm-tests.sh -b ${{ github.sha }} --retries 1 -- \ +# PYTHONPATH=$PYTHONPATH:levanter/tests bash levanter/infra/run.sh pytest levanter/tests -m "not entry" - name: Run most tests run: | export TPU_NAME=ci-run-${{ github.run_id }} - python infra/launch.py --foreground --tpu=$TPU_NAME --zone=$TPU_ZONE -- /opt/levanter/.venv/bin/pytest tests -m "not entry" + gcloud compute tpus tpu-vm ssh $TPU_NAME --zone ${TPU_ZONE} --command "PYTHONPATH=$PYTHONPATH:levanter/tests bash levanter/infra/run.sh pytest levanter/tests -m 'not entry'" # Something's wrong with these # # - name: Run forked tests diff --git a/docs/Getting-Started-TPU-VM.md b/docs/Getting-Started-TPU-VM.md index 1d2a50945..8fc8f98ee 100644 --- a/docs/Getting-Started-TPU-VM.md +++ b/docs/Getting-Started-TPU-VM.md @@ -95,25 +95,32 @@ First create a configuration file for future launches in your Levanter directory ``` cat > .config < docker_repository: levanter zone: us-west4-a -tpu: test-tpu +tpu_name: test-spin-up-32 +tpu_type: "v5litepod-16" +vm_image: "tpu-ubuntu2204-base" +preemptible: true +autodelete: false +subnetwork: "default" + EOF ``` -Everything after the `--` is run on each worker. +Now run `launch.py`. This will package your current directory into a Docker image and run it on your workers. Everything after the `--` is run on each worker. ```bash python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://' ``` -`launch.py` will package your directory and create and deploy a Docker image on each worker. - ### Launch a GPT-2 Small in interactive mode To run in the foreground, use `--foreground` with the `launch.py` script. You should use tmux or something for long running jobs for this version. It's mostly for debugging. @@ -124,18 +131,11 @@ python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config ### Babysitting Script If you are using a preemptible TPU VM, you probably want to use the "babysitting" script that automatically re-creates -the VM. This is because preemptible instances can be preempted and will always be killed every 24 hours. The babysitting -script handles both the creation of the node and the running of a job, and also relaunches the TPU VM if it gets preempted. -It keeps running the command (and relaunching) until the command exits successfully. - -Note that the babysitting-script will automatically set the `RUN_ID` environment variable if not set, and pass it to the -training command. This ensures that restarted jobs have the same run id, which is important for resumes to work. - -You can run it like this: +the VM. This is because preemptible instances can be preempted and will always be killed every 24 hours. You can run `launch.py` with the `--retries` and `--foreground` parameter to accomplish this. If `--retries` is greater than 1, `launch.py` will automatically attempt to re-create the VM and re-run the command if it fails. (`--foreground` is necessary to keep the script from returning immediately.) ```bash -infra/babysit-tpu-vm -z -t [--preemptible] -- \ - python infra/launch.py -- levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml + python infra/launch.py --retries=100 --foreground --tpu_name=my_tpu -- python src/levanter/main/train_lm.py --config_path config/my_config.yaml \ + --trainer.checkpointer.base_path gs://path/to/checkpoints/ ``` That `--` is important! It separates the spin up args from the running args. @@ -144,28 +144,26 @@ background mode will always return immediately. ### Running your own config -If you want to run your own config, we suggest you start from one of the existing configs. Then, if you're not using -an NFS server or similar, you should upload your config to GCS: +If you want to run your own config, we suggest you start from one of the existing configs. Just copy it to +a new file: + +`cp config/gpt2_small.yaml config/my_config.yaml` + +If you're using `launch.py`, the config will be automatically uploaded as part of your Docker image, so you +can just reference the local config path in your command line: -```bash -gsutil cp my_config.yaml gs://my_bucket//my_config.yaml ``` Afterward, you can use the config directly from the TPU VM instance, e.g.: ```bash -infra/babysit-tpu-vm -z -t [--preemptible] -- \ - python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path gs://my_bucket/my_config.yaml \ + python infra/launch.py -- python src/levanter/main/train_lm.py --config_path config/my_config.yaml \ --trainer.checkpointer.base_path gs://path/to/checkpoints/ ``` -The `--config_path` argument can be a local path, a GCS path, or any URL loadable by fsspec. With this configuration (unless `trainer.load_checkpoint` is false), Levanter will automatically try to load the latest checkpoint if it exists. -Tokenizers are also loaded via fsspec, so you can use the same trick to load them from GCS if you have a custom -tokenizer, or you can use an HF tokenizer. - ## Common Issues ### (CRFM) Permission denied on `/files` diff --git a/docs/Training-On-Your-Data.md b/docs/Training-On-Your-Data.md index cac96de31..11aba38b8 100644 --- a/docs/Training-On-Your-Data.md +++ b/docs/Training-On-Your-Data.md @@ -398,15 +398,25 @@ This will spin up a TPU VM instance and install Levanter on it. You can then run ``` cat > .config < .config < docker_repository: levanter zone: us-west4-a -tpu: test-tpu +tpu_name: test-spin-up-32 +tpu_type: "v5litepod-16" +vm_image: "tpu-ubuntu2204-base" +preemptible: true +autodelete: false +subnetwork: "default" + EOF ``` diff --git a/docs/tutorials/Training-On-Audio-Data.md b/docs/tutorials/Training-On-Audio-Data.md index bdab91c43..fab6e7d4f 100644 --- a/docs/tutorials/Training-On-Audio-Data.md +++ b/docs/tutorials/Training-On-Audio-Data.md @@ -179,16 +179,30 @@ infra/babysit-tpu-vm my-tpu -z us-east1-d -t v3-128 -- \ #### Spin up and manual launch -You should probably use the automated setup script, as described in the [relevant section of the TPU guide](../Getting-Started-TPU-VM.md#automatic-setup). -Here's what that looks like: +You can start up a TPU VM and launch your instance with `launch.py`. To simplify your command for multiple launches, you can put common parameters into `.config` in your `levanter` directory: + +cat > .config < + +docker_repository: levanter +zone: us-west4-a +tpu_name: test-spin-up-32 +tpu_type: "v5litepod-16" +vm_image: "tpu-ubuntu2204-base" +preemptible: true +autodelete: false +subnetwork: "default" +EOF ```bash -bash infra/spin-up-tpu-vm.sh my-tpu -z us-east1-d -t v3-128 -``` - -This will spin up a TPU VM instance and install Levanter on it. You can then run a command like so: -```bash python infra/launch.py -- python levanter/src/levanter/main/train_asr.py --config_path gs://path/to/config.yaml" ``` diff --git a/infra/launch.py b/infra/launch.py index 6b465f700..3eae2b23a 100755 --- a/infra/launch.py +++ b/infra/launch.py @@ -10,6 +10,7 @@ def setup_vm_docker(tpu_name, zone, docker_base_image): + """Change docker permissions on `tpu_name` and setup the cache volume.""" cli.tpu_ssh( tpu_name, zone, @@ -52,9 +53,7 @@ def list_tpus(zone): return tpus -def start_tpu_vm( - tpu_name, *, tpu_type, preemptible, version, zone, autodelete, project, docker_repository, docker_base_image -): +def start_tpu_vm(tpu_name, *, tpu_type, preemptible, version, zone, autodelete): tpu_exists = any([tpu["NAME"] == tpu_name for tpu in list_tpus(zone)]) if tpu_exists: if not autodelete: @@ -104,11 +103,12 @@ def start_tpu_vm( cli.add_arg(parser, config, ["--image_name"], default=f"levanter-{getpass.getuser()}") cli.add_arg(parser, config, ["--preemptible"], default=False, action="store_true") cli.add_arg(parser, config, ["--project"], default=cli.gcloud_config()["project"]) - cli.add_arg(parser, config, ["--tpu"], required=True) - cli.add_arg(parser, config, ["--tpu_type"]) + cli.add_arg(parser, config, ["--tpu_name"], required=True) + cli.add_arg(parser, config, ["--tpu_type"], required=True) cli.add_arg(parser, config, ["--version"], default="tpu-ubuntu2204-base") cli.add_arg(parser, config, ["--zone"], required=True) cli.add_arg(parser, config, ["--retries"], default=0, type=int) + cli.add_arg(parser, config, ["--run_id"], default=int(time.time()), type=int) parser.add_argument( "-e", "--env", action="append", nargs=2, metavar=("KEY", "VALUE"), default=config.get("env", {}).items() @@ -129,10 +129,11 @@ def start_tpu_vm( retries = 10000000 else: retries = args.retries - tpu_name = args.tpu + tpu_name = args.tpu_name tpu_type = args.tpu_type version = args.version zone = args.zone + run_id = args.run_id region = "-".join(zone.split("-")[:-1]) env = {k: v for k, v in args.env} @@ -152,11 +153,10 @@ def start_tpu_vm( version=version, zone=zone, autodelete=autodelete, - project=project, - docker_repository=docker_repository, - docker_base_image=docker_base_image, ) + # We don't technically need to setup on every run, but if we are working on a + # stale VM or a VM from e.g. spin-up-vm.sh, this ensures things always work. setup_vm_docker( tpu_name=tpu_name, zone=zone, @@ -164,7 +164,7 @@ def start_tpu_vm( ) # make an image tag based on the unix timestamp to ensure we always pull the latest image - tag = run_id = int(time.time()) + tag = int(time.time()) full_image_id = push_docker.push_to_gcp( project_id=project, From 1929ac274055563f6400e87ef4c09af9dfb8d2f7 Mon Sep 17 00:00:00 2001 From: Russell Power Date: Tue, 11 Jun 2024 09:59:39 -0700 Subject: [PATCH 5/7] Tiny doc cleanups. --- docker/tpu/Dockerfile.incremental | 1 + docs/Training-On-Your-Data.md | 3 +-- docs/tutorials/Training-On-Audio-Data.md | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/docker/tpu/Dockerfile.incremental b/docker/tpu/Dockerfile.incremental index 10afb1ca7..8a6c8f5c8 100644 --- a/docker/tpu/Dockerfile.incremental +++ b/docker/tpu/Dockerfile.incremental @@ -14,4 +14,5 @@ WORKDIR /opt/levanter ADD pyproject.toml README.md /opt/levanter/ RUN pip install -e '.[test]' +RUN pip install librosa soundfile ADD . /opt/levanter \ No newline at end of file diff --git a/docs/Training-On-Your-Data.md b/docs/Training-On-Your-Data.md index 11aba38b8..7361d1d7f 100644 --- a/docs/Training-On-Your-Data.md +++ b/docs/Training-On-Your-Data.md @@ -410,7 +410,6 @@ env: docker_repository: levanter zone: us-west4-a -tpu_name: test-spin-up-32 tpu_type: "v5litepod-16" vm_image: "tpu-ubuntu2204-base" preemptible: true @@ -421,7 +420,7 @@ EOF ``` ```bash -python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path gs://path/to/config.yaml" +python infra/launch.py --tpu_name=my_tpu -- python levanter/src/levanter/main/train_lm.py --config_path gs://path/to/config.yaml" ``` ## Monitoring diff --git a/docs/tutorials/Training-On-Audio-Data.md b/docs/tutorials/Training-On-Audio-Data.md index fab6e7d4f..f57b9a06f 100644 --- a/docs/tutorials/Training-On-Audio-Data.md +++ b/docs/tutorials/Training-On-Audio-Data.md @@ -193,7 +193,6 @@ env: docker_repository: levanter zone: us-west4-a -tpu_name: test-spin-up-32 tpu_type: "v5litepod-16" vm_image: "tpu-ubuntu2204-base" preemptible: true @@ -203,7 +202,7 @@ EOF ```bash -python infra/launch.py -- python levanter/src/levanter/main/train_asr.py --config_path gs://path/to/config.yaml" +python infra/launch.py --tpu_name=my_tpu -- python levanter/src/levanter/main/train_asr.py --config_path gs://path/to/config.yaml" ``` ### GPU From 22d1a64100dcea425fe6d9c242d19832622790ae Mon Sep 17 00:00:00 2001 From: Russell Power Date: Tue, 11 Jun 2024 14:51:38 -0700 Subject: [PATCH 6/7] Add back tokenizer documentation. --- docs/Getting-Started-TPU-VM.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/Getting-Started-TPU-VM.md b/docs/Getting-Started-TPU-VM.md index 8fc8f98ee..6f92d3f38 100644 --- a/docs/Getting-Started-TPU-VM.md +++ b/docs/Getting-Started-TPU-VM.md @@ -164,6 +164,10 @@ Afterward, you can use the config directly from the TPU VM instance, e.g.: With this configuration (unless `trainer.load_checkpoint` is false), Levanter will automatically try to load the latest checkpoint if it exists. +Tokenizers and configuration files are loaded via `fsspec` which supports remote +filesystems , so you can also copy your tokenizer or config file to GCS and use +a `gs://` path to access it. + ## Common Issues ### (CRFM) Permission denied on `/files` From 911fbbc5796d4f1313bdf02a32f56e12c032714c Mon Sep 17 00:00:00 2001 From: Russell Power Date: Tue, 11 Jun 2024 15:08:57 -0700 Subject: [PATCH 7/7] Fix doc typo, cleanup base dependency installation. --- docker/tpu/Dockerfile.base | 9 +++++---- docker/tpu/Dockerfile.incremental | 1 - docs/Training-On-Your-Data.md | 1 - infra/helpers/cli.py | 1 - infra/launch.py | 8 -------- infra/push_docker.py | 1 + 6 files changed, 6 insertions(+), 15 deletions(-) diff --git a/docker/tpu/Dockerfile.base b/docker/tpu/Dockerfile.base index 9a93736b1..b9b6106ab 100644 --- a/docker/tpu/Dockerfile.base +++ b/docker/tpu/Dockerfile.base @@ -4,12 +4,13 @@ RUN pip install virtualenv # venv binaries encode their directory, so we need to setup the venv in the final location RUN virtualenv -p python3.10 /opt/levanter/.venv -RUN /opt/levanter/.venv/bin/pip install -U "jax[tpu]==0.4.26" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html +ENV PATH /opt/levanter/.venv/bin:$PATH +RUN /opt/levanter/.venv/bin/pip install -U hatch "jax[tpu]==0.4.26" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html -# Add only the requirements files to cache dependency build/installation -WORKDIR /tmp +# Install package dependencies to make incremental builds faster. +WORKDIR /tmp/ ADD pyproject.toml README.md /tmp/ -RUN /opt/levanter/.venv/bin/pip install -e '.[test]' +RUN pip install $(hatch dep show requirements --all) FROM python:3.10 diff --git a/docker/tpu/Dockerfile.incremental b/docker/tpu/Dockerfile.incremental index 8a6c8f5c8..10afb1ca7 100644 --- a/docker/tpu/Dockerfile.incremental +++ b/docker/tpu/Dockerfile.incremental @@ -14,5 +14,4 @@ WORKDIR /opt/levanter ADD pyproject.toml README.md /opt/levanter/ RUN pip install -e '.[test]' -RUN pip install librosa soundfile ADD . /opt/levanter \ No newline at end of file diff --git a/docs/Training-On-Your-Data.md b/docs/Training-On-Your-Data.md index 7361d1d7f..9879306dc 100644 --- a/docs/Training-On-Your-Data.md +++ b/docs/Training-On-Your-Data.md @@ -398,7 +398,6 @@ This will spin up a TPU VM instance and install Levanter on it. You can then run ``` cat > .config < .config <