diff --git a/.dockerignore b/.dockerignore index f5ceb7397..17fbbcfe1 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,3 +1,5 @@ +.git + scratch cache wandb @@ -44,6 +46,7 @@ instance/ # Sphinx documentation docs/_build/ +docs/figures/ # PyBuilder target/ @@ -105,7 +108,6 @@ dmypy.json # JetBrains .idea/ - # dataset cache files **/*.parquet **/ledger.json diff --git a/.github/workflows/tpu_unit_tests.yaml b/.github/workflows/tpu_unit_tests.yaml index 3e27426eb..4f3eaccb3 100644 --- a/.github/workflows/tpu_unit_tests.yaml +++ b/.github/workflows/tpu_unit_tests.yaml @@ -31,14 +31,12 @@ jobs: export TPU_NAME=ci-run-${{ github.run_id }} eval "$(ssh-agent -s)" TRUE_SHA=${{ github.event.pull_request.head.sha }} - bash infra/spin-up-vm.sh $TPU_NAME -z ${TPU_ZONE} -t v4-8 --preemptible -s infra/helpers/setup-tpu-vm-tests.sh -b ${TRUE_SHA} --retries 1 -# infra/babysit-tpu-vm.sh $TPU_NAME -z ${{ TPU_ZONE }} -t v4-8 --preemptible -s infra/helpers/setup-tpu-vm-tests.sh -b ${{ github.sha }} --retries 1 -- \ -# PYTHONPATH=$PYTHONPATH:levanter/tests bash levanter/infra/run.sh pytest levanter/tests -m "not entry" + bash infra/spin-up-vm.sh $TPU_NAME -z ${TPU_ZONE} -t v4-8 --preemptible --retries 1 - name: Run most tests run: | export TPU_NAME=ci-run-${{ github.run_id }} - gcloud compute tpus tpu-vm ssh $TPU_NAME --zone ${TPU_ZONE} --command "PYTHONPATH=$PYTHONPATH:levanter/tests bash levanter/infra/run.sh pytest levanter/tests -m 'not entry'" + python infra/launch.py --foreground --tpu=$TPU_NAME --zone=$TPU_ZONE -- /opt/levanter/.venv/bin/pytest tests -m "not entry" # Something's wrong with these # # - name: Run forked tests diff --git a/.gitignore b/.gitignore index c66f6f352..835da2048 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ /scratch +# Configuration for TPU launches/secrets +.config + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -140,6 +143,7 @@ dmypy.json /wandb # dataset cache files +/cache *.parquet ledger.json diff --git a/docker/tpu/Dockerfile.base b/docker/tpu/Dockerfile.base new file mode 100644 index 000000000..9a93736b1 --- /dev/null +++ b/docker/tpu/Dockerfile.base @@ -0,0 +1,17 @@ +FROM python:3.10 AS build +RUN apt-get update && apt-get install -y clang +RUN pip install virtualenv + +# venv binaries encode their directory, so we need to setup the venv in the final location +RUN virtualenv -p python3.10 /opt/levanter/.venv +RUN /opt/levanter/.venv/bin/pip install -U "jax[tpu]==0.4.26" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html + +# Add only the requirements files to cache dependency build/installation +WORKDIR /tmp +ADD pyproject.toml README.md /tmp/ +RUN /opt/levanter/.venv/bin/pip install -e '.[test]' + +FROM python:3.10 + +WORKDIR /opt/levanter +COPY --from=build /opt/levanter/.venv /opt/levanter/.venv \ No newline at end of file diff --git a/docker/tpu/Dockerfile.incremental b/docker/tpu/Dockerfile.incremental new file mode 100644 index 000000000..10afb1ca7 --- /dev/null +++ b/docker/tpu/Dockerfile.incremental @@ -0,0 +1,17 @@ +ARG IMAGE=ghcr.io/rjpower/levanter +ARG TAG=latest + +FROM ${IMAGE}:${TAG} + +ENV TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS=60\ + TENSORSTORE_CURL_LOW_SPEED_LIMIT_BYTES=1024\ + RAY_USAGE_STATS_ENABLED=0\ + PATH=/opt/levanter/.venv/bin:$PATH\ + PYTHONPATH=/opt/levanter:/opt/levanter/src:/opt/levanter/examples:/opt/levanter/tests\ + HOME=/home/levanter + +WORKDIR /opt/levanter + +ADD pyproject.toml README.md /opt/levanter/ +RUN pip install -e '.[test]' +ADD . /opt/levanter \ No newline at end of file diff --git a/docs/Getting-Started-TPU-VM.md b/docs/Getting-Started-TPU-VM.md index fe73eef70..1d2a50945 100644 --- a/docs/Getting-Started-TPU-VM.md +++ b/docs/Getting-Started-TPU-VM.md @@ -85,18 +85,40 @@ the VM. That's explained down below in the [Running Levanter GPT-2](#running-lev ## Running Levanter GPT-2 Now that you have a TPU VM instance, you can follow the [Getting Started](Getting-Started-Training.md) steps, but here are a few shortcuts: -### Launch a GPT-2 Small in unattended mode (using nohup) +### Launch a GPT-2 Small in unattended mode + +You will need a [Docker installation](https://docs.docker.com/engine/install/) +on your development machine to build and run images on TPUs. + +First create a configuration file for future launches in your Levanter directory: + +``` +cat > .config <' +python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://' ``` -`launch.sh` will run the command in the background and redirect stdout and stderr to a log file in the home directory -on each worker. +`launch.py` will package your directory and create and deploy a Docker image on each worker. ### Launch a GPT-2 Small in interactive mode -This version writes to the terminal, you should use tmux or something for long running jobs for this version. It's mostly for debugging. + +To run in the foreground, use `--foreground` with the `launch.py` script. You should use tmux or something for long running jobs for this version. It's mostly for debugging. ```bash -gcloud compute tpus tpu-vm ssh $NAME --zone $ZONE --worker=all --command 'WANDB_API_KEY=... levanter/infra/run.sh python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://' +python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://' ``` ### Babysitting Script @@ -113,11 +135,12 @@ You can run it like this: ```bash infra/babysit-tpu-vm -z -t [--preemptible] -- \ - WANDB_API_KEY=... levanter/infra/run.sh python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml + python infra/launch.py -- levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml ``` -That `--` is important! It separates the spin up args from the running args. Also, you should never use `launch.sh` -with `babysit`, because nohup exits immediately with exit code 0. +That `--` is important! It separates the spin up args from the running args. +Also you should always use `--foregrouund` with `babysit-tpu-vm`, as the +background mode will always return immediately. ### Running your own config @@ -132,7 +155,7 @@ Afterward, you can use the config directly from the TPU VM instance, e.g.: ```bash infra/babysit-tpu-vm -z -t [--preemptible] -- \ - WANDB_API_KEY=... levanter/infra/run.sh python levanter/src/levanter/main/train_lm.py --config_path gs://my_bucket/my_config.yaml \ + python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path gs://my_bucket/my_config.yaml \ --trainer.checkpointer.base_path gs://path/to/checkpoints/ ``` diff --git a/docs/Training-On-Your-Data.md b/docs/Training-On-Your-Data.md index c14b0ba66..cac96de31 100644 --- a/docs/Training-On-Your-Data.md +++ b/docs/Training-On-Your-Data.md @@ -395,8 +395,23 @@ bash infra/spin-up-tpu-vm.sh my-tpu -z us-east1-d -t v3-128 This will spin up a TPU VM instance and install Levanter on it. You can then run a command like so: + +``` +cat > .config < str: + """Builds a Docker image, enables artifact access, and pushes to Artifact Registry.""" + + _run( + [ + "docker", + "buildx", + "build", + "--platform=linux/amd64", + "-t", + f"{image_name}:{tag}", + "-f", + docker_file, + ".", + ] + ) + + return f"{image_name}:{tag}" + + +# Disabled until we can figure out how Docker hub organizations work +def push_to_github(local_image, tag, github_user=None, github_token=None, docker_file=None): + """Pushes a local Docker image to Docker Hub.""" + + # Authenticate the docker service with Github if a token exists + if github_token: + login_process = subprocess.Popen( + ["docker", "login", "ghcr.io", "-u", github_user, "--password-stdin"], stdin=subprocess.PIPE + ) + print(login_process.communicate(input=github_token.encode(), timeout=10)) + + remote_name = f"ghcr.io/{github_user}/{local_image}:{tag}" + local_name = build_docker(docker_file=docker_file, image_name=local_image, tag=tag) + + _run(["docker", "tag", local_name, remote_name]) + _run(["docker", "push", remote_name]) + return remote_name + + +def push_to_gcp(project_id, region, repository, image_name, tag, docker_file) -> str: + """Pushes a local Docker image to Artifact Registry.""" + configure_gcp_docker(project_id, region, repository) + local_image = build_docker(docker_file=docker_file, image_name=image_name, tag=tag) + + artifact_repo = f"{region}-docker.pkg.dev/{project_id}/{repository}" + + full_image_name = f"{artifact_repo}/{image_name}:{tag}" + _run(["docker", "tag", local_image, full_image_name]) + _run(["docker", "push", full_image_name]) + + return f"{region}-docker.pkg.dev/{project_id}/{repository}/{image_name}:{tag}" + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Build and push Docker image to Artifact Registry.") + config = cli.load_config() + cli.add_arg(parser, config, ["--project"], help="GCP project ID") + cli.add_arg(parser, config, ["--region"], help="Artifact Registry region (e.g., us-west4)") + cli.add_arg(parser, config, ["--repository"], default="levanter", help="Artifact Registry repository name") + cli.add_arg(parser, config, ["--image"], default="levanter", help="Docker image name.") + cli.add_arg(parser, config, ["--tag"], default="latest", help="Docker image tag.") + cli.add_arg(parser, config, ["--github_user"], default=None, help="Github user name.") + cli.add_arg(parser, config, ["--github_token"], default=None, help="Github token.") + cli.add_arg(parser, config, ["--docker_file"], default="docker/tpu/Dockerfile.base", help="Dockerfile to use.") + + # push to either github or GCP + cli.add_arg(parser, config, ["--docker_target"], choices=["github", "gcp"], required=True) + + args = parser.parse_args() + + if args.docker_target == "github": + assert args.github_user, "Must specify --github_user when pushing to Github" + assert args.github_token, "Must specify --github_token when pushing to Github" + push_to_github(args.image, args.tag, args.github_user, args.github_token, docker_file=args.docker_file) + else: + assert args.region, "Must specify --region when pushing to GCP" + assert args.project, "Must specify --project when pushing to GCP" + assert args.repository, "Must specify --repository when pushing to GCP" + + push_to_gcp( + args.project, + args.region, + args.repository, + args.image, + args.tag, + docker_file=args.docker_file, + ) diff --git a/pyproject.toml b/pyproject.toml index f17a26791..76bdd8864 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -67,8 +67,6 @@ dev-mode-dirs = [".", "src"] [tool.hatch.metadata] allow-direct-references = true - - [tool.hatch.build.targets.wheel] packages = ["levanter"] @@ -109,3 +107,13 @@ markers = [ "entry: marks tests as entry point tests (deselect with '-m \"not entry\"')", "ray: marks tests that require Ray (deselect with '-m \"not ray\"')", ] + +[project.optional-dependencies] +test = [ + "pytest", + "flake8", + "pytest", + "soundfile", + "librosa", + "pytest-forked" +] \ No newline at end of file diff --git a/src/levanter/tracker/wandb.py b/src/levanter/tracker/wandb.py index 9d41e935a..c98c0727c 100644 --- a/src/levanter/tracker/wandb.py +++ b/src/levanter/tracker/wandb.py @@ -208,6 +208,9 @@ def _git_settings(self): return other_settings def _get_git_sha(self, code_dir) -> Optional[str]: + if "GIT_COMMIT" in os.environ: + return os.environ["GIT_COMMIT"] + try: repo = Repo(code_dir) git_sha = repo.head.commit.hexsha