diff --git a/.dockerignore b/.dockerignore index f5ceb7397..17fbbcfe1 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,3 +1,5 @@ +.git + scratch cache wandb @@ -44,6 +46,7 @@ instance/ # Sphinx documentation docs/_build/ +docs/figures/ # PyBuilder target/ @@ -105,7 +108,6 @@ dmypy.json # JetBrains .idea/ - # dataset cache files **/*.parquet **/ledger.json diff --git a/.gitignore b/.gitignore index c66f6f352..835da2048 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ /scratch +# Configuration for TPU launches/secrets +.config + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -140,6 +143,7 @@ dmypy.json /wandb # dataset cache files +/cache *.parquet ledger.json diff --git a/docker/tpu/Dockerfile.base b/docker/tpu/Dockerfile.base new file mode 100644 index 000000000..b9b6106ab --- /dev/null +++ b/docker/tpu/Dockerfile.base @@ -0,0 +1,18 @@ +FROM python:3.10 AS build +RUN apt-get update && apt-get install -y clang +RUN pip install virtualenv + +# venv binaries encode their directory, so we need to setup the venv in the final location +RUN virtualenv -p python3.10 /opt/levanter/.venv +ENV PATH /opt/levanter/.venv/bin:$PATH +RUN /opt/levanter/.venv/bin/pip install -U hatch "jax[tpu]==0.4.26" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html + +# Install package dependencies to make incremental builds faster. +WORKDIR /tmp/ +ADD pyproject.toml README.md /tmp/ +RUN pip install $(hatch dep show requirements --all) + +FROM python:3.10 + +WORKDIR /opt/levanter +COPY --from=build /opt/levanter/.venv /opt/levanter/.venv \ No newline at end of file diff --git a/docker/tpu/Dockerfile.incremental b/docker/tpu/Dockerfile.incremental new file mode 100644 index 000000000..10afb1ca7 --- /dev/null +++ b/docker/tpu/Dockerfile.incremental @@ -0,0 +1,17 @@ +ARG IMAGE=ghcr.io/rjpower/levanter +ARG TAG=latest + +FROM ${IMAGE}:${TAG} + +ENV TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS=60\ + TENSORSTORE_CURL_LOW_SPEED_LIMIT_BYTES=1024\ + RAY_USAGE_STATS_ENABLED=0\ + PATH=/opt/levanter/.venv/bin:$PATH\ + PYTHONPATH=/opt/levanter:/opt/levanter/src:/opt/levanter/examples:/opt/levanter/tests\ + HOME=/home/levanter + +WORKDIR /opt/levanter + +ADD pyproject.toml README.md /opt/levanter/ +RUN pip install -e '.[test]' +ADD . /opt/levanter \ No newline at end of file diff --git a/docs/Getting-Started-TPU-VM.md b/docs/Getting-Started-TPU-VM.md index fe73eef70..6f92d3f38 100644 --- a/docs/Getting-Started-TPU-VM.md +++ b/docs/Getting-Started-TPU-VM.md @@ -85,63 +85,88 @@ the VM. That's explained down below in the [Running Levanter GPT-2](#running-lev ## Running Levanter GPT-2 Now that you have a TPU VM instance, you can follow the [Getting Started](Getting-Started-Training.md) steps, but here are a few shortcuts: -### Launch a GPT-2 Small in unattended mode (using nohup) -```bash -gcloud compute tpus tpu-vm ssh $NAME --zone $ZONE --worker=all --command 'WANDB_API_KEY=... levanter/infra/launch.sh python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://' +### Launch a GPT-2 Small in unattended mode + +You will need a [Docker installation](https://docs.docker.com/engine/install/) +on your development machine to build and run images on TPUs. + +First create a configuration file for future launches in your Levanter directory: + ``` +cat > .config < + +docker_repository: levanter +zone: us-west4-a +tpu_name: test-spin-up-32 +tpu_type: "v5litepod-16" +vm_image: "tpu-ubuntu2204-base" +preemptible: true +autodelete: false +subnetwork: "default" + +EOF +``` + +Now run `launch.py`. This will package your current directory into a Docker image and run it on your workers. Everything after the `--` is run on each worker. -`launch.sh` will run the command in the background and redirect stdout and stderr to a log file in the home directory -on each worker. +```bash +python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://' +``` ### Launch a GPT-2 Small in interactive mode -This version writes to the terminal, you should use tmux or something for long running jobs for this version. It's mostly for debugging. + +To run in the foreground, use `--foreground` with the `launch.py` script. You should use tmux or something for long running jobs for this version. It's mostly for debugging. ```bash -gcloud compute tpus tpu-vm ssh $NAME --zone $ZONE --worker=all --command 'WANDB_API_KEY=... levanter/infra/run.sh python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://' +python infra/launch.py -- python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml --trainer.checkpointer.base_path gs://' ``` ### Babysitting Script If you are using a preemptible TPU VM, you probably want to use the "babysitting" script that automatically re-creates -the VM. This is because preemptible instances can be preempted and will always be killed every 24 hours. The babysitting -script handles both the creation of the node and the running of a job, and also relaunches the TPU VM if it gets preempted. -It keeps running the command (and relaunching) until the command exits successfully. - -Note that the babysitting-script will automatically set the `RUN_ID` environment variable if not set, and pass it to the -training command. This ensures that restarted jobs have the same run id, which is important for resumes to work. - -You can run it like this: +the VM. This is because preemptible instances can be preempted and will always be killed every 24 hours. You can run `launch.py` with the `--retries` and `--foreground` parameter to accomplish this. If `--retries` is greater than 1, `launch.py` will automatically attempt to re-create the VM and re-run the command if it fails. (`--foreground` is necessary to keep the script from returning immediately.) ```bash -infra/babysit-tpu-vm -z -t [--preemptible] -- \ - WANDB_API_KEY=... levanter/infra/run.sh python levanter/src/levanter/main/train_lm.py --config_path levanter/config/gpt2_small.yaml + python infra/launch.py --retries=100 --foreground --tpu_name=my_tpu -- python src/levanter/main/train_lm.py --config_path config/my_config.yaml \ + --trainer.checkpointer.base_path gs://path/to/checkpoints/ ``` -That `--` is important! It separates the spin up args from the running args. Also, you should never use `launch.sh` -with `babysit`, because nohup exits immediately with exit code 0. +That `--` is important! It separates the spin up args from the running args. +Also you should always use `--foregrouund` with `babysit-tpu-vm`, as the +background mode will always return immediately. ### Running your own config -If you want to run your own config, we suggest you start from one of the existing configs. Then, if you're not using -an NFS server or similar, you should upload your config to GCS: +If you want to run your own config, we suggest you start from one of the existing configs. Just copy it to +a new file: + +`cp config/gpt2_small.yaml config/my_config.yaml` + +If you're using `launch.py`, the config will be automatically uploaded as part of your Docker image, so you +can just reference the local config path in your command line: -```bash -gsutil cp my_config.yaml gs://my_bucket//my_config.yaml ``` Afterward, you can use the config directly from the TPU VM instance, e.g.: ```bash -infra/babysit-tpu-vm -z -t [--preemptible] -- \ - WANDB_API_KEY=... levanter/infra/run.sh python levanter/src/levanter/main/train_lm.py --config_path gs://my_bucket/my_config.yaml \ + python infra/launch.py -- python src/levanter/main/train_lm.py --config_path config/my_config.yaml \ --trainer.checkpointer.base_path gs://path/to/checkpoints/ ``` -The `--config_path` argument can be a local path, a GCS path, or any URL loadable by fsspec. With this configuration (unless `trainer.load_checkpoint` is false), Levanter will automatically try to load the latest checkpoint if it exists. -Tokenizers are also loaded via fsspec, so you can use the same trick to load them from GCS if you have a custom -tokenizer, or you can use an HF tokenizer. +Tokenizers and configuration files are loaded via `fsspec` which supports remote +filesystems , so you can also copy your tokenizer or config file to GCS and use +a `gs://` path to access it. ## Common Issues ### (CRFM) Permission denied on `/files` diff --git a/docs/Training-On-Your-Data.md b/docs/Training-On-Your-Data.md index c14b0ba66..9879306dc 100644 --- a/docs/Training-On-Your-Data.md +++ b/docs/Training-On-Your-Data.md @@ -395,8 +395,31 @@ bash infra/spin-up-tpu-vm.sh my-tpu -z us-east1-d -t v3-128 This will spin up a TPU VM instance and install Levanter on it. You can then run a command like so: + +``` +cat > .config < + +docker_repository: levanter +zone: us-west4-a +tpu_type: "v5litepod-16" +vm_image: "tpu-ubuntu2204-base" +preemptible: true +autodelete: false +subnetwork: "default" + +EOF +``` + ```bash -gcloud compute tpus tpu-vm ssh my-tpu --zone us-east1-d --worker=all --command="WANDB_API_KEY=... levanter/infra/launch.sh python levanter/src/levanter/main/train_lm.py --config_path gs://path/to/config.yaml" +python infra/launch.py --tpu_name=my_tpu -- python levanter/src/levanter/main/train_lm.py --config_path gs://path/to/config.yaml" ``` ## Monitoring diff --git a/docs/tutorials/Training-On-Audio-Data.md b/docs/tutorials/Training-On-Audio-Data.md index 235b2e79b..f57b9a06f 100644 --- a/docs/tutorials/Training-On-Audio-Data.md +++ b/docs/tutorials/Training-On-Audio-Data.md @@ -179,17 +179,30 @@ infra/babysit-tpu-vm my-tpu -z us-east1-d -t v3-128 -- \ #### Spin up and manual launch -You should probably use the automated setup script, as described in the [relevant section of the TPU guide](../Getting-Started-TPU-VM.md#automatic-setup). -Here's what that looks like: +You can start up a TPU VM and launch your instance with `launch.py`. To simplify your command for multiple launches, you can put common parameters into `.config` in your `levanter` directory: + +cat > .config < + +docker_repository: levanter +zone: us-west4-a +tpu_type: "v5litepod-16" +vm_image: "tpu-ubuntu2204-base" +preemptible: true +autodelete: false +subnetwork: "default" +EOF ```bash -bash infra/spin-up-tpu-vm.sh my-tpu -z us-east1-d -t v3-128 -``` - -This will spin up a TPU VM instance and install Levanter on it. You can then run a command like so: -```bash -gcloud compute tpus tpu-vm ssh my-tpu --zone us-east1-d --worker=all --command="WANDB_API_KEY=... levanter/infra/launch.sh python levanter/src/levanter/main/train_asr.py --config_path gs://path/to/config.yaml" +python infra/launch.py --tpu_name=my_tpu -- python levanter/src/levanter/main/train_asr.py --config_path gs://path/to/config.yaml" ``` ### GPU diff --git a/infra/__init__.py b/infra/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/infra/helpers/cli.py b/infra/helpers/cli.py new file mode 100644 index 000000000..7a2f61574 --- /dev/null +++ b/infra/helpers/cli.py @@ -0,0 +1,79 @@ +import argparse +import os +import subprocess +import typing + +from google.cloud import storage +import yaml + + +def run_command(*args, **kwargs): + print("Running:", " ".join(list(args))) + return subprocess.check_call(args, **kwargs) + + +def add_ssh_key(ssh_key_filename): + # format 3072 SHA256:... key-name (RSA) + key_hash = subprocess.check_output(["ssh-keygen", "-lf", ssh_key_filename]).decode("utf-8").split()[1] + existing_keys = subprocess.check_output(["ssh-add", "-l"]).decode("utf-8").split("\n") + for key in existing_keys: + if key_hash in key: + return + + subprocess.check_call(["ssh-add", ssh_key_filename]) + + +def tpu_ssh(tpu_name, zone, *args, ignore_failure=False): + add_ssh_key(os.path.expanduser("~/.ssh/google_compute_engine")) + try: + return run_command( + "gcloud", + "alpha", + "compute", + "tpus", + "tpu-vm", + "ssh", + tpu_name, + "--worker=all", + f"--zone={zone}", + "--command=%s" % " ".join(args), + ) + except subprocess.CalledProcessError as e: + if ignore_failure: + print("Ignoring failure:", e) + else: + raise + + +# Oddly enough, there's no API to simply fetch the current gcloud configuration... +def gcloud_config(): + client = storage.Client() + return { + "project": client.project, + } + + +def add_arg( + parser: argparse.ArgumentParser, config: typing.Dict, flags: typing.List[str], required=False, default=None, **kw +): + """Add an argument to the parser, using `config` or the environment to resolve default values.""" + key = flags[0].lstrip("-").replace("-", "_") + if key in config: + default = config[key] + + if key.upper() in os.environ: + default = os.environ[key.upper()] + + if default is not None: + kw["default"] = default + elif required: + kw["required"] = True + + parser.add_argument(*flags, **kw) + + +def load_config(): + if os.path.exists(".config"): + return yaml.load(open(".config", "r"), Loader=yaml.SafeLoader) + else: + return {} diff --git a/infra/launch.py b/infra/launch.py new file mode 100755 index 000000000..a1fed90c1 --- /dev/null +++ b/infra/launch.py @@ -0,0 +1,206 @@ +#!/usr/bin/python + +import argparse +import getpass +import subprocess +import time + +from infra import push_docker +from infra.helpers import cli + + +def setup_vm_docker(tpu_name, zone, docker_base_image): + """Change docker permissions on `tpu_name` and setup the cache volume.""" + cli.tpu_ssh( + tpu_name, + zone, + "sudo", + "usermod", + "-aG", + "docker", + getpass.getuser(), + ) + + cli.tpu_ssh(tpu_name, zone, "docker", "volume", "create", "--driver=local", "levanter") + + +def list_tpus(zone): + tpus = subprocess.check_output( + [ + "gcloud", + "alpha", + "compute", + "tpus", + "tpu-vm", + "list", + "--zone=" + zone, + ] + ) + rows = tpus.decode("utf-8").split("\n") + header = rows[0].split() + tpus = [] + for row in rows[1:]: + if row: + tpus.append(dict(zip(header, row.split()))) + return tpus + + +def start_tpu_vm(tpu_name, *, tpu_type, preemptible, version, zone, autodelete): + tpu_exists = any([tpu["NAME"] == tpu_name for tpu in list_tpus(zone)]) + if tpu_exists: + if not autodelete: + print("TPU already exists and autodelete is false, leaving it as is.") + return + + print("TPU already exists, deleting...") + cli.run_command( + "gcloud", + "alpha", + "compute", + "tpus", + "tpu-vm", + "delete", + "--quiet", + f"--zone={zone}", + tpu_name, + ) + + print(f"Creating new TPU {tpu_name} in {zone} of type {tpu_type}...") + command = [ + "gcloud", + "alpha", + "compute", + "tpus", + "tpu-vm", + "create", + tpu_name, + f"--accelerator-type={tpu_type}", + f"--version={version}", + "--zone=" + zone, + "--quiet", + ] + if preemptible: + command.append("--preemptible") + cli.run_command(*command) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + config = cli.load_config() + + cli.add_arg(parser, config, ["--autodelete"], default=False, action="store_true") + cli.add_arg(parser, config, ["--docker_base_image"], default="ghcr.io/rjpower/levanter:latest") + cli.add_arg(parser, config, ["--docker_repository"], default="levanter") + cli.add_arg(parser, config, ["--foreground"], default=False, action="store_true") + cli.add_arg(parser, config, ["--image_name"], default=f"levanter-{getpass.getuser()}") + cli.add_arg(parser, config, ["--preemptible"], default=False, action="store_true") + cli.add_arg(parser, config, ["--project"], default=cli.gcloud_config()["project"]) + cli.add_arg(parser, config, ["--tpu_name"], required=True) + cli.add_arg(parser, config, ["--tpu_type"], required=True) + cli.add_arg(parser, config, ["--version"], default="tpu-ubuntu2204-base") + cli.add_arg(parser, config, ["--zone"], required=True) + cli.add_arg(parser, config, ["--retries"], default=0, type=int) + cli.add_arg(parser, config, ["--run_id"], default=int(time.time()), type=int) + + parser.add_argument( + "-e", "--env", action="append", nargs=2, metavar=("KEY", "VALUE"), default=config.get("env", {}).items() + ) + parser.add_argument("command", nargs=argparse.REMAINDER) + + args = parser.parse_args() + + autodelete = args.autodelete + command = args.command + docker_base_image = args.docker_base_image + docker_repository = args.docker_repository + foreground = args.foreground + image_id = args.image_name + preemptible = args.preemptible + project = args.project + if args.retries < 0: + retries = 10000000 + else: + retries = args.retries + tpu_name = args.tpu_name + tpu_type = args.tpu_type + version = args.version + zone = args.zone + run_id = args.run_id + + region = "-".join(zone.split("-")[:-1]) + env = {k: v for k, v in args.env} + + if "WANDB_PROJECT" not in env: + env["WANDB_PROJECT"] = "levanter" + + if command[0] == "--": + command = command[1:] + + for i in range(retries + 1): + try: + start_tpu_vm( + tpu_name=tpu_name, + tpu_type=tpu_type, + preemptible=preemptible, + version=version, + zone=zone, + autodelete=autodelete, + ) + + # We don't technically need to setup on every run, but if we are working on a + # stale VM or a VM from e.g. spin-up-vm.sh, this ensures things always work. + setup_vm_docker( + tpu_name=tpu_name, + zone=zone, + docker_base_image=docker_base_image, + ) + + # make an image tag based on the unix timestamp to ensure we always pull the latest image + tag = int(time.time()) + + full_image_id = push_docker.push_to_gcp( + project_id=project, + region=region, + repository=docker_repository, + image_name=image_id, + tag=tag, + docker_file="docker/tpu/Dockerfile.incremental", + ) + + git_commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip() + + cli.tpu_ssh(tpu_name, zone, "docker", "stop", "levanter", "-t", "1", ignore_failure=True) + cli.tpu_ssh(tpu_name, zone, "docker", "rm", "-f", "levanter", ignore_failure=True) + + docker_command = [ + "docker", + "run", + "-t" if foreground else "-d", + "--name=levanter", + "--privileged", + "--shm-size=32gb", + "--net=host", + "--init", + "--mount", + "type=volume,source=levanter,target=/home/levanter", + "-v", + "/tmp:/tmp", + "-e", + f"WANDB_DOCKER={image_id}", + "-e", + f"GIT_COMMIT={git_commit}", + "-e", + f"RUN_ID={run_id}", + ] + + for k, v in env.items(): + docker_command.extend(["-e", k + f"='{str(v)}'"]) + + docker_command.extend([full_image_id, " ".join(command)]) + + print(f"Running on tpu_name... {tpu_name}") + cli.tpu_ssh(tpu_name, zone, *docker_command) + except subprocess.CalledProcessError as e: # noqa: F841 + print("Error running command.") + if i < retries - 1: + print("Retrying... %d/%d" % (i + 1, retries)) diff --git a/infra/push_docker.py b/infra/push_docker.py new file mode 100644 index 000000000..a0712ff85 --- /dev/null +++ b/infra/push_docker.py @@ -0,0 +1,213 @@ +#!/usr/bin/python + +""" +Build and deploy the Levanter base image to Artifact Registry or Docker Hub. + +It is not necessary to run this yourself unless you are deploying a new base image: the launch +script will automatically build and deploy an image based on your current code. +""" + +import argparse +import json +import subprocess + +from infra.helpers import cli + + +GCP_CLEANUP_POLICY = [ + { + "name": "delete-stale", + "action": {"type": "Delete"}, + "condition": { + "olderThan": "86400s", + "tagState": "ANY", + }, + }, + { + "name": "keep-latest", + "action": {"type": "Keep"}, + "mostRecentVersions": { + "keepCount": 5, + }, + }, +] + + +def _run(*args, **kw): + print("Running ", " ".join(args[0])) + return subprocess.check_output(*args, **kw) + + +def configure_gcp_docker(project_id, region, repository): + """Setup Artifact registry repository and configure permissions to enable TPU access.""" + # check if the repository already exists + try: + _run( + ["gcloud", "artifacts", "repositories", "describe", f"--location={region}", repository], + stderr=subprocess.STDOUT, + ) + print(f"Found existing artifact registry repository `{repository}`, skipping setup.") + return + except subprocess.CalledProcessError as e: + if b"NOT_FOUND" not in e.output: + raise + + # Activate artifact registry and setup the repository. + _run(["gcloud", "services", "enable", "artifactregistry.googleapis.com"]) + + try: + _run( + [ + "gcloud", + "artifacts", + "repositories", + "create", + repository, + f"--location={region}", + "--repository-format=docker", + ], + stderr=subprocess.STDOUT, + ) + except subprocess.CalledProcessError as e: + # Ignore error if repository already exists. + if b"ALREADY_EXISTS" not in e.output: + print("Error creating repository: ", e.output) + raise + + with open("/tmp/cleanup-policy.json", "w") as f: + json.dump(GCP_CLEANUP_POLICY, f, indent=2) + + _run( + [ + "gcloud", + "artifacts", + "repositories", + "set-cleanup-policies", + f"--location={region}", + "--policy=/tmp/cleanup-policy.json", + repository, + ] + ) + + # Grant public read access ('allUsers') for TPU VMs + _run( + [ + "gcloud", + "artifacts", + "repositories", + "add-iam-policy-binding", + "--member=allUsers", + "--role=roles/artifactregistry.reader", + f"--location={region}", + repository, + ] + ) + + _run( + [ + "gcloud", + "--project", + project_id, + "artifacts", + "repositories", + "add-iam-policy-binding", + repository, + "--location", + region, + "--member", + "allUsers", + "--role", + "roles/artifactregistry.reader", + ] + ) + + _run(["gcloud", "auth", "configure-docker", "--quiet", f"{region}-docker.pkg.dev"]) + + +def build_docker(docker_file, image_name, tag) -> str: + """Builds a Docker image, enables artifact access, and pushes to Artifact Registry.""" + + _run( + [ + "docker", + "buildx", + "build", + "--platform=linux/amd64", + "-t", + f"{image_name}:{tag}", + "-f", + docker_file, + ".", + ] + ) + + return f"{image_name}:{tag}" + + +# Disabled until we can figure out how Docker hub organizations work +def push_to_github(local_image, tag, github_user=None, github_token=None, docker_file=None): + """Pushes a local Docker image to Docker Hub.""" + + # Authenticate the docker service with Github if a token exists + if github_token: + login_process = subprocess.Popen( + ["docker", "login", "ghcr.io", "-u", github_user, "--password-stdin"], stdin=subprocess.PIPE + ) + print(login_process.communicate(input=github_token.encode(), timeout=10)) + + remote_name = f"ghcr.io/{github_user}/{local_image}:{tag}" + local_name = build_docker(docker_file=docker_file, image_name=local_image, tag=tag) + + _run(["docker", "tag", local_name, remote_name]) + _run(["docker", "push", remote_name]) + return remote_name + + +def push_to_gcp(project_id, region, repository, image_name, tag, docker_file) -> str: + """Pushes a local Docker image to Artifact Registry.""" + configure_gcp_docker(project_id, region, repository) + local_image = build_docker(docker_file=docker_file, image_name=image_name, tag=tag) + + artifact_repo = f"{region}-docker.pkg.dev/{project_id}/{repository}" + + full_image_name = f"{artifact_repo}/{image_name}:{tag}" + _run(["docker", "tag", local_image, full_image_name]) + _run(["docker", "push", full_image_name]) + + return f"{region}-docker.pkg.dev/{project_id}/{repository}/{image_name}:{tag}" + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Build and push Docker image to Artifact Registry.") + config = cli.load_config() + cli.add_arg(parser, config, ["--project"], help="GCP project ID") + cli.add_arg(parser, config, ["--region"], help="Artifact Registry region (e.g., us-west4)") + cli.add_arg(parser, config, ["--repository"], default="levanter", help="Artifact Registry repository name") + cli.add_arg(parser, config, ["--image"], default="levanter", help="Docker image name.") + cli.add_arg(parser, config, ["--tag"], default="latest", help="Docker image tag.") + cli.add_arg(parser, config, ["--github_user"], default=None, help="Github user name.") + cli.add_arg(parser, config, ["--github_token"], default=None, help="Github token.") + cli.add_arg(parser, config, ["--docker_file"], default="docker/tpu/Dockerfile.base", help="Dockerfile to use.") + + # push to either github or GCP + cli.add_arg(parser, config, ["--docker_target"], choices=["github", "gcp"], required=True) + + args = parser.parse_args() + + if args.docker_target == "github": + assert args.github_user, "Must specify --github_user when pushing to Github" + assert args.github_token, "Must specify --github_token when pushing to Github" + push_to_github(args.image, args.tag, args.github_user, args.github_token, docker_file=args.docker_file) + else: + assert args.region, "Must specify --region when pushing to GCP" + assert args.project, "Must specify --project when pushing to GCP" + assert args.repository, "Must specify --repository when pushing to GCP" + + push_to_gcp( + args.project, + args.region, + args.repository, + args.image, + args.tag, + docker_file=args.docker_file, + ) diff --git a/pyproject.toml b/pyproject.toml index 60e44e15b..527f88f7d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -67,8 +67,6 @@ dev-mode-dirs = [".", "src"] [tool.hatch.metadata] allow-direct-references = true - - [tool.hatch.build.targets.wheel] packages = ["levanter"] @@ -109,3 +107,13 @@ markers = [ "entry: marks tests as entry point tests (deselect with '-m \"not entry\"')", "ray: marks tests that require Ray (deselect with '-m \"not ray\"')", ] + +[project.optional-dependencies] +test = [ + "pytest", + "flake8", + "pytest", + "soundfile", + "librosa", + "pytest-forked" +] \ No newline at end of file diff --git a/src/levanter/tracker/wandb.py b/src/levanter/tracker/wandb.py index 9d41e935a..c98c0727c 100644 --- a/src/levanter/tracker/wandb.py +++ b/src/levanter/tracker/wandb.py @@ -208,6 +208,9 @@ def _git_settings(self): return other_settings def _get_git_sha(self, code_dir) -> Optional[str]: + if "GIT_COMMIT" in os.environ: + return os.environ["GIT_COMMIT"] + try: repo = Repo(code_dir) git_sha = repo.head.commit.hexsha