Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into blocked_cross_entropy
Browse files Browse the repository at this point in the history
  • Loading branch information
dlwh committed Nov 6, 2024
2 parents 795fd08 + d081b5b commit 05afef0
Show file tree
Hide file tree
Showing 16 changed files with 950 additions and 633 deletions.
2 changes: 1 addition & 1 deletion config/gpt2_small_fast_pile.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
data: !include data/pile_source_old.yaml
data: !include data/pile_mixture.yaml
model:
type: gpt2
hidden_dim: 768
Expand Down
1 change: 1 addition & 0 deletions config/gpt2_small_fast_supervised.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ data:
supervised_data:
validation_urls:
- "gs://marin-us-central2/benchmarks/mmlu/mmlu-*-dev-evaluation.jsonl.gz"
- "gs://marin-us-central2/benchmarks/mmlu/mmlu-*-validation-evaluation.jsonl.gz"
cache_dir: "gs://marin-us-central2/benchmarks/tokenized-gpt2/mmlu/"
input_field: "input"
output_field: "output"
Expand Down
45 changes: 42 additions & 3 deletions infra/cluster/job-cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ cluster_name: levanter-cluster
# Configure GCP
provider:
type: gcp
region: us-central2
availability_zone: us-central2-b
region: us-west4
availability_zone: us-west4-a
project_id: hai-gcp-models

# Maximum Workers (excluding Head Node)
Expand Down Expand Up @@ -126,6 +126,45 @@ available_node_types:
schedulingConfig:
preemptible: true

tpu_slice_v5e_16:
min_workers: 0
max_workers: 1024
resources: { "CPU": 120, "TPU": 4 }

node_config:
acceleratorType: v5litepod-16
runtimeVersion: tpu-ubuntu2204-base

# [IMPORTANT] Configure all TPU Workers to be Preemptible!
schedulingConfig:
preemptible: true

tpu_slice_v5e_64:
min_workers: 0
max_workers: 1024
resources: { "CPU": 120, "TPU": 4 }

node_config:
acceleratorType: v5litepod-64
runtimeVersion: tpu-ubuntu2204-base

# [IMPORTANT] Configure all TPU Workers to be Preemptible!
schedulingConfig:
preemptible: true

tpu_slice_v5e_256:
min_workers: 0
max_workers: 1024
resources: { "CPU": 120, "TPU": 4 }

node_config:
acceleratorType: v5litepod-256
runtimeVersion: tpu-ubuntu2204-base

# [IMPORTANT] Configure all TPU Workers to be Preemptible!
schedulingConfig:
preemptible: true

docker:
image: "ghcr.io/stanford-crfm/levanter-cluster:latest"
container_name: "ray_docker"
Expand All @@ -140,7 +179,7 @@ docker:
- -v "/var/run/docker.sock:/var/run/docker.sock"

initialization_commands:
- yes | gcloud auth configure-docker us-central2-docker.pkg.dev
- yes | gcloud auth configure-docker us-west4-docker.pkg.dev
- "export TPU_WORKER_ID=$(curl -H 'Metadata-Flavor: Google' http://metadata.google.internal/computeMetadata/v1/instance/attributes/agent-worker-number) || true"
- which docker || (curl -fsSL https://get.docker.com -o get-docker.sh; sudo sh get-docker.sh; sudo usermod -aG docker $USER; sudo systemctl restart docker -f)
# always run this because ray doesn't run with sudo
Expand Down
3 changes: 2 additions & 1 deletion infra/launch_on_ray.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def main():
cli.add_arg(parser, config, ["--project"], default=cli.gcloud_config()["project"])
cli.add_arg(parser, config, ["--tpu_type"], required=True)
# TODO: bring node_count to Ray
# cli.add_arg(parser, config, ["--node_count"], default=1, type=int)
cli.add_arg(parser, config, ["--node_count"], default=1, type=int)
cli.add_arg(parser, config, ["--foreground"], default=False, action="store_true")
cli.add_arg(parser, config, ["--retries"], default=10, type=int)
cli.add_arg(parser, config, ["--run_id"], default=cli.default_run_id(), type=str)
Expand Down Expand Up @@ -122,6 +122,7 @@ def main():
env=env,
name="levanter",
retries=retries,
node_count=args.node_count,
)

address = args.address or os.getenv("RAY_ADDRESS")
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ dependencies = [
"draccus>=0.8.0",
"pyarrow>=11.0.0",
"zstandard>=0.20.0",
"datasets>=2.18,<4.0",
"datasets>=3.1.0,<4.0",
"gcsfs>=2024.2,<2024.10",
"braceexpand>=0.1.7",
"jmp>=0.0.3",
Expand Down
5 changes: 4 additions & 1 deletion src/levanter/data/sharded_datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,10 @@ def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[dict]:
dataset = self._load_dataset()
if isinstance(dataset, datasets.IterableDataset) and shard_name != "data":
# ex_iterable has a key that gets discarded typically
shard = map(lambda t: t[1], dataset._ex_iterable.shard_data_sources(int(shard_name), dataset.n_shards))
shard = map(
lambda t: t[1],
dataset._ex_iterable.shard_data_sources(index=int(shard_name), num_shards=dataset.n_shards),
)
else:
shard = dataset

Expand Down
8 changes: 4 additions & 4 deletions src/levanter/data/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from levanter.store.cache import CacheOptions, TreeCache
from levanter.store.jagged_array import JaggedArrayStore
from levanter.store.tree_store import TreeStore
from levanter.utils.fsspec_utils import fsspec_expand_glob
from levanter.utils.fsspec_utils import expand_glob
from levanter.utils.hf_utils import num_cpus_used_by_tokenizer


Expand Down Expand Up @@ -508,7 +508,7 @@ def urls_for_split(self, split):
else:
raise ValueError(f"Unknown split {split}")

urls = [globbed for url in urls for globbed in fsspec_expand_glob(url)]
urls = [globbed for url in urls for globbed in expand_glob(url)]
return urls


Expand Down Expand Up @@ -625,13 +625,13 @@ def _prepare_supervised_example(ex: dict, tokenizer: PreTrainedTokenizerBase) ->
def mk_supervised_dataset(config: LMSupervisedDatasetConfig, tokenizer: PreTrainedTokenizerBase):
import levanter.data

validation_urls = [url for url_pat in config.validation_urls for url in fsspec_expand_glob(url_pat)]
validation_urls = [url for url_pat in config.validation_urls for url in expand_glob(url_pat)]
dataset = levanter.data.datasource_from_jsonl(validation_urls)

input_field = config.input_field
output_field = config.output_field

output_exemplar = {"input_ids": np.zeros((0,), dtype=np.int32), "sources_len": np.zeros((), dtype=np.int32)}
output_exemplar = {"input_ids": np.zeros((0,), dtype=np.int32), "sources_len": np.zeros((0,), dtype=np.int32)}

dataset = dataset.map_batches(lambda ex: preprocess_supervised_example(ex, tokenizer, input_field, output_field), batch_size=128, num_cpus=num_cpus_used_by_tokenizer(tokenizer), output_exemplar=output_exemplar) # type: ignore
dataset = dataset.build_or_load_cache(config.cache_dir, await_finished=True) # type: ignore
Expand Down
5 changes: 5 additions & 0 deletions src/levanter/infra/cli_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,11 @@ def make_docker_run_command(image_id, command, *, foreground, env, name="levante
"/tmp:/tmp",
]

# optionally add multislice env vars (if set by ray runtime env vars)
for v in ["MEGASCALE_COORDINATOR_ADDRESS", "MEGASCALE_NUM_SLICES", "MEGASCALE_PORT", "MEGASCALE_SLICE_ID"]:
v = shlex.quote(str(v))
docker_command.extend(["-e", v])

for k, v in env.items():
v = shlex.quote(str(v))
k = shlex.quote(str(k))
Expand Down
Loading

0 comments on commit 05afef0

Please sign in to comment.