Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into ksalahi/supervised-data
Browse files Browse the repository at this point in the history
  • Loading branch information
ahmeda14960 committed Oct 9, 2024
2 parents 5370c72 + 36b29fd commit 1063fd8
Show file tree
Hide file tree
Showing 55 changed files with 3,586 additions and 2,052 deletions.
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -117,3 +117,4 @@ dmypy.json

# local execution commands
local_*.sh
.aider*
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,9 @@ ledger.json
/checkpoints
*.jaxpr

# local execution commands
local_*.sh

# aider
.aider*

.benchmarks
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,8 @@ Please see the [CUDA Getting Started](docs/Getting-Started-GPU.md) guide for mor

## Contributing

[![GitHub repo Good Issues for newbies](https://img.shields.io/github/issues/stanford-crfm/levanter/good%20first%20issue?style=flat&logo=github&logoColor=green&label=Good%20First%20issues)](https://github.com/stanford-crfm/levanter/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) [![GitHub Help Wanted issues](https://img.shields.io/github/issues/stanford-crfm/levanter/help%20wanted?style=flat&logo=github&logoColor=b545d1&label=%22Help%20Wanted%22%20issues)](https://github.com/stanford-crfm/levanter/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22) [![GitHub Help Wanted PRs](https://img.shields.io/github/issues-pr/stanford-crfm/levanter/help%20wanted?style=flat&logo=github&logoColor=b545d1&label=%22Help%20Wanted%22%20PRs)](https://github.com/stanford-crfm/levanter/pulls?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22) [![GitHub repo Issues](https://img.shields.io/github/issues/stanford-crfm/levanter?style=flat&logo=github&logoColor=red&label=Issues)](https://github.com/stanford-crfm/levanter/issues?q=is%3Aopen)

We welcome contributions! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for more information.

## License
Expand Down
78 changes: 78 additions & 0 deletions config/data/dclm_gpt_neo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
cache_dir: "gs://marin-us-central2/tokenized/gpt_neox/"
tokenizer: "EleutherAI/gpt-neox-20b"
cache_options:
batch_size: 256
num_shard_groups: 1024
stop_strategy: restart
shuffle: 100000
configs:
"dclm":
train_urls:
- gs://marin-us-central2/raw/dclm/v2024-07-09-baseline-dedup/**/*.zstd
# these are just for eval
"paloma/4chan":
validation_urls:
- gs://levanter-data/paloma/4chan_meta_sep/val/val*.jsonl.gz
"paloma/c4_100_domains":
validation_urls:
- gs://levanter-data/paloma/c4_100_domains/val/val*.jsonl.gz
"paloma/c4_en":
validation_urls:
- gs://levanter-data/paloma/c4_en/val/val*.jsonl.gz
"paloma/dolma-v1_5":
validation_urls:
- gs://levanter-data/paloma/dolma-v1_5/val/val*.jsonl.gz
"paloma/dolma_100_programing_languages":
validation_urls:
- gs://levanter-data/paloma/dolma_100_programing_languages/val/val*.jsonl.gz
"paloma/dolma_100_subreddits":
validation_urls:
- gs://levanter-data/paloma/dolma_100_subreddits/val/val*.jsonl.gz
"paloma/falcon-refinedweb":
validation_urls:
- gs://levanter-data/paloma/falcon-refinedweb/val/val*.jsonl.gz
"paloma/gab":
validation_urls:
- gs://levanter-data/paloma/gab/val/val*.jsonl.gz
"paloma/m2d2_s2orc_unsplit":
validation_urls:
- gs://levanter-data/paloma/m2d2_s2orc_unsplit/val/val*.jsonl.gz
"paloma/m2d2_wikipedia_unsplit":
validation_urls:
- gs://levanter-data/paloma/m2d2_wikipedia_unsplit/val/val*.jsonl.gz
"paloma/manosphere_meta_sep":
validation_urls:
- gs://levanter-data/paloma/manosphere_meta_sep/val/val*.jsonl.gz
"paloma/mc4":
validation_urls:
- gs://levanter-data/paloma/mc4/val/val*.jsonl.gz
"paloma/ptb":
validation_urls:
- gs://levanter-data/paloma/ptb/val/val*.jsonl.gz
"paloma/redpajama":
validation_urls:
- gs://levanter-data/paloma/redpajama/val/val*.jsonl.gz
"paloma/twitterAAE_HELM_fixed":
validation_urls:
- gs://levanter-data/paloma/twitterAAE_HELM_fixed/val/val*.jsonl.gz
"paloma/wikitext_103":
validation_urls:
- gs://levanter-data/paloma/wikitext_103/val/val*.jsonl.gz
train_weights:
dclm: 1.0
paloma/4chan: 0.0
paloma/c4_100_domains: 0.0
paloma/c4_en: 0.0
paloma/dolma-v1_5: 0.0
paloma/dolma_100_programing_languages: 0.0
paloma/dolma_100_subreddits: 0.0
paloma/falcon-refinedweb: 0.0
paloma/gab: 0.0
paloma/m2d2_s2orc_unsplit: 0.0
paloma/m2d2_wikipedia_unsplit: 0.0
paloma/manosphere_meta_sep: 0.0
paloma/mc4: 0.0
paloma/ptb: 0.0
paloma/redpajama: 0.0
paloma/twitterAAE_HELM_fixed: 0.0
paloma/wikitext_103: 0.0
44 changes: 22 additions & 22 deletions config/data/dolma_olmo_paloma.yaml
Original file line number Diff line number Diff line change
@@ -1,59 +1,59 @@
cache_dir: "gs://marin-data/tokenized/OLMo-1B/dolma-v1.7"
cache_dir: "gs://marin-us-central2/tokenized/OLMo-1B/dolma/v1.7"
tokenizer: "allenai/OLMo-1B" # requires `pip install ai2-olmo`
# tokenizer: "meta-llama/Llama-2-7b-hf"
stop_strategy: restart
configs:
dolma-algebraic-stack:
train_urls:
- gs://marin-data/raw/dolma/dolma-v1.7/algebraic-stack-train-{0000..0015}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/algebraic-stack-train-{0000..0015}.json.gz
dolma-arxiv:
train_urls:
- gs://marin-data/raw/dolma/dolma-v1.7/arxiv-{0000..0099}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/arxiv-{0000..0099}.json.gz
dolma-gutenberg:
train_urls:
- gs://marin-data/raw/dolma/dolma-v1.7/books-{0000..0002}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/books-{0000..0002}.json.gz
dolma-c4:
train_urls:
- gs://marin-data/raw/dolma/dolma-v1.7/c4-{0000..0170}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/c4-{0000..0170}.json.gz
dolma-cc:
train_urls:
- gs://marin-data/raw/dolma/dolma-v1.7/cc_en_head-{0000..0274}.json.gz
- gs://marin-data/raw/dolma/dolma-v1.7/cc_en_middle-{0000..0238}.json.gz # 239 is missing
- gs://marin-data/raw/dolma/dolma-v1.7/cc_en_middle-{0240..0379}.json.gz
- gs://marin-data/raw/dolma/dolma-v1.7/cc_en_tail-{0000..0152}.json.gz # 153 is missing
- gs://marin-data/raw/dolma/dolma-v1.7/cc_en_tail-{0154..0444}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/cc_en_head-{0000..0274}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/cc_en_middle-{0000..0238}.json.gz # 239 is missing
- gs://marin-us-central2/raw/dolma/v1.7/cc_en_middle-{0240..0379}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/cc_en_tail-{0000..0152}.json.gz # 153 is missing
- gs://marin-us-central2/raw/dolma/v1.7/cc_en_tail-{0154..0444}.json.gz
dolma-cc-news:
train_urls:
- gs://marin-data/raw/dolma/dolma-v1.7/cc_news_head-{0000..0004}.json.gz
- gs://marin-data/raw/dolma/dolma-v1.7/cc_news_middle-{0000..0002}.json.gz
- gs://marin-data/raw/dolma/dolma-v1.7/cc_news_tail-0000.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/cc_news_head-{0000..0004}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/cc_news_middle-{0000..0002}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/cc_news_tail-0000.json.gz
dolma-falcon:
train_urls:
- gs://marin-data/raw/dolma/dolma-v1.7/falcon-{0000..0499}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/falcon-{0000..0499}.json.gz
dolma-megawika:
train_urls:
- gs://marin-data/raw/dolma/dolma-v1.7/megawika-{0000..0261}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/megawika-{0000..0261}.json.gz
dolma-owmath:
train_urls:
- gs://marin-data/raw/dolma/dolma-v1.7/open-web-math-train-{0000..0012}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/open-web-math-train-{0000..0012}.json.gz
dolma-pes2o:
train_urls:
- gs://marin-data/raw/dolma/dolma-v1.7/pes2o-{0000..0025}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/pes2o-{0000..0025}.json.gz
dolma-reddit:
train_urls:
- gs://marin-data/raw/dolma/dolma-v1.7/reddit-{0000..0077}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/reddit-{0000..0077}.json.gz
dolma-stackexchange:
train_urls:
- gs://marin-data/raw/dolma/dolma-v1.7/stackexchange-{0000..0025}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/stackexchange-{0000..0025}.json.gz
dolma-starcoder:
train_urls:
- gs://marin-data/raw/dolma/dolma-v1.7/starcoder-{0000..0048}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/starcoder-{0000..0048}.json.gz
dolma-flan:
train_urls:
- gs://marin-data/raw/dolma/dolma-v1.7/tulu_flan-{0000..0065}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/tulu_flan-{0000..0065}.json.gz
dolma-wiki:
train_urls:
- gs://marin-data/raw/dolma/dolma-v1.7/wiki-{0000..0001}.json.gz
- gs://marin-us-central2/raw/dolma/v1.7/wiki-{0000..0001}.json.gz
# these are just for eval
"paloma/4chan":
validation_urls:
Expand Down
1 change: 1 addition & 0 deletions config/gpt2_nano_mixture.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ data:
id: dlwh/wikitext_103_detokenized
w2:
id: dlwh/wikitext_103_detokenized
cache_dir: wikitext2_cache
train_weights:
wikitext: 1.0
w2: 1.0
Expand Down
3 changes: 3 additions & 0 deletions config/gpt2_small_fast.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ trainer:

train_batch_size: 256
num_train_steps: 20000

# tensor_parallel_axes: ["position", "key_position"]
# tensor_parallel_axes: ["heads", "mlp"]
optimizer:
learning_rate: 1E-3
weight_decay: 0.1
Expand Down
33 changes: 33 additions & 0 deletions config/llama_7b_with_dclm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
data: !include data/dclm_gpt_neo.yaml
model: # 7B class model
type: llama
seq_len: 2048
hidden_dim: 4096
intermediate_dim: 11008
num_layers: 32
num_heads: 32
num_kv_heads: 32
use_flash_attention: True
trainer:
tracker:
type: wandb
entity: "stanford-mercury"
project: "marin"
tags: ["dclm", "7B", "llama"]

mp: p=f32,c=bfloat16
train_batch_size: 2048
num_train_steps: 70000 # 280B / 4M
steps_per_eval: 1000
tensor_parallel_axes: ["mlp", "heads"]
fsdp_axis: "embed"
batch_axis: "batch"
optimizer:
learning_rate: 4e-4
weight_decay: 0.1
min_lr_ratio: 0.1
beta1: 0.9
beta2: 0.95
warmup: 5000

z_loss_weight: 5e-6
14 changes: 10 additions & 4 deletions config/whisper_tiny_librispeech.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
data:
id: WillHeld/librispeech_parquet
cache_dir: "gs://public_data_lev/processed/librispeech"
train_split: "train.360"
validation_split: "validation"
cache_dir: "gs://diva-flash/processed/mixture"
# The Whisper Tokenizer is way too large for Librispeech
tokenizer: "facebook/wav2vec2-base-960h"
configs:
librispeech:
id: WillHeld/librispeech_parquet
cache_dir: "gs://diva-flash/processed/librispeech"
train_split: "train.360"
validation_split: "validation"
train_weights:
librispeech: 1.0
model:
type: whisper
vocab_size: 32
Expand All @@ -24,3 +29,4 @@ optimizer:
learning_rate: 3E-3
weight_decay: 0.1
warmup: 0.01
hf_save_steps: 16000
74 changes: 74 additions & 0 deletions docker/tpu/Dockerfile.cluster
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# This dockerfile is used to build the docker image for using Ray to manage TPU slices.
ARG IMAGE=ghcr.io/stanford-crfm/levanter-base
ARG TAG=latest

FROM ${IMAGE}:${TAG}

# install docker in docker, but don't start it
RUN apt-get update && apt-get install -y docker.io

ENV TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS=60\
TENSORSTORE_CURL_LOW_SPEED_LIMIT_BYTES=1024\
RAY_USAGE_STATS_ENABLED=0\
PATH=/opt/levanter/.venv/bin:$PATH\
PYTHONPATH=/opt/levanter:/opt/levanter/src:/opt/levanter/examples:/opt/levanter/tests:src:.\
HOME=/home/levanter
# Install dependencies

RUN apt-get install -y \
sudo \
git \
libjemalloc-dev \
wget \
cmake \
g++ \
zlib1g-dev \
tmux \
screen \
rsync \
netbase \
openssh-client \
gnupg

RUN pip install --no-cache-dir \
flatbuffers \
cython==0.29.37 \
# Necessary for Dataset to work properly.
numpy\>=1.20 \
psutil \
# Required a recent version of setuptools to be compatible with python 3.12+.
setuptools==71.1.0 \
"google-api-python-client==1.7.8" \
"google-oauth"


# Install gcloud so we can get secrets (maybe we should just curl?)
RUN curl https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz > /tmp/google-cloud-sdk.tar.gz

RUN mkdir -p /usr/local/gcloud \
&& tar -C /usr/local/gcloud -xvf /tmp/google-cloud-sdk.tar.gz \
&& /usr/local/gcloud/google-cloud-sdk/install.sh \
&& rm -f /tmp/google-cloud-sdk.tar.gz

# Adding the package path to local
ENV PATH=$PATH:/usr/local/gcloud/google-cloud-sdk/bin

# GCP doesn't like it when root ssh's into a machine
RUN useradd -m -s /bin/bash levanter
RUN echo "levanter ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
RUN usermod -aG docker levanter
RUN mkdir -p $HOME && touch $HOME/.bashrc && chown -R levanter $HOME
RUN echo "export PATH=$PATH" >> $HOME/.bashrc
RUN adduser levanter docker

RUN chown -R levanter /opt/levanter

USER levanter

# HACK until https://github.com/ray-project/ray/issues/47769 is resolved
RUN pip install 'ray[default,gcp]==2.34.0'
RUN git clone https://github.com/dlwh/ray.git ~/ray --branch tpu_docker_2.34 --depth 1
RUN cp ~/ray/python/ray/autoscaler/_private/gcp/tpu_command_runner.py /opt/levanter/.venv/lib/python3.10/site-packages/ray/autoscaler/_private/gcp/tpu_command_runner.py


WORKDIR /opt/levanter
1 change: 0 additions & 1 deletion docs/Configuration-Guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ The following table lists some of the parameters that you might want to change.
| Parameter | Description | Default |
|----------------|-------------------------------------------------------------------------------|---------|
| `log_dir` | Where to save logs (python logger). `$run_id` will be appended | `logs/` |
| `run_base_dir` | where to save run artifacts. not really used much. `$run_id` will be appended | `runs/` |



Expand Down
Loading

0 comments on commit 1063fd8

Please sign in to comment.