Skip to content

Commit

Permalink
Merge branch 'main' into dependabot/pip/gcsfs-gte-2024.2-and-lt-2024.11
Browse files Browse the repository at this point in the history
  • Loading branch information
dlwh authored Nov 14, 2024
2 parents e190340 + f8ab21a commit dc1f74c
Show file tree
Hide file tree
Showing 58 changed files with 3,084 additions and 817 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,17 @@ data:
tokenizer: gpt2
cache_dir: "gs://levanter-data/tokenized/data_mix"
supervised_data:
validation_urls:
- "gs://marin-us-central2/benchmarks/mmlu/mmlu-*-dev-evaluation.jsonl.gz"
cache_dir: "gs://marin-us-central2/benchmarks/tokenized-gpt2/mmlu/"
input_field: "input"
output_field: "output"
mmlu:
validation_urls:
- "gs://marin-us-central2/evaluation/mmlu-eval-subject-2eb39e/cais/*-validation-evaluation.jsonl.gz"
cache_dir: "gs://levanter-data/tokenized-gpt2/mmlu/"
tags: [ "e"]
arc_easy:
validation_urls:
- "gs://marin-us-central2/evaluation/arc-easy-b39e70/allenai/ai2_arc-ARC-Easy-validation-evaluation.jsonl.gz"
cache_dir: "gs://levanter-data/tokenized-gpt2/arc_easy/"
tags: [ "arc", "e"]

model:
type: gpt2
hidden_dim: 768
Expand Down
2 changes: 1 addition & 1 deletion config/gpt2_small_fast_pile.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
data: !include data/pile_source_old.yaml
data: !include data/pile_mixture.yaml
model:
type: gpt2
hidden_dim: 768
Expand Down
32 changes: 32 additions & 0 deletions config/llama3_small_fast.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
data:
train_urls:
- "gs://pubmed-mosaic/openwebtext-sharded/openwebtext_train.{1..128}-of-128.jsonl.gz"
validation_urls:
- "gs://pubmed-mosaic/openwebtext-sharded/openwebtext_val.{1..8}-of-8.jsonl.gz"
cache_dir: "gs://levanter-data/tokenized/openwebtext_llama3/"
tokenizer: "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
model:
type: llama
hidden_dim: 768
intermediate_dim: 2048
num_heads: 12
num_kv_heads: 12
num_layers: 12
seq_len: 1024
gradient_checkpointing: true
trainer:
tracker:
- type: wandb
project: "levanter"
tags: [ "openwebtext", "llama", "itest"]

mp: p=f32,c=bfloat16
model_axis_size: 1
per_device_parallelism: -1

train_batch_size: 256
num_train_steps: 20000
optimizer:
learning_rate: 1E-3
weight_decay: 0.1
warmup: 0.01
39 changes: 39 additions & 0 deletions config/llama_7b_tulu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
data:
train_urls:
- "gs://marin-us-central2/documents/instruct/tulu_v2_mix/text/tulu-v2-sft-mixture-000.jsonl.gz"
- "gs://marin-us-central2/documents/instruct/tulu_v2_mix/text/tulu-v2-sft-mixture-001.jsonl.gz"
- "gs://marin-us-central2/documents/instruct/tulu_v2_mix/text/tulu-v2-sft-mixture-002.jsonl.gz"
cache_dir: "gs://marin-us-central2/tokenized/OLMo-1B/tuluv2_sft/"
tokenizer: "allenai/OLMo-1B"
model: # 7B class model
type: llama
seq_len: 2048
hidden_dim: 4096
intermediate_dim: 11008
num_layers: 32
num_heads: 32
num_kv_heads: 32
use_flash_attention: True
flash_attention_block_size: 512
use_bias: false
use_layer_norm_weight: false
trainer:
tracker:
type: wandb
project: "marin"
tags: ["dolma", "olmo", "llama"]

mp: p=f32,c=bfloat16
train_batch_size: 256
num_train_steps: 750000 # 3,000,000,000,000 / 4,000,000 = 750,000
steps_per_eval: 1000
tensor_parallel_axes: ["mlp", "heads"]
fsdp_axis: "embed"
batch_axis: "batch"
optimizer:
learning_rate: 4E-4
weight_decay: 0.1
min_lr_ratio: 0.1
warmup: 5000

epoch: 3
2 changes: 1 addition & 1 deletion config/llama_7b_with_dclm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ trainer:

mp: p=f32,c=bfloat16
train_batch_size: 2048
num_train_steps: 70000 # 280B / 4M
num_train_steps: 480000 # 2T / 4M
steps_per_eval: 1000
tensor_parallel_axes: ["mlp", "heads"]
fsdp_axis: "embed"
Expand Down
13 changes: 13 additions & 0 deletions config/llama_sft_hf_ckpt.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Model configuration
model:
type: llama
seq_len: 2048
hidden_dim: 4096
intermediate_dim: 11008
num_layers: 32
num_heads: 32
num_kv_heads: 32
use_flash_attention: true
flash_attention_block_size: 512
use_bias: false
use_layer_norm_weight: false
4 changes: 3 additions & 1 deletion examples/alpaca/alpaca.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,11 +162,13 @@ def _prepare_example(ex: dict) -> LmExample:
# mask out padding and anything before the start of the target
Pos = input_ids.resolve_axis("position")
if config.mask_inputs:
loss_mask = hax.arange(Pos) >= ex["source_lens"]
loss_mask = hax.arange(Pos) >= ex["source_lens"] - 1 # should be minus 1?

# don't predict the padding
targets = hax.roll(input_ids, -1, Pos)
loss_mask = loss_mask & (targets != tokenizer.pad_token_id)
# to not predict EOS token since we don't have target!
loss_mask = loss_mask & (1 - hax.nn.one_hot(-1, Pos, dtype=jax.numpy.bool_))
else:
loss_mask = 1 - hax.nn.one_hot(-1, Pos, dtype=jax.numpy.float32)
lm_ex = LmExample.causal(input_ids, loss_mask=loss_mask)
Expand Down
52 changes: 52 additions & 0 deletions examples/sft/alpaca-llama-sft.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Model configuration
model:
type: llama
seq_len: 2048
hidden_dim: 4096
intermediate_dim: 11008
num_layers: 32
num_heads: 32
num_kv_heads: 32
use_flash_attention: true
flash_attention_block_size: 512
use_bias: false
use_layer_norm_weight: false

# Training configuration
trainer:
mp: p=f32,c=bfloat16
tracker:
type: wandb
project: "levanter-sft"
tags: ["llama", "sft"]
num_train_steps: 750000
train_batch_size: 64
tensor_parallel_axes: ["mlp", "heads"]
fsdp_axis: "embed"
batch_axis: "batch"
steps_per_eval: 1000

# Optimizer settings
optimizer:
learning_rate: 2e-5
weight_decay: 0.0
min_lr_ratio: 0.1
warmup: 100

# Supervised data configuration
supervised_data:
cache_dir: "gs://levanter-checkpoints/marin/sft_cache/alpaca-olmo"
input_field: "instruction"
output_field: "output"
hf_dataset_name: "tatsu-lab/alpaca" # Changed from id
hf_dataset_split: "train"
name: "alpaca" # Optional metadata
tags: ["instruction-tuning"] # Optional metadata
validation_urls: [] # Empty list for no validation files

# Additional settings
tokenizer: "allenai/OLMo-1B"
max_tune_length: 2048
epoch: 0

initialize_from_hf: false
32 changes: 32 additions & 0 deletions examples/sft/alpaca-llama.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
model_name_or_path: meta-llama/Llama-2-7b-hf

# Training configuration
trainer:
mp: p=f32,c=bfloat16
wandb:
project: "levanter-sft"
tags: ["llama2", "alpaca"]
num_train_steps: 1218
train_batch_size: 64
# If using model parallelism
tensor_parallel_axes: ["mlp", "heads"]

# Optimizer settings
optimizer:
learning_rate: 2e-5
weight_decay: 0.0

supervised_data:
hf_dataset_name: "tatsu-lab/alpaca"
hf_dataset_split: "train"
input_field: "instruction" # change from prompt
output_field: "output" # this is correct
cache_dir: "gs://levanter-checkpoints/marin/sft_cache/alpaca-new"

max_tune_length: 2048
trust_remote_code: false
model_cache_dir: null

hf_save_path: "sft_hf_ckpts"
hf_upload: false
hf_save_steps: 1000
32 changes: 32 additions & 0 deletions examples/sft/dolly-llama.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
model_name_or_path: meta-llama/Llama-2-7b-hf

# Training configuration
trainer:
mp: p=f32,c=bfloat16
wandb:
project: "levanter-sft"
tags: ["llama2", "oasst"]
num_train_steps: 1218
train_batch_size: 128
# If using model parallelism
tensor_parallel_axes: ["mlp", "heads"]

# Optimizer settings
optimizer:
learning_rate: 2e-5
weight_decay: 0.0

supervised_data:
hf_dataset_name: "databricks/databricks-dolly-15k"
hf_dataset_split: "train"
input_field: "instruction" # change from prompt
output_field: "response" # this is correct
cache_dir: "cache/dolly"

max_tune_length: 2048
trust_remote_code: false
model_cache_dir: null

hf_save_path: "sft_hf_ckpts"
hf_upload: false
hf_save_steps: 1000
38 changes: 38 additions & 0 deletions examples/sft/oasst-llama.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
model_name_or_path: meta-llama/Llama-2-7b-hf

# Training configuration
trainer:
mp: p=f32,c=bfloat16
wandb:
project: "levanter-sft"
tags: ["llama2", "oasst"]
num_train_steps: 1218
train_batch_size: 128

# If using model parallelism
tensor_parallel_axes: ["mlp", "heads"]

# Optimizer settings
optimizer:
learning_rate: 2e-5
weight_decay: 0.0

# Supervised data configuration
supervised_data:
# For HF dataset
id: "databricks/databricks-dolly-15k"
input_field: "instruction" # adjust based on dataset
output_field: "response" # adjust based on dataset
cache_dir: "cache/dolly"

# Model configuration
max_tune_length: 2048
trust_remote_code: false
model_cache_dir: null

# Checkpoint saving configuration
hf_save_path: "sft_hf_ckpts"
hf_upload: false
hf_save_steps: 1000

# python examples/sft/sft.py --config_path examples/sft/oasst-llama2.yaml
51 changes: 51 additions & 0 deletions examples/sft/tulu-llama-sft.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Model configuration
model:
type: llama
seq_len: 2048
hidden_dim: 4096
intermediate_dim: 11008
num_layers: 32
num_heads: 32
num_kv_heads: 32
use_flash_attention: true
flash_attention_block_size: 512
use_bias: false
use_layer_norm_weight: false

# Training configuration
trainer:
mp: p=f32,c=bfloat16
tracker:
type: wandb
project: "levanter-sft"
tags: ["llama", "sft"]
num_train_steps: 750000
train_batch_size: 64
tensor_parallel_axes: ["mlp", "heads"]
fsdp_axis: "embed"
batch_axis: "batch"
steps_per_eval: 1000

# Optimizer settings
optimizer:
learning_rate: 2e-5
weight_decay: 0.0
min_lr_ratio: 0.1
warmup: 100

# Supervised data configuration
dataset_type: chat_jsonl
chat_train_urls:
- "gs://marin-us-central2/documents/allenai--tulu-v2-sft-mixture-0ba27c/data/**/*.jsonl.gz"
supervised_data:
cache_dir: "gs://levanter-checkpoints/marin/sft_cache/chat-data"
messages_field: "messages"
input_role: "user"
output_role: "assistant"

# Additional settings
tokenizer: "EleutherAI/gpt-neox-20b"
max_tune_length: 2048
epoch: 0

initialize_from_hf: false
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ dependencies = [
"transformers>=4.41.2",
"optax>=0.1.9",
"wandb>=0.17.8",
"scipy<=1.12.0",
"draccus>=0.8.0",
"draccus>=0.9.3",
"pyarrow>=11.0.0",
"zstandard>=0.20.0",
"datasets>=3.1.0,<4.0",
Expand Down
16 changes: 11 additions & 5 deletions scripts/clean_old_checkpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def is_dir_of_checkpoints(path):
return any("step-" in child for child in children)


def list_deletable_directories(base_dir):
def list_deletable_directories(base_dir, age):
fs = fsspec.filesystem("gcs")
run_ids = fs.ls(base_dir)

Expand Down Expand Up @@ -58,8 +58,8 @@ def list_deletable_directories(base_dir):
details = fs.ls(f"{path}/{file}", detail=True)
if details:
mtime = details[0]["mtime"]
age = (datetime.now(timezone.utc) - mtime).days
if age < AGE:
this_age = (datetime.now(timezone.utc) - mtime).days
if this_age < age:
new = True
break

Expand All @@ -74,9 +74,15 @@ def list_deletable_directories(base_dir):

# Usage example:
if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser(description="List directories that can be deleted.")
parser.add_argument("base_dir", help="The base directory to clean up.", type=str, nargs="+")
parser.add_argument("--age", help="The age in days of the checkpoints to delete.", type=int, default=30)
args = parser.parse_args()
if len(sys.argv) < 2:
print("Usage: python clean_old_checkpoints.py <base_dir>")
sys.exit(1)
for base_dir in sys.argv[1:]:
for path in list_deletable_directories(base_dir):
for base_dir in args.base_dir:
for path in list_deletable_directories(base_dir, args.age):
print(f"gs://{path}")
Loading

0 comments on commit dc1f74c

Please sign in to comment.