Merge branch 'main' into dependabot/pip/gcsfs-gte-2024.2-and-lt-2024.11

stanford-crfm · Nov 14, 2024 · dc1f74c · dc1f74c
2 parents e190340 + f8ab21a
commit dc1f74c
Show file tree

Hide file tree

Showing 58 changed files with 3,084 additions and 817 deletions.
diff --git a/config/gpt2_small_fast_supervised.yaml → config/gpt2_small_fast_eval.yaml b/config/gpt2_small_fast_supervised.yaml → config/gpt2_small_fast_eval.yaml
@@ -13,11 +13,17 @@ data:
   tokenizer: gpt2
   cache_dir: "gs://levanter-data/tokenized/data_mix"
 supervised_data:
-  validation_urls:
-    - "gs://marin-us-central2/benchmarks/mmlu/mmlu-*-dev-evaluation.jsonl.gz"
-  cache_dir: "gs://marin-us-central2/benchmarks/tokenized-gpt2/mmlu/"
-  input_field: "input"
-  output_field: "output"
+  mmlu:
+    validation_urls:
+      - "gs://marin-us-central2/evaluation/mmlu-eval-subject-2eb39e/cais/*-validation-evaluation.jsonl.gz"
+    cache_dir: "gs://levanter-data/tokenized-gpt2/mmlu/"
+    tags: [ "e"]
+  arc_easy:
+    validation_urls:
+      - "gs://marin-us-central2/evaluation/arc-easy-b39e70/allenai/ai2_arc-ARC-Easy-validation-evaluation.jsonl.gz"
+    cache_dir: "gs://levanter-data/tokenized-gpt2/arc_easy/"
+    tags: [ "arc", "e"]
+
 model:
   type: gpt2
   hidden_dim: 768

diff --git a/config/gpt2_small_fast_pile.yaml b/config/gpt2_small_fast_pile.yaml
@@ -1,4 +1,4 @@
-data: !include data/pile_source_old.yaml
+data: !include data/pile_mixture.yaml
 model:
   type: gpt2
   hidden_dim: 768

diff --git a/config/llama3_small_fast.yaml b/config/llama3_small_fast.yaml
@@ -0,0 +1,32 @@
+data:
+  train_urls:
+    - "gs://pubmed-mosaic/openwebtext-sharded/openwebtext_train.{1..128}-of-128.jsonl.gz"
+  validation_urls:
+    - "gs://pubmed-mosaic/openwebtext-sharded/openwebtext_val.{1..8}-of-8.jsonl.gz"
+  cache_dir: "gs://levanter-data/tokenized/openwebtext_llama3/"
+  tokenizer: "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
+model:
+  type: llama
+  hidden_dim: 768
+  intermediate_dim: 2048
+  num_heads: 12
+  num_kv_heads: 12
+  num_layers: 12
+  seq_len: 1024
+  gradient_checkpointing: true
+trainer:
+  tracker:
+    - type: wandb
+      project: "levanter"
+      tags: [ "openwebtext", "llama", "itest"]
+
+  mp: p=f32,c=bfloat16
+  model_axis_size: 1
+  per_device_parallelism: -1
+
+  train_batch_size: 256
+  num_train_steps: 20000
+optimizer:
+  learning_rate: 1E-3
+  weight_decay: 0.1
+  warmup: 0.01
diff --git a/config/llama_7b_tulu.yaml b/config/llama_7b_tulu.yaml
@@ -0,0 +1,39 @@
+data:
+  train_urls:
+    - "gs://marin-us-central2/documents/instruct/tulu_v2_mix/text/tulu-v2-sft-mixture-000.jsonl.gz"
+    - "gs://marin-us-central2/documents/instruct/tulu_v2_mix/text/tulu-v2-sft-mixture-001.jsonl.gz"
+    - "gs://marin-us-central2/documents/instruct/tulu_v2_mix/text/tulu-v2-sft-mixture-002.jsonl.gz"
+  cache_dir: "gs://marin-us-central2/tokenized/OLMo-1B/tuluv2_sft/"
+  tokenizer: "allenai/OLMo-1B"
+model:  # 7B class model
+  type: llama
+  seq_len: 2048
+  hidden_dim: 4096
+  intermediate_dim: 11008
+  num_layers: 32
+  num_heads: 32
+  num_kv_heads: 32
+  use_flash_attention: True
+  flash_attention_block_size: 512
+  use_bias: false
+  use_layer_norm_weight: false
+trainer:
+  tracker:
+    type: wandb
+    project: "marin"
+    tags: ["dolma", "olmo", "llama"]
+
+  mp: p=f32,c=bfloat16
+  train_batch_size: 256
+  num_train_steps: 750000  # 3,000,000,000,000 / 4,000,000 = 750,000
+  steps_per_eval: 1000
+  tensor_parallel_axes: ["mlp", "heads"]
+  fsdp_axis: "embed"
+  batch_axis: "batch"
+optimizer:
+  learning_rate: 4E-4
+  weight_decay: 0.1
+  min_lr_ratio: 0.1
+  warmup: 5000
+
+epoch: 3
diff --git a/config/llama_7b_with_dclm.yaml b/config/llama_7b_with_dclm.yaml
@@ -17,7 +17,7 @@ trainer:
 
   mp: p=f32,c=bfloat16
   train_batch_size: 2048
-  num_train_steps: 70000  # 280B / 4M
+  num_train_steps: 480000  # 2T / 4M
   steps_per_eval: 1000
   tensor_parallel_axes: ["mlp", "heads"]
   fsdp_axis: "embed"

diff --git a/config/llama_sft_hf_ckpt.yaml b/config/llama_sft_hf_ckpt.yaml
@@ -0,0 +1,13 @@
+# Model configuration
+model:
+  type: llama
+  seq_len: 2048
+  hidden_dim: 4096
+  intermediate_dim: 11008
+  num_layers: 32
+  num_heads: 32
+  num_kv_heads: 32
+  use_flash_attention: true
+  flash_attention_block_size: 512
+  use_bias: false
+  use_layer_norm_weight: false
diff --git a/examples/alpaca/alpaca.py b/examples/alpaca/alpaca.py
@@ -162,11 +162,13 @@ def _prepare_example(ex: dict) -> LmExample:
         # mask out padding and anything before the start of the target
         Pos = input_ids.resolve_axis("position")
         if config.mask_inputs:
-            loss_mask = hax.arange(Pos) >= ex["source_lens"]
+            loss_mask = hax.arange(Pos) >= ex["source_lens"] - 1  # should be minus 1?
 
             # don't predict the padding
             targets = hax.roll(input_ids, -1, Pos)
             loss_mask = loss_mask & (targets != tokenizer.pad_token_id)
+            # to not predict EOS token since we don't have target!
+            loss_mask = loss_mask & (1 - hax.nn.one_hot(-1, Pos, dtype=jax.numpy.bool_))
         else:
             loss_mask = 1 - hax.nn.one_hot(-1, Pos, dtype=jax.numpy.float32)
         lm_ex = LmExample.causal(input_ids, loss_mask=loss_mask)

diff --git a/examples/sft/alpaca-llama-sft.yaml b/examples/sft/alpaca-llama-sft.yaml
@@ -0,0 +1,52 @@
+# Model configuration
+model:
+  type: llama
+  seq_len: 2048
+  hidden_dim: 4096
+  intermediate_dim: 11008
+  num_layers: 32
+  num_heads: 32
+  num_kv_heads: 32
+  use_flash_attention: true
+  flash_attention_block_size: 512
+  use_bias: false
+  use_layer_norm_weight: false
+
+# Training configuration
+trainer:
+  mp: p=f32,c=bfloat16
+  tracker:
+    type: wandb
+    project: "levanter-sft"
+    tags: ["llama", "sft"]
+  num_train_steps: 750000
+  train_batch_size: 64
+  tensor_parallel_axes: ["mlp", "heads"]
+  fsdp_axis: "embed"
+  batch_axis: "batch"
+  steps_per_eval: 1000
+
+# Optimizer settings
+optimizer:
+  learning_rate: 2e-5
+  weight_decay: 0.0
+  min_lr_ratio: 0.1
+  warmup: 100
+
+# Supervised data configuration
+supervised_data:
+  cache_dir: "gs://levanter-checkpoints/marin/sft_cache/alpaca-olmo"
+  input_field: "instruction"
+  output_field: "output"
+  hf_dataset_name: "tatsu-lab/alpaca"  # Changed from id
+  hf_dataset_split: "train"
+  name: "alpaca"  # Optional metadata
+  tags: ["instruction-tuning"]  # Optional metadata
+  validation_urls: []  # Empty list for no validation files
+
+# Additional settings
+tokenizer: "allenai/OLMo-1B"
+max_tune_length: 2048
+epoch: 0
+
+initialize_from_hf: false
diff --git a/examples/sft/alpaca-llama.yaml b/examples/sft/alpaca-llama.yaml
@@ -0,0 +1,32 @@
+model_name_or_path: meta-llama/Llama-2-7b-hf
+
+# Training configuration
+trainer:
+  mp: p=f32,c=bfloat16
+  wandb:
+    project: "levanter-sft"
+    tags: ["llama2", "alpaca"]
+  num_train_steps: 1218
+  train_batch_size: 64
+  # If using model parallelism
+  tensor_parallel_axes: ["mlp", "heads"]
+
+# Optimizer settings
+optimizer:
+  learning_rate: 2e-5
+  weight_decay: 0.0
+
+supervised_data:
+  hf_dataset_name: "tatsu-lab/alpaca"
+  hf_dataset_split: "train"
+  input_field: "instruction"   # change from prompt
+  output_field: "output"    # this is correct
+  cache_dir: "gs://levanter-checkpoints/marin/sft_cache/alpaca-new"
+
+max_tune_length: 2048
+trust_remote_code: false
+model_cache_dir: null
+
+hf_save_path: "sft_hf_ckpts"
+hf_upload: false
+hf_save_steps: 1000
diff --git a/examples/sft/dolly-llama.yaml b/examples/sft/dolly-llama.yaml
@@ -0,0 +1,32 @@
+model_name_or_path: meta-llama/Llama-2-7b-hf
+
+# Training configuration
+trainer:
+  mp: p=f32,c=bfloat16
+  wandb:
+    project: "levanter-sft"
+    tags: ["llama2", "oasst"]
+  num_train_steps: 1218
+  train_batch_size: 128
+  # If using model parallelism
+  tensor_parallel_axes: ["mlp", "heads"]
+
+# Optimizer settings
+optimizer:
+  learning_rate: 2e-5
+  weight_decay: 0.0
+
+supervised_data:
+  hf_dataset_name: "databricks/databricks-dolly-15k"
+  hf_dataset_split: "train"
+  input_field: "instruction"   # change from prompt
+  output_field: "response"    # this is correct
+  cache_dir: "cache/dolly"
+
+max_tune_length: 2048
+trust_remote_code: false
+model_cache_dir: null
+
+hf_save_path: "sft_hf_ckpts"
+hf_upload: false
+hf_save_steps: 1000
diff --git a/examples/sft/oasst-llama.yaml b/examples/sft/oasst-llama.yaml
@@ -0,0 +1,38 @@
+model_name_or_path: meta-llama/Llama-2-7b-hf
+
+# Training configuration
+trainer:
+  mp: p=f32,c=bfloat16
+  wandb:
+    project: "levanter-sft"
+    tags: ["llama2", "oasst"]
+  num_train_steps: 1218
+  train_batch_size: 128
+
+  # If using model parallelism
+  tensor_parallel_axes: ["mlp", "heads"]
+
+# Optimizer settings
+optimizer:
+  learning_rate: 2e-5
+  weight_decay: 0.0
+
+# Supervised data configuration
+supervised_data:
+  # For HF dataset
+  id: "databricks/databricks-dolly-15k"
+  input_field: "instruction"  # adjust based on dataset
+  output_field: "response"  # adjust based on dataset
+  cache_dir: "cache/dolly"
+
+# Model configuration
+max_tune_length: 2048
+trust_remote_code: false
+model_cache_dir: null
+
+# Checkpoint saving configuration
+hf_save_path: "sft_hf_ckpts"
+hf_upload: false
+hf_save_steps: 1000
+
+# python examples/sft/sft.py --config_path examples/sft/oasst-llama2.yaml
diff --git a/examples/sft/tulu-llama-sft.yaml b/examples/sft/tulu-llama-sft.yaml
@@ -0,0 +1,51 @@
+# Model configuration
+model:
+  type: llama
+  seq_len: 2048
+  hidden_dim: 4096
+  intermediate_dim: 11008
+  num_layers: 32
+  num_heads: 32
+  num_kv_heads: 32
+  use_flash_attention: true
+  flash_attention_block_size: 512
+  use_bias: false
+  use_layer_norm_weight: false
+
+# Training configuration
+trainer:
+  mp: p=f32,c=bfloat16
+  tracker:
+    type: wandb
+    project: "levanter-sft"
+    tags: ["llama", "sft"]
+  num_train_steps: 750000
+  train_batch_size: 64
+  tensor_parallel_axes: ["mlp", "heads"]
+  fsdp_axis: "embed"
+  batch_axis: "batch"
+  steps_per_eval: 1000
+
+# Optimizer settings
+optimizer:
+  learning_rate: 2e-5
+  weight_decay: 0.0
+  min_lr_ratio: 0.1
+  warmup: 100
+
+# Supervised data configuration
+dataset_type: chat_jsonl
+chat_train_urls:
+  - "gs://marin-us-central2/documents/allenai--tulu-v2-sft-mixture-0ba27c/data/**/*.jsonl.gz"
+supervised_data:
+  cache_dir: "gs://levanter-checkpoints/marin/sft_cache/chat-data"
+messages_field: "messages"
+input_role: "user"
+output_role: "assistant"
+
+# Additional settings
+tokenizer: "EleutherAI/gpt-neox-20b"
+max_tune_length: 2048
+epoch: 0
+
+initialize_from_hf: false
diff --git a/pyproject.toml b/pyproject.toml
@@ -28,8 +28,7 @@ dependencies = [
     "transformers>=4.41.2",
     "optax>=0.1.9",
     "wandb>=0.17.8",
-    "scipy<=1.12.0",
-    "draccus>=0.8.0",
+    "draccus>=0.9.3",
     "pyarrow>=11.0.0",
     "zstandard>=0.20.0",
     "datasets>=3.1.0,<4.0",

diff --git a/scripts/clean_old_checkpoints.py b/scripts/clean_old_checkpoints.py
@@ -19,7 +19,7 @@ def is_dir_of_checkpoints(path):
     return any("step-" in child for child in children)
 
 
-def list_deletable_directories(base_dir):
+def list_deletable_directories(base_dir, age):
     fs = fsspec.filesystem("gcs")
     run_ids = fs.ls(base_dir)
 
@@ -58,8 +58,8 @@ def list_deletable_directories(base_dir):
                     details = fs.ls(f"{path}/{file}", detail=True)
                     if details:
                         mtime = details[0]["mtime"]
-                        age = (datetime.now(timezone.utc) - mtime).days
-                        if age < AGE:
+                        this_age = (datetime.now(timezone.utc) - mtime).days
+                        if this_age < age:
                             new = True
                             break
 
@@ -74,9 +74,15 @@ def list_deletable_directories(base_dir):
 
 # Usage example:
 if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="List directories that can be deleted.")
+    parser.add_argument("base_dir", help="The base directory to clean up.", type=str, nargs="+")
+    parser.add_argument("--age", help="The age in days of the checkpoints to delete.", type=int, default=30)
+    args = parser.parse_args()
     if len(sys.argv) < 2:
         print("Usage: python clean_old_checkpoints.py <base_dir>")
         sys.exit(1)
-    for base_dir in sys.argv[1:]:
-        for path in list_deletable_directories(base_dir):
+    for base_dir in args.base_dir:
+        for path in list_deletable_directories(base_dir, args.age):
             print(f"gs://{path}")