Skip to content

Commit

Permalink
add fix to recipe (#11368)
Browse files Browse the repository at this point in the history
* add fix to recipe

* Apply isort and black reformatting

Signed-off-by: JRD971000 <[email protected]>

---------

Signed-off-by: JRD971000 <[email protected]>
Co-authored-by: Ali Taghibakhshi <[email protected]>
Co-authored-by: JRD971000 <[email protected]>
  • Loading branch information
3 people authored Nov 22, 2024
1 parent e731d48 commit 94dbd50
Show file tree
Hide file tree
Showing 8 changed files with 332 additions and 121 deletions.
1 change: 1 addition & 0 deletions nemo/collections/llm/gpt/model/ssm.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@ class BaseMambaConfig2_7B(SSMConfig):
@dataclass
class NVIDIAMambaConfig8B(SSMConfig):
hybrid_override_pattern: str = "M" * 56
num_attention_heads: int = 32
num_layers: int = 56
seq_length: int = 4096
hidden_size: int = 4096
Expand Down
64 changes: 47 additions & 17 deletions nemo/collections/llm/recipes/mamba2_130m.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
)


@run.cli.factory(target=finetune, name=NAME)
def trainer(
tensor_parallelism: int = 1,
pipeline_parallelism: int = 1,
Expand All @@ -76,7 +77,11 @@ def trainer(
sequence_parallelism: bool = False,
num_nodes: int = 1,
num_gpus_per_node: int = 8,
max_steps: int = 1168251,
max_steps: int = 100,
val_check_interval: int = 100,
limit_test_batches: int = 50,
limit_val_batches: int = 32,
log_every_n_steps: int = 10,
callbacks: Optional[list[run.Config[Callback]]] = None,
) -> run.Config[nl.Trainer]:
"""
Expand Down Expand Up @@ -137,15 +142,15 @@ def trainer(
accumulate_grad_batches=1,
callbacks=callbacks,
devices=num_gpus_per_node,
limit_test_batches=50,
limit_val_batches=32,
log_every_n_steps=10,
max_steps=max_steps,
num_nodes=num_nodes,
plugins=bf16_mixed(),
strategy=strategy,
use_distributed_sampler=False,
val_check_interval=2000,
val_check_interval=val_check_interval,
limit_test_batches=limit_test_batches,
limit_val_batches=limit_val_batches,
log_every_n_steps=log_every_n_steps,
)

return trainer
Expand All @@ -158,6 +163,16 @@ def pretrain_recipe(
tokenizer_model: str = None,
num_nodes: int = 1,
num_gpus_per_node: int = 8,
tensor_parallelism: int = 1,
pipeline_parallelism: int = 1,
max_steps: int = 100,
val_check_interval: int = 100,
limit_test_batches: int = 50,
limit_val_batches: int = 32,
log_every_n_steps: int = 10,
seq_length: int = 4096,
gbs: int = 8,
mbs: int = 1,
fn=pretrain,
) -> run.Partial:
"""
Expand Down Expand Up @@ -193,16 +208,23 @@ def pretrain_recipe(
fn,
model=model(),
trainer=trainer(
max_steps=max_steps,
num_nodes=num_nodes,
tensor_parallelism=tensor_parallelism,
pipeline_parallelism=pipeline_parallelism,
num_gpus_per_node=num_gpus_per_node,
val_check_interval=val_check_interval,
limit_test_batches=limit_test_batches,
limit_val_batches=limit_val_batches,
log_every_n_steps=log_every_n_steps,
callbacks=[run.Config(TimingCallback)],
),
data=run.Config(
MockDataModule,
seq_length=4096,
global_batch_size=8,
micro_batch_size=1,
tokenizer=tokenizer(tokenizer_model=tokenizer_model),
seq_length=seq_length,
global_batch_size=gbs,
micro_batch_size=mbs,
tokenizer=tokenizer(),
),
log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
Expand All @@ -218,6 +240,14 @@ def finetune_recipe(
tokenizer_model: str = None,
num_nodes: int = 1,
num_gpus_per_node: int = 8,
tensor_model_parallel_size: int = 1,
pipeline_model_parallel_size: int = 1,
seq_length: int = 4096,
max_steps: int = 100,
val_check_interval: int = 100,
limit_test_batches: int = 50,
limit_val_batches: int = 32,
log_every_n_steps: int = 10,
gbs: int = 8,
mbs: int = 1,
peft_scheme: Optional[str] = 'none',
Expand Down Expand Up @@ -266,8 +296,8 @@ def finetune_recipe(
)
strategy = run.Config(
nl.MegatronStrategy,
tensor_model_parallel_size=1,
pipeline_model_parallel_size=1,
tensor_model_parallel_size=tensor_model_parallel_size,
pipeline_model_parallel_size=pipeline_model_parallel_size,
gradient_as_bucket_view=True,
ckpt_load_optimizer=False,
ckpt_save_optimizer=False,
Expand All @@ -283,10 +313,11 @@ def finetune_recipe(
accelerator="gpu",
accumulate_grad_batches=1,
devices=num_gpus_per_node,
limit_test_batches=10,
limit_val_batches=10,
log_every_n_steps=20,
max_steps=100,
max_steps=max_steps,
val_check_interval=val_check_interval,
limit_test_batches=limit_test_batches,
limit_val_batches=limit_val_batches,
log_every_n_steps=log_every_n_steps,
num_nodes=num_nodes,
plugins=run.Config(
nl.MegatronMixedPrecision,
Expand All @@ -296,15 +327,14 @@ def finetune_recipe(
callbacks=[checkpoint_callback],
strategy=strategy,
use_distributed_sampler=False,
val_check_interval=20,
)
recipe = run.Partial(
llm.finetune,
model=model(tokenizer_model=tokenizer_model),
trainer=trainer,
data=run.Config(
llm.SquadDataModule,
seq_length=2048,
seq_length=seq_length,
global_batch_size=gbs,
micro_batch_size=mbs,
tokenizer=tokenizer(tokenizer_model=tokenizer_model),
Expand Down
68 changes: 49 additions & 19 deletions nemo/collections/llm/recipes/mamba2_1_3b.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def model(tokenizer_model: str = None) -> run.Config[pl.LightningModule]:
)


@run.cli.factory(target=finetune, name=NAME)
def trainer(
tensor_parallelism: int = 1,
pipeline_parallelism: int = 1,
Expand All @@ -76,7 +77,11 @@ def trainer(
sequence_parallelism: bool = False,
num_nodes: int = 1,
num_gpus_per_node: int = 8,
max_steps: int = 1168251,
max_steps: int = 100,
val_check_interval: int = 100,
limit_test_batches: int = 50,
limit_val_batches: int = 32,
log_every_n_steps: int = 10,
callbacks: Optional[list[run.Config[Callback]]] = None,
) -> run.Config[nl.Trainer]:
"""
Expand Down Expand Up @@ -137,15 +142,15 @@ def trainer(
accumulate_grad_batches=1,
callbacks=callbacks,
devices=num_gpus_per_node,
limit_test_batches=50,
limit_val_batches=32,
log_every_n_steps=10,
max_steps=max_steps,
num_nodes=num_nodes,
plugins=bf16_mixed(),
strategy=strategy,
use_distributed_sampler=False,
val_check_interval=2000,
val_check_interval=val_check_interval,
limit_test_batches=limit_test_batches,
limit_val_batches=limit_val_batches,
log_every_n_steps=log_every_n_steps,
)

return trainer
Expand All @@ -157,7 +162,17 @@ def pretrain_recipe(
name: str = "default",
tokenizer_model: str = None,
num_nodes: int = 1,
num_gpus_per_node: int = 8,
num_gpus_per_node: int = 1,
tensor_parallelism: int = 1,
pipeline_parallelism: int = 1,
max_steps: int = 100,
val_check_interval: int = 100,
limit_test_batches: int = 50,
limit_val_batches: int = 32,
log_every_n_steps: int = 10,
seq_length: int = 4096,
gbs: int = 8,
mbs: int = 1,
fn=pretrain,
) -> run.Partial:
"""
Expand Down Expand Up @@ -191,17 +206,24 @@ def pretrain_recipe(
"""
return run.Partial(
fn,
model=model(),
model=model(tokenizer_model=tokenizer_model),
trainer=trainer(
max_steps=max_steps,
num_nodes=num_nodes,
tensor_parallelism=tensor_parallelism,
pipeline_parallelism=pipeline_parallelism,
num_gpus_per_node=num_gpus_per_node,
val_check_interval=val_check_interval,
limit_test_batches=limit_test_batches,
limit_val_batches=limit_val_batches,
log_every_n_steps=log_every_n_steps,
callbacks=[run.Config(TimingCallback)],
),
data=run.Config(
MockDataModule,
seq_length=4096,
global_batch_size=8,
micro_batch_size=1,
seq_length=seq_length,
global_batch_size=gbs,
micro_batch_size=mbs,
tokenizer=tokenizer(tokenizer_model=tokenizer_model),
),
log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
Expand All @@ -217,7 +239,15 @@ def finetune_recipe(
resume_path: str = None,
tokenizer_model: str = None,
num_nodes: int = 1,
num_gpus_per_node: int = 8,
num_gpus_per_node: int = 1,
tensor_model_parallel_size: int = 1,
pipeline_model_parallel_size: int = 1,
seq_length: int = 4096,
max_steps: int = 100,
val_check_interval: int = 100,
limit_test_batches: int = 50,
limit_val_batches: int = 32,
log_every_n_steps: int = 10,
gbs: int = 8,
mbs: int = 1,
peft_scheme: Optional[str] = 'none',
Expand Down Expand Up @@ -266,8 +296,8 @@ def finetune_recipe(
)
strategy = run.Config(
nl.MegatronStrategy,
tensor_model_parallel_size=1,
pipeline_model_parallel_size=1,
tensor_model_parallel_size=tensor_model_parallel_size,
pipeline_model_parallel_size=pipeline_model_parallel_size,
gradient_as_bucket_view=True,
ckpt_load_optimizer=False,
ckpt_save_optimizer=False,
Expand All @@ -283,10 +313,11 @@ def finetune_recipe(
accelerator="gpu",
accumulate_grad_batches=1,
devices=num_gpus_per_node,
limit_test_batches=10,
limit_val_batches=10,
log_every_n_steps=20,
max_steps=100,
max_steps=max_steps,
val_check_interval=val_check_interval,
limit_test_batches=limit_test_batches,
limit_val_batches=limit_val_batches,
log_every_n_steps=log_every_n_steps,
num_nodes=num_nodes,
plugins=run.Config(
nl.MegatronMixedPrecision,
Expand All @@ -296,15 +327,14 @@ def finetune_recipe(
callbacks=[checkpoint_callback],
strategy=strategy,
use_distributed_sampler=False,
val_check_interval=20,
)
recipe = run.Partial(
llm.finetune,
model=model(tokenizer_model=tokenizer_model),
trainer=trainer,
data=run.Config(
llm.SquadDataModule,
seq_length=2048,
seq_length=seq_length,
global_batch_size=gbs,
micro_batch_size=mbs,
tokenizer=tokenizer(tokenizer_model=tokenizer_model),
Expand Down
Loading

0 comments on commit 94dbd50

Please sign in to comment.