Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix checkpoints averaging #201

Merged
merged 2 commits into from
Oct 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/openmathinstruct2/training.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ ns train \
--nemo_model=/workspace/llama3.1-8b-nemo \
dgtm777 marked this conversation as resolved.
Show resolved Hide resolved
--num_nodes=8 \
--num_gpus=8 \
--average_steps 10000 20000 30000 40000 50000 60000 \
--average_steps=10000,20000,30000,40000,50000,60000 \
--training_data=/workspace/openmathinstruct2-sft.jsonl \
++model.data.train_ds.micro_batch_size=4 \
++model.tensor_model_parallel_size=4 \
Expand All @@ -103,7 +103,7 @@ ns train \
--nemo_model=/workspace/llama3.1-70b-nemo \
--num_nodes=32 \
--num_gpus=8 \
--average_steps 3330 6660 9990 13320 16650 20000 \
--average_steps=3330,6660,9990,13320,16650,20000 \
--training_data=/workspace/openmathinstruct2-sft-5M.jsonl \
++model.data.train_ds.micro_batch_size=1 \
++model.tensor_model_parallel_size=8 \
Expand Down
3 changes: 3 additions & 0 deletions nemo_skills/pipeline/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@ def eval(
else:
log_dir = f"{output_dir}/eval-logs"

if " " in str(benchmarks):
raise ValueError("benchmarks should be separated with commas")

if server_address is None: # we need to host the model
assert server_gpus is not None, "Need to specify server_gpus if hosting the model"
server_address = "localhost:5000"
Expand Down
3 changes: 3 additions & 0 deletions nemo_skills/pipeline/summarize_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ def summarize_results(
"""Summarize results of an evaluation job."""
setup_logging(disable_hydra_logs=False, log_level=logging.INFO if not debug else logging.DEBUG)

if " " in str(benchmarks):
raise ValueError("benchmarks should be separated with commas")

cluster = cluster or os.environ.get("NEMO_SKILLS_CONFIG")

# copying results from the cluster if necessary
Expand Down
9 changes: 7 additions & 2 deletions nemo_skills/pipeline/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,9 @@ def train(
disable_wandb: bool = typer.Option(False, help="Disable wandb logging"),
with_sandbox: bool = typer.Option(False, help="If sandbox is required for code generation"),
partition: str = typer.Option(None, help="Specify partition for jobs"),
average_steps: list[int] = typer.Option(None, help="List of checkpoint steps to average"),
average_steps: str = typer.Option(
dgtm777 marked this conversation as resolved.
Show resolved Hide resolved
None, help="List of commas separated checkpoint steps to average. E.g 1000,5000"
),
run_after: str = typer.Option(None, help="Experiment to run after"),
config_dir: str = typer.Option(None, help="Can customize where we search for cluster configs"),
log_dir: str = typer.Option(None, help="Can specify a custom location for slurm logs. "),
Expand Down Expand Up @@ -200,6 +202,9 @@ def train(
if validation_data:
check_if_mounted(cluster_config, validation_data)

if " " in str(average_steps):
raise ValueError("average steps should be separated with commas")

train_cmd = get_training_cmd(
cluster_config=cluster_config,
partition=partition,
Expand Down Expand Up @@ -239,7 +244,7 @@ def train(
nemo_model=nemo_model,
output_dir=output_dir,
final_nemo_path=final_nemo_path,
average_steps=f"--steps {' '.join(map(str, average_steps))} " if average_steps else "",
average_steps=f"--steps {' '.join(average_steps.split(','))} " if average_steps else "",
)

add_task(
Expand Down
Loading