Skip to content

Commit

Permalink
editing logging to resolve all checker issues
Browse files Browse the repository at this point in the history
  • Loading branch information
itayhubara committed Feb 29, 2024
1 parent aa8415d commit a8efc51
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 16 deletions.
9 changes: 3 additions & 6 deletions llama2_70b_lora/run_llama_70B_scrolls_r16.sh
Original file line number Diff line number Diff line change
@@ -1,16 +1,13 @@
accelerate launch --config_file configs/default_config.yaml scripts/train.py \
--model_name meta-llama/Llama-2-70b-hf \
--dataset_path "./dataset" \
--model_path "./llama-v2-fused-qkv" \
--model_path "/software/users/ihubara/lora_clean/llama-v2-fused-qkv" \
--max_seq_len 8192 \
--bf16 True \
--logging_steps 2 \
--eval_steps 6 \
--save_steps 999 \
--logging_steps 32 \
--eval_steps 64 \
--output_dir "./results/llama-70b_scrolls_gov_report_r16_$1" \
--per_device_train_batch_size 1 \
--gradient_accumulation_steps 1 \
--dataset_text_field "input" \
--lr_scheduler_type "cosine" \
--learning_rate 5e-4 \
--warmup_ratio 0 \
Expand Down
25 changes: 18 additions & 7 deletions llama2_70b_lora/scripts/mlperf_logging_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,22 +73,27 @@ def end(self, key, value=None, metadata=None, sync=False, log_rank=None):
class MLPerfCallback(TrainerCallback):
"A callback that prints a message at the beginning of training"

def __init__(self, logger, train_dataset_length, eval_dataset_length):
def __init__(self, logger, train_dataset_length, eval_dataset_length,lora_alpha):
super().__init__()
self.mllogger = logger
self.submission_info = {
"submission_benchmark": "llama2_70b_lora",
"submission_division": "Closed",
"submission_division": "closed",
"submission_org": "referece",
"submission_platform": "referece",
"submission_poc_name": "referece",
"submission_poc_email": "referece",
"submission_status": "referece",
"submission_status": "onprem",
"train_dataset_length": train_dataset_length,
"eval_dataset_length": eval_dataset_length,
"lora_alpha": lora_alpha
}

def on_train_begin(self, args, state, control, **kwargs):
self.gbs=args.per_device_train_batch_size * args.gradient_accumulation_steps * os.getenv("WORLD_SIZE", 1)
self.mllogger.event(
key=constants.CACHE_CLEAR, value="True",
)
self.mllogger.event(
key=constants.SUBMISSION_BENCHMARK,
value=self.submission_info["submission_benchmark"],
Expand Down Expand Up @@ -133,9 +138,15 @@ def on_train_begin(self, args, state, control, **kwargs):
self.mllogger.event(key=constants.SEED, value=args.seed)
self.mllogger.event(key=constants.OPT_LR_WARMUP_FACTOR, value=args.warmup_ratio)
self.mllogger.event(key=constants.OPT_LR_TRAINING_STEPS, value=args.max_steps)
self.mllogger.event(key=constants.OPT_ADAMW_WEIGHT_DECAY, value=args.weight_decay)
self.mllogger.event(key=constants.OPT_GRADIENT_CLIP_NORM, value=args.max_grad_norm)
self.mllogger.event(key=constants.OPT_BASE_LR, value=args.learning_rate)
self.mllogger.event(key=constants.LORA_ALPHA, value=args.lora_alpha)
self.mllogger.event(key=constants.LORA_ALPHA, value=self.submission_info["lora_alpha"])
self.mllogger.event(key='lora_rank', value=16)
self.mllogger.event(key=constants.GRADIENT_ACCUMULATION_STEPS, value=args.gradient_accumulation_steps)
self.mllogger.start(key=constants.INIT_START, value="")
# device warmup should be done here
self.mllogger.end(key=constants.INIT_STOP, value="")
self.mllogger.start(constants.RUN_START, value="")

def on_step_begin(
Expand Down Expand Up @@ -168,9 +179,9 @@ def on_step_begin(
metadata={"step_num": state.log_history[-1]["step"]},
)
self.mllogger.event(
"eval_loss",
constants.EVAL_ACCURACY,
value=state.log_history[-1]["eval_loss"],
metadata={"step_num": state.log_history[-1]["step"]},
metadata={"samples_num": state.log_history[-1]["step"]*self.gbs},
)
self.mllogger.start(
constants.BLOCK_START,
Expand All @@ -187,7 +198,7 @@ def on_step_begin(
constants.RUN_STOP,
value=eval_loss_list[-1],
metadata={
"step_num": state.log_history[-1]["step"],
"samples_num": state.log_history[-1]["step"]*self.gbs,
"status": "success",
},
)
Expand Down
6 changes: 3 additions & 3 deletions llama2_70b_lora/scripts/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ class ScriptArguments:
max_grad_norm: Optional[float] = field(default=0.0)
weight_decay: Optional[float] = field(default=0.001)
lora_alpha: Optional[int] = field(default=32)
lora_dropout: Optional[float] = field(default=0.1, metadata={"lora dropout is a fixed to 0.1 in closed submission"})
lora_r: Optional[int] = field(default=16, metadata={"lora rank is a fixed to 16 in closed submission"})
lora_dropout: Optional[float] = field(default=0.1, metadata={"help": "lora dropout is a fixed to 0.1 in closed submission"})
lora_r: Optional[int] = field(default=16, metadata={"help": "lora rank is a fixed to 16 in closed submission"})
lora_target_modules: Optional[str] = field(
default=None,
metadata={
Expand Down Expand Up @@ -185,7 +185,7 @@ def main(args):
args=training_arguments,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
callbacks=[MLPerfCallback(loralogger, len(train_dataset), len(eval_dataset))],
callbacks=[MLPerfCallback(loralogger, len(train_dataset), len(eval_dataset),args.lora_alpha)],
)
trainer.accelerator.print(f"{trainer.model}")
if args.use_peft_lora:
Expand Down

0 comments on commit a8efc51

Please sign in to comment.