diff --git a/tests/fixtures/configs/train/reinforce/gemma.json b/tests/fixtures/configs/train/reinforce/gemma.json index e6409b95..71e6a988 100644 --- a/tests/fixtures/configs/train/reinforce/gemma.json +++ b/tests/fixtures/configs/train/reinforce/gemma.json @@ -1,188 +1,188 @@ { - "train_dataset_settings": { - "sources": [ - { - "name": "train_chat", - "records_path": "tests/fixtures/datasets/chat/train_chat.jsonl", - "sample_rate": 1.0 - } - ], - "prompt_template": { - "role_tag_mapping": { - "bot": "assistant", - "user": "user", - "system": "system" - }, - "prefix_template": "<|start_header_id|>{role}<|end_header_id|>\n\n", - "suffix_template": "<|eot_id|>" - }, - "dataset_type": "chat", - "max_tokens_count": 2000, - "only_answer_loss": true - }, - "val_dataset_settings": { - "sources": [ - { - "name": "val_chat", - "records_path": "tests/fixtures/datasets/chat/train_chat.jsonl", - "sample_rate": 1.0 - } - ], - "prompt_template": { - "role_tag_mapping": { - "bot": "assistant", - "user": "user", - "system": "system" - }, - "prefix_template": "<|start_header_id|>{role}<|end_header_id|>\n\n", - "suffix_template": "<|eot_id|>" - }, - "dataset_type": "chat", - "max_tokens_count": 2000, - "only_answer_loss": true - }, - "cherry_pick_settings": { - "generator_transformers_settings": { - "num_beams": 1, - "do_sample": false, - "stop_strings": "", - "max_new_tokens": 8 - }, - "custom_generation_settings": { - "skip_special_tokens": false - }, - "dataset_settings": { - "sources": [ - { - "name": "chat_test", - "records_path": "tests/fixtures/datasets/chat/train_chat.jsonl", - "num_samples": 2 - } - ], - "prompt_template": { - "role_tag_mapping": { - "bot": "assistant", - "user": "user", - "system": "system" - }, - "prefix_template": "<|start_header_id|>{role}<|end_header_id|>\n\n", - "suffix_template": "<|eot_id|>" - }, - "dataset_type": "chat", - "max_tokens_count": 150, - "only_answer_loss": true - }, - "metric_settings": [ - { - "type": "length", - "parameters": { - "need_average": [ - true - ] - } - }, - { - "type": "kl", - "parameters": { - "need_average": [ - true - ], - "ref_logits_type": "sft" - } - }, - { - "type": "kl", - "parameters": { - "need_average": [ - true - ], - "ref_logits_type": "reference" - } - } - ] - }, - "reward_model_settings": { - "model_path": "/from_s3/llama-3.2-1B", - "model_type": "seq_cls", - "model_kwargs": { - "num_labels": 1, - "attn_implementation": "flash_attention_2" - }, - "transformers_settings": {} - }, - "model_settings": { - "model_path": "/from_s3/llama-3.2-1B", - "model_type": "causal", - "model_kwargs": { - "attn_implementation": "flash_attention_2" - }, - "transformers_settings": {}, - "liger_kernels_settings": {} - }, - "tokenizer_settings": { - "tokenizer_kwargs": { - "padding_side": "left" - } - }, - "trainer_settings": { - "actor_settings": { - "actor_type": "distributed_vllm", - "vllm_num_engines": 2, - "vllm_tensor_parallel_size": 1 - }, - "deepspeed": "tests/fixtures/configs/train/reinforce/deepspeed_cfg.json", - "gradient_checkpointing": true, - "gradient_checkpointing_kwargs": { - "use_reentrant": false - }, - "critic_type": "ray_transformers", - "reward_processor_type": "rloo", - "evaluation_strategy": "steps", - "num_generations": 4, - "per_device_train_batch_size": 1, - "per_device_eval_batch_size": 1, - "gradient_accumulation_steps": 1, - "adam_beta1": 0.9, - "adam_beta2": 0.95, - "adam_epsilon": 0.00001, - "eval_steps": 10, - "save_steps": 100, - "save_strategy": "steps", - "load_best_model_at_end": false, - "logging_steps": 1, - "learning_rate": 0.00001, - "max_steps": 1001, - "lr_scheduler_type": "constant_with_warmup", - "warmup_ratio": 0.03, - "fp16": false, - "bf16": true, - "optim": "adamw_torch", - "weight_decay": 0.0, - "max_grad_norm": 2, - "save_total_limit": 11, - "dataloader_num_workers": 12, - "stop_token": "", - "temperature": 0.7, - "non_eos_penalty": true, - "penalty_reward_value": -1, - "clip_rewards_min": -1e+8, - "clip_rewards_max": 1e+8, - "whiten_rewards": false, - "kl_coef": 0.05, - "mean_baseline_coef": 0.95, - "num_samples_for_reward_stats": 1000, - "no_cuda": false - }, - "special_tokens_settings": { - "bos_token": "", - "eos_token": "", - "pad_token": "" - }, - "logging_settings": { - "project_name": "alignment", - "run_name": "sft", - "entity": "turbo-alignment" - }, - "seed": 0, - "log_path": "train_output" + "train_dataset_settings": { + "sources": [ + { + "name": "train_chat", + "records_path": "tests/fixtures/datasets/chat/train_chat.jsonl", + "sample_rate": 1.0 + } + ], + "prompt_template": { + "role_tag_mapping": { + "bot": "assistant", + "user": "user", + "system": "system" + }, + "prefix_template": "<|start_header_id|>{role}<|end_header_id|>\n\n", + "suffix_template": "<|eot_id|>" + }, + "dataset_type": "chat", + "max_tokens_count": 2000, + "only_answer_loss": true + }, + "val_dataset_settings": { + "sources": [ + { + "name": "val_chat", + "records_path": "tests/fixtures/datasets/chat/train_chat.jsonl", + "sample_rate": 1.0 + } + ], + "prompt_template": { + "role_tag_mapping": { + "bot": "assistant", + "user": "user", + "system": "system" + }, + "prefix_template": "<|start_header_id|>{role}<|end_header_id|>\n\n", + "suffix_template": "<|eot_id|>" + }, + "dataset_type": "chat", + "max_tokens_count": 2000, + "only_answer_loss": true + }, + "cherry_pick_settings": { + "generator_transformers_settings": { + "num_beams": 1, + "do_sample": false, + "stop_strings": "", + "max_new_tokens": 8 + }, + "custom_generation_settings": { + "skip_special_tokens": false + }, + "dataset_settings": { + "sources": [ + { + "name": "chat_test", + "records_path": "tests/fixtures/datasets/chat/train_chat.jsonl", + "num_samples": 2 + } + ], + "prompt_template": { + "role_tag_mapping": { + "bot": "assistant", + "user": "user", + "system": "system" + }, + "prefix_template": "<|start_header_id|>{role}<|end_header_id|>\n\n", + "suffix_template": "<|eot_id|>" + }, + "dataset_type": "chat", + "max_tokens_count": 150, + "only_answer_loss": true + }, + "metric_settings": [ + { + "type": "length", + "parameters": { + "need_average": [ + true + ] + } + }, + { + "type": "kl", + "parameters": { + "need_average": [ + true + ], + "ref_logits_type": "sft" + } + }, + { + "type": "kl", + "parameters": { + "need_average": [ + true + ], + "ref_logits_type": "reference" + } + } + ] + }, + "reward_model_settings": { + "model_path": "/from_s3/llama-3.2-1B", + "model_type": "seq_cls", + "model_kwargs": { + "num_labels": 1, + "attn_implementation": "flash_attention_2" + }, + "transformers_settings": {} + }, + "model_settings": { + "model_path": "/from_s3/llama-3.2-1B", + "model_type": "causal", + "model_kwargs": { + "attn_implementation": "flash_attention_2" + }, + "transformers_settings": {}, + "liger_kernels_settings": {} + }, + "tokenizer_settings": { + "tokenizer_kwargs": { + "padding_side": "left" + } + }, + "trainer_settings": { + "actor_settings": { + "actor_type": "distributed_vllm", + "vllm_num_engines": 2, + "vllm_tensor_parallel_size": 1 + }, + "deepspeed": "tests/fixtures/configs/train/reinforce/deepspeed_cfg.json", + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": { + "use_reentrant": false + }, + "critic_type": "ray_transformers", + "reward_processor_type": "rloo", + "evaluation_strategy": "steps", + "num_generations": 4, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "gradient_accumulation_steps": 1, + "adam_beta1": 0.9, + "adam_beta2": 0.95, + "adam_epsilon": 0.00001, + "eval_steps": 10, + "save_steps": 100, + "save_strategy": "steps", + "load_best_model_at_end": false, + "logging_steps": 1, + "learning_rate": 0.00001, + "max_steps": 1001, + "lr_scheduler_type": "constant_with_warmup", + "warmup_ratio": 0.03, + "fp16": false, + "bf16": true, + "optim": "adamw_torch", + "weight_decay": 0.0, + "max_grad_norm": 2, + "save_total_limit": 11, + "dataloader_num_workers": 12, + "stop_token": "", + "temperature": 0.7, + "non_eos_penalty": true, + "penalty_reward_value": -1, + "clip_rewards_min": -1e+8, + "clip_rewards_max": 1e+8, + "whiten_rewards": false, + "kl_coef": 0.05, + "mean_baseline_coef": 0.95, + "num_samples_for_reward_stats": 1000, + "no_cuda": false + }, + "special_tokens_settings": { + "bos_token": "", + "eos_token": "", + "pad_token": "" + }, + "logging_settings": { + "project_name": "alignment", + "run_name": "sft", + "entity": "turbo-alignment" + }, + "seed": 0, + "log_path": "train_output" } \ No newline at end of file diff --git a/turbo_alignment/cli/train.py b/turbo_alignment/cli/train.py index 0e98def6..d04a89c1 100755 --- a/turbo_alignment/cli/train.py +++ b/turbo_alignment/cli/train.py @@ -130,7 +130,7 @@ def reinforce_training( experiment_settings = pipeline_settings.REINFORCETrainExperimentSettings.parse_file(experiment_settings_path) - policy_models = RayGroup(num_nodes=1, num_gpus_per_node=8, ray_actor_type=pipelines.TrainREINFORCEStrategy) + policy_models = RayGroup(num_nodes=2, num_gpus_per_node=8, ray_actor_type=pipelines.TrainREINFORCEStrategy)#64.19 GiB is allocated by PyTorch, and 3.40 GiB reward_model = RayGroup(num_nodes=1, num_gpus_per_node=1, ray_actor_type=RewardModel) reference_model = RayGroup(num_nodes=1, num_gpus_per_node=1, ray_actor_type=ReferenceModel) diff --git a/turbo_alignment/trainers/online/ray/distributed_torch_ray_actor.py b/turbo_alignment/trainers/online/ray/distributed_torch_ray_actor.py index 6604ecdf..03ee3cbc 100644 --- a/turbo_alignment/trainers/online/ray/distributed_torch_ray_actor.py +++ b/turbo_alignment/trainers/online/ray/distributed_torch_ray_actor.py @@ -33,6 +33,7 @@ def __init__(self, world_size, rank, local_rank, master_addr, master_port): @staticmethod def _get_current_node_ip(): address = ray._private.services.get_node_ip_address() + print(f'dist ray actor DEBUG: {address=}', flush=True) # strip ipv6 address return address.strip("[]") diff --git a/turbo_alignment/trainers/online/ray/rayactor_group.py b/turbo_alignment/trainers/online/ray/rayactor_group.py index ad02a387..6ef4c2a9 100644 --- a/turbo_alignment/trainers/online/ray/rayactor_group.py +++ b/turbo_alignment/trainers/online/ray/rayactor_group.py @@ -58,7 +58,7 @@ def _initiate_actors(self, pg, num_gpus_per_actor): for i in range(len(bundles)): bundles[i][resources_name] = self._num_resources_per_node - pg = placement_group(bundles, strategy="STRICT_PACK") + pg = placement_group(bundles, strategy="PACK") ray.get(pg.ready()) if pg: master_actor = self.ray_actor_type.options( diff --git a/turbo_alignment/trainers/online/reference_actor.py b/turbo_alignment/trainers/online/reference_actor.py index 7aeda652..bcbf6337 100644 --- a/turbo_alignment/trainers/online/reference_actor.py +++ b/turbo_alignment/trainers/online/reference_actor.py @@ -3,6 +3,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer import torch import torch.nn.functional as F +import gc @ray.remote(num_gpus=1) class ReferenceModel(DistributedTorchRayActor): @@ -13,7 +14,7 @@ def __init__(self, world_size, rank, local_rank, master_addr, master_port): def init_model_from_pretrained(self, pretrain): self._setup_distributed() - self.model = AutoModelForCausalLM.from_pretrained(pretrain, device_map='cuda', torch_dtype=torch.bfloat16) + self.model = AutoModelForCausalLM.from_pretrained(pretrain, device_map='cuda', torch_dtype=torch.bfloat16) #attn_implementation='flash_attention_2' self.tokenizer = AutoTokenizer.from_pretrained(pretrain, trust_remote_code=True) print(f"Reference model initialized on Node {self.node_id}, Local Rank {self.local_rank}") print("GPU IDs: {}".format(ray.get_runtime_context().get_accelerator_ids()["GPU"])) @@ -40,13 +41,21 @@ def reference_forward(self, x, temperature, loss_mask): self.model.eval() x = {k: v.cuda() for k, v in x.items()} - logits = self.model(**x).logits[:, :-1] + print(f"{x.keys()}") + logits = self.model(**x).logits[:, :-1] # 35GB logits /= temperature - all_logprob = F.log_softmax(logits, dim=-1) - - input_ids = x['input_ids'] - logprob = torch.gather(all_logprob, 2, input_ids[:, 1:].unsqueeze(-1)).squeeze(-1) + #logits = F.log_softmax(logits, dim=-1) # 35GB + ''' + Memory Efficient implementation of log_softmax using in_place operation + ''' + torch.exp(logits, out=logits) + summed = torch.sum(logits, dim=-1, keepdim=True) + logits /= summed + torch.log(logits, out=logits) + + logprob = torch.gather(logits, 2, x['input_ids'][:, 1:].unsqueeze(-1)).squeeze(-1) logprob[~loss_mask[:, 1:].to(torch.bool)] = 0 + out = logprob.sum(-1) - return logprob.sum(-1) \ No newline at end of file + return out \ No newline at end of file diff --git a/turbo_alignment/trainers/online/reinforce.py b/turbo_alignment/trainers/online/reinforce.py index e306b6bb..da482cb2 100644 --- a/turbo_alignment/trainers/online/reinforce.py +++ b/turbo_alignment/trainers/online/reinforce.py @@ -102,6 +102,7 @@ def __init__( start = time.time() if self.vllm_engines is not None and torch.distributed.get_rank() == 0: master_address = ray._private.services.get_node_ip_address() + print(f'TRAINER DEBUG: {master_address=}', flush=True) with socket.socket() as sock: sock.bind(("", 0)) master_port = sock.getsockname()[1] @@ -269,9 +270,9 @@ def get_answers_and_rewards( ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: import time import logging - # for k, v in inputs.items(): - # if isinstance(v, torch.Tensor): - # print(f'get_answers_and_rewards:inputs: {k}, {v.device=}', flush=True) + for k, v in inputs.items(): + if isinstance(v, torch.Tensor): + print(f'get_answers_and_rewards:inputs: {k}:{v.shape}', flush=True) if do_broadcast: # TODO: move to generator @@ -301,7 +302,8 @@ def get_answers_and_rewards( logging.info(f'Decoded: {self.tokenizer.decode([response_ids[0].squeeze(0)[-1]])}') # Padding - max_length = 2048 # max(response_id.size(1) for response_id in response_ids) + max_length = 4096 # max(response_id.size(1) for response_id in response_ids) #45.89 GiB + logging.info(f'{max_length=}') def pad_sequences(sequences, max_length, pad_value=0): padded_sequences = [] @@ -314,7 +316,7 @@ def pad_sequences(sequences, max_length, pad_value=0): response_ids = pad_sequences(response_ids, max_length, pad_value=self.tokenizer.pad_token_id) response_attention_mask = pad_sequences(response_attention_mask, max_length, pad_value=0) - response_tokens_mask = torch.zeros(response_ids.shape, dtype=torch.float32) + response_tokens_mask = torch.zeros(response_ids.shape, dtype=torch.bfloat16) response_tokens_mask[:, generations[0].input_token_ids.shape[0] :] = 1.0 position_ids = (response_attention_mask.cumsum(-1) - 1).clamp(min=0) @@ -427,8 +429,25 @@ def get_batch_loss_metrics( return loss.mean(), metrics def compute_loss(self, model, inputs, return_outputs: bool = False, num_items_in_batch=None): + def print_readable_stats(): + stats = torch.cuda.memory_stats() + keys_of_interest = [ + "allocated_bytes.all.peak", + "allocated_bytes.current", + "reserved_bytes.all.peak", + "reserved_bytes.current", + "active_bytes.all.peak", + "active_bytes.current", + ] + for key in keys_of_interest: + print(f"{key}: {stats[key] / (1024 ** 2):.2f} MB") + + print_readable_stats() + import logging import time + import gc + logging.info(f'{isinstance(model, DeepSpeedEngine)=}') start = time.time() @@ -436,6 +455,8 @@ def compute_loss(self, model, inputs, return_outputs: bool = False, num_items_in self.store_metrics(metrics=metrics, train_eval='train') logging.info(f'Compute Loss elapsed time:{time.time() - start}') + gc.collect() + torch.cuda.empty_cache() return (loss, metrics) if return_outputs else loss def prediction_step( diff --git a/turbo_alignment/trainers/online/reward_actor.py b/turbo_alignment/trainers/online/reward_actor.py index c2adceb4..beceae2a 100644 --- a/turbo_alignment/trainers/online/reward_actor.py +++ b/turbo_alignment/trainers/online/reward_actor.py @@ -39,4 +39,9 @@ def forward(self, x): # print('\n\n\nRM model', [v.shape for k, v in x.items()], 'RM model\n\n\n') # print(self.tokenizer.decode(x['input_ids'][0], skip_special_tokens=False)) #TODO reward from eos + + for k, v in x.items(): + if isinstance(v, torch.Tensor): + print(f'REWARD MODEL:{v.shape=}', flush=True) + return self.model(**x).logits \ No newline at end of file