From d37b5f10c439742382e9a98d843477259a1f3f06 Mon Sep 17 00:00:00 2001 From: Joseph Viviano Date: Fri, 20 Dec 2024 00:10:45 -0500 Subject: [PATCH] multinode metric tracking now working --- tutorials/examples/train_hypergrid.py | 144 +++++++++++++++----------- 1 file changed, 84 insertions(+), 60 deletions(-) diff --git a/tutorials/examples/train_hypergrid.py b/tutorials/examples/train_hypergrid.py index 59ab5b0..d233c23 100644 --- a/tutorials/examples/train_hypergrid.py +++ b/tutorials/examples/train_hypergrid.py @@ -163,7 +163,7 @@ def _error_checker(self): sys.exit(1) except Exception as e: - print(f'Process {self.rank}: Error in error checker: {str(e)}') + print('Process {}: Error in error checker: {}'.format(self.rank, e)) self.signal_error() break @@ -196,8 +196,8 @@ def _cleanup(self): def gather_distributed_data( - local_data: Union[List, torch.Tensor], world_size: int = None, rank: int = None -) -> List: + local_tensor: torch.Tensor, world_size: int = None, rank: int = None, verbose: bool = False, +) -> torch.Tensor: """ Gather data from all processes in a distributed setting. @@ -207,75 +207,92 @@ def gather_distributed_data( rank: Current process rank (optional, will get from env if None) Returns: - List containing gathered data from all processes + On rank 0: Concatenated tensor from all processes + On other ranks: None """ - print("syncing distributed data") + if verbose: + print("syncing distributed data") if world_size is None: world_size = dist.get_world_size() if rank is None: rank = dist.get_rank() - # Convert data to tensor if it's not already. - if not isinstance(local_data, torch.Tensor): - print("+ converting tensor data via serialization") - # Serialize complex data structures. - serialized_data = pickle.dumps(local_data) - local_tensor = torch.ByteTensor(torch.ByteStorage.from_buffer(serialized_data)) - else: - print("+ tensor not converted") - local_tensor = local_data - - # First gather sizes to allocate correct buffer sizes. - local_size = torch.tensor([local_tensor.numel()], device=local_tensor.device) + # First gather batch_sizes to allocate correct buffer sizes. + local_batch_size = torch.tensor([local_tensor.shape[0]], device=local_tensor.device, dtype=local_tensor.dtype) if rank == 0: - size_list = [ - torch.tensor([0], device=local_tensor.device) for _ in range(world_size) + # Assumes same dimensionality on all ranks! + batch_size_list = [ + torch.zeros((1, ), device=local_tensor.device, dtype=local_tensor.dtype) for _ in range(world_size) ] else: - size_list = None - print("+ all gather of local_size={} to size_list".format(local_size)) - dist.gather(local_size, gather_list=size_list, dst=0) - # 44 [0] Process 0: Caught error: Invalid function argument. Expected parameter `tensor` to be of type torch.Tensor.[0] + batch_size_list = None + + if verbose: + print("rank={}, batch_size_list={}".format(rank, batch_size_list)) + print("+ gather of local_batch_size={} to batch_size_list".format(local_batch_size)) + dist.gather(local_batch_size, gather_list=batch_size_list, dst=0) dist.barrier() # Add synchronization + # Pad local tensor to maximum size. + if verbose: + print("+ padding local tensor") + + if rank == 0: + max_batch_size = (max(bs for bs in batch_size_list)) + else: + max_batch_size = 0 + + state_size = local_tensor.shape[1] # assume states are 1-d, is true for this env. + + # Broadcast max_size to all processes for padding + max_batch_size_tensor = torch.tensor(max_batch_size, device=local_tensor.device) + dist.broadcast(max_batch_size_tensor, src=0) # Pad local tensor to maximum size. - print("+ padding local tensor") - max_size = max(size.item() for size in size_list) - if local_tensor.numel() < max_size: + if local_tensor.shape[0] < max_batch_size: padding = torch.zeros( - max_size - local_tensor.numel(), + (max_batch_size - local_tensor.shape[0], state_size), dtype=local_tensor.dtype, - device=local_tensor.device, + device=local_tensor.device ) - local_tensor = torch.cat((local_tensor, padding)) + local_tensor = torch.cat((local_tensor, padding), dim=0) - # Gather all tensors. - print("+ gathering all tensors from world_size={}".format(world_size)) + # Gather padded tensors. if rank == 0: tensor_list = [ - torch.zeros(max_size, dtype=local_tensor.dtype, device=local_tensor.device) + torch.zeros( + (max_batch_size, state_size), + dtype=local_tensor.dtype, + device=local_tensor.device, + ) for _ in range(world_size) ] else: tensor_list = None + + if verbose: + print("+ gathering all tensors from world_size={}".format(world_size)) + print("rank={}, tensor_list={}".format(rank, tensor_list)) dist.gather(local_tensor, gather_list=tensor_list, dst=0) dist.barrier() # Add synchronization - # Trim padding and deserialize if necessary. - result = [] - for tensor, size in zip(tensor_list, size_list): - trimmed_tensor = tensor[: size.item()] - if not isinstance(local_data, torch.Tensor): - print("+ deserializing tensor.") - # Deserialize data. - result = pickle.loads(trimmed_tensor.cpu().numpy().tobytes()) - else: - print("+ tensor not deserialized") - result = trimmed_tensor + # Only rank 0 processes the results + if rank == 0: + results = [] + for tensor, batch_size in zip(tensor_list, batch_size_list): + trimmed_tensor = tensor[:batch_size.item(), ...] + results.append(trimmed_tensor) - return result + if verbose: + print("distributed n_results={}".format(len(results))) + + for r in results: + print(" {}".format(r.shape)) + + return torch.cat(results, dim=0) # Concatenates along the batch dimension. + + return None # For all non-zero ranks. def main(args): # noqa: C901 @@ -528,7 +545,7 @@ def cleanup(): world_size, cleanup_callback=cleanup, ) - handler.start() + #handler.start() for iteration in trange(n_iterations): @@ -616,6 +633,24 @@ def cleanup(): ] ) + log_this_iter = ((iteration % args.validation_interval == 0) or iteration == n_iterations - 1) + + print("before distributed -- orig_shape={}".format(visited_terminating_states.tensor.shape)) + if args.distributed and log_this_iter: + try: + all_visited_terminating_states = gather_distributed_data( + visited_terminating_states.tensor + ) + except Exception as e: + print('Process {}: Caught error: {}'.format(my_rank, e)) + #handler.signal_error() + sys.exit(1) + else: + all_visited_terminating_states = visited_terminating_states.tensor + + if my_rank == 0: + print("after distributed -- gathered_shape={}, orig_shape={}".format(all_visited_terminating_states.shape, visited_terminating_states.tensor.shape)) + # If we are on the master node, calculate the validation metrics. if my_rank == 0: to_log = { @@ -628,24 +663,12 @@ def cleanup(): "opt_time": opt_time, "rest_time": rest_time, } + if use_wandb: wandb.log(to_log, step=iteration) - if (iteration % args.validation_interval == 0) or ( - iteration == n_iterations - 1 - ): - - if args.distributed: - try: - all_visited_terminating_states = gather_distributed_data( - visited_terminating_states.tensor - ) - except Exception as e: - print(f'Process {my_rank}: Caught error: {str(e)}') - handler.signal_error() - sys.exit(1) - else: - all_visited_terminating_states = visited_terminating_states.tensor + if log_this_iter: + print("logging thjs iteration!") validation_info, discovered_modes = validate_hypergrid( env, gflownet, @@ -725,6 +748,7 @@ def validate_hypergrid( modes_found = set([tuple(s.tolist()) for s in modes]) discovered_modes.update(modes_found) validation_info["n_modes_found"] = len(discovered_modes) + print(len(discovered_modes)) # Old way of counting modes -- potentially buggy - to be removed. # # Add the mode counting metric.