Skip to content

Commit

Permalink
uncomment memory checking
Browse files Browse the repository at this point in the history
  • Loading branch information
xrsrke committed Jan 12, 2024
1 parent fac49b7 commit 490bc83
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions src/nanotron/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,8 @@ def __init__(self, config_or_config_file: Union[Config, str]):
group=self.parallel_context.world_pg,
rank=None,
)
# if free_mem < MIN_GPU_MEM_THRESHOLD:
# raise RuntimeError(f"Not enough memory to train the model on node {os.environ.get('SLURMD_NODENAME')}")
if free_mem < MIN_GPU_MEM_THRESHOLD:
raise RuntimeError(f"Not enough memory to train the model on node {os.environ.get('SLURMD_NODENAME')}")
# Try to allocate all the memory
test_tensor_size = int(free_mem * 0.9)
test_tensor = torch.zeros((test_tensor_size,), dtype=torch.uint8, device=torch.device("cuda"))
Expand Down

0 comments on commit 490bc83

Please sign in to comment.