You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hi, I got the following error when training with multiple GPUs, how should I fix it?
raceback (most recent call last):
File "train_rcnn.py", line 270, in
trainer.train(
File "/root/autodl-tmp/EPNet/tools/../tools/train_utils/train_utils.py", line 197, in train
loss, tb_dict, disp_dict = self._train_it(batch)
File "/root/autodl-tmp/EPNet/tools/../tools/train_utils/train_utils.py", line 130, in _train_it
loss, tb_dict, disp_dict = self.model_fn(self.model, batch)
File "/root/autodl-tmp/EPNet/tools/../lib/net/train_functions.py", line 46, in model_fn
ret_dict = model(input_data)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 168, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 178, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 86, in parallel_apply
output.reraise()
File "/root/miniconda3/lib/python3.8/site-packages/torch/_utils.py", line 434, in reraise
raise exception
AssertionError: Caught AssertionError in replica 1 on device 1.
Original Traceback (most recent call last):
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
output = module(*input, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/autodl-tmp/EPNet/tools/../lib/net/point_rcnn.py", line 52, in forward
rois, roi_scores_raw = self.rpn.proposal_layer(rpn_scores_raw, rpn_reg, backbone_xyz) # (B, M, 7)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/autodl-tmp/EPNet/tools/../lib/rpn/proposal_layer.py", line 46, in forward
scores_single, proposals_single = self.distance_based_proposal(scores_single, proposals_single,
File "/root/autodl-tmp/EPNet/tools/../lib/rpn/proposal_layer.py", line 93, in distance_based_proposal
assert i == 2, '%d' % i
AssertionError: 1
The text was updated successfully, but these errors were encountered:
Hi, I got the following error when training with multiple GPUs, how should I fix it?
raceback (most recent call last):
File "train_rcnn.py", line 270, in
trainer.train(
File "/root/autodl-tmp/EPNet/tools/../tools/train_utils/train_utils.py", line 197, in train
loss, tb_dict, disp_dict = self._train_it(batch)
File "/root/autodl-tmp/EPNet/tools/../tools/train_utils/train_utils.py", line 130, in _train_it
loss, tb_dict, disp_dict = self.model_fn(self.model, batch)
File "/root/autodl-tmp/EPNet/tools/../lib/net/train_functions.py", line 46, in model_fn
ret_dict = model(input_data)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 168, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 178, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 86, in parallel_apply
output.reraise()
File "/root/miniconda3/lib/python3.8/site-packages/torch/_utils.py", line 434, in reraise
raise exception
AssertionError: Caught AssertionError in replica 1 on device 1.
Original Traceback (most recent call last):
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
output = module(*input, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/autodl-tmp/EPNet/tools/../lib/net/point_rcnn.py", line 52, in forward
rois, roi_scores_raw = self.rpn.proposal_layer(rpn_scores_raw, rpn_reg, backbone_xyz) # (B, M, 7)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/autodl-tmp/EPNet/tools/../lib/rpn/proposal_layer.py", line 46, in forward
scores_single, proposals_single = self.distance_based_proposal(scores_single, proposals_single,
File "/root/autodl-tmp/EPNet/tools/../lib/rpn/proposal_layer.py", line 93, in distance_based_proposal
assert i == 2, '%d' % i
AssertionError: 1
The text was updated successfully, but these errors were encountered: