From 2f0e72b73e0dece2fe2c3bafd264cc8f1e55ad15 Mon Sep 17 00:00:00 2001 From: Sanskar Agrawal Date: Thu, 12 Aug 2021 18:25:23 +0530 Subject: [PATCH 01/24] add dataparallel --- ml3d/torch/dataloaders/concat_batcher.py | 21 +++++++++++++++++++ ml3d/torch/pipelines/base_pipeline.py | 12 ++++++++--- ml3d/torch/pipelines/semantic_segmentation.py | 10 +++++++-- ml3d/utils/builder.py | 11 +++++++--- scripts/run_pipeline.py | 13 +++++++++--- 5 files changed, 56 insertions(+), 11 deletions(-) diff --git a/ml3d/torch/dataloaders/concat_batcher.py b/ml3d/torch/dataloaders/concat_batcher.py index 9efa70064..ce531509e 100644 --- a/ml3d/torch/dataloaders/concat_batcher.py +++ b/ml3d/torch/dataloaders/concat_batcher.py @@ -4,6 +4,7 @@ import pickle import torch import yaml +import math from os import listdir from os.path import exists, join, isdir @@ -434,6 +435,26 @@ def to(self, device): self.feat = [feat.to(device) for feat in self.feat] self.label = [label.to(device) for label in self.label] + @staticmethod + def scatter(batch, num_gpu): + batch_size = len(batch.batch_lengths) + assert num_gpu <= batch_size, "batch size must be greater than number of cuda devices" + + new_batch_size = math.ceil(batch_size / num_gpu) + batches = [SparseConvUnetBatch([]) for _ in range(num_gpu)] + splits = [0] + for length in batch.batch_lengths: + splits.append(splits[-1] + length) + for i in range(num_gpu): + start = splits[new_batch_size * i] + end = splits[min(new_batch_size * (i + 1), len(splits) - 1)] + batches[i].point = batch.point[start:end] + batches[i].feat = batch.feat[start:end] + batches[i].label = batch.label[start:end] + batches[i].batch_lengths = batch.batch_lengths[start:end] + + return batches + class ObjectDetectBatch: diff --git a/ml3d/torch/pipelines/base_pipeline.py b/ml3d/torch/pipelines/base_pipeline.py index 97c6f746b..4e56968c4 100644 --- a/ml3d/torch/pipelines/base_pipeline.py +++ b/ml3d/torch/pipelines/base_pipeline.py @@ -12,7 +12,12 @@ class BasePipeline(ABC): """Base pipeline class.""" - def __init__(self, model, dataset=None, device='gpu', **kwargs): + def __init__(self, + model, + dataset=None, + device='cuda', + device_ids=[0], + **kwargs): """Initialize. Args: @@ -42,9 +47,10 @@ def __init__(self, model, dataset=None, device='gpu', **kwargs): if device == 'cpu' or not torch.cuda.is_available(): self.device = torch.device('cpu') + self.device_ids = [-1] else: - self.device = torch.device('cuda' if len(device.split(':')) == - 1 else 'cuda:' + device.split(':')[1]) + self.device = torch.device('cuda') + self.device_ids = device_ids @abstractmethod def run_inference(self, data): diff --git a/ml3d/torch/pipelines/semantic_segmentation.py b/ml3d/torch/pipelines/semantic_segmentation.py index 9d857f9f4..78cc5c7b0 100644 --- a/ml3d/torch/pipelines/semantic_segmentation.py +++ b/ml3d/torch/pipelines/semantic_segmentation.py @@ -14,6 +14,7 @@ from os.path import exists, join, isfile, dirname, abspath from .base_pipeline import BasePipeline +from .dataparallel import CustomDataParallel from ..dataloaders import get_sampler, TorchDataloader, DefaultBatcher, ConcatBatcher from ..utils import latest_torch_ckpt from ..modules.losses import SemSegLoss @@ -102,7 +103,8 @@ def __init__( scheduler_gamma=0.95, momentum=0.98, main_log_dir='./logs/', - device='gpu', + device='cuda', + device_ids=[0], split='train', train_sum_dir='train_log', **kwargs): @@ -122,6 +124,7 @@ def __init__( momentum=momentum, main_log_dir=main_log_dir, device=device, + device_ids=device_ids, split=split, train_sum_dir=train_sum_dir, **kwargs) @@ -308,7 +311,6 @@ def run_train(self): dataset = self.dataset cfg = self.cfg - model.to(device) log.info("DEVICE : {}".format(device)) timestamp = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') @@ -379,6 +381,10 @@ def run_train(self): writer = SummaryWriter(self.tensorboard_dir) self.save_config(writer) + + model = CustomDataParallel(model, device_ids=self.device_ids) + # model.to(device) + log.info("Writing summary in {}.".format(self.tensorboard_dir)) log.info("Started training") diff --git a/ml3d/utils/builder.py b/ml3d/utils/builder.py index f2c02afc8..f56b12097 100644 --- a/ml3d/utils/builder.py +++ b/ml3d/utils/builder.py @@ -14,17 +14,22 @@ def build_network(cfg): return build(cfg, NETWORK) -def convert_device_name(framework): +def convert_device_name(framework, device_ids): """Convert device to either cpu or cuda.""" gpu_names = ["gpu", "cuda"] cpu_names = ["cpu"] if framework not in cpu_names + gpu_names: raise KeyError("the device shoule either " "be cuda or cpu but got {}".format(framework)) + assert type(device_ids) is list + device_ids_new = [] + for device in device_ids: + device_ids_new.append(int(device)) + if framework in gpu_names: - return "cuda" + return "cuda", device_ids_new else: - return "cpu" + return "cpu", device_ids_new def convert_framework_name(framework): diff --git a/scripts/run_pipeline.py b/scripts/run_pipeline.py index 753352cd4..c56296b81 100644 --- a/scripts/run_pipeline.py +++ b/scripts/run_pipeline.py @@ -28,8 +28,12 @@ def parse_args(): parser.add_argument('--dataset_path', help='path to the dataset') parser.add_argument('--ckpt_path', help='path to the checkpoint') parser.add_argument('--device', - help='device to run the pipeline', - default='gpu') + help='devices to run the pipeline', + default='cuda') + parser.add_argument('--device_ids', + nargs='+', + help='cuda device list', + default=['0']) parser.add_argument('--split', help='train or test', default='train') parser.add_argument('--mode', help='additional mode', default=None) parser.add_argument('--max_epochs', help='number of epochs', default=None) @@ -62,7 +66,8 @@ def main(): args, extra_dict = parse_args() framework = _ml3d.utils.convert_framework_name(args.framework) - args.device = _ml3d.utils.convert_device_name(args.device) + args.device, args.device_ids = _ml3d.utils.convert_device_name( + args.device, args.device_ids) if framework == 'torch': import open3d.ml.torch as ml3d else: @@ -107,6 +112,8 @@ def main(): cfg_dict_pipeline["max_epochs"] = args.max_epochs if args.batch_size is not None: cfg_dict_pipeline["batch_size"] = args.batch_size + cfg_dict_pipeline["device"] = args.device + cfg_dict_pipeline["device_ids"] = args.device_ids pipeline = Pipeline(model, dataset, **cfg_dict_pipeline) else: if (args.pipeline and args.model and args.dataset) is None: From 306de5bda59be3b0bd6c834b883faf734adf054e Mon Sep 17 00:00:00 2001 From: Sanskar Agrawal Date: Thu, 12 Aug 2021 18:52:15 +0530 Subject: [PATCH 02/24] add dataparallel class --- ml3d/torch/pipelines/dataparallel.py | 36 ++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 ml3d/torch/pipelines/dataparallel.py diff --git a/ml3d/torch/pipelines/dataparallel.py b/ml3d/torch/pipelines/dataparallel.py new file mode 100644 index 000000000..086857db2 --- /dev/null +++ b/ml3d/torch/pipelines/dataparallel.py @@ -0,0 +1,36 @@ +import torch +import numpy as np +from torch.nn.parallel import DataParallel + + +class CustomDataParallel(DataParallel): + """Custom DataParallel method for performing scatter operation + outside of torch's DataParallel. + """ + + def __init__(self, module, **kwargs): + super(CustomDataParallel, self).__init__(module, **kwargs) + self.get_loss = self.module.get_loss + self.cfg = self.module.cfg + + def forward(self, *inputs, **kwargs): + if not self.device_ids: + return self.module(*inputs, **kwargs) + inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) + # self._sync_params() + if len(self.device_ids) == 1: + return self.module(*inputs[0], **kwargs[0]) + + replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) + outputs = self.parallel_apply(replicas, inputs, kwargs) + + return self.gather(outputs, self.output_device) + + def scatter(self, inputs, kwargs, device_ids): + if not hasattr(inputs[0], 'scatter'): + raise NotImplementedError( + f"Please implement scatter for {inputs[0]} for multi gpu execution." + ) + inputs = inputs[0].scatter(inputs[0], len(self.device_ids)) + + return inputs, [kwargs for _ in range(len(inputs))] From b953254ef12023b33864639cb9532a624a84fc09 Mon Sep 17 00:00:00 2001 From: Sanskar Agrawal Date: Fri, 13 Aug 2021 19:20:23 +0530 Subject: [PATCH 03/24] fix bugs --- ml3d/torch/dataloaders/concat_batcher.py | 10 +++---- ml3d/torch/pipelines/dataparallel.py | 27 ++++++++++++++++--- ml3d/torch/pipelines/semantic_segmentation.py | 4 --- 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/ml3d/torch/dataloaders/concat_batcher.py b/ml3d/torch/dataloaders/concat_batcher.py index ce531509e..5497ced96 100644 --- a/ml3d/torch/dataloaders/concat_batcher.py +++ b/ml3d/torch/dataloaders/concat_batcher.py @@ -438,22 +438,18 @@ def to(self, device): @staticmethod def scatter(batch, num_gpu): batch_size = len(batch.batch_lengths) - assert num_gpu <= batch_size, "batch size must be greater than number of cuda devices" new_batch_size = math.ceil(batch_size / num_gpu) batches = [SparseConvUnetBatch([]) for _ in range(num_gpu)] - splits = [0] - for length in batch.batch_lengths: - splits.append(splits[-1] + length) for i in range(num_gpu): - start = splits[new_batch_size * i] - end = splits[min(new_batch_size * (i + 1), len(splits) - 1)] + start = new_batch_size * i + end = min(new_batch_size * (i + 1), batch_size) batches[i].point = batch.point[start:end] batches[i].feat = batch.feat[start:end] batches[i].label = batch.label[start:end] batches[i].batch_lengths = batch.batch_lengths[start:end] - return batches + return [b for b in batches if len(b.point)] # filter empty batch class ObjectDetectBatch: diff --git a/ml3d/torch/pipelines/dataparallel.py b/ml3d/torch/pipelines/dataparallel.py index 086857db2..11738b5a0 100644 --- a/ml3d/torch/pipelines/dataparallel.py +++ b/ml3d/torch/pipelines/dataparallel.py @@ -16,21 +16,40 @@ def __init__(self, module, **kwargs): def forward(self, *inputs, **kwargs): if not self.device_ids: return self.module(*inputs, **kwargs) - inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) - # self._sync_params() + if len(self.device_ids) == 1: - return self.module(*inputs[0], **kwargs[0]) + if hasattr(inputs[0], 'to'): + inputs[0].to(self.device_ids[0]) + return self.module(inputs[0], **kwargs) + + inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) + self.module.cuda() replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) outputs = self.parallel_apply(replicas, inputs, kwargs) return self.gather(outputs, self.output_device) def scatter(self, inputs, kwargs, device_ids): + """Custom scatter method to override default method. + Scatter batch dimension based on custom scatter implemented + in custom batcher. + + Agrs: + inputs: Object of type custom batcher. + kwargs: Optional keyword arguments. + device_ids: List of device ids. + + Returns: + Returns a list of inputs of length num_devices. + Each input is transfered to different device id. + """ if not hasattr(inputs[0], 'scatter'): raise NotImplementedError( f"Please implement scatter for {inputs[0]} for multi gpu execution." ) - inputs = inputs[0].scatter(inputs[0], len(self.device_ids)) + inputs = inputs[0].scatter(inputs[0], len(device_ids)) + for i in range(len(inputs)): + inputs[i].to(torch.device(device_ids[i])) return inputs, [kwargs for _ in range(len(inputs))] diff --git a/ml3d/torch/pipelines/semantic_segmentation.py b/ml3d/torch/pipelines/semantic_segmentation.py index 78cc5c7b0..f5089c405 100644 --- a/ml3d/torch/pipelines/semantic_segmentation.py +++ b/ml3d/torch/pipelines/semantic_segmentation.py @@ -399,8 +399,6 @@ def run_train(self): model.trans_point_sampler = train_sampler.get_point_sampler() for step, inputs in enumerate(tqdm(train_loader, desc='training')): - if hasattr(inputs['data'], 'to'): - inputs['data'].to(device) self.optimizer.zero_grad() results = model(inputs['data']) loss, gt_labels, predict_scores = model.get_loss( @@ -429,8 +427,6 @@ def run_train(self): with torch.no_grad(): for step, inputs in enumerate( tqdm(valid_loader, desc='validation')): - if hasattr(inputs['data'], 'to'): - inputs['data'].to(device) results = model(inputs['data']) loss, gt_labels, predict_scores = model.get_loss( From 4dccfc03dc3d2a68b2ae3f945ca972b3c11cb30f Mon Sep 17 00:00:00 2001 From: Sanskar Agrawal Date: Tue, 24 Aug 2021 15:04:18 +0530 Subject: [PATCH 04/24] objdet multigpu --- ml3d/torch/dataloaders/concat_batcher.py | 46 +++++++++++++++++++++++- ml3d/torch/pipelines/object_detection.py | 9 +++-- 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/ml3d/torch/dataloaders/concat_batcher.py b/ml3d/torch/dataloaders/concat_batcher.py index 5497ced96..9b53a2a18 100644 --- a/ml3d/torch/dataloaders/concat_batcher.py +++ b/ml3d/torch/dataloaders/concat_batcher.py @@ -340,6 +340,33 @@ def to(self, device): return self + @staticmethod + def scatter(batch, num_gpu): + batch_size = len(batch.points) + + new_batch_size = math.ceil(batch_size / num_gpu) + batches = [KPConvBatch([]) for _ in range(num_gpu)] + for i in range(num_gpu): + start = new_batch_size * i + end = min(new_batch_size * (i + 1), batch_size) + batches[i].points = batch.points[start:end] + batches[i].neighbors = batch.neighbors[start:end] + batches[i].pools = batch.pools[start:end] + batches[i].upsamples = batch.upsamples[start:end] + batches[i].lengths = batch.lengths[start: + end] # TODO : verify lengths + + return [b for b in batches if len(b.points)] # filter empty batch + + def print(self): + print(self.points) + print(self.neighbors) + print(self.pools) + print(self.upsamples) + print(self.lengths) + print(self.features) + exit(0) + def unstack_points(self, layer=None): """Unstack the points.""" return self.unstack_elements('points', layer) @@ -471,13 +498,13 @@ def __init__(self, batches): self.attr = [] for batch in batches: - self.attr.append(batch['attr']) data = batch['data'] attr = batch['attr'] if 'test' not in attr['split'] and len( data['bboxes'] ) == 0: # Skip training batch with no bounding box. continue + self.attr.append(attr) self.point.append(torch.tensor(data['point'], dtype=torch.float32)) self.labels.append( torch.tensor(data['labels'], dtype=torch.int64) if 'labels' in @@ -506,6 +533,23 @@ def to(self, device): if self.bboxes[i] is not None: self.bboxes[i] = self.bboxes[i].to(device) + @staticmethod + def scatter(batch, num_gpu): + batch_size = len(batch.point) + + new_batch_size = math.ceil(batch_size / num_gpu) + batches = [ObjectDetectBatch([]) for _ in range(num_gpu)] + for i in range(num_gpu): + start = new_batch_size * i + end = min(new_batch_size * (i + 1), batch_size) + batches[i].point = batch.point[start:end] + batches[i].labels = batch.labels[start:end] + batches[i].bboxes = batch.bboxes[start:end] + batches[i].bbox_objs = batch.bbox_objs[start:end] + batches[i].attr = batch.attr[start:end] + + return [b for b in batches if len(b.point)] # filter empty batch + class ConcatBatcher(object): """ConcatBatcher for KPConv.""" diff --git a/ml3d/torch/pipelines/object_detection.py b/ml3d/torch/pipelines/object_detection.py index 43dd2a445..fe2a6ed05 100644 --- a/ml3d/torch/pipelines/object_detection.py +++ b/ml3d/torch/pipelines/object_detection.py @@ -11,6 +11,7 @@ from pathlib import Path from .base_pipeline import BasePipeline +from .dataparallel import CustomDataParallel from ..dataloaders import TorchDataloader, ConcatBatcher from torch.utils.tensorboard import SummaryWriter from ..utils import latest_torch_ckpt @@ -171,7 +172,7 @@ def run_valid(self): gt = [] with torch.no_grad(): for data in tqdm(valid_loader, desc='validation'): - data.to(device) + # data.to(device) results = model(data) loss = model.loss(results, data) for l, v in loss.items(): @@ -280,6 +281,10 @@ def run_train(self): writer = SummaryWriter(self.tensorboard_dir) self.save_config(writer) + + # wrap model for multiple GPU + model = CustomDataParallel(model, device_ids=self.device_ids) + log.info("Writing summary in {}.".format(self.tensorboard_dir)) log.info("Started training") @@ -291,7 +296,7 @@ def run_train(self): process_bar = tqdm(train_loader, desc='training') for data in process_bar: - data.to(device) + # data.to(device) results = model(data) loss = model.loss(results, data) loss_sum = sum(loss.values()) From 27793394385d2deab2e618fde904a63828080f7b Mon Sep 17 00:00:00 2001 From: Sanskar Agrawal Date: Fri, 27 Aug 2021 13:33:42 +0530 Subject: [PATCH 05/24] rename scatter --- ml3d/torch/pipelines/dataparallel.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/ml3d/torch/pipelines/dataparallel.py b/ml3d/torch/pipelines/dataparallel.py index 11738b5a0..ae7a77859 100644 --- a/ml3d/torch/pipelines/dataparallel.py +++ b/ml3d/torch/pipelines/dataparallel.py @@ -22,7 +22,7 @@ def forward(self, *inputs, **kwargs): inputs[0].to(self.device_ids[0]) return self.module(inputs[0], **kwargs) - inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) + inputs, kwargs = self.customscatter(inputs, kwargs, self.device_ids) self.module.cuda() replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) @@ -30,7 +30,7 @@ def forward(self, *inputs, **kwargs): return self.gather(outputs, self.output_device) - def scatter(self, inputs, kwargs, device_ids): + def customscatter(self, inputs, kwargs, device_ids): """Custom scatter method to override default method. Scatter batch dimension based on custom scatter implemented in custom batcher. @@ -45,9 +45,12 @@ def scatter(self, inputs, kwargs, device_ids): Each input is transfered to different device id. """ if not hasattr(inputs[0], 'scatter'): - raise NotImplementedError( - f"Please implement scatter for {inputs[0]} for multi gpu execution." - ) + try: + self.scatter(inputs, kwargs, device_ids) + except: + raise NotImplementedError( + f"Please implement scatter for {inputs[0]} for multi gpu execution." + ) inputs = inputs[0].scatter(inputs[0], len(device_ids)) for i in range(len(inputs)): inputs[i].to(torch.device(device_ids[i])) From 48a101f35cb0ec388b7959a1b14b174515ab5fc2 Mon Sep 17 00:00:00 2001 From: Sanskar Agrawal Date: Fri, 24 Sep 2021 16:33:46 +0530 Subject: [PATCH 06/24] fix objdet --- ml3d/datasets/utils/operations.py | 2 +- ml3d/torch/dataloaders/concat_batcher.py | 27 ------------------------ ml3d/torch/models/base_model_objdet.py | 2 +- ml3d/torch/models/point_pillars.py | 2 +- ml3d/torch/models/point_rcnn.py | 2 +- ml3d/torch/pipelines/dataparallel.py | 2 +- ml3d/torch/pipelines/object_detection.py | 6 +++--- 7 files changed, 8 insertions(+), 35 deletions(-) diff --git a/ml3d/datasets/utils/operations.py b/ml3d/datasets/utils/operations.py index 45147f2c3..3d252d481 100644 --- a/ml3d/datasets/utils/operations.py +++ b/ml3d/datasets/utils/operations.py @@ -4,7 +4,7 @@ import math from scipy.spatial import ConvexHull -from ...metrics import iou_bev +from open3d.ml.contrib import iou_bev_cpu as iou_bev def create_3D_rotations(axis, angle): diff --git a/ml3d/torch/dataloaders/concat_batcher.py b/ml3d/torch/dataloaders/concat_batcher.py index 9b53a2a18..6a7dda437 100644 --- a/ml3d/torch/dataloaders/concat_batcher.py +++ b/ml3d/torch/dataloaders/concat_batcher.py @@ -340,33 +340,6 @@ def to(self, device): return self - @staticmethod - def scatter(batch, num_gpu): - batch_size = len(batch.points) - - new_batch_size = math.ceil(batch_size / num_gpu) - batches = [KPConvBatch([]) for _ in range(num_gpu)] - for i in range(num_gpu): - start = new_batch_size * i - end = min(new_batch_size * (i + 1), batch_size) - batches[i].points = batch.points[start:end] - batches[i].neighbors = batch.neighbors[start:end] - batches[i].pools = batch.pools[start:end] - batches[i].upsamples = batch.upsamples[start:end] - batches[i].lengths = batch.lengths[start: - end] # TODO : verify lengths - - return [b for b in batches if len(b.points)] # filter empty batch - - def print(self): - print(self.points) - print(self.neighbors) - print(self.pools) - print(self.upsamples) - print(self.lengths) - print(self.features) - exit(0) - def unstack_points(self, layer=None): """Unstack the points.""" return self.unstack_elements('points', layer) diff --git a/ml3d/torch/models/base_model_objdet.py b/ml3d/torch/models/base_model_objdet.py index e63176c0b..7e665955a 100644 --- a/ml3d/torch/models/base_model_objdet.py +++ b/ml3d/torch/models/base_model_objdet.py @@ -24,7 +24,7 @@ def __init__(self, **kwargs): self.cfg = Config(kwargs) @abstractmethod - def loss(self, results, inputs): + def get_loss(self, results, inputs): """Computes the loss given the network input and outputs. Args: diff --git a/ml3d/torch/models/point_pillars.py b/ml3d/torch/models/point_pillars.py index 47344ecd8..da1d49b27 100644 --- a/ml3d/torch/models/point_pillars.py +++ b/ml3d/torch/models/point_pillars.py @@ -138,7 +138,7 @@ def get_optimizer(self, cfg): optimizer = torch.optim.AdamW(self.parameters(), **cfg) return optimizer, None - def loss(self, results, inputs): + def get_loss(self, results, inputs): scores, bboxes, dirs = results gt_labels = inputs.labels gt_bboxes = inputs.bboxes diff --git a/ml3d/torch/models/point_rcnn.py b/ml3d/torch/models/point_rcnn.py index f06c3c217..ea542e045 100644 --- a/ml3d/torch/models/point_rcnn.py +++ b/ml3d/torch/models/point_rcnn.py @@ -183,7 +183,7 @@ def step(self): return optimizer, scheduler - def loss(self, results, inputs): + def get_loss(self, results, inputs): if self.mode == "RPN": return self.rpn.loss(results, inputs) else: diff --git a/ml3d/torch/pipelines/dataparallel.py b/ml3d/torch/pipelines/dataparallel.py index ae7a77859..1b36c431a 100644 --- a/ml3d/torch/pipelines/dataparallel.py +++ b/ml3d/torch/pipelines/dataparallel.py @@ -46,7 +46,7 @@ def customscatter(self, inputs, kwargs, device_ids): """ if not hasattr(inputs[0], 'scatter'): try: - self.scatter(inputs, kwargs, device_ids) + return self.scatter(inputs, kwargs, device_ids) except: raise NotImplementedError( f"Please implement scatter for {inputs[0]} for multi gpu execution." diff --git a/ml3d/torch/pipelines/object_detection.py b/ml3d/torch/pipelines/object_detection.py index fe2a6ed05..b31a210da 100644 --- a/ml3d/torch/pipelines/object_detection.py +++ b/ml3d/torch/pipelines/object_detection.py @@ -174,7 +174,7 @@ def run_valid(self): for data in tqdm(valid_loader, desc='validation'): # data.to(device) results = model(data) - loss = model.loss(results, data) + loss = model.get_loss(results, data) for l, v in loss.items(): if not l in self.valid_losses: self.valid_losses[l] = [] @@ -296,9 +296,9 @@ def run_train(self): process_bar = tqdm(train_loader, desc='training') for data in process_bar: - # data.to(device) + data.to(device) results = model(data) - loss = model.loss(results, data) + loss = model.get_loss(results, data) loss_sum = sum(loss.values()) self.optimizer.zero_grad() From 53479c2838abce31cf832a3ba441fd9332ca6570 Mon Sep 17 00:00:00 2001 From: Sanskar Agrawal Date: Fri, 24 Sep 2021 16:36:16 +0530 Subject: [PATCH 07/24] remove comments --- ml3d/torch/pipelines/object_detection.py | 2 +- ml3d/torch/pipelines/semantic_segmentation.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/ml3d/torch/pipelines/object_detection.py b/ml3d/torch/pipelines/object_detection.py index b31a210da..9d4332338 100644 --- a/ml3d/torch/pipelines/object_detection.py +++ b/ml3d/torch/pipelines/object_detection.py @@ -172,7 +172,7 @@ def run_valid(self): gt = [] with torch.no_grad(): for data in tqdm(valid_loader, desc='validation'): - # data.to(device) + data.to(device) results = model(data) loss = model.get_loss(results, data) for l, v in loss.items(): diff --git a/ml3d/torch/pipelines/semantic_segmentation.py b/ml3d/torch/pipelines/semantic_segmentation.py index f5089c405..26d2c966d 100644 --- a/ml3d/torch/pipelines/semantic_segmentation.py +++ b/ml3d/torch/pipelines/semantic_segmentation.py @@ -383,7 +383,6 @@ def run_train(self): self.save_config(writer) model = CustomDataParallel(model, device_ids=self.device_ids) - # model.to(device) log.info("Writing summary in {}.".format(self.tensorboard_dir)) From 6139e66449f559f0f7ffe33bfb6d37a580c2cb18 Mon Sep 17 00:00:00 2001 From: Sanskar Agrawal Date: Mon, 10 Jan 2022 17:47:41 +0530 Subject: [PATCH 08/24] fix cam_img matrix --- ml3d/datasets/waymo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml3d/datasets/waymo.py b/ml3d/datasets/waymo.py index 2e1545b47..8d1690b91 100644 --- a/ml3d/datasets/waymo.py +++ b/ml3d/datasets/waymo.py @@ -166,7 +166,7 @@ def read_calib(path): Tr_velo_to_cam = Waymo._extend_matrix(Tr_velo_to_cam) world_cam = np.transpose(rect_4x4 @ Tr_velo_to_cam) - cam_img = np.transpose(P2) + cam_img = np.transpose(np.vstack((P2.reshape(3, 4), [0, 0, 0, 1]))) return {'world_cam': world_cam, 'cam_img': cam_img} From d9e156488d53f55d7894f404185d478e4b0ddf54 Mon Sep 17 00:00:00 2001 From: Sanskar Agrawal Date: Mon, 17 Jan 2022 21:40:59 +0530 Subject: [PATCH 09/24] add distributed training --- ml3d/configs/pointpillars_waymo.yml | 28 +++---- ml3d/torch/pipelines/base_pipeline.py | 18 ++++- ml3d/torch/pipelines/object_detection.py | 80 ++++++++++++-------- ml3d/utils/config.py | 6 ++ scripts/run_pipeline.py | 94 ++++++++++++++++++++---- 5 files changed, 162 insertions(+), 64 deletions(-) diff --git a/ml3d/configs/pointpillars_waymo.yml b/ml3d/configs/pointpillars_waymo.yml index 43534e834..4e18a5b4c 100644 --- a/ml3d/configs/pointpillars_waymo.yml +++ b/ml3d/configs/pointpillars_waymo.yml @@ -10,7 +10,7 @@ model: batcher: "ignore" - point_cloud_range: [-74.88, -74.88, -2, 74.88, 74.88, 4] + point_cloud_range: [-80, -80, -6, 85, 85, 7] classes: ['VEHICLE', 'PEDESTRIAN', 'CYCLIST'] loss: @@ -31,7 +31,7 @@ model: max_voxels: [32000, 32000] voxel_encoder: - in_channels: 5 + in_channels: 4 feat_channels: [64] voxel_size: *vsize @@ -70,18 +70,18 @@ model: rotations: [0, 1.57] iou_thr: [[0.4, 0.55], [0.3, 0.5], [0.3, 0.5]] - augment: - PointShuffle: True - ObjectRangeFilter: True - ObjectSample: - min_points_dict: - VEHICLE: 5 - PEDESTRIAN: 10 - CYCLIST: 10 - sample_dict: - VEHICLE: 15 - PEDESTRIAN: 10 - CYCLIST: 10 + augment: {} + # PointShuffle: True + # ObjectRangeFilter: True + # ObjectSample: + # min_points_dict: + # VEHICLE: 5 + # PEDESTRIAN: 10 + # CYCLIST: 10 + # sample_dict: + # VEHICLE: 15 + # PEDESTRIAN: 10 + # CYCLIST: 10 pipeline: diff --git a/ml3d/torch/pipelines/base_pipeline.py b/ml3d/torch/pipelines/base_pipeline.py index 13ff7be86..e76efca8e 100644 --- a/ml3d/torch/pipelines/base_pipeline.py +++ b/ml3d/torch/pipelines/base_pipeline.py @@ -16,7 +16,7 @@ def __init__(self, model, dataset=None, device='cuda', - device_ids=[0], + distributed=False, **kwargs): """Initialize. @@ -46,12 +46,22 @@ def __init__(self, model.__class__.__name__ + '_' + dataset_name + '_torch') make_dir(self.cfg.logs_dir) + self.distributed = distributed + + self.rank = kwargs.get('rank', 0) + if device == 'cpu' or not torch.cuda.is_available(): + if distributed: + raise ValueError( + "Distributed training is ON, but CUDA not available.") self.device = torch.device('cpu') - self.device_ids = [-1] else: - self.device = torch.device('cuda') - self.device_ids = device_ids + if distributed: + self.device = torch.device(device) + print("Using device", self.device) + torch.cuda.set_device(self.device) + else: + self.device = torch.device('cuda') self.summary = {} self.cfg.setdefault('summary', {}) diff --git a/ml3d/torch/pipelines/object_detection.py b/ml3d/torch/pipelines/object_detection.py index 2927c19ef..9228e3a4c 100644 --- a/ml3d/torch/pipelines/object_detection.py +++ b/ml3d/torch/pipelines/object_detection.py @@ -1,12 +1,13 @@ import logging import re +import numpy as np +import torch +import torch.distributed as dist + from datetime import datetime from os.path import exists, join from pathlib import Path - from tqdm import tqdm -import numpy as np -import torch from torch.utils.data import DataLoader from .base_pipeline import BasePipeline @@ -255,18 +256,21 @@ def run_train(self): """Run training with train data split.""" torch.manual_seed(self.rng.integers(np.iinfo( np.int32).max)) # Random reproducible seed for torch + rank = self.rank # Rank for distributed training model = self.model device = self.device dataset = self.dataset cfg = self.cfg - log.info("DEVICE : {}".format(device)) - timestamp = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') + if rank == 0: + log.info("DEVICE : {}".format(device)) + timestamp = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') - log_file_path = join(cfg.logs_dir, 'log_train_' + timestamp + '.txt') - log.info("Logging in file : {}".format(log_file_path)) - log.addHandler(logging.FileHandler(log_file_path)) + log_file_path = join(cfg.logs_dir, + 'log_train_' + timestamp + '.txt') + log.info("Logging in file : {}".format(log_file_path)) + log.addHandler(logging.FileHandler(log_file_path)) batcher = ConcatBatcher(device, model.cfg.name) @@ -277,15 +281,22 @@ def run_train(self): use_cache=dataset.cfg.use_cache, steps_per_epoch=dataset.cfg.get( 'steps_per_epoch_train', None)) - train_loader = DataLoader( - train_split, - batch_size=cfg.batch_size, - num_workers=cfg.get('num_workers', 4), - pin_memory=cfg.get('pin_memory', False), - collate_fn=batcher.collate_fn, - worker_init_fn=lambda x: np.random.seed(x + np.uint32( - torch.utils.data.get_worker_info().seed)) - ) # numpy expects np.uint32, whereas torch returns np.uint64. + + if self.distributed: + train_sampler = torch.utils.data.distributed.DistributedSampler( + train_split) + else: + train_sampler = None + + train_loader = DataLoader(train_split, + batch_size=cfg.batch_size, + num_workers=cfg.get('num_workers', 0), + pin_memory=cfg.get('pin_memory', False), + collate_fn=batcher.collate_fn, + sampler=train_sampler) + # worker_init_fn=lambda x: np.random.seed(x + np.uint32( + # torch.utils.data.get_worker_info().seed)) + # ) # numpy expects np.uint32, whereas torch returns np.uint64. self.optimizer, self.scheduler = model.get_optimizer(cfg.optimizer) @@ -301,23 +312,31 @@ def run_train(self): runid + '_' + Path(tensorboard_dir).name) writer = SummaryWriter(self.tensorboard_dir) - self.save_config(writer) + if rank == 0: + self.save_config(writer) + log.info("Writing summary in {}.".format(self.tensorboard_dir)) # wrap model for multiple GPU - model = CustomDataParallel(model, device_ids=self.device_ids) + if self.distributed: + model.cuda(self.device) + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[self.device]) - log.info("Writing summary in {}.".format(self.tensorboard_dir)) record_summary = 'train' in cfg.get('summary').get('record_for', []) - log.info("Started training") + if rank == 0: + log.info("Started training") + for epoch in range(start_ep, cfg.max_epoch + 1): log.info(f'=== EPOCH {epoch:d}/{cfg.max_epoch:d} ===') - model.train() + if self.distributed: + train_sampler.set_epoch(epoch) + model.train() self.losses = {} process_bar = tqdm(train_loader, desc='training') - for data in process_bar: + for data in train_loader: data.to(device) results = model(data) loss = model.get_loss(results, data) @@ -331,7 +350,7 @@ def run_train(self): self.optimizer.step() # Record visualization for the last iteration - if record_summary and process_bar.n == process_bar.total - 1: + if rank == 0 and record_summary and process_bar.n == process_bar.total - 1: boxes = model.inference_end(results, data) self.summary['train'] = self.get_3d_summary(boxes, data, @@ -351,13 +370,13 @@ def run_train(self): self.scheduler.step() # --------------------- validation - if (epoch % cfg.get("validation_freq", 1)) == 0: + if rank == 0 and (epoch % cfg.get("validation_freq", 1)) == 0: self.run_valid() - self.save_logs(writer, epoch) - - if epoch % cfg.save_ckpt_freq == 0: - self.save_ckpt(epoch) + if rank == 0: + self.save_logs(writer, epoch) + if epoch % cfg.save_ckpt_freq == 0: + self.save_ckpt(epoch) def get_3d_summary(self, infer_bboxes_batch, @@ -480,7 +499,8 @@ def save_logs(self, writer, epoch): def load_ckpt(self, ckpt_path=None, is_resume=True): train_ckpt_dir = join(self.cfg.logs_dir, 'checkpoint') - make_dir(train_ckpt_dir) + if self.rank == 0: + make_dir(train_ckpt_dir) epoch = 0 if ckpt_path is None: diff --git a/ml3d/utils/config.py b/ml3d/utils/config.py index 51b10829e..2686c958b 100644 --- a/ml3d/utils/config.py +++ b/ml3d/utils/config.py @@ -272,3 +272,9 @@ def __getattr__(self, name): def __getitem__(self, name): return self._cfg_dict.__getitem__(name) + + def __getstate__(self): + return self.cfg_dict + + def __setstate__(self, state): + self.cfg_dict = state diff --git a/scripts/run_pipeline.py b/scripts/run_pipeline.py index 6b18e02a3..8a9e34564 100644 --- a/scripts/run_pipeline.py +++ b/scripts/run_pipeline.py @@ -3,6 +3,8 @@ import sys import yaml import pprint +import os +import torch.distributed as dist from pathlib import Path @@ -70,6 +72,8 @@ def main(): rng = np.random.default_rng(args.seed) if framework == 'torch': import open3d.ml.torch as ml3d + import torch.multiprocessing as mp + import torch.distributed as dist else: import tensorflow as tf import open3d.ml.tf as ml3d @@ -101,24 +105,20 @@ def main(): cfg_dict_dataset, cfg_dict_pipeline, cfg_dict_model = \ _ml3d.utils.Config.merge_cfg_file(cfg, args, extra_dict) - cfg_dict_dataset['seed'] = rng - cfg_dict_model['seed'] = rng - cfg_dict_pipeline['seed'] = rng - - dataset = Dataset(cfg_dict_dataset.pop('dataset_path', None), - **cfg_dict_dataset) - if args.mode is not None: cfg_dict_model["mode"] = args.mode - model = Model(**cfg_dict_model) - if args.max_epochs is not None: cfg_dict_pipeline["max_epochs"] = args.max_epochs if args.batch_size is not None: cfg_dict_pipeline["batch_size"] = args.batch_size + + cfg_dict_dataset['seed'] = rng + cfg_dict_model['seed'] = rng + cfg_dict_pipeline['seed'] = rng + cfg_dict_pipeline["device"] = args.device cfg_dict_pipeline["device_ids"] = args.device_ids - pipeline = Pipeline(model, dataset, **cfg_dict_pipeline) + else: if (args.pipeline and args.model and args.dataset) is None: raise ValueError("Please specify pipeline, model, and dataset " + @@ -136,25 +136,87 @@ def main(): cfg_dict_model['seed'] = rng cfg_dict_pipeline['seed'] = rng - dataset = Dataset(**cfg_dict_dataset) - model = Model(**cfg_dict_model, mode=args.mode) - pipeline = Pipeline(model, dataset, **cfg_dict_pipeline) - with open(Path(__file__).parent / 'README.md', 'r') as f: readme = f.read() - pipeline.cfg_tb = { + + cfg_tb = { 'readme': readme, 'cmd_line': cmd_line, 'dataset': pprint.pformat(cfg_dict_dataset, indent=2), 'model': pprint.pformat(cfg_dict_model, indent=2), 'pipeline': pprint.pformat(cfg_dict_pipeline, indent=2) } + args.cfg_tb = cfg_tb + args.distributed = framework == 'torch' and args.device != 'cpu' and len( + args.device_ids) > 1 + + if not args.distributed: + # print("not distr : ") + # exit(0) + dataset = Dataset(**cfg_dict_dataset) + model = Model(**cfg_dict_model, mode=args.mode) + pipeline = Pipeline(model, dataset, **cfg_dict_pipeline) + + pipeline.cfg_tb = cfg_tb + + if args.split == 'test': + pipeline.run_test() + else: + pipeline.run_train() + + else: + mp.spawn(main_worker, + args=(Dataset, Model, Pipeline, cfg_dict_dataset, + cfg_dict_model, cfg_dict_pipeline, args), + nprocs=len(args.device_ids)) + + +def setup(rank, world_size): + os.environ['MASTER_ADDR'] = 'localhost' + os.environ['MASTER_PORT'] = '12355' + + # initialize the process group + dist.init_process_group("gloo", rank=rank, world_size=world_size) + + +def cleanup(): + dist.destroy_process_group() + + +def main_worker(rank, Dataset, Model, Pipeline, cfg_dict_dataset, + cfg_dict_model, cfg_dict_pipeline, args): + world_size = len(args.device_ids) + setup(rank, world_size) + + cfg_dict_dataset['rank'] = rank + cfg_dict_model['rank'] = rank + cfg_dict_pipeline['rank'] = rank + + device = f"cuda:{args.device_ids[rank]}" + print(f"rank = {rank}, world_size = {world_size}, gpu = {device}") + + cfg_dict_model['device'] = device + cfg_dict_pipeline['device'] = device + + dataset = Dataset(**cfg_dict_dataset) + model = Model(**cfg_dict_model, mode=args.mode) + pipeline = Pipeline(model, + dataset, + distributed=args.distributed, + **cfg_dict_pipeline) + + with open(Path(__file__).parent / 'README.md', 'r') as f: + readme = f.read() + pipeline.cfg_tb = args.cfg_tb if args.split == 'test': - pipeline.run_test() + if rank == 0: + pipeline.run_test() else: pipeline.run_train() + cleanup() + if __name__ == '__main__': main() From ef0e44053adfc7ddaddd25b7f78513d653dbca30 Mon Sep 17 00:00:00 2001 From: Sanskar Agrawal Date: Tue, 18 Jan 2022 18:51:56 +0530 Subject: [PATCH 10/24] parllel validation --- ml3d/torch/pipelines/object_detection.py | 65 +++++++++++++++++------- scripts/run_pipeline.py | 2 + 2 files changed, 48 insertions(+), 19 deletions(-) diff --git a/ml3d/torch/pipelines/object_detection.py b/ml3d/torch/pipelines/object_detection.py index 9228e3a4c..ebbfe09b7 100644 --- a/ml3d/torch/pipelines/object_detection.py +++ b/ml3d/torch/pipelines/object_detection.py @@ -156,7 +156,8 @@ def run_valid(self, epoch=0): log.info("DEVICE : {}".format(device)) log_file_path = join(cfg.logs_dir, 'log_valid_' + timestamp + '.txt') log.info("Logging in file : {}".format(log_file_path)) - log.addHandler(logging.FileHandler(log_file_path)) + if self.rank == 0: + log.addHandler(logging.FileHandler(log_file_path)) batcher = ConcatBatcher(device, model.cfg.name) @@ -168,16 +169,24 @@ def run_valid(self, epoch=0): shuffle=True, steps_per_epoch=dataset.cfg.get( 'steps_per_epoch_valid', None)) - valid_loader = DataLoader( - valid_split, - batch_size=cfg.val_batch_size, - num_workers=cfg.get('num_workers', 4), - pin_memory=cfg.get('pin_memory', False), - collate_fn=batcher.collate_fn, - worker_init_fn=lambda x: np.random.seed(x + np.uint32( - torch.utils.data.get_worker_info().seed))) - record_summary = 'valid' in cfg.get('summary').get('record_for', []) + if self.distributed: + valid_sampler = torch.utils.data.distributed.DistributedSampler( + valid_split) + else: + valid_sampler = None + + valid_loader = DataLoader(valid_split, + batch_size=cfg.val_batch_size, + num_workers=cfg.get('num_workers', 0), + pin_memory=cfg.get('pin_memory', False), + collate_fn=batcher.collate_fn, + sampler=valid_sampler) + # worker_init_fn=lambda x: np.random.seed(x + np.uint32( + # torch.utils.data.get_worker_info().seed))) + + record_summary = self.rank == 0 and 'valid' in cfg.get('summary').get( + 'record_for', []) log.info("Started validation") self.valid_losses = {} @@ -322,7 +331,8 @@ def run_train(self): model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[self.device]) - record_summary = 'train' in cfg.get('summary').get('record_for', []) + record_summary = self.rank == 0 and 'train' in cfg.get('summary').get( + 'record_for', []) if rank == 0: log.info("Started training") @@ -336,22 +346,35 @@ def run_train(self): self.losses = {} process_bar = tqdm(train_loader, desc='training') - for data in train_loader: + for data in process_bar: data.to(device) results = model(data) - loss = model.get_loss(results, data) + if self.distributed: + loss = model.module.get_loss(results, data) + else: + loss = model.get_loss(results, data) loss_sum = sum(loss.values()) self.optimizer.zero_grad() loss_sum.backward() - if model.cfg.get('grad_clip_norm', -1) > 0: - torch.nn.utils.clip_grad_value_(model.parameters(), - model.cfg.grad_clip_norm) + if self.distributed: + if model.module.cfg.get('grad_clip_norm', -1) > 0: + torch.nn.utils.clip_grad_value_( + model.module.parameters(), + model.module.cfg.grad_clip_norm) + else: + if model.cfg.get('grad_clip_norm', -1) > 0: + torch.nn.utils.clip_grad_value_( + model.parameters(), model.cfg.grad_clip_norm) + self.optimizer.step() # Record visualization for the last iteration - if rank == 0 and record_summary and process_bar.n == process_bar.total - 1: - boxes = model.inference_end(results, data) + if record_summary and process_bar.n == process_bar.total - 1: + if self.distributed: + boxes = model.module.inference_end(results, data) + else: + boxes = model.inference_end(results, data) self.summary['train'] = self.get_3d_summary(boxes, data, epoch, @@ -366,11 +389,15 @@ def run_train(self): process_bar.set_description(desc) process_bar.refresh() + if self.distributed: + dist.barrier() + if self.scheduler is not None: self.scheduler.step() # --------------------- validation - if rank == 0 and (epoch % cfg.get("validation_freq", 1)) == 0: + # if rank == 0 and (epoch % cfg.get("validation_freq", 1)) == 0: + if epoch % cfg.get("validation_freq", 1) == 0: self.run_valid() if rank == 0: diff --git a/scripts/run_pipeline.py b/scripts/run_pipeline.py index 8a9e34564..84ffa2919 100644 --- a/scripts/run_pipeline.py +++ b/scripts/run_pipeline.py @@ -5,6 +5,7 @@ import pprint import os import torch.distributed as dist +from torch import multiprocessing from pathlib import Path @@ -219,4 +220,5 @@ def main_worker(rank, Dataset, Model, Pipeline, cfg_dict_dataset, if __name__ == '__main__': + multiprocessing.set_start_method('spawn') main() From 29b972906a099996541e0c897737cbc688b65010 Mon Sep 17 00:00:00 2001 From: Sanskar Agrawal Date: Mon, 7 Feb 2022 17:18:34 +0530 Subject: [PATCH 11/24] update config --- ml3d/configs/pointpillars_waymo.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ml3d/configs/pointpillars_waymo.yml b/ml3d/configs/pointpillars_waymo.yml index 4e18a5b4c..cd35e21f6 100644 --- a/ml3d/configs/pointpillars_waymo.yml +++ b/ml3d/configs/pointpillars_waymo.yml @@ -37,7 +37,7 @@ model: scatter: in_channels: 64 - output_shape: [468, 468] + output_shape: [520, 520] backbone: in_channels: 64 @@ -57,14 +57,14 @@ model: nms_pre: 4096 score_thr: 0.1 ranges: [ - [-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345], - [-74.88, -74.88, -0.1188, 74.88, 74.88, -0.1188], - [-74.88, -74.88, 0, 74.88, 74.88, 0], + [-80, -80, 1.142, 85, 85, 1.142], + [-80, -80, 1.139, 85, 85, 1.139], + [-80, -80, 1.149, 85, 85, 1.149], ] sizes: [ - [2.08, 4.73, 1.77], # car - [0.84, 1.81, 1.77], # cyclist - [0.84, 0.91, 1.74] # pedestrian + [1.98, 4.50, 1.96], # VEHICLE + [0.91, 1.94, 1.78], # CYCLIST + [0.84, 0.91, 1.70] # PEDESTRIAN ] dir_offset: 0.7854 rotations: [0, 1.57] From 672248d56638f62aa50770c569e1a5258ed5f8ad Mon Sep 17 00:00:00 2001 From: Sanskar Agrawal Date: Tue, 8 Feb 2022 00:41:42 -0800 Subject: [PATCH 12/24] gather in run_valid --- ml3d/torch/pipelines/object_detection.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/ml3d/torch/pipelines/object_detection.py b/ml3d/torch/pipelines/object_detection.py index ebbfe09b7..cff670f92 100644 --- a/ml3d/torch/pipelines/object_detection.py +++ b/ml3d/torch/pipelines/object_detection.py @@ -227,6 +227,22 @@ def run_valid(self, epoch=0): similar_classes = cfg.get("similar_classes", {}) difficulties = cfg.get("difficulties", [0]) + if self.distributed: + gt_gather = [None for _ in range(dist.get_world_size())] + pred_gather = [None for _ in range(dist.get_world_size())] + + dist.gather_object(gt, gt_gather if self.rank == 0 else None, dst=0) + dist.gather_object(pred, + pred_gather if self.rank == 0 else None, + dst=0) + + if self.rank == 0: + gt = sum(gt_gather, []) + pred = sum(pred_gather, []) + + if self.rank != 0: + return + ap = mAP(pred, gt, model.classes, @@ -399,6 +415,8 @@ def run_train(self): # if rank == 0 and (epoch % cfg.get("validation_freq", 1)) == 0: if epoch % cfg.get("validation_freq", 1) == 0: self.run_valid() + if self.distributed: + dist.barrier() if rank == 0: self.save_logs(writer, epoch) @@ -528,6 +546,8 @@ def load_ckpt(self, ckpt_path=None, is_resume=True): train_ckpt_dir = join(self.cfg.logs_dir, 'checkpoint') if self.rank == 0: make_dir(train_ckpt_dir) + if self.distributed: + dist.barrier() epoch = 0 if ckpt_path is None: From 620b35c9fdb600bddaa0662e29ad24b070169d28 Mon Sep 17 00:00:00 2001 From: Sanskar Agrawal Date: Wed, 9 Feb 2022 11:39:58 -0800 Subject: [PATCH 13/24] fix preprocessing --- ml3d/datasets/augment/augmentation.py | 2 +- ml3d/datasets/waymo.py | 17 +++++++---------- ml3d/torch/pipelines/base_pipeline.py | 11 ++++++----- scripts/collect_bboxes.py | 22 +++++++++++++++++++--- scripts/preprocess_waymo.py | 22 +++++++++++----------- 5 files changed, 44 insertions(+), 30 deletions(-) diff --git a/ml3d/datasets/augment/augmentation.py b/ml3d/datasets/augment/augmentation.py index c11f13452..c7ca1244f 100644 --- a/ml3d/datasets/augment/augmentation.py +++ b/ml3d/datasets/augment/augmentation.py @@ -484,7 +484,7 @@ def ObjectSample(self, data, db_boxes_dict, sample_dict): sampled_points = np.concatenate( [box.points_inside_box for box in sampled], axis=0) points = remove_points_in_boxes(points, sampled) - points = np.concatenate([sampled_points, points], axis=0) + points = np.concatenate([sampled_points[:, :4], points], axis=0) return { 'point': points, diff --git a/ml3d/datasets/waymo.py b/ml3d/datasets/waymo.py index 8d1690b91..13cc098b2 100644 --- a/ml3d/datasets/waymo.py +++ b/ml3d/datasets/waymo.py @@ -29,7 +29,6 @@ def __init__(self, name='Waymo', cache_dir='./logs/cache', use_cache=False, - val_split=3, **kwargs): """Initialize the function by passing the dataset and other details. @@ -38,7 +37,6 @@ def __init__(self, name: The name of the dataset (Waymo in this case). cache_dir: The directory where the cache is stored. use_cache: Indicates if the dataset should be cached. - val_split: The split value to get a set of images for training, validation, for testing. Returns: class: The corresponding class. @@ -47,7 +45,6 @@ def __init__(self, name=name, cache_dir=cache_dir, use_cache=use_cache, - val_split=val_split, **kwargs) cfg = self.cfg @@ -63,15 +60,15 @@ def __init__(self, self.val_files = [] for f in self.all_files: - idx = Path(f).name.replace('.bin', '')[:3] - idx = int(idx) - if idx < cfg.val_split: + if 'train' in f: self.train_files.append(f) - else: + elif 'val' in f: self.val_files.append(f) - - self.test_files = glob( - join(cfg.dataset_path, 'testing', 'velodyne', '*.bin')) + elif 'test' in f: + self.test_files.append(f) + else: + log.warning( + f"Skipping {f}, prefix must be one of train, test or val.") @staticmethod def get_label_to_names(): diff --git a/ml3d/torch/pipelines/base_pipeline.py b/ml3d/torch/pipelines/base_pipeline.py index e76efca8e..2c633c761 100644 --- a/ml3d/torch/pipelines/base_pipeline.py +++ b/ml3d/torch/pipelines/base_pipeline.py @@ -39,16 +39,17 @@ def __init__(self, self.dataset = dataset self.rng = np.random.default_rng(kwargs.get('seed', None)) - make_dir(self.cfg.main_log_dir) + self.distributed = distributed + self.rank = kwargs.get('rank', 0) + dataset_name = dataset.name if dataset is not None else '' self.cfg.logs_dir = join( self.cfg.main_log_dir, model.__class__.__name__ + '_' + dataset_name + '_torch') - make_dir(self.cfg.logs_dir) - self.distributed = distributed - - self.rank = kwargs.get('rank', 0) + if self.rank == 0: + make_dir(self.cfg.main_log_dir) + make_dir(self.cfg.logs_dir) if device == 'cpu' or not torch.cuda.is_available(): if distributed: diff --git a/scripts/collect_bboxes.py b/scripts/collect_bboxes.py index 6865cbcd8..fd73f6b7a 100644 --- a/scripts/collect_bboxes.py +++ b/scripts/collect_bboxes.py @@ -1,6 +1,8 @@ from os.path import join import argparse import pickle +import random +from tqdm import tqdm from open3d.ml.datasets import utils from open3d.ml import datasets import multiprocessing @@ -26,6 +28,13 @@ def parse_args(): type=int, default=multiprocessing.cpu_count(), required=False) + parser.add_argument( + '--max_pc', + help= + 'Boxes from random N pointclouds will be saved. Default None(save from whole dataset).', + type=int, + default=None, + required=False) args = parser.parse_args() @@ -77,11 +86,18 @@ def process_boxes(i): classname = getattr(datasets, args.dataset_type) dataset = classname(args.dataset_path) train = dataset.get_split('train') + max_pc = len(train) if args.max_pc is None else args.max_pc + + query_pc = range(len(train)) if max_pc >= len(train) else random.sample( + range(len(train)), max_pc) - print("Found", len(train), "traning samples") - print("This may take a few minutes...") + print(f"Found {len(train)} traning samples, Using {max_pc}") + print( + f"Using {args.num_cpus} number of cpus, This may take a few minutes...") with multiprocessing.Pool(args.num_cpus) as p: - bboxes = p.map(process_boxes, range(len(train))) + bboxes = list(tqdm(p.imap(process_boxes, query_pc), + total=len(query_pc))) bboxes = [e for l in bboxes for e in l] file = open(join(out_path, 'bboxes.pkl'), 'wb') pickle.dump(bboxes, file) + print(f"Saved {len(bboxes)} boxes.") diff --git a/scripts/preprocess_waymo.py b/scripts/preprocess_waymo.py index 290c1768f..d502d8e3d 100644 --- a/scripts/preprocess_waymo.py +++ b/scripts/preprocess_waymo.py @@ -37,10 +37,10 @@ def parse_args(): default=16, type=int) - parser.add_argument('--is_test', - help='True for processing test data (default False)', - default=False, - type=bool) + parser.add_argument('--split', + help='One of {train, val, test} (default train)', + default='train', + type=str) args = parser.parse_args() @@ -65,9 +65,9 @@ class Waymo2KITTI(): is_test (bool): Whether in the test_mode. Default: False. """ - def __init__(self, dataset_path, save_dir='', workers=8, is_test=False): + def __init__(self, dataset_path, save_dir='', workers=8, split='train'): - self.write_image = True + self.write_image = False self.filter_empty_3dboxes = True self.filter_no_label_zone_points = True @@ -85,8 +85,8 @@ def __init__(self, dataset_path, save_dir='', workers=8, is_test=False): self.dataset_path = dataset_path self.save_dir = save_dir self.workers = int(workers) - self.is_test = is_test - self.prefix = '' + self.is_test = split == 'test' + self.prefix = split + '_' self.save_track_id = False self.tfrecord_files = sorted( @@ -152,8 +152,6 @@ def __len__(self): return len(self.tfrecord_files) def save_image(self, frame, file_idx, frame_idx): - self.prefix = '' - for img in frame.images: img_path = Path(self.image_save_dir + str(img.name - 1)) / ( self.prefix + str(file_idx).zfill(3) + str(frame_idx).zfill(3) + @@ -453,6 +451,8 @@ def cart_to_homo(mat): out_path = args.out_path if out_path is None: args.out_path = args.dataset_path + if args.split not in ['train', 'val', 'test']: + raise ValueError("split must be one of {train, val, test}") converter = Waymo2KITTI(args.dataset_path, args.out_path, args.workers, - args.is_test) + args.split) converter.convert() From 1c64d652a38990ecd280a223f1a5bc71a2eca5a7 Mon Sep 17 00:00:00 2001 From: Sanskar Agrawal Date: Fri, 18 Feb 2022 02:48:44 -0800 Subject: [PATCH 14/24] add shuffle --- ml3d/datasets/waymo.py | 4 ++++ ml3d/torch/pipelines/object_detection.py | 6 +++--- scripts/run_pipeline.py | 2 -- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/ml3d/datasets/waymo.py b/ml3d/datasets/waymo.py index 13cc098b2..d64331211 100644 --- a/ml3d/datasets/waymo.py +++ b/ml3d/datasets/waymo.py @@ -53,6 +53,7 @@ def __init__(self, self.dataset_path = cfg.dataset_path self.num_classes = 4 self.label_to_names = self.get_label_to_names() + self.shuffle = kwargs.get('shuffle', False) self.all_files = sorted( glob(join(cfg.dataset_path, 'velodyne', '*.bin'))) @@ -69,6 +70,9 @@ def __init__(self, else: log.warning( f"Skipping {f}, prefix must be one of train, test or val.") + if self.shuffle: + log.info("Shuffling training files...") + self.rng.shuffle(self.train_files) @staticmethod def get_label_to_names(): diff --git a/ml3d/torch/pipelines/object_detection.py b/ml3d/torch/pipelines/object_detection.py index cff670f92..60e6d7ec1 100644 --- a/ml3d/torch/pipelines/object_detection.py +++ b/ml3d/torch/pipelines/object_detection.py @@ -155,8 +155,8 @@ def run_valid(self, epoch=0): log.info("DEVICE : {}".format(device)) log_file_path = join(cfg.logs_dir, 'log_valid_' + timestamp + '.txt') - log.info("Logging in file : {}".format(log_file_path)) if self.rank == 0: + log.info("Logging in file : {}".format(log_file_path)) log.addHandler(logging.FileHandler(log_file_path)) batcher = ConcatBatcher(device, model.cfg.name) @@ -207,12 +207,12 @@ def run_valid(self, epoch=0): boxes = model.inference_end(results, data) pred.extend([BEVBox3D.to_dicts(b) for b in boxes]) gt.extend([BEVBox3D.to_dicts(b) for b in data.bbox_objs]) - # Save only for the first batch - if record_summary and 'valid' not in self.summary: + if record_summary: self.summary['valid'] = self.get_3d_summary(boxes, data, epoch, results=results) + record_summary = False # Save only for the first batch sum_loss = 0 desc = "validation - " diff --git a/scripts/run_pipeline.py b/scripts/run_pipeline.py index 84ffa2919..0495fedd9 100644 --- a/scripts/run_pipeline.py +++ b/scripts/run_pipeline.py @@ -152,8 +152,6 @@ def main(): args.device_ids) > 1 if not args.distributed: - # print("not distr : ") - # exit(0) dataset = Dataset(**cfg_dict_dataset) model = Model(**cfg_dict_model, mode=args.mode) pipeline = Pipeline(model, dataset, **cfg_dict_pipeline) From a22a8dce59175103977da4971481c748a03f0440 Mon Sep 17 00:00:00 2001 From: Sanskar Agrawal Date: Fri, 18 Feb 2022 17:29:16 +0530 Subject: [PATCH 15/24] fix rng --- ml3d/torch/pipelines/base_pipeline.py | 3 ++- scripts/run_pipeline.py | 7 ++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/ml3d/torch/pipelines/base_pipeline.py b/ml3d/torch/pipelines/base_pipeline.py index 2c633c761..a032714a9 100644 --- a/ml3d/torch/pipelines/base_pipeline.py +++ b/ml3d/torch/pipelines/base_pipeline.py @@ -24,6 +24,7 @@ def __init__(self, model: A network model. dataset: A dataset, or None for inference model. device: 'gpu' or 'cpu'. + distributed: Whether to use multiple gpus. kwargs: Returns: @@ -59,7 +60,7 @@ def __init__(self, else: if distributed: self.device = torch.device(device) - print("Using device", self.device) + print(f"Rank : {self.rank} using device : {self.device}") torch.cuda.set_device(self.device) else: self.device = torch.device('cuda') diff --git a/scripts/run_pipeline.py b/scripts/run_pipeline.py index 0495fedd9..0bcf31e30 100644 --- a/scripts/run_pipeline.py +++ b/scripts/run_pipeline.py @@ -191,6 +191,11 @@ def main_worker(rank, Dataset, Model, Pipeline, cfg_dict_dataset, cfg_dict_model['rank'] = rank cfg_dict_pipeline['rank'] = rank + rng = np.random.default_rng(args.seed + rank) + cfg_dict_dataset['seed'] = rng + cfg_dict_model['seed'] = rng + cfg_dict_pipeline['seed'] = rng + device = f"cuda:{args.device_ids[rank]}" print(f"rank = {rank}, world_size = {world_size}, gpu = {device}") @@ -219,4 +224,4 @@ def main_worker(rank, Dataset, Model, Pipeline, cfg_dict_dataset, if __name__ == '__main__': multiprocessing.set_start_method('spawn') - main() + sys.exit(main()) From 685dd3aa3993a9a65f019153adb4f2b3933c4f01 Mon Sep 17 00:00:00 2001 From: Sanskar Agrawal Date: Fri, 18 Feb 2022 17:31:18 +0530 Subject: [PATCH 16/24] remove customparallel --- ml3d/torch/pipelines/object_detection.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ml3d/torch/pipelines/object_detection.py b/ml3d/torch/pipelines/object_detection.py index 60e6d7ec1..d8df902bc 100644 --- a/ml3d/torch/pipelines/object_detection.py +++ b/ml3d/torch/pipelines/object_detection.py @@ -11,7 +11,6 @@ from torch.utils.data import DataLoader from .base_pipeline import BasePipeline -from .dataparallel import CustomDataParallel from ..dataloaders import TorchDataloader, ConcatBatcher from torch.utils.tensorboard import SummaryWriter # pylint: disable-next=unused-import From bfaa4a2dec9968347e20f0e71e783b7db6147c97 Mon Sep 17 00:00:00 2001 From: Sanskar Agrawal Date: Fri, 18 Feb 2022 17:44:42 +0530 Subject: [PATCH 17/24] reset semseg distributed training --- ml3d/torch/pipelines/base_pipeline.py | 5 +++++ ml3d/torch/pipelines/object_detection.py | 19 ++++++++++--------- ml3d/torch/pipelines/semantic_segmentation.py | 11 +++++------ 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/ml3d/torch/pipelines/base_pipeline.py b/ml3d/torch/pipelines/base_pipeline.py index a032714a9..aed40e36f 100644 --- a/ml3d/torch/pipelines/base_pipeline.py +++ b/ml3d/torch/pipelines/base_pipeline.py @@ -41,6 +41,11 @@ def __init__(self, self.rng = np.random.default_rng(kwargs.get('seed', None)) self.distributed = distributed + if self.distributed and self.name == "SemanticSegmentation": + raise NotImplementedError( + "Distributed training not implemented for SemanticSegmentation!" + ) + self.rank = kwargs.get('rank', 0) dataset_name = dataset.name if dataset is not None else '' diff --git a/ml3d/torch/pipelines/object_detection.py b/ml3d/torch/pipelines/object_detection.py index d8df902bc..91a570676 100644 --- a/ml3d/torch/pipelines/object_detection.py +++ b/ml3d/torch/pipelines/object_detection.py @@ -312,15 +312,16 @@ def run_train(self): else: train_sampler = None - train_loader = DataLoader(train_split, - batch_size=cfg.batch_size, - num_workers=cfg.get('num_workers', 0), - pin_memory=cfg.get('pin_memory', False), - collate_fn=batcher.collate_fn, - sampler=train_sampler) - # worker_init_fn=lambda x: np.random.seed(x + np.uint32( - # torch.utils.data.get_worker_info().seed)) - # ) # numpy expects np.uint32, whereas torch returns np.uint64. + train_loader = DataLoader( + train_split, + batch_size=cfg.batch_size, + num_workers=cfg.get('num_workers', 0), + pin_memory=cfg.get('pin_memory', False), + collate_fn=batcher.collate_fn, + sampler=train_sampler, + worker_init_fn=lambda x: np.random.seed(x + np.uint32( + torch.utils.data.get_worker_info().seed)) + ) # numpy expects np.uint32, whereas torch returns np.uint64. self.optimizer, self.scheduler = model.get_optimizer(cfg.optimizer) diff --git a/ml3d/torch/pipelines/semantic_segmentation.py b/ml3d/torch/pipelines/semantic_segmentation.py index d56ddc48b..227b77722 100644 --- a/ml3d/torch/pipelines/semantic_segmentation.py +++ b/ml3d/torch/pipelines/semantic_segmentation.py @@ -12,7 +12,6 @@ # pylint: disable-next=unused-import from open3d.visualization.tensorboard_plugin import summary from .base_pipeline import BasePipeline -from .dataparallel import CustomDataParallel from ..dataloaders import get_sampler, TorchDataloader, DefaultBatcher, ConcatBatcher from ..utils import latest_torch_ckpt from ..modules.losses import SemSegLoss, filter_valid_label @@ -102,7 +101,6 @@ def __init__( momentum=0.98, main_log_dir='./logs/', device='cuda', - device_ids=[0], split='train', train_sum_dir='train_log', **kwargs): @@ -122,7 +120,6 @@ def __init__( momentum=momentum, main_log_dir=main_log_dir, device=device, - device_ids=device_ids, split=split, train_sum_dir=train_sum_dir, **kwargs) @@ -309,6 +306,7 @@ def run_train(self): dataset = self.dataset cfg = self.cfg + model.to(device) log.info("DEVICE : {}".format(device)) timestamp = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') @@ -379,9 +377,6 @@ def run_train(self): writer = SummaryWriter(self.tensorboard_dir) self.save_config(writer) - - model = CustomDataParallel(model, device_ids=self.device_ids) - log.info("Writing summary in {}.".format(self.tensorboard_dir)) record_summary = cfg.get('summary').get('record_for', []) @@ -397,6 +392,8 @@ def run_train(self): model.trans_point_sampler = train_sampler.get_point_sampler() for step, inputs in enumerate(tqdm(train_loader, desc='training')): + if hasattr(inputs['data'], 'to'): + inputs['data'].to(device) self.optimizer.zero_grad() results = model(inputs['data']) loss, gt_labels, predict_scores = model.get_loss( @@ -429,6 +426,8 @@ def run_train(self): with torch.no_grad(): for step, inputs in enumerate( tqdm(valid_loader, desc='validation')): + if hasattr(inputs['data'], 'to'): + inputs['data'].to(device) results = model(inputs['data']) loss, gt_labels, predict_scores = model.get_loss( From 6d58cd12faee716feb0385dd020e3e85880d1ae1 Mon Sep 17 00:00:00 2001 From: Sanskar Agrawal Date: Tue, 22 Feb 2022 01:47:35 -0800 Subject: [PATCH 18/24] change config --- ml3d/configs/pointpillars_waymo.yml | 51 +++++++++++++++-------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/ml3d/configs/pointpillars_waymo.yml b/ml3d/configs/pointpillars_waymo.yml index cd35e21f6..13b35fdd9 100644 --- a/ml3d/configs/pointpillars_waymo.yml +++ b/ml3d/configs/pointpillars_waymo.yml @@ -2,7 +2,7 @@ dataset: name: Waymo dataset_path: # path/to/your/dataset cache_dir: ./logs/cache - steps_per_epoch_train: 5000 + steps_per_epoch_train: 4000 model: name: PointPillars @@ -10,7 +10,7 @@ model: batcher: "ignore" - point_cloud_range: [-80, -80, -6, 85, 85, 7] + point_cloud_range: [-74.88, -74.88, -2, 74.88, 74.88, 4] classes: ['VEHICLE', 'PEDESTRIAN', 'CYCLIST'] loss: @@ -37,13 +37,13 @@ model: scatter: in_channels: 64 - output_shape: [520, 520] + output_shape: [468, 468] backbone: in_channels: 64 out_channels: [64, 128, 256] layer_nums: [3, 5, 5] - layer_strides: [2, 2, 2] + layer_strides: [1, 2, 2] neck: in_channels: [64, 128, 256] @@ -57,38 +57,39 @@ model: nms_pre: 4096 score_thr: 0.1 ranges: [ - [-80, -80, 1.142, 85, 85, 1.142], - [-80, -80, 1.139, 85, 85, 1.139], - [-80, -80, 1.149, 85, 85, 1.149], + [-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345], + [-74.88, -74.88, -0.1188, 74.88, 74.88, -0.1188], + [-74.88, -74.88, 0, 74.88, 74.88, 0], ] sizes: [ - [1.98, 4.50, 1.96], # VEHICLE - [0.91, 1.94, 1.78], # CYCLIST - [0.84, 0.91, 1.70] # PEDESTRIAN + [2.08, 4.73, 1.77], # VEHICLE + [0.84, 1.81, 1.77], # CYCLIST + [0.84, 0.91, 1.74] # PEDESTRIAN ] dir_offset: 0.7854 rotations: [0, 1.57] iou_thr: [[0.4, 0.55], [0.3, 0.5], [0.3, 0.5]] - augment: {} - # PointShuffle: True - # ObjectRangeFilter: True - # ObjectSample: - # min_points_dict: - # VEHICLE: 5 - # PEDESTRIAN: 10 - # CYCLIST: 10 - # sample_dict: - # VEHICLE: 15 - # PEDESTRIAN: 10 - # CYCLIST: 10 + augment: + PointShuffle: True + ObjectRangeFilter: + point_cloud_range: [-74.88, -74.88, -2, 74.88, 74.88, 4] + ObjectSample: + min_points_dict: + VEHICLE: 5 + PEDESTRIAN: 10 + CYCLIST: 10 + sample_dict: + VEHICLE: 15 + PEDESTRIAN: 10 + CYCLIST: 10 pipeline: name: ObjectDetection test_compute_metric: true batch_size: 6 - val_batch_size: 1 + val_batch_size: 6 test_batch_size: 1 save_ckpt_freq: 5 max_epoch: 200 @@ -102,10 +103,10 @@ pipeline: weight_decay: 0.01 # evaluation properties - overlaps: [0.5, 0.5, 0.7] + overlaps: [0.5, 0.5, 0.5] difficulties: [0, 1, 2] summary: - record_for: [] + record_for: [train, valid] max_pts: use_reference: false max_outputs: 1 From 51a16c376259ab7d4410e4f4590a2a1090b55bb7 Mon Sep 17 00:00:00 2001 From: Sanskar Agrawal Date: Tue, 22 Feb 2022 19:32:46 +0530 Subject: [PATCH 19/24] fix lgtm --- ml3d/torch/pipelines/dataparallel.py | 1 - scripts/run_pipeline.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/ml3d/torch/pipelines/dataparallel.py b/ml3d/torch/pipelines/dataparallel.py index 1b36c431a..eee68138d 100644 --- a/ml3d/torch/pipelines/dataparallel.py +++ b/ml3d/torch/pipelines/dataparallel.py @@ -1,5 +1,4 @@ import torch -import numpy as np from torch.nn.parallel import DataParallel diff --git a/scripts/run_pipeline.py b/scripts/run_pipeline.py index 504d21e97..a1b7311f4 100644 --- a/scripts/run_pipeline.py +++ b/scripts/run_pipeline.py @@ -212,8 +212,6 @@ def main_worker(rank, Dataset, Model, Pipeline, cfg_dict_dataset, distributed=args.distributed, **cfg_dict_pipeline) - with open(Path(__file__).parent / 'README.md', 'r') as f: - readme = f.read() pipeline.cfg_tb = args.cfg_tb if args.split == 'test': From 50133bb8db163f48997c8b3461e73dc037d8681f Mon Sep 17 00:00:00 2001 From: Sanskar Agrawal Date: Fri, 25 Mar 2022 20:23:09 +0530 Subject: [PATCH 20/24] address reviews (1) --- ml3d/configs/pointpillars_waymo.yml | 2 +- ml3d/datasets/waymo.py | 28 ++++++++--------- ml3d/torch/pipelines/base_pipeline.py | 6 ++-- ml3d/torch/pipelines/object_detection.py | 3 +- scripts/collect_bboxes.py | 14 +++++---- scripts/preprocess_waymo.py | 38 ++++++++++++++---------- scripts/run_pipeline.py | 24 +++++++++++---- 7 files changed, 67 insertions(+), 48 deletions(-) diff --git a/ml3d/configs/pointpillars_waymo.yml b/ml3d/configs/pointpillars_waymo.yml index 13b35fdd9..23900ed15 100644 --- a/ml3d/configs/pointpillars_waymo.yml +++ b/ml3d/configs/pointpillars_waymo.yml @@ -106,7 +106,7 @@ pipeline: overlaps: [0.5, 0.5, 0.5] difficulties: [0, 1, 2] summary: - record_for: [train, valid] + record_for: [] max_pts: use_reference: false max_outputs: 1 diff --git a/ml3d/datasets/waymo.py b/ml3d/datasets/waymo.py index 6a01cf418..bceacd23f 100644 --- a/ml3d/datasets/waymo.py +++ b/ml3d/datasets/waymo.py @@ -91,18 +91,17 @@ def read_lidar(path): """Reads lidar data from the path provided. Returns: - A data object with lidar information. + pc: pointcloud data with shape [N, 6], where + the format is xyzRGB. """ - assert Path(path).exists() - return np.fromfile(path, dtype=np.float32).reshape(-1, 6) @staticmethod def read_label(path, calib): - """Reads labels of bound boxes. + """Reads labels of bounding boxes. Returns: - The data objects with bound boxes information. + The data objects with bounding boxes information. """ if not Path(path).exists(): return None @@ -132,24 +131,22 @@ def read_calib(path): Returns: The camera and the camera image used in calibration. """ - assert Path(path).exists() - with open(path, 'r') as f: lines = f.readlines() obj = lines[0].strip().split(' ')[1:] - P0 = np.array(obj, dtype=np.float32) + unused_P0 = np.array(obj, dtype=np.float32) obj = lines[1].strip().split(' ')[1:] - P1 = np.array(obj, dtype=np.float32) + unused_P1 = np.array(obj, dtype=np.float32) obj = lines[2].strip().split(' ')[1:] P2 = np.array(obj, dtype=np.float32) obj = lines[3].strip().split(' ')[1:] - P3 = np.array(obj, dtype=np.float32) + unused_P3 = np.array(obj, dtype=np.float32) obj = lines[4].strip().split(' ')[1:] - P4 = np.array(obj, dtype=np.float32) + unused_P4 = np.array(obj, dtype=np.float32) obj = lines[5].strip().split(' ')[1:] R0 = np.array(obj, dtype=np.float32).reshape(3, 3) @@ -210,7 +207,7 @@ def get_split_list(self, split): else: raise ValueError("Invalid split {}".format(split)) - def is_tested(): + def is_tested(attr): """Checks if a datum in the dataset has been tested. Args: @@ -220,16 +217,16 @@ def is_tested(): If the datum attribute is tested, then return the path where the attribute is stored; else, returns false. """ - pass + raise NotImplementedError() - def save_test_result(): + def save_test_result(results, attr): """Saves the output of a model. Args: results: The output of a model for the datum associated with the attribute passed. attr: The attributes that correspond to the outputs passed in results. """ - pass + raise NotImplementedError() class WaymoSplit(): @@ -279,6 +276,7 @@ class Object3d(BEVBox3D): """ def __init__(self, center, size, label, calib): + # ground truth files doesn't have confidence value. confidence = float(label[15]) if label.__len__() == 16 else -1.0 world_cam = calib['world_cam'] diff --git a/ml3d/torch/pipelines/base_pipeline.py b/ml3d/torch/pipelines/base_pipeline.py index aed40e36f..f466868b9 100644 --- a/ml3d/torch/pipelines/base_pipeline.py +++ b/ml3d/torch/pipelines/base_pipeline.py @@ -23,7 +23,7 @@ def __init__(self, Args: model: A network model. dataset: A dataset, or None for inference model. - device: 'gpu' or 'cpu'. + device: 'cuda' or 'cpu'. distributed: Whether to use multiple gpus. kwargs: @@ -59,8 +59,8 @@ def __init__(self, if device == 'cpu' or not torch.cuda.is_available(): if distributed: - raise ValueError( - "Distributed training is ON, but CUDA not available.") + raise NotImplementedError( + "Distributed training for CPU is not supported yet.") self.device = torch.device('cpu') else: if distributed: diff --git a/ml3d/torch/pipelines/object_detection.py b/ml3d/torch/pipelines/object_detection.py index 11bb87c99..6646ac225 100644 --- a/ml3d/torch/pipelines/object_detection.py +++ b/ml3d/torch/pipelines/object_detection.py @@ -176,8 +176,6 @@ def run_valid(self, epoch=0): pin_memory=cfg.get('pin_memory', False), collate_fn=batcher.collate_fn, sampler=valid_sampler) - # worker_init_fn=lambda x: np.random.seed(x + np.uint32( - # torch.utils.data.get_worker_info().seed))) record_summary = self.rank == 0 and 'valid' in cfg.get('summary').get( 'record_for', []) @@ -341,6 +339,7 @@ def run_train(self): model.cuda(self.device) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[self.device]) + # model.get_loss = model.module.get_loss record_summary = self.rank == 0 and 'train' in cfg.get('summary').get( 'record_for', []) diff --git a/scripts/collect_bboxes.py b/scripts/collect_bboxes.py index 609020851..f8c6a5f33 100644 --- a/scripts/collect_bboxes.py +++ b/scripts/collect_bboxes.py @@ -1,12 +1,13 @@ import logging -from os.path import join import argparse import pickle -import random +import numpy as np +import multiprocessing + from tqdm import tqdm +from os.path import join from open3d.ml.datasets import utils from open3d.ml import datasets -import multiprocessing def parse_args(): @@ -25,7 +26,7 @@ def parse_args(): default="KITTI", required=False) parser.add_argument('--num_cpus', - help='Name of dataset class', + help='Number of threads to use.', type=int, default=multiprocessing.cpu_count(), required=False) @@ -95,8 +96,9 @@ def process_boxes(i): train = dataset.get_split('train') max_pc = len(train) if args.max_pc is None else args.max_pc - query_pc = range(len(train)) if max_pc >= len(train) else random.sample( - range(len(train)), max_pc) + rng = np.random.default_rng() + query_pc = range(len(train)) if max_pc >= len(train) else rng.choice( + range(len(train)), max_pc, replace=False) print(f"Found {len(train)} traning samples, Using {max_pc}") print( diff --git a/scripts/preprocess_waymo.py b/scripts/preprocess_waymo.py index bf5afb24f..6b44a61fd 100644 --- a/scripts/preprocess_waymo.py +++ b/scripts/preprocess_waymo.py @@ -8,13 +8,13 @@ import logging import numpy as np import os, sys, glob, pickle -from pathlib import Path -from os.path import join, exists, dirname, abspath -from os import makedirs -import random import argparse import tensorflow as tf import matplotlib.image as mpimg + +from pathlib import Path +from os.path import join, exists, dirname, abspath +from os import makedirs from multiprocessing import Pool from tqdm import tqdm from waymo_open_dataset.utils import range_image_utils, transform_utils @@ -58,6 +58,25 @@ class Waymo2KITTI(): """Waymo to KITTI converter. This class converts tfrecord files from Waymo dataset to KITTI format. + KITTI format : (type, truncated, occluded, alpha, bbox, dimensions(3), location(3), + rotation_y(1), score(1, optional)) + type (string): Describes the type of object. + truncated (float): Ranges from 0(non-truncated) to 1(truncated). + occluded (int): Integer(0, 1, 2, 3) signifies state fully visible, partly + occluded, largely occluded, unknown. + alpha (float): Observation angle of object, ranging [-pi..pi]. + bbox (float): 2d bounding box of object in the image. + dimensions (float): 3D object dimensions: h, w, l in meters. + location (float): 3D object location: x,y,z in camera coordinates (in meters). + rotation_y (float): rotation around Y-axis in camera coordinates [-pi..pi]. + score (float): Only for predictions, indicating confidence in detection. + + Conversion writes following files: + pointcloud(np.float32) : pointcloud data with shape [N, 6]. Consists of + (x, y, z, intensity, elongation, timestamp). + images(np.uint8): camera images are saved if `write_image` is True. + calibrations(np.float32): Intinsic and Extrinsic matrix for all cameras. + label(np.float32): Bounding box information in KITTI format. Args: dataset_path (str): Directory to load waymo raw data. @@ -137,7 +156,6 @@ def process_one(self, file_idx): if (self.selected_waymo_locations is not None and frame.context.stats.location not in self.selected_waymo_locations): - print("continue") continue if self.write_image: @@ -208,7 +226,6 @@ def save_calib(self, frame, file_idx, frame_idx): f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt', 'w+') as fp_calib: fp_calib.write(calib_context) - fp_calib.close() def save_pose(self, frame, file_idx, frame_idx): pose = np.array(frame.pose.transform).reshape(4, 4) @@ -226,7 +243,6 @@ def save_label(self, frame, file_idx, frame_idx): for labels in frame.projected_lidar_labels: name = labels.name for label in labels.labels: - # TODO: need a workaround as bbox may not belong to front cam bbox = [ label.box.center_x - label.box.length / 2, label.box.center_y - label.box.width / 2, @@ -255,9 +271,6 @@ def save_label(self, frame, file_idx, frame_idx): if my_type not in self.selected_waymo_classes: continue - # if self.filter_empty_3dboxes and obj.num_lidar_points_in_box < 1: - # continue - height = obj.box.height width = obj.box.width length = obj.box.length @@ -266,11 +279,6 @@ def save_label(self, frame, file_idx, frame_idx): y = obj.box.center_y z = obj.box.center_z - # # project bounding box to the virtual reference frame - # pt_ref = self.T_velo_to_front_cam @ \ - # np.array([x, y, z, 1]).reshape((4, 1)) - # x, y, z, _ = pt_ref.flatten().tolist() - rotation_y = -obj.box.heading - np.pi / 2 track_id = obj.id diff --git a/scripts/run_pipeline.py b/scripts/run_pipeline.py index a1b7311f4..68564e1b9 100644 --- a/scripts/run_pipeline.py +++ b/scripts/run_pipeline.py @@ -43,6 +43,18 @@ def parse_args(): parser.add_argument('--main_log_dir', help='the dir to save logs and models') parser.add_argument('--seed', help='random seed', default=0) + parser.add_argument( + '--host', + help='Host for distributed training, default: localhost', + default='localhost') + parser.add_argument('--port', + help='port for distributed training, default: 12355', + default='12355') + parser.add_argument( + '--backend', + help= + 'backend for distributed training. One of (nccl, gloo)}, default: gloo', + default='gloo') args, unknown = parser.parse_known_args() @@ -173,12 +185,12 @@ def main(): nprocs=len(args.device_ids)) -def setup(rank, world_size): - os.environ['MASTER_ADDR'] = 'localhost' - os.environ['MASTER_PORT'] = '12355' +def setup(rank, world_size, args): + os.environ['MASTER_ADDR'] = args.host + os.environ['MASTER_PORT'] = args.port # initialize the process group - dist.init_process_group("gloo", rank=rank, world_size=world_size) + dist.init_process_group(args.backend, rank=rank, world_size=world_size) def cleanup(): @@ -188,7 +200,7 @@ def cleanup(): def main_worker(rank, Dataset, Model, Pipeline, cfg_dict_dataset, cfg_dict_model, cfg_dict_pipeline, args): world_size = len(args.device_ids) - setup(rank, world_size) + setup(rank, world_size, args) cfg_dict_dataset['rank'] = rank cfg_dict_model['rank'] = rank @@ -229,5 +241,5 @@ def main_worker(rank, Dataset, Model, Pipeline, cfg_dict_dataset, format='%(levelname)s - %(asctime)s - %(module)s - %(message)s', ) - multiprocessing.set_start_method('spawn') + multiprocessing.set_start_method('forkserver') sys.exit(main()) From 1ed11c8841bbb4423dc82f553a4029c03fcec66e Mon Sep 17 00:00:00 2001 From: Sanskar Agrawal Date: Fri, 25 Mar 2022 20:32:02 +0530 Subject: [PATCH 21/24] fix model.module.... --- ml3d/torch/pipelines/object_detection.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/ml3d/torch/pipelines/object_detection.py b/ml3d/torch/pipelines/object_detection.py index 6646ac225..0ba26f116 100644 --- a/ml3d/torch/pipelines/object_detection.py +++ b/ml3d/torch/pipelines/object_detection.py @@ -339,7 +339,9 @@ def run_train(self): model.cuda(self.device) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[self.device]) - # model.get_loss = model.module.get_loss + model.get_loss = model.module.get_loss + model.cfg = model.module.cfg + model.inference_end = model.module.inference_end record_summary = self.rank == 0 and 'train' in cfg.get('summary').get( 'record_for', []) @@ -359,10 +361,7 @@ def run_train(self): for data in process_bar: data.to(device) results = model(data) - if self.distributed: - loss = model.module.get_loss(results, data) - else: - loss = model.get_loss(results, data) + loss = model.get_loss(results, data) loss_sum = sum(loss.values()) self.optimizer.zero_grad() @@ -381,10 +380,7 @@ def run_train(self): # Record visualization for the last iteration if record_summary and process_bar.n == process_bar.total - 1: - if self.distributed: - boxes = model.module.inference_end(results, data) - else: - boxes = model.inference_end(results, data) + boxes = model.inference_end(results, data) self.summary['train'] = self.get_3d_summary(boxes, data, epoch, From eb3b5518e6fc4a4e0d531da1905afc773d041d19 Mon Sep 17 00:00:00 2001 From: Sanskar Agrawal Date: Tue, 5 Apr 2022 17:13:03 +0530 Subject: [PATCH 22/24] remove dataparallel --- ml3d/torch/pipelines/dataparallel.py | 57 ---------------------------- 1 file changed, 57 deletions(-) delete mode 100644 ml3d/torch/pipelines/dataparallel.py diff --git a/ml3d/torch/pipelines/dataparallel.py b/ml3d/torch/pipelines/dataparallel.py deleted file mode 100644 index eee68138d..000000000 --- a/ml3d/torch/pipelines/dataparallel.py +++ /dev/null @@ -1,57 +0,0 @@ -import torch -from torch.nn.parallel import DataParallel - - -class CustomDataParallel(DataParallel): - """Custom DataParallel method for performing scatter operation - outside of torch's DataParallel. - """ - - def __init__(self, module, **kwargs): - super(CustomDataParallel, self).__init__(module, **kwargs) - self.get_loss = self.module.get_loss - self.cfg = self.module.cfg - - def forward(self, *inputs, **kwargs): - if not self.device_ids: - return self.module(*inputs, **kwargs) - - if len(self.device_ids) == 1: - if hasattr(inputs[0], 'to'): - inputs[0].to(self.device_ids[0]) - return self.module(inputs[0], **kwargs) - - inputs, kwargs = self.customscatter(inputs, kwargs, self.device_ids) - - self.module.cuda() - replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) - outputs = self.parallel_apply(replicas, inputs, kwargs) - - return self.gather(outputs, self.output_device) - - def customscatter(self, inputs, kwargs, device_ids): - """Custom scatter method to override default method. - Scatter batch dimension based on custom scatter implemented - in custom batcher. - - Agrs: - inputs: Object of type custom batcher. - kwargs: Optional keyword arguments. - device_ids: List of device ids. - - Returns: - Returns a list of inputs of length num_devices. - Each input is transfered to different device id. - """ - if not hasattr(inputs[0], 'scatter'): - try: - return self.scatter(inputs, kwargs, device_ids) - except: - raise NotImplementedError( - f"Please implement scatter for {inputs[0]} for multi gpu execution." - ) - inputs = inputs[0].scatter(inputs[0], len(device_ids)) - for i in range(len(inputs)): - inputs[i].to(torch.device(device_ids[i])) - - return inputs, [kwargs for _ in range(len(inputs))] From 77c1a0a62adb1e11e397d958c955bd6d8a034355 Mon Sep 17 00:00:00 2001 From: naruarjun Date: Fri, 10 Jun 2022 19:33:03 +0530 Subject: [PATCH 23/24] Adding test files list initialization (#542) Co-authored-by: Narayanan E.R --- ml3d/datasets/waymo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ml3d/datasets/waymo.py b/ml3d/datasets/waymo.py index bceacd23f..2c1feced9 100644 --- a/ml3d/datasets/waymo.py +++ b/ml3d/datasets/waymo.py @@ -55,6 +55,7 @@ def __init__(self, glob(join(cfg.dataset_path, 'velodyne', '*.bin'))) self.train_files = [] self.val_files = [] + self.test_files = [] for f in self.all_files: if 'train' in f: From 2929b43958c9e9bcc4629e529c99ee88f8caa6fc Mon Sep 17 00:00:00 2001 From: Sanskar Agrawal Date: Tue, 13 Sep 2022 21:05:52 +0530 Subject: [PATCH 24/24] address reviews --- ml3d/datasets/waymo.py | 7 ++++--- ml3d/torch/pipelines/object_detection.py | 1 - 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ml3d/datasets/waymo.py b/ml3d/datasets/waymo.py index 2c1feced9..1dd2036ef 100644 --- a/ml3d/datasets/waymo.py +++ b/ml3d/datasets/waymo.py @@ -101,6 +101,10 @@ def read_lidar(path): def read_label(path, calib): """Reads labels of bounding boxes. + Args: + path: The path to the label file. + calib: Calibration as returned by read_calib(). + Returns: The data objects with bounding boxes information. """ @@ -272,9 +276,6 @@ def get_attr(self, idx): class Object3d(BEVBox3D): - """The class stores details that are object-specific, such as bounding box - coordinates, occlusion and so on. - """ def __init__(self, center, size, label, calib): # ground truth files doesn't have confidence value. diff --git a/ml3d/torch/pipelines/object_detection.py b/ml3d/torch/pipelines/object_detection.py index 0ba26f116..04a0cc457 100644 --- a/ml3d/torch/pipelines/object_detection.py +++ b/ml3d/torch/pipelines/object_detection.py @@ -402,7 +402,6 @@ def run_train(self): self.scheduler.step() # --------------------- validation - # if rank == 0 and (epoch % cfg.get("validation_freq", 1)) == 0: if epoch % cfg.get("validation_freq", 1) == 0: self.run_valid() if self.distributed: