diff --git a/embodiedscan/converter/generate_image_3rscan.py b/embodiedscan/converter/generate_image_3rscan.py new file mode 100644 index 0000000..826b0c8 --- /dev/null +++ b/embodiedscan/converter/generate_image_3rscan.py @@ -0,0 +1,27 @@ +import os +import zipfile +from argparse import ArgumentParser +from functools import partial + +import mmengine + + +def process_scene(path, scene_name): + """Process single 3Rscan scene.""" + with zipfile.ZipFile(os.path.join(path, scene_name, 'sequence.zip'), + 'r') as zip_ref: + zip_ref.extractall(os.path.join(path, scene_name, 'sequence')) + + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument('--dataset_folder', + required=True, + help='folder of the dataset.') + parser.add_argument('--nproc', type=int, default=8) + args = parser.parse_args() + + mmengine.track_parallel_progress(func=partial(process_scene, + args.dataset_folder), + tasks=os.listdir(args.dataset_folder), + nproc=args.nproc) diff --git a/embodiedscan/converter/generate_image_scannet.py b/embodiedscan/converter/generate_image_scannet.py new file mode 100644 index 0000000..77c52b2 --- /dev/null +++ b/embodiedscan/converter/generate_image_scannet.py @@ -0,0 +1,189 @@ +# Modified from https://github.com/ScanNet/ScanNet/blob/master/SensReader/python/SensorData.py # noqa +import os +import struct +import zlib +from argparse import ArgumentParser +from functools import partial + +import imageio +import mmengine +import numpy as np + +COMPRESSION_TYPE_COLOR = {-1: 'unknown', 0: 'raw', 1: 'png', 2: 'jpeg'} + +COMPRESSION_TYPE_DEPTH = { + -1: 'unknown', + 0: 'raw_ushort', + 1: 'zlib_ushort', + 2: 'occi_ushort' +} + + +class RGBDFrame: + """Class for single ScanNet RGB-D image processing.""" + + def load(self, file_handle): + self.camera_to_world = np.asarray(struct.unpack( + 'f' * 16, file_handle.read(16 * 4)), + dtype=np.float32).reshape(4, 4) + self.timestamp_color = struct.unpack('Q', file_handle.read(8))[0] + self.timestamp_depth = struct.unpack('Q', file_handle.read(8))[0] + self.color_size_bytes = struct.unpack('Q', file_handle.read(8))[0] + self.depth_size_bytes = struct.unpack('Q', file_handle.read(8))[0] + self.color_data = b''.join( + struct.unpack('c' * self.color_size_bytes, + file_handle.read(self.color_size_bytes))) + self.depth_data = b''.join( + struct.unpack('c' * self.depth_size_bytes, + file_handle.read(self.depth_size_bytes))) + + def decompress_depth(self, compression_type): + assert compression_type == 'zlib_ushort' + return zlib.decompress(self.depth_data) + + def decompress_color(self, compression_type): + assert compression_type == 'jpeg' + return imageio.imread(self.color_data) + + +class SensorData: + """Class for single ScanNet scene processing. + + Single scene file contains multiple RGB-D images. + """ + + def __init__(self, filename, fast=False): + self.version = 4 + self.load(filename, fast) + + def load(self, filename, fast): + with open(filename, 'rb') as f: + version = struct.unpack('I', f.read(4))[0] + assert self.version == version + strlen = struct.unpack('Q', f.read(8))[0] + self.sensor_name = b''.join( + struct.unpack('c' * strlen, f.read(strlen))) + self.intrinsic_color = np.asarray(struct.unpack( + 'f' * 16, f.read(16 * 4)), + dtype=np.float32).reshape(4, 4) + self.extrinsic_color = np.asarray(struct.unpack( + 'f' * 16, f.read(16 * 4)), + dtype=np.float32).reshape(4, 4) + self.intrinsic_depth = np.asarray(struct.unpack( + 'f' * 16, f.read(16 * 4)), + dtype=np.float32).reshape(4, 4) + self.extrinsic_depth = np.asarray(struct.unpack( + 'f' * 16, f.read(16 * 4)), + dtype=np.float32).reshape(4, 4) + self.color_compression_type = COMPRESSION_TYPE_COLOR[struct.unpack( + 'i', f.read(4))[0]] + self.depth_compression_type = COMPRESSION_TYPE_DEPTH[struct.unpack( + 'i', f.read(4))[0]] + self.color_width = struct.unpack('I', f.read(4))[0] + self.color_height = struct.unpack('I', f.read(4))[0] + self.depth_width = struct.unpack('I', f.read(4))[0] + self.depth_height = struct.unpack('I', f.read(4))[0] + self.depth_shift = struct.unpack('f', f.read(4))[0] + num_frames = struct.unpack('Q', f.read(8))[0] + self.num_frames = num_frames + self.frames = [] + if fast: + index = list(range(num_frames))[::10] + else: + index = list(range(num_frames)) + self.index = index + for i in range(num_frames): + frame = RGBDFrame() + frame.load(f) + if i in index: + self.frames.append(frame) + + def export_depth_images(self, output_path): + if not os.path.exists(output_path): + os.makedirs(output_path) + for f in range(len(self.frames)): + depth_data = self.frames[f].decompress_depth( + self.depth_compression_type) + depth = np.fromstring(depth_data, dtype=np.uint16).reshape( + self.depth_height, self.depth_width) + imageio.imwrite( + os.path.join(output_path, + self.index_to_str(self.index[f]) + '.png'), depth) + + def export_color_images(self, output_path): + if not os.path.exists(output_path): + os.makedirs(output_path) + for f in range(len(self.frames)): + color = self.frames[f].decompress_color( + self.color_compression_type) + imageio.imwrite( + os.path.join(output_path, + self.index_to_str(self.index[f]) + '.jpg'), color) + + @staticmethod + def index_to_str(index): + return str(index).zfill(5) + + @staticmethod + def save_mat_to_file(matrix, filename): + with open(filename, 'w') as f: + for line in matrix: + np.savetxt(f, line[np.newaxis], fmt='%f') + + def export_poses(self, output_path): + if not os.path.exists(output_path): + os.makedirs(output_path) + for f in range(len(self.frames)): + self.save_mat_to_file( + self.frames[f].camera_to_world, + os.path.join(output_path, + self.index_to_str(self.index[f]) + '.txt')) + + def export_intrinsics(self, output_path): + if not os.path.exists(output_path): + os.makedirs(output_path) + self.save_mat_to_file(self.intrinsic_color, + os.path.join(output_path, 'intrinsic.txt')) + + def export_depth_intrinsics(self, output_path): + if not os.path.exists(output_path): + os.makedirs(output_path) + self.save_mat_to_file(self.intrinsic_depth, + os.path.join(output_path, 'depth_intrinsic.txt')) + + +def process_scene(path, fast, idx): + """Process single ScanNet scene. + + Extract RGB images, poses and camera intrinsics. + """ + data = SensorData(os.path.join(path, idx, f'{idx}.sens'), fast) + output_path = os.path.join('posed_images', idx) + data.export_color_images(output_path) + data.export_intrinsics(output_path) + data.export_poses(output_path) + data.export_depth_images(output_path) + data.export_depth_intrinsics(output_path) + + +def process_directory(path, fast, nproc): + mmengine.track_parallel_progress(func=partial(process_scene, path, fast), + tasks=os.listdir(path), + nproc=nproc) + + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument('--dataset_folder', + default=None, + help='folder of the dataset.') + parser.add_argument('--nproc', type=int, default=8) + parser.add_argument('--fast', action='store_true') + args = parser.parse_args() + + if args.dataset_folder is not None: + os.chdir(args.dataset_folder) + + # process train and val scenes + if os.path.exists('scans'): + process_directory('scans', args.fast, args.nproc) diff --git a/embodiedscan/embodied_dataset.py b/embodiedscan/embodied_dataset.py new file mode 100644 index 0000000..697a02e --- /dev/null +++ b/embodiedscan/embodied_dataset.py @@ -0,0 +1,322 @@ +import os +import warnings +from typing import Callable, List, Optional, Union + +import mmengine +import numpy as np +from mmdet3d.registry import DATASETS +from mmdet3d.structures import get_box_type +from mmengine.dataset import BaseDataset +from mmengine.fileio import load + + +@DATASETS.register_module() +class EmbodiedScanDataset(BaseDataset): + + def __init__(self, + data_root: str, + ann_file: str, + metainfo: Optional[dict] = None, + pipeline: List[Union[dict, Callable]] = [], + test_mode: bool = False, + load_eval_anns: bool = True, + filter_empty_gt: bool = True, + remove_dontcare: bool = False, + box_type_3d: str = 'Euler-Depth', + **kwargs) -> None: + + self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d) + self.filter_empty_gt = filter_empty_gt + self.remove_dontcare = remove_dontcare + self.load_eval_anns = load_eval_anns + super().__init__(ann_file=ann_file, + metainfo=metainfo, + data_root=data_root, + pipeline=pipeline, + test_mode=test_mode, + **kwargs) + + def process_metainfo(self): + """This function will be processed after metainfos from ann_file and + config are combined.""" + assert 'categories' in self._metainfo + + if 'classes' not in self._metainfo: + self._metainfo.setdefault( + 'classes', list(self._metainfo['categories'].keys())) + + self.label_mapping = np.full( + max(list(self._metainfo['categories'].values())) + 1, + -1, + dtype=int) + for key, value in self._metainfo['categories'].items(): + if key in self._metainfo['classes']: + self.label_mapping[value] = self._metainfo['classes'].index( + key) + + self.occ_label_mapping = np.full( + max(list(self._metainfo['categories'].values())) + 1, + -1, + dtype=int) + if 'occ_classes' in self._metainfo: + for idx, label_name in enumerate(self._metainfo['occ_classes']): + self.occ_label_mapping[self.metainfo['categories'][ + label_name]] = idx + 1 # 1-based, 0 is empty + + def parse_data_info(self, info: dict) -> dict: + """Process the raw data info. + + The only difference with it in `Det3DDataset` + is the specific process for `axis_align_matrix'. + + Args: + info (dict): Raw info dict. + + Returns: + dict: Has `ann_info` in training stage. And + all path has been converted to absolute path. + """ + info['axis_align_matrix'] = self._get_axis_align_matrix(info) + # Because multi-view settings are different from original designs + # we temporarily follow the ori design in ImVoxelNet + info['img_path'] = [] + info['depth_img_path'] = [] + if 'cam2img' in info: + cam2img = info['cam2img'].astype(np.float32) + else: + cam2img = [] + + extrinsics = [] + for i in range(len(info['images'])): + img_path = os.path.join(self.data_prefix.get('img_path', ''), + info['images'][i]['img_path']) + depth_img_path = os.path.join(self.data_prefix.get('img_path', ''), + info['images'][i]['depth_path']) + + info['img_path'].append(img_path) + info['depth_img_path'].append(depth_img_path) + align_global2cam = np.linalg.inv( + info['axis_align_matrix'] @ info['images'][i]['cam2global']) + extrinsics.append(align_global2cam.astype(np.float32)) + if 'cam2img' not in info: + cam2img.append(info['images'][i]['cam2img'].astype(np.float32)) + + info['depth2img'] = dict(extrinsic=extrinsics, + intrinsic=cam2img, + origin=np.array([.0, .0, + .5]).astype(np.float32)) + + if 'depth_cam2img' not in info: + info['depth_cam2img'] = cam2img + + if not self.test_mode: + info['ann_info'] = self.parse_ann_info(info) + if self.test_mode and self.load_eval_anns: + eval_ann_info = self.parse_ann_info(info) + info['eval_ann_info'] = self._remove_dontcare(eval_ann_info) + + return info + + def parse_ann_info(self, info: dict) -> dict: + """Process the `instances` in data info to `ann_info`. + + Args: + info (dict): Info dict. + + Returns: + dict: Processed `ann_info`. + """ + + ann_info = None + if 'instances' in info and len(info['instances']) > 0: + ann_info = dict( + gt_bboxes_3d=np.zeros((len(info['instances']), 9), + dtype=np.float32), + gt_labels_3d=np.zeros((len(info['instances']), ), + dtype=np.int64), + ) + for idx, instance in enumerate(info['instances']): + ann_info['gt_bboxes_3d'][idx] = instance['bbox_3d'] + ann_info['gt_labels_3d'][idx] = self.label_mapping[ + instance['bbox_label_3d']] + + # pack ann_info for return + if ann_info is None: + ann_info = dict() + ann_info['gt_bboxes_3d'] = np.zeros((0, 9), dtype=np.float32) + ann_info['gt_labels_3d'] = np.zeros((0, ), dtype=np.int64) + + # post-processing/filtering ann_info if not empty gt + if 'visible_instance_ids' in info['images'][0]: + ids = [] + for i in range(len(info['images'])): + ids.append(info['images'][i]['visible_instance_ids']) + mask_length = ann_info['gt_labels_3d'].shape[0] + ann_info['visible_instance_masks'] = self._ids2masks( + ids, mask_length) + + if self.remove_dontcare: + ann_info = self._remove_dontcare(ann_info) + + ann_dataset = info['sample_idx'].split('/')[0] + if ann_dataset == 'scannet': + region = info['sample_idx'].split('/')[1] + occ_filename = os.path.join(self.data_prefix.get('img_path', ''), + ann_dataset, 'scans', region, + 'occupancy', 'occupancy.npy') + mask_filename = os.path.join(self.data_prefix.get('img_path', ''), + ann_dataset, 'scans', region, + 'occupancy', 'visible_occupancy.pkl') + elif ann_dataset == '3rscan': + region = info['sample_idx'].split('/')[1] + occ_filename = os.path.join(self.data_prefix.get('img_path', + ''), ann_dataset, + region, 'occupancy', 'occupancy.npy') + mask_filename = os.path.join(self.data_prefix.get('img_path', ''), + ann_dataset, region, 'occupancy', + 'visible_occupancy.pkl') + elif ann_dataset == 'matterport3d': + building = info['sample_idx'].split('/')[1] + region = info['sample_idx'].split('/')[2] + occ_filename = os.path.join(self.data_prefix.get('img_path', ''), + ann_dataset, building, 'occupancy', + f'occupancy_{region}.npy') + mask_filename = os.path.join(self.data_prefix.get('img_path', ''), + ann_dataset, building, 'occupancy', + f'visible_occupancy_{region}.pkl') + else: + raise NotImplementedError + + gt_occ = np.load(occ_filename) + for i in range(gt_occ.shape[0]): + cls_id = self.occ_label_mapping[gt_occ[i][3]] + if cls_id < 0: + cls_id = 255 + gt_occ[i][3] = cls_id + ann_info['gt_occupancy'] = gt_occ + + ann_info['visible_occupancy_masks'] = [] + occ_masks = mmengine.load(mask_filename) + for i in range(len(info['images'])): + ann_info['visible_occupancy_masks'].append( + occ_masks[i]['visible_occupancy']) + + ann_info['gt_bboxes_3d'] = self.box_type_3d( + ann_info['gt_bboxes_3d'], + box_dim=ann_info['gt_bboxes_3d'].shape[-1], + with_yaw=True, + origin=(0.5, 0.5, 0.5)) + + return ann_info + + @staticmethod + def _get_axis_align_matrix(info: dict) -> np.ndarray: + """Get axis_align_matrix from info. If not exist, return identity mat. + + Args: + info (dict): Info of a single sample data. + + Returns: + np.ndarray: 4x4 transformation matrix. + """ + if 'axis_align_matrix' in info: + return np.array(info['axis_align_matrix']) + else: + warnings.warn( + 'axis_align_matrix is not found in ScanNet data info, please ' + 'use new pre-process scripts to re-generate ScanNet data') + return np.eye(4).astype(np.float32) + + def _ids2masks(self, ids, mask_length): + """Change visible_instance_ids to visible_instance_masks.""" + masks = [] + for idx in range(len(ids)): + mask = np.zeros((mask_length, ), dtype=bool) + mask[ids[idx]] = 1 + masks.append(mask) + return masks + + def _remove_dontcare(self, ann_info: dict) -> dict: + """Remove annotations that do not need to be cared. + + -1 indicates dontcare in MMDet3d. + + Args: + ann_info (dict): Dict of annotation infos. The + instance with label `-1` will be removed. + + Returns: + dict: Annotations after filtering. + """ + img_filtered_annotations = {} + filter_mask = ann_info['gt_labels_3d'] > -1 + for key in ann_info.keys(): + if key == 'instances': + img_filtered_annotations[key] = ann_info[key] + elif key == 'visible_instance_masks': + img_filtered_annotations[key] = [] + for idx in range(len(ann_info[key])): + img_filtered_annotations[key].append( + ann_info[key][idx][filter_mask]) + elif key in ['gt_occupancy', 'visible_occupancy_masks']: + pass + else: + img_filtered_annotations[key] = (ann_info[key][filter_mask]) + return img_filtered_annotations + + def load_data_list(self) -> List[dict]: + """Load annotations from an annotation file named as ``self.ann_file`` + + If the annotation file does not follow `OpenMMLab 2.0 format dataset + `_ . + The subclass must override this method for load annotations. The meta + information of annotation file will be overwritten :attr:`METAINFO` + and ``metainfo`` argument of constructor. + + Returns: + list[dict]: A list of annotation. + """ # noqa: E501 + # `self.ann_file` denotes the absolute annotation file path if + # `self.root=None` or relative path if `self.root=/path/to/data/`. + annotations = load(self.ann_file) + if not isinstance(annotations, dict): + raise TypeError(f'The annotations loaded from annotation file ' + f'should be a dict, but got {type(annotations)}!') + if 'data_list' not in annotations or 'metainfo' not in annotations: + raise ValueError('Annotation must have data_list and metainfo ' + 'keys') + metainfo = annotations['metainfo'] + raw_data_list = annotations['data_list'] + + # Meta information load from annotation file will not influence the + # existed meta information load from `BaseDataset.METAINFO` and + # `metainfo` arguments defined in constructor. + for k, v in metainfo.items(): + self._metainfo.setdefault(k, v) + + self.process_metainfo() + + # load and parse data_infos. + data_list = [] + for raw_data_info in raw_data_list: + # parse raw data information to target format + data_info = self.parse_data_info(raw_data_info) + if isinstance(data_info, dict): + # For image tasks, `data_info` should information if single + # image, such as dict(img_path='xxx', width=360, ...) + data_list.append(data_info) + elif isinstance(data_info, list): + # For video tasks, `data_info` could contain image + # information of multiple frames, such as + # [dict(video_path='xxx', timestamps=...), + # dict(video_path='xxx', timestamps=...)] + for item in data_info: + if not isinstance(item, dict): + raise TypeError('data_info must be list of dict, but ' + f'got {type(item)}') + data_list.extend(data_info) + else: + raise TypeError('data_info should be a dict or list of dict, ' + f'but got {type(data_info)}') + + return data_list diff --git a/embodiedscan/eval/__init__.py b/embodiedscan/eval/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/embodiedscan/eval/det_metric.py b/embodiedscan/eval/det_metric.py new file mode 100644 index 0000000..7774dee --- /dev/null +++ b/embodiedscan/eval/det_metric.py @@ -0,0 +1,237 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import logging +from collections import OrderedDict +from typing import Dict, List, Optional, Sequence, Union + +import numpy as np +from mmdet3d.evaluation import indoor_eval +from mmdet3d.registry import METRICS +from mmdet3d.structures import get_box_type +from mmdet.evaluation import eval_map +from mmengine.dist import (broadcast_object_list, collect_results, + is_main_process) +from mmengine.evaluator import BaseMetric +from mmengine.evaluator.metric import _to_cpu +from mmengine.logging import MMLogger, print_log + + +@METRICS.register_module() +class IndoorDetMetric(BaseMetric): + """Indoor scene evaluation metric. + + Args: + iou_thr (float or List[float]): List of iou threshold when calculate + the metric. Defaults to [0.25, 0.5]. + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be 'cpu' or + 'gpu'. Defaults to 'cpu'. + prefix (str, optional): The prefix that will be added in the metric + names to disambiguate homonymous metrics of different evaluators. + If prefix is not provided in the argument, self.default_prefix will + be used instead. Defaults to None. + """ + + def __init__(self, + iou_thr: List[float] = [0.25, 0.5], + collect_device: str = 'cpu', + prefix: Optional[str] = None, + batchwise_anns: bool = False, + **kwargs) -> None: + super(IndoorDetMetric, self).__init__(prefix=prefix, + collect_device=collect_device) + self.iou_thr = [iou_thr] if isinstance(iou_thr, float) else iou_thr + self.batchwise_anns = batchwise_anns + + def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None: + """Process one batch of data samples and predictions. + + The processed results should be stored in ``self.results``, which will + be used to compute the metrics when all batches have been processed. + + Args: + data_batch (dict): A batch of data from the dataloader. + data_samples (Sequence[dict]): A batch of outputs from the model. + """ + for data_sample in data_samples: + pred_3d = data_sample['pred_instances_3d'] + eval_ann_info = data_sample['eval_ann_info'] + cpu_pred_3d = dict() + for k, v in pred_3d.items(): + if hasattr(v, 'to'): + cpu_pred_3d[k] = v.to('cpu') + else: + cpu_pred_3d[k] = v + self.results.append((eval_ann_info, cpu_pred_3d)) + + def compute_metrics(self, results: list) -> Dict[str, float]: + """Compute the metrics from processed results. + + Args: + results (list): The processed results of each batch. + + Returns: + Dict[str, float]: The computed metrics. The keys are the names of + the metrics, and the values are corresponding results. + """ + logger: MMLogger = MMLogger.get_current_instance() + ann_infos = [] + pred_results = [] + + for eval_ann, sinlge_pred_results in results: + ann_infos.append(eval_ann) + pred_results.append(sinlge_pred_results) + + # some checkpoints may not record the key "box_type_3d" + box_type_3d, box_mode_3d = get_box_type( + self.dataset_meta.get('box_type_3d', 'depth')) + + ret_dict = indoor_eval(ann_infos, + pred_results, + self.iou_thr, + self.dataset_meta['classes'], + logger=logger, + box_mode_3d=box_mode_3d, + classes_split=self.dataset_meta.get( + 'classes_split', None)) + + return ret_dict + + def evaluate(self, size: int) -> dict: + """Evaluate the model performance of the whole dataset after processing + all batches. + + Args: + size (int): Length of the entire validation dataset. When batch + size > 1, the dataloader may pad some data samples to make + sure all ranks have the same length of dataset slice. The + ``collect_results`` function will drop the padded data based on + this size. + + Returns: + dict: Evaluation metrics dict on the val dataset. The keys are the + names of the metrics, and the values are corresponding results. + """ + if len(self.results) == 0: + print_log( + f'{self.__class__.__name__} got empty `self.results`. Please ' + 'ensure that the processed results are properly added into ' + '`self.results` in `process` method.', + logger='current', + level=logging.WARNING) + + if self.batchwise_anns: + # the actual dataset length/size is the len(self.results) + if self.collect_device == 'cpu': + results = collect_results(self.results, + len(self.results), + self.collect_device, + tmpdir=self.collect_dir) + else: + results = collect_results(self.results, len(self.results), + self.collect_device) + else: + if self.collect_device == 'cpu': + results = collect_results(self.results, + size, + self.collect_device, + tmpdir=self.collect_dir) + else: + results = collect_results(self.results, size, + self.collect_device) + + if is_main_process(): + # cast all tensors in results list to cpu + results = _to_cpu(results) + _metrics = self.compute_metrics(results) # type: ignore + # Add prefix to metric names + if self.prefix: + _metrics = { + '/'.join((self.prefix, k)): v + for k, v in _metrics.items() + } + metrics = [_metrics] + else: + metrics = [None] # type: ignore + + broadcast_object_list(metrics) + + # reset the results list + self.results.clear() + return metrics[0] + + +@METRICS.register_module() +class Indoor2DMetric(BaseMetric): + """indoor 2d predictions evaluation metric. + + Args: + iou_thr (float or List[float]): List of iou threshold when calculate + the metric. Defaults to [0.5]. + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be 'cpu' or + 'gpu'. Defaults to 'cpu'. + prefix (str, optional): The prefix that will be added in the metric + names to disambiguate homonymous metrics of different evaluators. + If prefix is not provided in the argument, self.default_prefix will + be used instead. Defaults to None. + """ + + def __init__(self, + iou_thr: Union[float, List[float]] = [0.5], + collect_device: str = 'cpu', + prefix: Optional[str] = None): + super(Indoor2DMetric, self).__init__(prefix=prefix, + collect_device=collect_device) + self.iou_thr = [iou_thr] if isinstance(iou_thr, float) else iou_thr + + def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None: + """Process one batch of data samples and predictions. + + The processed results should be stored in ``self.results``, which will + be used to compute the metrics when all batches have been processed. + + Args: + data_batch (dict): A batch of data from the dataloader. + data_samples (Sequence[dict]): A batch of outputs from the model. + """ + for data_sample in data_samples: + pred = data_sample['pred_instances'] + eval_ann_info = data_sample['eval_ann_info'] + ann = dict(labels=eval_ann_info['gt_bboxes_labels'], + bboxes=eval_ann_info['gt_bboxes']) + + pred_bboxes = pred['bboxes'].cpu().numpy() + pred_scores = pred['scores'].cpu().numpy() + pred_labels = pred['labels'].cpu().numpy() + + dets = [] + for label in range(len(self.dataset_meta['classes'])): + index = np.where(pred_labels == label)[0] + pred_bbox_scores = np.hstack( + [pred_bboxes[index], pred_scores[index].reshape((-1, 1))]) + dets.append(pred_bbox_scores) + + self.results.append((ann, dets)) + + def compute_metrics(self, results: list) -> Dict[str, float]: + """Compute the metrics from processed results. + + Args: + results (list): The processed results of each batch. + + Returns: + Dict[str, float]: The computed metrics. The keys are the names of + the metrics, and the values are corresponding results. + """ + logger: MMLogger = MMLogger.get_current_instance() + annotations, preds = zip(*results) + eval_results = OrderedDict() + for iou_thr_2d_single in self.iou_thr: + mean_ap, _ = eval_map(preds, + annotations, + scale_ranges=None, + iou_thr=iou_thr_2d_single, + dataset=self.dataset_meta['classes'], + logger=logger) + eval_results['mAP_' + str(iou_thr_2d_single)] = mean_ap + return eval_results diff --git a/embodiedscan/eval/indoor_eval.py b/embodiedscan/eval/indoor_eval.py new file mode 100644 index 0000000..9b57810 --- /dev/null +++ b/embodiedscan/eval/indoor_eval.py @@ -0,0 +1,377 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch +from mmengine.logging import print_log +from terminaltables import AsciiTable + + +def average_precision(recalls, precisions, mode='area'): + """Calculate average precision (for single or multiple scales). + + Args: + recalls (np.ndarray): Recalls with shape of (num_scales, num_dets) + or (num_dets, ). + precisions (np.ndarray): Precisions with shape of + (num_scales, num_dets) or (num_dets, ). + mode (str): 'area' or '11points', 'area' means calculating the area + under precision-recall curve, '11points' means calculating + the average precision of recalls at [0, 0.1, ..., 1] + + Returns: + float or np.ndarray: Calculated average precision. + """ + if recalls.ndim == 1: + recalls = recalls[np.newaxis, :] + precisions = precisions[np.newaxis, :] + + assert recalls.shape == precisions.shape + assert recalls.ndim == 2 + + num_scales = recalls.shape[0] + ap = np.zeros(num_scales, dtype=np.float32) + if mode == 'area': + zeros = np.zeros((num_scales, 1), dtype=recalls.dtype) + ones = np.ones((num_scales, 1), dtype=recalls.dtype) + mrec = np.hstack((zeros, recalls, ones)) + mpre = np.hstack((zeros, precisions, zeros)) + for i in range(mpre.shape[1] - 1, 0, -1): + mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i]) + for i in range(num_scales): + ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0] + ap[i] = np.sum( + (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1]) + elif mode == '11points': + for i in range(num_scales): + for thr in np.arange(0, 1 + 1e-3, 0.1): + precs = precisions[i, recalls[i, :] >= thr] + prec = precs.max() if precs.size > 0 else 0 + ap[i] += prec + ap /= 11 + else: + raise ValueError( + 'Unrecognized mode, only "area" and "11points" are supported') + return ap + + +def eval_det_cls(pred, gt, iou_thr=None): + """Generic functions to compute precision/recall for object detection for a + single class. + + Args: + pred (dict): Predictions mapping from image id to bounding boxes + and scores. + gt (dict): Ground truths mapping from image id to bounding boxes. + iou_thr (list[float]): A list of iou thresholds. + + Return: + tuple (np.ndarray, np.ndarray, float): Recalls, precisions and + average precision. + """ + + # {img_id: {'bbox': box structure, 'det': matched list}} + class_recs = {} + npos = 0 + # figure out the bbox code size first + gt_bbox_code_size = 9 + pred_bbox_code_size = 9 + for img_id in gt.keys(): + if len(gt[img_id]) != 0: + gt_bbox_code_size = gt[img_id][0].tensor.shape[1] + break + for img_id in pred.keys(): + if len(pred[img_id][0]) != 0: + pred_bbox_code_size = pred[img_id][0][0].tensor.shape[1] + break + assert gt_bbox_code_size == pred_bbox_code_size + for img_id in gt.keys(): + cur_gt_num = len(gt[img_id]) + if cur_gt_num != 0: + gt_cur = torch.zeros([cur_gt_num, gt_bbox_code_size], + dtype=torch.float32) + for i in range(cur_gt_num): + gt_cur[i] = gt[img_id][i].tensor + bbox = gt[img_id][0].new_box(gt_cur) + else: + bbox = gt[img_id] + det = [[False] * len(bbox) for i in iou_thr] + npos += len(bbox) + class_recs[img_id] = {'bbox': bbox, 'det': det} + + # construct dets + image_ids = [] + confidence = [] + ious = [] + for img_id in pred.keys(): + cur_num = len(pred[img_id]) + if cur_num == 0: + continue + pred_cur = torch.zeros((cur_num, pred_bbox_code_size), + dtype=torch.float32) + box_idx = 0 + for box, score in pred[img_id]: + image_ids.append(img_id) + confidence.append(score) + # handle outlier (too thin) predicted boxes + w, l, h = box.tensor[0, 3:6] + faces = [w * l, w * h, h * l] + if torch.any(box.tensor.new_tensor(faces) < 2e-4): + print('Find small predicted boxes,', + 'and clamp short edges to 2e-2 meters.') + box.tensor[:, 3:6] = torch.clamp(box.tensor[:, 3:6], min=2e-2) + pred_cur[box_idx] = box.tensor + box_idx += 1 + pred_cur = box.new_box(pred_cur) + gt_cur = class_recs[img_id]['bbox'] + if len(gt_cur) > 0: + # calculate iou in each image + iou_cur = pred_cur.overlaps(pred_cur, gt_cur) + for i in range(cur_num): + ious.append(iou_cur[i]) + else: + for i in range(cur_num): + ious.append(np.zeros(1)) + + confidence = np.array(confidence) + + # sort by confidence + sorted_ind = np.argsort(-confidence) + image_ids = [image_ids[x] for x in sorted_ind] + ious = [ious[x] for x in sorted_ind] + + # go down dets and mark TPs and FPs + num_images = len(image_ids) + tp_thr = [np.zeros(num_images) for i in iou_thr] + fp_thr = [np.zeros(num_images) for i in iou_thr] + for d in range(num_images): + R = class_recs[image_ids[d]] + iou_max = -np.inf + BBGT = R['bbox'] + cur_iou = ious[d] + + if len(BBGT) > 0: + # compute overlaps + for j in range(len(BBGT)): + # iou = get_iou_main(get_iou_func, (bb, BBGT[j,...])) + iou = cur_iou[j] + if iou > iou_max: + iou_max = iou + jmax = j + + for iou_idx, thresh in enumerate(iou_thr): + if iou_max > thresh: + if not R['det'][iou_idx][jmax]: + tp_thr[iou_idx][d] = 1. + R['det'][iou_idx][jmax] = 1 + else: + fp_thr[iou_idx][d] = 1. + else: + fp_thr[iou_idx][d] = 1. + + ret = [] + for iou_idx, thresh in enumerate(iou_thr): + # compute precision recall + fp = np.cumsum(fp_thr[iou_idx]) + tp = np.cumsum(tp_thr[iou_idx]) + recall = tp / float(npos) + # avoid divide by zero in case the first detection matches a difficult + # ground truth + precision = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) + ap = average_precision(recall, precision) + ret.append((recall, precision, ap)) + + return ret + + +def eval_map_recall(pred, gt, ovthresh=None): + """Evaluate mAP and recall. + + Generic functions to compute precision/recall for object detection + for multiple classes. + + Args: + pred (dict): Information of detection results, + which maps class_id and predictions. + gt (dict): Information of ground truths, which maps class_id and + ground truths. + ovthresh (list[float], optional): iou threshold. Default: None. + + Return: + tuple[dict]: dict results of recall, AP, and precision for all classes. + """ + + ret_values = {} + for classname in gt.keys(): + if classname in pred: + ret_values[classname] = eval_det_cls(pred[classname], + gt[classname], ovthresh) + recall = [{} for i in ovthresh] + precision = [{} for i in ovthresh] + ap = [{} for i in ovthresh] + + for label in gt.keys(): + for iou_idx, thresh in enumerate(ovthresh): + if label in pred: + recall[iou_idx][label], precision[iou_idx][label], ap[iou_idx][ + label] = ret_values[label][iou_idx] + else: + recall[iou_idx][label] = np.zeros(1) + precision[iou_idx][label] = np.zeros(1) + ap[iou_idx][label] = np.zeros(1) + + return recall, precision, ap + + +def indoor_eval(gt_annos, + dt_annos, + metric, + label2cat, + logger=None, + box_mode_3d=None, + classes_split=None): + """Indoor Evaluation. + + Evaluate the result of the detection. + + Args: + gt_annos (list[dict]): Ground truth annotations. + dt_annos (list[dict]): Detection annotations. the dict + includes the following keys + + - labels_3d (torch.Tensor): Labels of boxes. + - bboxes_3d (:obj:`BaseInstance3DBoxes`): + 3D bounding boxes in Depth coordinate. + - scores_3d (torch.Tensor): Scores of boxes. + metric (list[float]): IoU thresholds for computing average precisions. + label2cat (tuple): Map from label to category. + logger (logging.Logger | str, optional): The way to print the mAP + summary. See `mmdet.utils.print_log()` for details. Default: None. + + Return: + dict[str, float]: Dict of results. + """ + assert len(dt_annos) == len(gt_annos) + pred = {} # map {class_id: pred} + gt = {} # map {class_id: gt} + for img_id in range(len(dt_annos)): + # parse detected annotations + det_anno = dt_annos[img_id] + for i in range(len(det_anno['labels_3d'])): + label = det_anno['labels_3d'].numpy()[i] + bbox = det_anno['bboxes_3d'].convert_to(box_mode_3d)[i] + score = det_anno['scores_3d'].numpy()[i] + if label not in pred: + pred[int(label)] = {} + if img_id not in pred[label]: + pred[int(label)][img_id] = [] + if label not in gt: + gt[int(label)] = {} + if img_id not in gt[label]: + gt[int(label)][img_id] = [] + pred[int(label)][img_id].append((bbox, score)) + + # parse gt annotations + gt_anno = gt_annos[img_id] + + gt_boxes = gt_anno['gt_bboxes_3d'] + labels_3d = gt_anno['gt_labels_3d'] + + for i in range(len(labels_3d)): + label = labels_3d[i] + bbox = gt_boxes[i] + if label not in gt: + gt[label] = {} + if img_id not in gt[label]: + gt[label][img_id] = [] + gt[label][img_id].append(bbox) + + rec, prec, ap = eval_map_recall(pred, gt, metric) + + # filter nan results + ori_keys = list(ap[0].keys()) + for key in ori_keys: + if np.isnan(ap[0][key][0]): + for r in rec: + del r[key] + for p in prec: + del p[key] + for a in ap: + del a[key] + + ret_dict = dict() + header = ['classes'] + table_columns = [[label2cat[label] + for label in ap[0].keys()] + ['Overall']] + + for i, iou_thresh in enumerate(metric): + header.append(f'AP_{iou_thresh:.2f}') + header.append(f'AR_{iou_thresh:.2f}') + rec_list = [] + for label in ap[i].keys(): + ret_dict[f'{label2cat[label]}_AP_{iou_thresh:.2f}'] = float( + ap[i][label][0]) + ret_dict[f'mAP_{iou_thresh:.2f}'] = float(np.mean(list( + ap[i].values()))) + + table_columns.append(list(map(float, list(ap[i].values())))) + table_columns[-1] += [ret_dict[f'mAP_{iou_thresh:.2f}']] + table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]] + + for label in rec[i].keys(): + ret_dict[f'{label2cat[label]}_rec_{iou_thresh:.2f}'] = float( + rec[i][label][-1]) + rec_list.append(rec[i][label][-1]) + ret_dict[f'mAR_{iou_thresh:.2f}'] = float(np.mean(rec_list)) + + table_columns.append(list(map(float, rec_list))) + table_columns[-1] += [ret_dict[f'mAR_{iou_thresh:.2f}']] + table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]] + + table_data = [header] + table_rows = list(zip(*table_columns)) + table_data += table_rows + table = AsciiTable(table_data) + table.inner_footing_row_border = True + print_log('\n' + table.table, logger=logger) + + if classes_split is not None: + splits = ['head', 'common', 'tail'] + for idx in range(len(splits)): + header = [f'{splits[idx]}_classes'] + # init the category list/column + cat_list = [] + for label in classes_split[idx]: + if label in ap[0]: + cat_list.append(label2cat[label]) + table_columns = [cat_list + ['Overall']] + + for i, iou_thresh in enumerate(metric): + header.append(f'AP_{iou_thresh:.2f}') + header.append(f'AR_{iou_thresh:.2f}') + ap_list = [] + for label in classes_split[idx]: + if label in ap[i]: + ap_list.append(float(ap[i][label][0])) + mean_ap = float(np.mean(ap_list)) + + table_columns.append(list(map(float, ap_list))) + table_columns[-1] += [mean_ap] + table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]] + + rec_list = [] + for label in classes_split[idx]: + if label in rec[i]: + rec_list.append(rec[i][label][-1]) + mean_rec = float(np.mean(rec_list)) + + table_columns.append(list(map(float, rec_list))) + table_columns[-1] += [mean_rec] + table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]] + + table_data = [header] + table_rows = list(zip(*table_columns)) + table_data += table_rows + table = AsciiTable(table_data) + table.inner_footing_row_border = True + print_log('\n' + table.table, logger=logger) + + return ret_dict diff --git a/embodiedscan/eval/occupancy_metric.py b/embodiedscan/eval/occupancy_metric.py new file mode 100644 index 0000000..3366479 --- /dev/null +++ b/embodiedscan/eval/occupancy_metric.py @@ -0,0 +1,177 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import logging +from typing import Dict, Optional, Sequence + +import numpy as np +import torch +from mmdet3d.registry import METRICS +from mmengine.dist import (broadcast_object_list, collect_results, + is_main_process) +from mmengine.evaluator import BaseMetric +from mmengine.evaluator.metric import _to_cpu +from mmengine.logging import MMLogger, print_log +from terminaltables import AsciiTable + + +@METRICS.register_module() +class OccupancyMetric(BaseMetric): + """Indoor scene evaluation metric. + + Args: + iou_thr (list[float]): List of iou threshold when calculate the + metric. Defaults to [0.25, 0.5]. + collect_device (str, optional): Device name used for collecting + results from different ranks during distributed training. + Must be 'cpu' or 'gpu'. Defaults to 'cpu'. + prefix (str): The prefix that will be added in the metric + names to disambiguate homonymous metrics of different evaluators. + If prefix is not provided in the argument, self.default_prefix + will be used instead. Default: None + """ + + def __init__(self, + collect_device: str = 'cpu', + prefix: Optional[str] = None, + batchwise_anns: bool = False, + **kwargs): + super(OccupancyMetric, self).__init__(prefix=prefix, + collect_device=collect_device) + self.batchwise_anns = batchwise_anns + + def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None: + """Process one batch of data samples and predictions. + + The processed results should be stored in ``self.results``, + which will be used to compute the metrics when all batches + have been processed. + + Args: + data_batch (dict): A batch of data from the dataloader. + data_samples (Sequence[dict]): A batch of outputs from + the model. + """ + for data_sample in data_samples: + pred_occ = data_sample['pred_occupancy'] + gt_4 = data_sample['gt_occupancy'] + gt_occ = torch.zeros_like(pred_occ) + gt_occ[gt_4[:, 0], gt_4[:, 1], gt_4[:, 2]] = gt_4[:, 3] + if 'gt_occupancy_masks' in data_sample: + gt_occ_mask = data_sample['gt_occupancy_masks'] + gt_occ[~gt_occ_mask] = 255 + self.results.append((gt_occ, pred_occ)) + + def compute_metrics(self, results: list) -> Dict[str, float]: + """Compute the metrics from processed results. + + Args: + results (list): The processed results of each batch. + + Returns: + Dict[str, float]: The computed metrics. The keys are the names of + the metrics, and the values are corresponding results. + """ + logger: MMLogger = MMLogger.get_current_instance() + num_class = len(self.dataset_meta['classes']) + 1 + score = np.zeros((num_class, 3)) + + for gt_occ, sinlge_pred_results in results: + mask = (gt_occ != 255) + for j in range(num_class): + if j == 0: # class 0 (empty) for geometry IoU + score[j][0] += ((gt_occ[mask] != 0) * + (sinlge_pred_results[mask] != 0)).sum() + score[j][1] += (gt_occ[mask] != 0).sum() + score[j][2] += (sinlge_pred_results[mask] != 0).sum() + else: + score[j][0] += ((gt_occ[mask] == j) * + (sinlge_pred_results[mask] == j)).sum() + score[j][1] += (gt_occ[mask] == j).sum() + score[j][2] += (sinlge_pred_results[mask] == j).sum() + + ret_dict = dict() + table_data = [['classes', 'IoU']] + res = [] + for i in range(num_class): + name = 'empty' + if i > 0: + name = self.dataset_meta['classes'][i - 1] + + tp = score[i, 0] + p = score[i, 1] + g = score[i, 2] + union = p + g - tp + # do not save the accuracy result if nan + if np.isnan(tp / union): + continue + ret_dict[name] = tp / union + res.append(tp / union) + table_data.append([name, f'{ret_dict[name]:.5f}']) + table_data.append(['mean', f'{sum(res)/len(res):.5f}']) + + table = AsciiTable(table_data) + table.inner_footing_row_border = True + print_log('\n' + table.table, logger=logger) + return ret_dict + + def evaluate(self, size: int) -> dict: + """Evaluate the model performance of the whole dataset after processing + all batches. + + Args: + size (int): Length of the entire validation dataset. When batch + size > 1, the dataloader may pad some data samples to make + sure all ranks have the same length of dataset slice. The + ``collect_results`` function will drop the padded data based on + this size. + + Returns: + dict: Evaluation metrics dict on the val dataset. The keys are the + names of the metrics, and the values are corresponding results. + """ + if len(self.results) == 0: + print_log( + f'{self.__class__.__name__} got empty `self.results`. Please ' + 'ensure that the processed results are properly added into ' + '`self.results` in `process` method.', + logger='current', + level=logging.WARNING) + + if self.batchwise_anns: + # the actual dataset length/size is the len(self.results) + if self.collect_device == 'cpu': + results = collect_results(self.results, + len(self.results), + self.collect_device, + tmpdir=self.collect_dir) + else: + results = collect_results(self.results, len(self.results), + self.collect_device) + else: + if self.collect_device == 'cpu': + results = collect_results(self.results, + size, + self.collect_device, + tmpdir=self.collect_dir) + else: + results = collect_results(self.results, size, + self.collect_device) + + if is_main_process(): + # cast all tensors in results list to cpu + results = _to_cpu(results) + _metrics = self.compute_metrics(results) # type: ignore + # Add prefix to metric names + if self.prefix: + _metrics = { + '/'.join((self.prefix, k)): v + for k, v in _metrics.items() + } + metrics = [_metrics] + else: + metrics = [None] # type: ignore + + broadcast_object_list(metrics) + + # reset the results list + self.results.clear() + return metrics[0] diff --git a/embodiedscan/explorer.py b/embodiedscan/explorer.py new file mode 100644 index 0000000..7b6e462 --- /dev/null +++ b/embodiedscan/explorer.py @@ -0,0 +1,251 @@ +import os +from typing import List, Union + +import mmengine +import numpy as np +import open3d as o3d +from utils.color_selector import ColorMap +from utils.img_drawer import ImageDrawer + +DATASETS = ['scannet', '3rscan', 'matterport3d'] + + +class EmbodiedScanExplorer: + + def __init__( + self, + dataroot: Union[dict, List], + ann_file: Union[dict, List, str], + verbose: bool = False, + color_setting: str = None, + ): + + if isinstance(ann_file, dict): + ann_file = list(ann_file.values()) + elif isinstance(ann_file, str): + ann_file = [ann_file] + self.ann_files = ann_file + + if isinstance(dataroot, str): + dataroot = [dataroot] + if isinstance(dataroot, list): + self.dataroot = dict() + for dataset in DATASETS: + self.dataroot[dataset] = None + for root in dataroot: + for dataset in DATASETS: + if dataset.lower() in root.lower(): + self.dataroot[dataset] = root + break + if isinstance(dataroot, dict): + self.dataroot = dataroot + + self.verbose = verbose + + if self.verbose: + print('Dataset root') + for dataset in DATASETS: + print(dataset, ':', self.dataroot[dataset]) + + if self.verbose: + print('Loading') + self.metainfo = None + data_list = [] + for file in self.ann_files: + data = mmengine.load(file) + if self.metainfo is None: + self.metainfo = data['metainfo'] + else: + assert self.metainfo == data['metainfo'] + data_list += data['data_list'] + + self.classes = list(self.metainfo['categories'].keys()) + self.color_selector = ColorMap(classes=self.classes, + init_file=color_setting) + self.data = [] + for data in data_list: + splits = data['sample_idx'].split('/') + data['dataset'] = splits[0] + if self.dataroot[splits[0]] is not None: + self.data.append(data) + + if self.verbose: + print('Loading complete') + + def count_scenes(self): + return len(self.data) + + def list_scenes(self): + res = [] + for scene in self.data: + res.append(scene['sample_idx']) + return res + + def scene_info(self, scene_name): + for scene in self.data: + if scene['sample_idx'] == scene_name: + if self.verbose: + print('Info of', scene_name) + print(len(scene['images']), 'images') + print(len(scene['instances']), 'boxes') + return dict(num_images=len(scene['images']), + num_boxes=len(scene['instances'])) + + if self.verbose: + print('No such scene') + return None + + def render_scene(self, scene_name, render_box=False): + s = scene_name.split('/') + if len(s) == 2: + dataset, region = s + else: + dataset, building, region = s + select = None + for scene in self.data: + if scene['sample_idx'] == scene_name: + select = scene + break + axis_align_matrix = select['axis_align_matrix'] + if dataset == 'scannet': + filepath = os.path.join(self.dataroot['scannet'], 'scans', region, + f'{region}_vh_clean.ply') + elif dataset == '3rscan': + filepath = os.path.join(self.dataroot['3rscan'], region, + 'mesh.refined.v2.obj') + elif dataset == 'matterport3d': + filepath = os.path.join(self.dataroot['matterport3d'], building, + 'region_segmentations', f'{region}.ply') + else: + raise NotImplementedError + + mesh = o3d.io.read_triangle_mesh(filepath, True) + mesh.transform(axis_align_matrix) + frame = o3d.geometry.TriangleMesh.create_coordinate_frame() + boxes = [] + if render_box: + for instance in select['instances']: + box = self._9dof_to_box(instance['bbox_3d'], + instance['bbox_label_3d']) + boxes.append(box) + o3d.visualization.draw_geometries([mesh, frame] + boxes) + + def render_occupancy(self, scene_name): + s = scene_name.split('/') + if len(s) == 2: + dataset, region = s + else: + dataset, building, region = s + + if dataset == 'scannet': + filepath = os.path.join(self.dataroot['scannet'], 'scans', region, + 'occupancy', 'occupancy.npy') + elif dataset == '3rscan': + filepath = os.path.join(self.dataroot['3rscan'], region, + 'occupancy', 'occupancy.npy') + elif dataset == 'matterport3d': + filepath = os.path.join(self.dataroot['matterport3d'], building, + 'occupancy', f'occupancy_{region}.npy') + else: + raise NotImplementedError + + gt_occ = np.load(filepath) + point_cloud_range = [-3.2, -3.2, -1.28 + 0.5, 3.2, 3.2, 1.28 + 0.5] + # occ_size = [40, 40, 16] + grid_size = [0.16, 0.16, 0.16] + points = np.zeros((gt_occ.shape[0], 6), dtype=float) + for i in range(gt_occ.shape[0]): + x, y, z, label_id = gt_occ[i] + label_id = int(label_id) + label = 'object' + if label_id == 0: + label = 'object' + else: + label = self.classes[label_id - 1] + color = self.color_selector.get_color(label) + color = [x / 255.0 for x in color] + points[i][:3] = [ + x * grid_size[0] + point_cloud_range[0] + grid_size[0] / 2, + y * grid_size[1] + point_cloud_range[1] + grid_size[1] / 2, + z * grid_size[2] + point_cloud_range[2] + grid_size[2] / 2 + ] + points[i][3:] = color + pcd = o3d.geometry.PointCloud() + pcd.points = o3d.utility.Vector3dVector(points[:, :3]) + pcd.colors = o3d.utility.Vector3dVector(points[:, 3:]) + voxel_grid = o3d.geometry.VoxelGrid.create_from_point_cloud( + pcd, voxel_size=grid_size[0]) + frame = o3d.geometry.TriangleMesh.create_coordinate_frame() + o3d.visualization.draw_geometries([frame, voxel_grid]) + + def render_image(self, scene_name, camera_name): + dataset = scene_name.split('/')[0] + select = None + for scene in self.data: + if scene['sample_idx'] == scene_name: + select = scene + for camera in select['images']: + img_path = camera['img_path'] + img_path = os.path.join(self.dataroot[dataset], + img_path[img_path.find('/') + 1:]) + if dataset == 'scannet': + cam_name = img_path.split('/')[-1][:-4] + elif dataset == '3rscan': + cam_name = img_path.split('/')[-1][:-10] + elif dataset == 'matterport3d': + cam_name = img_path.split('/')[-1][:-8] + img_path.split( + '/')[-1][-7:-4] + if cam_name == camera_name: + axis_align_matrix = select['axis_align_matrix'] + extrinsic = axis_align_matrix @ camera['cam2global'] + if 'cam2img' in camera: + intrinsic = camera['cam2img'] + else: + intrinsic = select['cam2img'] + img_drawer = ImageDrawer(img_path, verbose=self.verbose) + for i in camera['visible_instance_ids']: + instance = select['instances'][i] + box = self._9dof_to_box(instance['bbox_3d'], + instance['bbox_label_3d']) + label = self.classes[instance['bbox_label_3d'] - 1] + color = self.color_selector.get_color(label) + img_drawer.draw_box3d(box, + color, + label, + extrinsic=extrinsic, + intrinsic=intrinsic) + + img_drawer.show() + return + + print('No such camera') + return + + def _9dof_to_box(self, box, label_id): + if isinstance(box, list): + box = np.array(box) + center = box[:3].reshape(3, 1) + scale = box[3:6].reshape(3, 1) + rot = box[6:].reshape(3, 1) + rot_mat = \ + o3d.geometry.OrientedBoundingBox.get_rotation_matrix_from_zxy(rot) + geo = o3d.geometry.OrientedBoundingBox(center, rot_mat, scale) + + label = self.classes[label_id - 1] + color = self.color_selector.get_color(label) + color = [x / 255.0 for x in color] + geo.color = color + return geo + + +if __name__ == '__main__': + a = EmbodiedScanExplorer( + dataroot=['data/scannet', 'data/3rscan/', 'data/matterport3d/'], + ann_file=[ + 'data/full_10_visible/embodiedscan_infos_train_full.pkl', + 'data/full_10_visible/embodiedscan_infos_val_full.pkl' + ], + verbose=True) + print(a.list_scenes()) + print(a.count_scenes()) + a.render_image('scannet/scene0000_00', '00000') diff --git a/embodiedscan/refine_pickle.py b/embodiedscan/refine_pickle.py new file mode 100644 index 0000000..d77bf87 --- /dev/null +++ b/embodiedscan/refine_pickle.py @@ -0,0 +1,116 @@ +import json +import os +import pickle + +from tqdm import tqdm + + +def path_split(path): + s = path.split('/') + return s[0], s[2], s[3] + + +with open( + '/mnt/petrelfs/share_data/maoxiaohan/3rscan/meta_data/' + + '3rscan_mapping.json', 'r') as f: + map_3rscan = json.load(f) +back_3rscan = {v: k for k, v in map_3rscan.items()} + +with open( + '/mnt/petrelfs/share_data/maoxiaohan/matterport3d/meta_data/' + + 'scene_mapping.json', 'r') as f: + map_mp3d = json.load(f) +back_mp3d = {v: k for k, v in map_mp3d.items()} +buildings = os.listdir( + '/mnt/petrelfs/share_data/maoxiaohan/matterport3d/rename') +assert len(buildings) == len(list(back_mp3d.keys())) +max_cam = 0 +back_mp3d_cam = dict() +for building in buildings: + assert building[-5:] == '.json' + building_name = building[:-5] + with open( + os.path.join( + '/mnt/petrelfs/share_data/maoxiaohan/matterport3d/rename', + building), 'r') as f: + tmp = json.load(f) + max_cam = max(max_cam, len(list(tmp.keys()))) + back_mp3d_cam[building_name] = {v: k for k, v in tmp.items()} + +print(max_cam) + + +def mp3d_split(region, camera): + global back_mp3d + global back_mp3d_cam + x = region.find('_region') + building = region[:x] + raw_building = back_mp3d[building] + raw_region = region[x + 1:] + assert camera[-4] == '_' + raw_camera = back_mp3d_cam[raw_building][camera[:-4]] + cam_pos = camera[-3:] + return raw_building, raw_region, raw_camera, cam_pos + + +def generate(in_dir, out_dir, filename): + with open(os.path.join(in_dir, filename), 'rb') as f: + data = pickle.load(f) + + for scene in tqdm(data['data_list']): + bo = False + for img in scene['images']: + path = img['img_path'] + dataset, region, camera = path_split(path) + assert camera[-4:] == '.jpg' + camera = camera[:-4] + + if dataset == 'scannet': + img_path = path + depth_path = f'{dataset}/posed_images/{region}/{camera}.png' + img['depth_path'] = depth_path + if not bo: + scene['depth_cam2img'] = scene['depth2img'] + scene.pop('depth2img', None) + scene['sample_idx'] = f'scannet/{region}' + bo = True + elif dataset == '3rscan': + raw_region = back_3rscan[region] + img_path = f'{dataset}/{raw_region}/sequence/' + \ + 'frame-{camera}.color.jpg' + depth_path = f'{dataset}/{raw_region}/sequence/' + \ + 'frame-{camera}.depth.pgm' + img['img_path'] = img_path + img['depth_path'] = depth_path + if not bo: + scene['depth_cam2img'] = scene['cam2depth'] + scene.pop('cam2depth', None) + scene['sample_idx'] = f'3rscan/{raw_region}' + bo = True + elif dataset == 'matterport3d': + raw_building, raw_region, raw_camera, cam_pos = mp3d_split( + region, camera) + img_path = f'{dataset}/{raw_building}/' + \ + 'matterport_color_images/{raw_camera}_i{cam_pos}.jpg' + depth_path = f'{dataset}/{raw_building}/' + \ + 'matterport_depth_images/{raw_camera}_d{cam_pos}.png' + img['img_path'] = img_path + img['depth_path'] = depth_path + img.pop('cam2depth', None) + if not bo: + scene['sample_idx'] = \ + f'matterport3d/{raw_building}/{raw_region}' + bo = True + else: + raise NotImplementedError + + with open(os.path.join(out_dir, filename), 'wb') as f: + pickle.dump(data, f) + + +generate(in_dir='/mnt/petrelfs/share_data/wangtai/data/full_10_visible', + out_dir='./data', + filename='embodiedscan_infos_train_full.pkl') +generate(in_dir='/mnt/petrelfs/share_data/wangtai/data/full_10_visible', + out_dir='./data', + filename='embodiedscan_infos_val_full.pkl') diff --git a/embodiedscan/structures/__init__.py b/embodiedscan/structures/__init__.py new file mode 100644 index 0000000..103d611 --- /dev/null +++ b/embodiedscan/structures/__init__.py @@ -0,0 +1,52 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .bbox_3d import (BaseInstance3DBoxes, Box3DMode, CameraInstance3DBoxes, + Coord3DMode, DepthInstance3DBoxes, + EulerCameraInstance3DBoxes, EulerDepthInstance3DBoxes, + LiDARInstance3DBoxes, get_box_type, + get_proj_mat_by_coord_type, limit_period, + mono_cam_box2vis, points_cam2img, points_img2cam, + rotation_3d_in_axis, rotation_3d_in_euler, xywhr2xyxyr) +from .det3d_data_sample import Det3DDataSample +# yapf: disable +from .ops import (AxisAlignedBboxOverlaps3D, BboxOverlaps3D, + BboxOverlapsNearest3D, axis_aligned_bbox_overlaps_3d, + bbox3d2result, bbox3d2roi, bbox3d_mapping_back, + bbox_overlaps_3d, bbox_overlaps_nearest_3d, + box2d_to_corner_jit, box3d_to_bbox, box_camera_to_lidar, + boxes3d_to_corners3d_lidar, camera_to_lidar, + center_to_corner_box2d, center_to_corner_box3d, + center_to_minmax_2d, corner_to_standup_nd_jit, + corner_to_surfaces_3d, corner_to_surfaces_3d_jit, corners_nd, + create_anchors_3d_range, depth_to_lidar_points, + depth_to_points, get_frustum, iou_jit, minmax_to_corner_2d, + points_in_convex_polygon_3d_jit, + points_in_convex_polygon_jit, points_in_rbbox, + projection_matrix_to_CRT_kitti, rbbox2d_to_near_bbox, + remove_outside_points, rotation_points_single_angle, + surface_equ_3d) +# yapf: enable +from .point_data import PointData +from .points import BasePoints, CameraPoints, DepthPoints, LiDARPoints + +__all__ = [ + 'BasePoints', 'CameraPoints', 'DepthPoints', 'LiDARPoints', + 'Det3DDataSample', 'PointData', 'Box3DMode', 'BaseInstance3DBoxes', + 'LiDARInstance3DBoxes', 'CameraInstance3DBoxes', 'DepthInstance3DBoxes', + 'EulerCameraInstance3DBoxes', 'EulerDepthInstance3DBoxes', 'xywhr2xyxyr', + 'get_box_type', 'rotation_3d_in_axis', 'limit_period', 'points_cam2img', + 'points_img2cam', 'Coord3DMode', 'mono_cam_box2vis', + 'get_proj_mat_by_coord_type', 'box2d_to_corner_jit', 'box3d_to_bbox', + 'box_camera_to_lidar', 'boxes3d_to_corners3d_lidar', 'camera_to_lidar', + 'center_to_corner_box2d', 'center_to_corner_box3d', 'center_to_minmax_2d', + 'corner_to_standup_nd_jit', 'corner_to_surfaces_3d', + 'corner_to_surfaces_3d_jit', 'corners_nd', 'create_anchors_3d_range', + 'depth_to_lidar_points', 'depth_to_points', 'get_frustum', 'iou_jit', + 'minmax_to_corner_2d', 'points_in_convex_polygon_3d_jit', + 'points_in_convex_polygon_jit', 'points_in_rbbox', + 'projection_matrix_to_CRT_kitti', 'rbbox2d_to_near_bbox', + 'remove_outside_points', 'rotation_points_single_angle', 'surface_equ_3d', + 'BboxOverlapsNearest3D', 'BboxOverlaps3D', 'bbox_overlaps_nearest_3d', + 'bbox_overlaps_3d', 'AxisAlignedBboxOverlaps3D', + 'axis_aligned_bbox_overlaps_3d', 'bbox3d_mapping_back', 'bbox3d2roi', + 'bbox3d2result', 'rotation_3d_in_euler' +] diff --git a/embodiedscan/structures/bbox_3d/__init__.py b/embodiedscan/structures/bbox_3d/__init__.py new file mode 100644 index 0000000..a1515f8 --- /dev/null +++ b/embodiedscan/structures/bbox_3d/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base_box3d import BaseInstance3DBoxes +from .box_3d_mode import Box3DMode +from .cam_box3d import CameraInstance3DBoxes +from .coord_3d_mode import Coord3DMode +from .depth_box3d import DepthInstance3DBoxes +from .euler_cam_box3d import EulerCameraInstance3DBoxes +from .euler_depth_box3d import EulerDepthInstance3DBoxes +from .lidar_box3d import LiDARInstance3DBoxes +from .utils import (batch_points_cam2img, get_box_type, + get_proj_mat_by_coord_type, limit_period, mono_cam_box2vis, + points_cam2img, points_img2cam, rotation_3d_in_axis, + rotation_3d_in_euler, xywhr2xyxyr) + +__all__ = [ + 'Box3DMode', 'BaseInstance3DBoxes', 'LiDARInstance3DBoxes', + 'CameraInstance3DBoxes', 'DepthInstance3DBoxes', + 'EulerCameraInstance3DBoxes', 'EulerDepthInstance3DBoxes', 'xywhr2xyxyr', + 'get_box_type', 'rotation_3d_in_axis', 'rotation_3d_in_euler', + 'limit_period', 'points_cam2img', 'points_img2cam', 'Coord3DMode', + 'mono_cam_box2vis', 'batch_points_cam2img', 'get_proj_mat_by_coord_type' +] diff --git a/embodiedscan/structures/bbox_3d/base_box3d.py b/embodiedscan/structures/bbox_3d/base_box3d.py new file mode 100644 index 0000000..85c83ed --- /dev/null +++ b/embodiedscan/structures/bbox_3d/base_box3d.py @@ -0,0 +1,698 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from abc import abstractmethod +from typing import Iterator, Optional, Sequence, Tuple, Union + +import numpy as np +import torch +from mmcv.ops import box_iou_rotated, points_in_boxes_all, points_in_boxes_part +from mmdet3d.structures.points import BasePoints +from torch import Tensor + +from .utils import limit_period + + +class BaseInstance3DBoxes: + """Base class for 3D Boxes. + + Note: + The box is bottom centered, i.e. the relative position of origin in the + box is (0.5, 0.5, 0). + + Args: + tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The boxes + data with shape (N, box_dim). + box_dim (int): Number of the dimension of a box. Each row is + (x, y, z, x_size, y_size, z_size, yaw). Defaults to 7. + with_yaw (bool): Whether the box is with yaw rotation. If False, the + value of yaw will be set to 0 as minmax boxes. Defaults to True. + origin (Tuple[float]): Relative position of the box origin. + Defaults to (0.5, 0.5, 0). This will guide the box be converted to + (0.5, 0.5, 0) mode. + + Attributes: + tensor (Tensor): Float matrix with shape (N, box_dim). + box_dim (int): Integer indicating the dimension of a box. Each row is + (x, y, z, x_size, y_size, z_size, yaw, ...). + with_yaw (bool): If True, the value of yaw will be set to 0 as minmax + boxes. + """ + + YAW_AXIS: int = 0 + + def __init__( + self, + tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]], + box_dim: int = 7, + with_yaw: bool = True, + origin: Tuple[float, float, float] = (0.5, 0.5, 0) + ) -> None: + if isinstance(tensor, Tensor): + device = tensor.device + else: + device = torch.device('cpu') + tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device) + if tensor.numel() == 0: + # Use reshape, so we don't end up creating a new tensor that does + # not depend on the inputs (and consequently confuses jit) + tensor = tensor.reshape((-1, box_dim)) + assert tensor.dim() == 2 and tensor.size(-1) == box_dim, \ + ('The box dimension must be 2 and the length of the last ' + f'dimension must be {box_dim}, but got boxes with shape ' + f'{tensor.shape}.') + + if tensor.shape[-1] == 6: + # If the dimension of boxes is 6, we expand box_dim by padding 0 as + # a fake yaw and set with_yaw to False + assert box_dim == 6 + fake_rot = tensor.new_zeros(tensor.shape[0], 1) + tensor = torch.cat((tensor, fake_rot), dim=-1) + self.box_dim = box_dim + 1 + self.with_yaw = False + else: + self.box_dim = box_dim + self.with_yaw = with_yaw + self.tensor = tensor.clone() + + if origin != (0.5, 0.5, 0): + dst = self.tensor.new_tensor((0.5, 0.5, 0)) + src = self.tensor.new_tensor(origin) + self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src) + + @property + def shape(self) -> torch.Size: + """torch.Size: Shape of boxes.""" + return self.tensor.shape + + @property + def volume(self) -> Tensor: + """Tensor: A vector with volume of each box in shape (N, ).""" + return self.tensor[:, 3] * self.tensor[:, 4] * self.tensor[:, 5] + + @property + def dims(self) -> Tensor: + """Tensor: Size dimensions of each box in shape (N, 3).""" + return self.tensor[:, 3:6] + + @property + def yaw(self) -> Tensor: + """Tensor: A vector with yaw of each box in shape (N, ).""" + return self.tensor[:, 6] + + @property + def height(self) -> Tensor: + """Tensor: A vector with height of each box in shape (N, ).""" + return self.tensor[:, 5] + + @property + def top_height(self) -> Tensor: + """Tensor: A vector with top height of each box in shape (N, ).""" + return self.bottom_height + self.height + + @property + def bottom_height(self) -> Tensor: + """Tensor: A vector with bottom height of each box in shape (N, ).""" + return self.tensor[:, 2] + + @property + def center(self) -> Tensor: + """Calculate the center of all the boxes. + + Note: + In MMDetection3D's convention, the bottom center is usually taken + as the default center. + + The relative position of the centers in different kinds of boxes + are different, e.g., the relative center of a boxes is + (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar. It is + recommended to use ``bottom_center`` or ``gravity_center`` for + clearer usage. + + Returns: + Tensor: A tensor with center of each box in shape (N, 3). + """ + return self.bottom_center + + @property + def bottom_center(self) -> Tensor: + """Tensor: A tensor with center of each box in shape (N, 3).""" + return self.tensor[:, :3] + + @property + def gravity_center(self) -> Tensor: + """Tensor: A tensor with center of each box in shape (N, 3).""" + bottom_center = self.bottom_center + gravity_center = torch.zeros_like(bottom_center) + gravity_center[:, :2] = bottom_center[:, :2] + gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5 + return gravity_center + + @property + def corners(self) -> Tensor: + """Tensor: A tensor with 8 corners of each box in shape (N, 8, 3).""" + pass + + @property + def bev(self) -> Tensor: + """Tensor: 2D BEV box of each box with rotation in XYWHR format, in + shape (N, 5).""" + return self.tensor[:, [0, 1, 3, 4, 6]] + + @property + def nearest_bev(self) -> Tensor: + """Tensor: A tensor of 2D BEV box of each box without rotation.""" + # Obtain BEV boxes with rotation in XYWHR format + bev_rotated_boxes = self.bev + # convert the rotation to a valid range + rotations = bev_rotated_boxes[:, -1] + normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi)) + + # find the center of boxes + conditions = (normed_rotations > np.pi / 4)[..., None] + bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:, + [0, 1, 3, 2]], + bev_rotated_boxes[:, :4]) + + centers = bboxes_xywh[:, :2] + dims = bboxes_xywh[:, 2:] + bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1) + return bev_boxes + + def in_range_bev( + self, box_range: Union[Tensor, np.ndarray, + Sequence[float]]) -> Tensor: + """Check whether the boxes are in the given range. + + Args: + box_range (Tensor or np.ndarray or Sequence[float]): The range of + box in order of (x_min, y_min, x_max, y_max). + + Note: + The original implementation of SECOND checks whether boxes in a + range by checking whether the points are in a convex polygon, we + reduce the burden for simpler cases. + + Returns: + Tensor: A binary vector indicating whether each box is inside the + reference range. + """ + in_range_flags = ((self.bev[:, 0] > box_range[0]) + & (self.bev[:, 1] > box_range[1]) + & (self.bev[:, 0] < box_range[2]) + & (self.bev[:, 1] < box_range[3])) + return in_range_flags + + @abstractmethod + def rotate( + self, + angle: Union[Tensor, np.ndarray, float], + points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None + ) -> Union[Tuple[Tensor, Tensor], Tuple[np.ndarray, np.ndarray], Tuple[ + BasePoints, Tensor], None]: + """Rotate boxes with points (optional) with the given angle or rotation + matrix. + + Args: + angle (Tensor or np.ndarray or float): Rotation angle or rotation + matrix. + points (Tensor or np.ndarray or :obj:`BasePoints`, optional): + Points to rotate. Defaults to None. + + Returns: + tuple or None: When ``points`` is None, the function returns None, + otherwise it returns the rotated points and the rotation matrix + ``rot_mat_T``. + """ + pass + + @abstractmethod + def flip( + self, + bev_direction: str = 'horizontal', + points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None + ) -> Union[Tensor, np.ndarray, BasePoints, None]: + """Flip the boxes in BEV along given BEV direction. + + Args: + bev_direction (str): Direction by which to flip. Can be chosen from + 'horizontal' and 'vertical'. Defaults to 'horizontal'. + points (Tensor or np.ndarray or :obj:`BasePoints`, optional): + Points to flip. Defaults to None. + + Returns: + Tensor or np.ndarray or :obj:`BasePoints` or None: When ``points`` + is None, the function returns None, otherwise it returns the + flipped points. + """ + pass + + def translate(self, trans_vector: Union[Tensor, np.ndarray]) -> None: + """Translate boxes with the given translation vector. + + Args: + trans_vector (Tensor or np.ndarray): Translation vector of size + 1x3. + """ + if not isinstance(trans_vector, Tensor): + trans_vector = self.tensor.new_tensor(trans_vector) + self.tensor[:, :3] += trans_vector + + def in_range_3d( + self, box_range: Union[Tensor, np.ndarray, + Sequence[float]]) -> Tensor: + """Check whether the boxes are in the given range. + + Args: + box_range (Tensor or np.ndarray or Sequence[float]): The range of + box (x_min, y_min, z_min, x_max, y_max, z_max). + + Note: + In the original implementation of SECOND, checking whether a box in + the range checks whether the points are in a convex polygon, we try + to reduce the burden for simpler cases. + + Returns: + Tensor: A binary vector indicating whether each point is inside the + reference range. + """ + in_range_flags = ((self.tensor[:, 0] > box_range[0]) + & (self.tensor[:, 1] > box_range[1]) + & (self.tensor[:, 2] > box_range[2]) + & (self.tensor[:, 0] < box_range[3]) + & (self.tensor[:, 1] < box_range[4]) + & (self.tensor[:, 2] < box_range[5])) + return in_range_flags + + @abstractmethod + def convert_to(self, + dst: int, + rt_mat: Optional[Union[Tensor, np.ndarray]] = None, + correct_yaw: bool = False) -> 'BaseInstance3DBoxes': + """Convert self to ``dst`` mode. + + Args: + dst (int): The target Box mode. + rt_mat (Tensor or np.ndarray, optional): The rotation and + translation matrix between different coordinates. + Defaults to None. The conversion from ``src`` coordinates to + ``dst`` coordinates usually comes along the change of sensors, + e.g., from camera to LiDAR. This requires a transformation + matrix. + correct_yaw (bool): Whether to convert the yaw angle to the target + coordinate. Defaults to False. + + Returns: + :obj:`BaseInstance3DBoxes`: The converted box of the same type in + the ``dst`` mode. + """ + pass + + def scale(self, scale_factor: float) -> None: + """Scale the box with horizontal and vertical scaling factors. + + Args: + scale_factors (float): Scale factors to scale the boxes. + """ + self.tensor[:, :6] *= scale_factor + self.tensor[:, 7:] *= scale_factor # velocity + + def limit_yaw(self, offset: float = 0.5, period: float = np.pi) -> None: + """Limit the yaw to a given period and offset. + + Args: + offset (float): The offset of the yaw. Defaults to 0.5. + period (float): The expected period. Defaults to np.pi. + """ + self.tensor[:, 6] = limit_period(self.tensor[:, 6], offset, period) + + def nonempty(self, threshold: float = 0.0) -> Tensor: + """Find boxes that are non-empty. + + A box is considered empty if either of its side is no larger than + threshold. + + Args: + threshold (float): The threshold of minimal sizes. Defaults to 0.0. + + Returns: + Tensor: A binary vector which represents whether each box is empty + (False) or non-empty (True). + """ + box = self.tensor + size_x = box[..., 3] + size_y = box[..., 4] + size_z = box[..., 5] + keep = ((size_x > threshold) + & (size_y > threshold) & (size_z > threshold)) + return keep + + def __getitem__( + self, item: Union[int, slice, np.ndarray, + Tensor]) -> 'BaseInstance3DBoxes': + """ + Args: + item (int or slice or np.ndarray or Tensor): Index of boxes. + + Note: + The following usage are allowed: + + 1. `new_boxes = boxes[3]`: Return a `Boxes` that contains only one + box. + 2. `new_boxes = boxes[2:10]`: Return a slice of boxes. + 3. `new_boxes = boxes[vector]`: Where vector is a + torch.BoolTensor with `length = len(boxes)`. Nonzero elements in + the vector will be selected. + + Note that the returned Boxes might share storage with this Boxes, + subject to PyTorch's indexing semantics. + + Returns: + :obj:`BaseInstance3DBoxes`: A new object of + :class:`BaseInstance3DBoxes` after indexing. + """ + original_type = type(self) + if isinstance(item, int): + return original_type(self.tensor[item].view(1, -1), + box_dim=self.box_dim, + with_yaw=self.with_yaw) + b = self.tensor[item] + assert b.dim() == 2, \ + f'Indexing on Boxes with {item} failed to return a matrix!' + return original_type(b, box_dim=self.box_dim, with_yaw=self.with_yaw) + + def __len__(self) -> int: + """int: Number of boxes in the current object.""" + return self.tensor.shape[0] + + def __repr__(self) -> str: + """str: Return a string that describes the object.""" + return self.__class__.__name__ + '(\n ' + str(self.tensor) + ')' + + @classmethod + def cat(cls, boxes_list: Sequence['BaseInstance3DBoxes'] + ) -> 'BaseInstance3DBoxes': + """Concatenate a list of Boxes into a single Boxes. + + Args: + boxes_list (Sequence[:obj:`BaseInstance3DBoxes`]): List of boxes. + + Returns: + :obj:`BaseInstance3DBoxes`: The concatenated boxes. + """ + assert isinstance(boxes_list, (list, tuple)) + if len(boxes_list) == 0: + return cls(torch.empty(0)) + assert all(isinstance(box, cls) for box in boxes_list) + + # use torch.cat (v.s. layers.cat) + # so the returned boxes never share storage with input + cat_boxes = cls(torch.cat([b.tensor for b in boxes_list], dim=0), + box_dim=boxes_list[0].box_dim, + with_yaw=boxes_list[0].with_yaw) + return cat_boxes + + def numpy(self) -> np.ndarray: + """Reload ``numpy`` from self.tensor.""" + return self.tensor.numpy() + + def to(self, device: Union[str, torch.device], *args, + **kwargs) -> 'BaseInstance3DBoxes': + """Convert current boxes to a specific device. + + Args: + device (str or :obj:`torch.device`): The name of the device. + + Returns: + :obj:`BaseInstance3DBoxes`: A new boxes object on the specific + device. + """ + original_type = type(self) + return original_type(self.tensor.to(device, *args, **kwargs), + box_dim=self.box_dim, + with_yaw=self.with_yaw) + + def cpu(self) -> 'BaseInstance3DBoxes': + """Convert current boxes to cpu device. + + Returns: + :obj:`BaseInstance3DBoxes`: A new boxes object on the cpu device. + """ + original_type = type(self) + return original_type(self.tensor.cpu(), + box_dim=self.box_dim, + with_yaw=self.with_yaw) + + def cuda(self, *args, **kwargs) -> 'BaseInstance3DBoxes': + """Convert current boxes to cuda device. + + Returns: + :obj:`BaseInstance3DBoxes`: A new boxes object on the cuda device. + """ + original_type = type(self) + return original_type(self.tensor.cuda(*args, **kwargs), + box_dim=self.box_dim, + with_yaw=self.with_yaw) + + def clone(self) -> 'BaseInstance3DBoxes': + """Clone the boxes. + + Returns: + :obj:`BaseInstance3DBoxes`: Box object with the same properties as + self. + """ + original_type = type(self) + return original_type(self.tensor.clone(), + box_dim=self.box_dim, + with_yaw=self.with_yaw) + + def detach(self) -> 'BaseInstance3DBoxes': + """Detach the boxes. + + Returns: + :obj:`BaseInstance3DBoxes`: Box object with the same properties as + self. + """ + original_type = type(self) + return original_type(self.tensor.detach(), + box_dim=self.box_dim, + with_yaw=self.with_yaw) + + @property + def device(self) -> torch.device: + """torch.device: The device of the boxes are on.""" + return self.tensor.device + + def __iter__(self) -> Iterator[Tensor]: + """Yield a box as a Tensor at a time. + + Returns: + Iterator[Tensor]: A box of shape (box_dim, ). + """ + yield from self.tensor + + @classmethod + def height_overlaps(cls, boxes1: 'BaseInstance3DBoxes', + boxes2: 'BaseInstance3DBoxes') -> Tensor: + """Calculate height overlaps of two boxes. + + Note: + This function calculates the height overlaps between ``boxes1`` and + ``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type. + + Args: + boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes. + boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes. + + Returns: + Tensor: Calculated height overlap of the boxes. + """ + assert isinstance(boxes1, BaseInstance3DBoxes) + assert isinstance(boxes2, BaseInstance3DBoxes) + assert type(boxes1) == type(boxes2), \ + '"boxes1" and "boxes2" should be in the same type, ' \ + f'but got {type(boxes1)} and {type(boxes2)}.' + + boxes1_top_height = boxes1.top_height.view(-1, 1) + boxes1_bottom_height = boxes1.bottom_height.view(-1, 1) + boxes2_top_height = boxes2.top_height.view(1, -1) + boxes2_bottom_height = boxes2.bottom_height.view(1, -1) + + heighest_of_bottom = torch.max(boxes1_bottom_height, + boxes2_bottom_height) + lowest_of_top = torch.min(boxes1_top_height, boxes2_top_height) + overlaps_h = torch.clamp(lowest_of_top - heighest_of_bottom, min=0) + return overlaps_h + + @classmethod + def overlaps(cls, + boxes1: 'BaseInstance3DBoxes', + boxes2: 'BaseInstance3DBoxes', + mode: str = 'iou') -> Tensor: + """Calculate 3D overlaps of two boxes. + + Note: + This function calculates the overlaps between ``boxes1`` and + ``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type. + + Args: + boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes. + boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes. + mode (str): Mode of iou calculation. Defaults to 'iou'. + + Returns: + Tensor: Calculated 3D overlap of the boxes. + """ + raise NotImplementedError + assert isinstance(boxes1, BaseInstance3DBoxes) + assert isinstance(boxes2, BaseInstance3DBoxes) + assert type(boxes1) == type(boxes2), \ + '"boxes1" and "boxes2" should be in the same type, ' \ + f'but got {type(boxes1)} and {type(boxes2)}.' + + assert mode in ['iou', 'iof'] + + rows = len(boxes1) + cols = len(boxes2) + if rows * cols == 0: + return boxes1.tensor.new(rows, cols) + + # height overlap + overlaps_h = cls.height_overlaps(boxes1, boxes2) + + # Restrict the min values of W and H to avoid memory overflow in + # ``box_iou_rotated``. + boxes1_bev, boxes2_bev = boxes1.bev, boxes2.bev + boxes1_bev[:, 2:4] = boxes1_bev[:, 2:4].clamp(min=1e-4) + boxes2_bev[:, 2:4] = boxes2_bev[:, 2:4].clamp(min=1e-4) + + # bev overlap + iou2d = box_iou_rotated(boxes1_bev, boxes2_bev) + areas1 = (boxes1_bev[:, 2] * boxes1_bev[:, 3]).unsqueeze(1).expand( + rows, cols) + areas2 = (boxes2_bev[:, 2] * boxes2_bev[:, 3]).unsqueeze(0).expand( + rows, cols) + overlaps_bev = iou2d * (areas1 + areas2) / (1 + iou2d) + + # 3d overlaps + overlaps_3d = overlaps_bev.to(boxes1.device) * overlaps_h + + volume1 = boxes1.volume.view(-1, 1) + volume2 = boxes2.volume.view(1, -1) + + if mode == 'iou': + # the clamp func is used to avoid division of 0 + iou3d = overlaps_3d / torch.clamp(volume1 + volume2 - overlaps_3d, + min=1e-8) + else: + iou3d = overlaps_3d / torch.clamp(volume1, min=1e-8) + + return iou3d + + def new_box( + self, data: Union[Tensor, np.ndarray, Sequence[Sequence[float]]] + ) -> 'BaseInstance3DBoxes': + """Create a new box object with data. + + The new box and its tensor has the similar properties as self and + self.tensor, respectively. + + Args: + data (Tensor or np.ndarray or Sequence[Sequence[float]]): Data to + be copied. + + Returns: + :obj:`BaseInstance3DBoxes`: A new bbox object with ``data``, the + object's other properties are similar to ``self``. + """ + new_tensor = self.tensor.new_tensor(data) \ + if not isinstance(data, Tensor) else data.to(self.device) + original_type = type(self) + return original_type(new_tensor, + box_dim=self.box_dim, + with_yaw=self.with_yaw) + + def points_in_boxes_part( + self, + points: Tensor, + boxes_override: Optional[Tensor] = None) -> Tensor: + """Find the box in which each point is. + + Args: + points (Tensor): Points in shape (1, M, 3) or (M, 3), 3 dimensions + are (x, y, z) in LiDAR or depth coordinate. + boxes_override (Tensor, optional): Boxes to override `self.tensor`. + Defaults to None. + + Note: + If a point is enclosed by multiple boxes, the index of the first + box will be returned. + + Returns: + Tensor: The index of the first box that each point is in with shape + (M, ). Default value is -1 (if the point is not enclosed by any + box). + """ + if boxes_override is not None: + boxes = boxes_override + else: + boxes = self.tensor + + points_clone = points.clone()[..., :3] + if points_clone.dim() == 2: + points_clone = points_clone.unsqueeze(0) + else: + assert points_clone.dim() == 3 and points_clone.shape[0] == 1 + + boxes = boxes.to(points_clone.device).unsqueeze(0) + box_idx = points_in_boxes_part(points_clone, boxes) + + return box_idx.squeeze(0) + + def points_in_boxes_all(self, + points: Tensor, + boxes_override: Optional[Tensor] = None) -> Tensor: + """Find all boxes in which each point is. + + Args: + points (Tensor): Points in shape (1, M, 3) or (M, 3), 3 dimensions + are (x, y, z) in LiDAR or depth coordinate. + boxes_override (Tensor, optional): Boxes to override `self.tensor`. + Defaults to None. + + Returns: + Tensor: A tensor indicating whether a point is in a box with shape + (M, T). T is the number of boxes. Denote this tensor as A, it the + m^th point is in the t^th box, then `A[m, t] == 1`, otherwise + `A[m, t] == 0`. + """ + if boxes_override is not None: + boxes = boxes_override + else: + boxes = self.tensor + + points_clone = points.clone()[..., :3] + if points_clone.dim() == 2: + points_clone = points_clone.unsqueeze(0) + else: + assert points_clone.dim() == 3 and points_clone.shape[0] == 1 + + boxes = boxes.to(points_clone.device).unsqueeze(0) + box_idxs_of_pts = points_in_boxes_all(points_clone, boxes) + + return box_idxs_of_pts.squeeze(0) + + def points_in_boxes(self, + points: Tensor, + boxes_override: Optional[Tensor] = None) -> Tensor: + warnings.warn('DeprecationWarning: points_in_boxes is a deprecated ' + 'method, please consider using points_in_boxes_part.') + return self.points_in_boxes_part(points, boxes_override) + + def points_in_boxes_batch( + self, + points: Tensor, + boxes_override: Optional[Tensor] = None) -> Tensor: + warnings.warn('DeprecationWarning: points_in_boxes_batch is a ' + 'deprecated method, please consider using ' + 'points_in_boxes_all.') + return self.points_in_boxes_all(points, boxes_override) diff --git a/embodiedscan/structures/bbox_3d/box_3d_mode.py b/embodiedscan/structures/bbox_3d/box_3d_mode.py new file mode 100644 index 0000000..1c70e30 --- /dev/null +++ b/embodiedscan/structures/bbox_3d/box_3d_mode.py @@ -0,0 +1,269 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from enum import IntEnum, unique +from typing import Optional, Sequence, Union + +import numpy as np +import torch +from torch import Tensor + +from .base_box3d import BaseInstance3DBoxes +from .cam_box3d import CameraInstance3DBoxes +from .depth_box3d import DepthInstance3DBoxes +from .lidar_box3d import LiDARInstance3DBoxes +from .utils import limit_period + + +@unique +class Box3DMode(IntEnum): + """Enum of different ways to represent a box. + + Coordinates in LiDAR: + + .. code-block:: none + + up z + ^ x front + | / + | / + left y <------ 0 + + The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0), + and the yaw is around the z axis, thus the rotation axis=2. + + Coordinates in Camera: + + .. code-block:: none + + z front + / + / + 0 ------> x right + | + | + v + down y + + The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5), + and the yaw is around the y axis, thus the rotation axis=1. + + Coordinates in Depth: + + .. code-block:: none + + up z + ^ y front + | / + | / + 0 ------> x right + + The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0), + and the yaw is around the z axis, thus the rotation axis=2. + """ + + LIDAR = 0 + CAM = 1 + DEPTH = 2 + EULER_CAM = 3 + EULER_DEPTH = 4 + + @staticmethod + def convert( + box: Union[Sequence[float], np.ndarray, Tensor, BaseInstance3DBoxes], + src: 'Box3DMode', + dst: 'Box3DMode', + rt_mat: Optional[Union[np.ndarray, Tensor]] = None, + with_yaw: bool = True, + correct_yaw: bool = False + ) -> Union[Sequence[float], np.ndarray, Tensor, BaseInstance3DBoxes]: + """Convert boxes from ``src`` mode to ``dst`` mode. + + Args: + box (Sequence[float] or np.ndarray or Tensor or + :obj:`BaseInstance3DBoxes`): Can be a k-tuple, k-list or an Nxk + array/tensor. + src (:obj:`Box3DMode`): The source box mode. + dst (:obj:`Box3DMode`): The target box mode. + rt_mat (np.ndarray or Tensor, optional): The rotation and + translation matrix between different coordinates. + Defaults to None. The conversion from ``src`` coordinates to + ``dst`` coordinates usually comes along the change of sensors, + e.g., from camera to LiDAR. This requires a transformation + matrix. + with_yaw (bool): If ``box`` is an instance of + :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle. + Defaults to True. + correct_yaw (bool): If the yaw is rotated by rt_mat. + Defaults to False. + + Returns: + Sequence[float] or np.ndarray or Tensor or + :obj:`BaseInstance3DBoxes`: The converted box of the same type. + """ + if src == dst: + return box + + is_numpy = isinstance(box, np.ndarray) + is_Instance3DBoxes = isinstance(box, BaseInstance3DBoxes) + single_box = isinstance(box, (list, tuple)) + if single_box: + assert len(box) >= 7, ( + 'Box3DMode.convert takes either a k-tuple/list or ' + 'an Nxk array/tensor, where k >= 7') + arr = torch.tensor(box)[None, :] + else: + # avoid modifying the input box + if is_numpy: + arr = torch.from_numpy(np.asarray(box)).clone() + elif is_Instance3DBoxes: + arr = box.tensor.clone() + else: + arr = box.clone() + + if is_Instance3DBoxes: + with_yaw = box.with_yaw + + # convert box from `src` mode to `dst` mode. + x_size, y_size, z_size = arr[..., 3:4], arr[..., 4:5], arr[..., 5:6] + if with_yaw: + yaw = arr[..., 6:7] + if src == Box3DMode.LIDAR and dst == Box3DMode.CAM: + if rt_mat is None: + rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]]) + xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) + if with_yaw: + if correct_yaw: + yaw_vector = torch.cat([ + torch.cos(yaw), + torch.sin(yaw), + torch.zeros_like(yaw) + ], + dim=1) + else: + yaw = -yaw - np.pi / 2 + yaw = limit_period(yaw, period=np.pi * 2) + elif src == Box3DMode.CAM and dst == Box3DMode.LIDAR: + if rt_mat is None: + rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]]) + xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) + if with_yaw: + if correct_yaw: + yaw_vector = torch.cat([ + torch.cos(-yaw), + torch.zeros_like(yaw), + torch.sin(-yaw) + ], + dim=1) + else: + yaw = -yaw - np.pi / 2 + yaw = limit_period(yaw, period=np.pi * 2) + elif src == Box3DMode.DEPTH and dst == Box3DMode.CAM: + if rt_mat is None: + rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]]) + xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) + if with_yaw: + if correct_yaw: + yaw_vector = torch.cat([ + torch.cos(yaw), + torch.sin(yaw), + torch.zeros_like(yaw) + ], + dim=1) + else: + yaw = -yaw + elif src == Box3DMode.CAM and dst == Box3DMode.DEPTH: + if rt_mat is None: + rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]]) + xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) + if with_yaw: + if correct_yaw: + yaw_vector = torch.cat([ + torch.cos(-yaw), + torch.zeros_like(yaw), + torch.sin(-yaw) + ], + dim=1) + else: + yaw = -yaw + elif src == Box3DMode.LIDAR and dst == Box3DMode.DEPTH: + if rt_mat is None: + rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]]) + xyz_size = torch.cat([x_size, y_size, z_size], dim=-1) + if with_yaw: + if correct_yaw: + yaw_vector = torch.cat([ + torch.cos(yaw), + torch.sin(yaw), + torch.zeros_like(yaw) + ], + dim=1) + else: + yaw = yaw + np.pi / 2 + yaw = limit_period(yaw, period=np.pi * 2) + elif src == Box3DMode.DEPTH and dst == Box3DMode.LIDAR: + if rt_mat is None: + rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]]) + xyz_size = torch.cat([x_size, y_size, z_size], dim=-1) + if with_yaw: + if correct_yaw: + yaw_vector = torch.cat([ + torch.cos(yaw), + torch.sin(yaw), + torch.zeros_like(yaw) + ], + dim=1) + else: + yaw = yaw - np.pi / 2 + yaw = limit_period(yaw, period=np.pi * 2) + else: # TODO: add transformation between euler boxes + raise NotImplementedError( + f'Conversion from Box3DMode {src} to {dst} ' + 'is not supported yet') + + if not isinstance(rt_mat, Tensor): + rt_mat = arr.new_tensor(rt_mat) + if rt_mat.size(1) == 4: + extended_xyz = torch.cat( + [arr[..., :3], arr.new_ones(arr.size(0), 1)], dim=-1) + xyz = extended_xyz @ rt_mat.t() + else: + xyz = arr[..., :3] @ rt_mat.t() + + # Note: we only use rotation in rt_mat + # so don't need to extend yaw_vector + if with_yaw and correct_yaw: + rot_yaw_vector = yaw_vector @ rt_mat[:3, :3].t() + if dst == Box3DMode.CAM: + yaw = torch.atan2(-rot_yaw_vector[:, [2]], rot_yaw_vector[:, + [0]]) + elif dst in [Box3DMode.LIDAR, Box3DMode.DEPTH]: + yaw = torch.atan2(rot_yaw_vector[:, [1]], rot_yaw_vector[:, + [0]]) + yaw = limit_period(yaw, period=np.pi * 2) + + if with_yaw: + remains = arr[..., 7:] + arr = torch.cat([xyz[..., :3], xyz_size, yaw, remains], dim=-1) + else: + remains = arr[..., 6:] + arr = torch.cat([xyz[..., :3], xyz_size, remains], dim=-1) + + # convert arr to the original type + original_type = type(box) + if single_box: + return original_type(arr.flatten().tolist()) + if is_numpy: + return arr.numpy() + elif is_Instance3DBoxes: + if dst == Box3DMode.CAM: + target_type = CameraInstance3DBoxes + elif dst == Box3DMode.LIDAR: + target_type = LiDARInstance3DBoxes + elif dst == Box3DMode.DEPTH: + target_type = DepthInstance3DBoxes + else: + raise NotImplementedError( + f'Conversion to {dst} through {original_type} ' + 'is not supported yet') + return target_type(arr, box_dim=arr.size(-1), with_yaw=with_yaw) + else: + return arr diff --git a/embodiedscan/structures/bbox_3d/cam_box3d.py b/embodiedscan/structures/bbox_3d/cam_box3d.py new file mode 100644 index 0000000..106f3f8 --- /dev/null +++ b/embodiedscan/structures/bbox_3d/cam_box3d.py @@ -0,0 +1,403 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Sequence, Tuple, Union + +import numpy as np +import torch +from mmdet3d.structures.points import BasePoints +from torch import Tensor + +from .base_box3d import BaseInstance3DBoxes +from .utils import rotation_3d_in_axis, yaw2local + + +class CameraInstance3DBoxes(BaseInstance3DBoxes): + """3D boxes of instances in CAM coordinates. + + Coordinates in Camera: + + .. code-block:: none + + z front (yaw=-0.5*pi) + / + / + 0 ------> x right (yaw=0) + | + | + v + down y + + The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5), + and the yaw is around the y axis, thus the rotation axis=1. The yaw is 0 at + the positive direction of x axis, and decreases from the positive direction + of x to the positive direction of z. + + Args: + tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The boxes + data with shape (N, box_dim). + box_dim (int): Number of the dimension of a box. Each row is + (x, y, z, x_size, y_size, z_size, yaw). Defaults to 7. + with_yaw (bool): Whether the box is with yaw rotation. If False, the + value of yaw will be set to 0 as minmax boxes. Defaults to True. + origin (Tuple[float]): Relative position of the box origin. + Defaults to (0.5, 1.0, 0.5). This will guide the box be converted + to (0.5, 1.0, 0.5) mode. + + Attributes: + tensor (Tensor): Float matrix with shape (N, box_dim). + box_dim (int): Integer indicating the dimension of a box. Each row is + (x, y, z, x_size, y_size, z_size, yaw, ...). + with_yaw (bool): If True, the value of yaw will be set to 0 as minmax + boxes. + """ + YAW_AXIS = 1 + + def __init__( + self, + tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]], + box_dim: int = 7, + with_yaw: bool = True, + origin: Tuple[float, float, float] = (0.5, 1.0, 0.5) + ) -> None: + if isinstance(tensor, Tensor): + device = tensor.device + else: + device = torch.device('cpu') + tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device) + if tensor.numel() == 0: + # Use reshape, so we don't end up creating a new tensor that does + # not depend on the inputs (and consequently confuses jit) + tensor = tensor.reshape((-1, box_dim)) + assert tensor.dim() == 2 and tensor.size(-1) == box_dim, \ + ('The box dimension must be 2 and the length of the last ' + f'dimension must be {box_dim}, but got boxes with shape ' + f'{tensor.shape}.') + + if tensor.shape[-1] == 6: + # If the dimension of boxes is 6, we expand box_dim by padding 0 as + # a fake yaw and set with_yaw to False + assert box_dim == 6 + fake_rot = tensor.new_zeros(tensor.shape[0], 1) + tensor = torch.cat((tensor, fake_rot), dim=-1) + self.box_dim = box_dim + 1 + self.with_yaw = False + else: + self.box_dim = box_dim + self.with_yaw = with_yaw + self.tensor = tensor.clone() + + if origin != (0.5, 1.0, 0.5): + dst = self.tensor.new_tensor((0.5, 1.0, 0.5)) + src = self.tensor.new_tensor(origin) + self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src) + + @property + def height(self) -> Tensor: + """Tensor: A vector with height of each box in shape (N, ).""" + return self.tensor[:, 4] + + @property + def top_height(self) -> Tensor: + """Tensor: A vector with top height of each box in shape (N, ).""" + # the positive direction is down rather than up + return self.bottom_height - self.height + + @property + def bottom_height(self) -> Tensor: + """Tensor: A vector with bottom height of each box in shape (N, ).""" + return self.tensor[:, 1] + + @property + def local_yaw(self) -> Tensor: + """Tensor: A vector with local yaw of each box in shape (N, ). + local_yaw equals to alpha in kitti, which is commonly used in monocular + 3D object detection task, so only :obj:`CameraInstance3DBoxes` has the + property.""" + yaw = self.yaw + loc = self.gravity_center + local_yaw = yaw2local(yaw, loc) + + return local_yaw + + @property + def gravity_center(self) -> Tensor: + """Tensor: A tensor with center of each box in shape (N, 3).""" + bottom_center = self.bottom_center + gravity_center = torch.zeros_like(bottom_center) + gravity_center[:, [0, 2]] = bottom_center[:, [0, 2]] + gravity_center[:, 1] = bottom_center[:, 1] - self.tensor[:, 4] * 0.5 + return gravity_center + + @property + def corners(self) -> Tensor: + """Convert boxes to corners in clockwise order, in the form of (x0y0z0, + x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0). + + .. code-block:: none + + front z + / + / + (x0, y0, z1) + ----------- + (x1, y0, z1) + /| / | + / | / | + (x0, y0, z0) + ----------- + + (x1, y1, z1) + | / . | / + | / origin | / + (x0, y1, z0) + ----------- + -------> right x + | (x1, y1, z0) + | + v + down y + + Returns: + Tensor: A tensor with 8 corners of each box in shape (N, 8, 3). + """ + if self.tensor.numel() == 0: + return torch.empty([0, 8, 3], device=self.tensor.device) + + dims = self.dims + corners_norm = torch.from_numpy( + np.stack(np.unravel_index(np.arange(8), [2] * 3), + axis=1)).to(device=dims.device, dtype=dims.dtype) + + corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] + # use relative origin (0.5, 1, 0.5) + corners_norm = corners_norm - dims.new_tensor([0.5, 1, 0.5]) + corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3]) + + corners = rotation_3d_in_axis(corners, + self.tensor[:, 6], + axis=self.YAW_AXIS) + corners += self.tensor[:, :3].view(-1, 1, 3) + return corners + + @property + def bev(self) -> Tensor: + """Tensor: 2D BEV box of each box with rotation in XYWHR format, in + shape (N, 5).""" + bev = self.tensor[:, [0, 2, 3, 5, 6]].clone() + # positive direction of the gravity axis + # in cam coord system points to the earth + # so the bev yaw angle needs to be reversed + bev[:, -1] = -bev[:, -1] + return bev + + def rotate( + self, + angle: Union[Tensor, np.ndarray, float], + points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None + ) -> Union[Tuple[Tensor, Tensor], Tuple[np.ndarray, np.ndarray], Tuple[ + BasePoints, Tensor], None]: + """Rotate boxes with points (optional) with the given angle or rotation + matrix. + + Args: + angle (Tensor or np.ndarray or float): Rotation angle or rotation + matrix. + points (Tensor or np.ndarray or :obj:`BasePoints`, optional): + Points to rotate. Defaults to None. + + Returns: + tuple or None: When ``points`` is None, the function returns None, + otherwise it returns the rotated points and the rotation matrix + ``rot_mat_T``. + """ + if not isinstance(angle, Tensor): + angle = self.tensor.new_tensor(angle) + + assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \ + f'invalid rotation angle shape {angle.shape}' + + if angle.numel() == 1: + self.tensor[:, 0:3], rot_mat_T = rotation_3d_in_axis( + self.tensor[:, 0:3], + angle, + axis=self.YAW_AXIS, + return_mat=True) + else: + rot_mat_T = angle + rot_sin = rot_mat_T[2, 0] + rot_cos = rot_mat_T[0, 0] + angle = np.arctan2(rot_sin, rot_cos) + self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T + + self.tensor[:, 6] += angle + + if points is not None: + if isinstance(points, Tensor): + points[:, :3] = points[:, :3] @ rot_mat_T + elif isinstance(points, np.ndarray): + rot_mat_T = rot_mat_T.cpu().numpy() + points[:, :3] = np.dot(points[:, :3], rot_mat_T) + elif isinstance(points, BasePoints): + points.rotate(rot_mat_T) + else: + raise ValueError + return points, rot_mat_T + else: + return rot_mat_T + + def flip( + self, + bev_direction: str = 'horizontal', + points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None + ) -> Union[Tensor, np.ndarray, BasePoints, None]: + """Flip the boxes in BEV along given BEV direction. + + In CAM coordinates, it flips the x (horizontal) or z (vertical) axis. + + Args: + bev_direction (str): Direction by which to flip. Can be chosen from + 'horizontal' and 'vertical'. Defaults to 'horizontal'. + points (Tensor or np.ndarray or :obj:`BasePoints`, optional): + Points to flip. Defaults to None. + + Returns: + Tensor or np.ndarray or :obj:`BasePoints` or None: When ``points`` + is None, the function returns None, otherwise it returns the + flipped points. + """ + assert bev_direction in ('horizontal', 'vertical') + if bev_direction == 'horizontal': + self.tensor[:, 0::7] = -self.tensor[:, 0::7] + if self.with_yaw: + self.tensor[:, 6] = -self.tensor[:, 6] + np.pi + elif bev_direction == 'vertical': + self.tensor[:, 2::7] = -self.tensor[:, 2::7] + if self.with_yaw: + self.tensor[:, 6] = -self.tensor[:, 6] + + if points is not None: + assert isinstance(points, (Tensor, np.ndarray, BasePoints)) + if isinstance(points, (Tensor, np.ndarray)): + if bev_direction == 'horizontal': + points[:, 0] = -points[:, 0] + elif bev_direction == 'vertical': + points[:, 2] = -points[:, 2] + elif isinstance(points, BasePoints): + points.flip(bev_direction) + return points + + @classmethod + def height_overlaps(cls, boxes1: 'CameraInstance3DBoxes', + boxes2: 'CameraInstance3DBoxes') -> Tensor: + """Calculate height overlaps of two boxes. + + Note: + This function calculates the height overlaps between ``boxes1`` and + ``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type. + + Args: + boxes1 (:obj:`CameraInstance3DBoxes`): Boxes 1 contain N boxes. + boxes2 (:obj:`CameraInstance3DBoxes`): Boxes 2 contain M boxes. + + Returns: + Tensor: Calculated height overlap of the boxes. + """ + assert isinstance(boxes1, CameraInstance3DBoxes) + assert isinstance(boxes2, CameraInstance3DBoxes) + + boxes1_top_height = boxes1.top_height.view(-1, 1) + boxes1_bottom_height = boxes1.bottom_height.view(-1, 1) + boxes2_top_height = boxes2.top_height.view(1, -1) + boxes2_bottom_height = boxes2.bottom_height.view(1, -1) + + # positive direction of the gravity axis + # in cam coord system points to the earth + heighest_of_bottom = torch.min(boxes1_bottom_height, + boxes2_bottom_height) + lowest_of_top = torch.max(boxes1_top_height, boxes2_top_height) + overlaps_h = torch.clamp(heighest_of_bottom - lowest_of_top, min=0) + return overlaps_h + + def convert_to(self, + dst: int, + rt_mat: Optional[Union[Tensor, np.ndarray]] = None, + correct_yaw: bool = False) -> 'BaseInstance3DBoxes': + """Convert self to ``dst`` mode. + + Args: + dst (int): The target Box mode. + rt_mat (Tensor or np.ndarray, optional): The rotation and + translation matrix between different coordinates. + Defaults to None. The conversion from ``src`` coordinates to + ``dst`` coordinates usually comes along the change of sensors, + e.g., from camera to LiDAR. This requires a transformation + matrix. + correct_yaw (bool): Whether to convert the yaw angle to the target + coordinate. Defaults to False. + + Returns: + :obj:`BaseInstance3DBoxes`: The converted box of the same type in + the ``dst`` mode. + """ + from .box_3d_mode import Box3DMode + + # TODO: always set correct_yaw=True + return Box3DMode.convert(box=self, + src=Box3DMode.CAM, + dst=dst, + rt_mat=rt_mat, + correct_yaw=correct_yaw) + + def points_in_boxes_part( + self, + points: Tensor, + boxes_override: Optional[Tensor] = None) -> Tensor: + """Find the box in which each point is. + + Args: + points (Tensor): Points in shape (1, M, 3) or (M, 3), 3 dimensions + are (x, y, z) in LiDAR or depth coordinate. + boxes_override (Tensor, optional): Boxes to override `self.tensor`. + Defaults to None. + + Returns: + Tensor: The index of the first box that each point is in with shape + (M, ). Default value is -1 (if the point is not enclosed by any + box). + """ + from .coord_3d_mode import Coord3DMode + + points_lidar = Coord3DMode.convert(points, Coord3DMode.CAM, + Coord3DMode.LIDAR) + if boxes_override is not None: + boxes_lidar = boxes_override + else: + boxes_lidar = Coord3DMode.convert(self.tensor, + Coord3DMode.CAM, + Coord3DMode.LIDAR, + is_point=False) + + box_idx = super().points_in_boxes_part(points_lidar, boxes_lidar) + return box_idx + + def points_in_boxes_all(self, + points: Tensor, + boxes_override: Optional[Tensor] = None) -> Tensor: + """Find all boxes in which each point is. + + Args: + points (Tensor): Points in shape (1, M, 3) or (M, 3), 3 dimensions + are (x, y, z) in LiDAR or depth coordinate. + boxes_override (Tensor, optional): Boxes to override `self.tensor`. + Defaults to None. + + Returns: + Tensor: The index of all boxes in which each point is with shape + (M, T). + """ + from .coord_3d_mode import Coord3DMode + + points_lidar = Coord3DMode.convert(points, Coord3DMode.CAM, + Coord3DMode.LIDAR) + if boxes_override is not None: + boxes_lidar = boxes_override + else: + boxes_lidar = Coord3DMode.convert(self.tensor, + Coord3DMode.CAM, + Coord3DMode.LIDAR, + is_point=False) + + box_idx = super().points_in_boxes_all(points_lidar, boxes_lidar) + return box_idx diff --git a/embodiedscan/structures/bbox_3d/coord_3d_mode.py b/embodiedscan/structures/bbox_3d/coord_3d_mode.py new file mode 100644 index 0000000..f10bb1a --- /dev/null +++ b/embodiedscan/structures/bbox_3d/coord_3d_mode.py @@ -0,0 +1,271 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from enum import IntEnum, unique +from typing import Optional, Sequence, Union + +import numpy as np +import torch +from mmdet3d.structures.points import (BasePoints, CameraPoints, DepthPoints, + LiDARPoints) +from torch import Tensor + +from .base_box3d import BaseInstance3DBoxes +from .box_3d_mode import Box3DMode + + +@unique +class Coord3DMode(IntEnum): + """Enum of different ways to represent a box and point cloud. + + Coordinates in LiDAR: + + .. code-block:: none + + up z + ^ x front + | / + | / + left y <------ 0 + + The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0), + and the yaw is around the z axis, thus the rotation axis=2. + + Coordinates in Camera: + + .. code-block:: none + + z front + / + / + 0 ------> x right + | + | + v + down y + + The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5), + and the yaw is around the y axis, thus the rotation axis=1. + + Coordinates in Depth: + + .. code-block:: none + + up z + ^ y front + | / + | / + 0 ------> x right + + The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0), + and the yaw is around the z axis, thus the rotation axis=2. + """ + + LIDAR = 0 + CAM = 1 + DEPTH = 2 + + @staticmethod + def convert(input: Union[Sequence[float], np.ndarray, Tensor, + BaseInstance3DBoxes, BasePoints], + src: Union[Box3DMode, 'Coord3DMode'], + dst: Union[Box3DMode, 'Coord3DMode'], + rt_mat: Optional[Union[np.ndarray, Tensor]] = None, + with_yaw: bool = True, + correct_yaw: bool = False, + is_point: bool = True): + """Convert boxes or points from ``src`` mode to ``dst`` mode. + + Args: + input (Sequence[float] or np.ndarray or Tensor or + :obj:`BaseInstance3DBoxes` or :obj:`BasePoints`): Can be a + k-tuple, k-list or an Nxk array/tensor. + src (:obj:`Box3DMode` or :obj:`Coord3DMode`): The source mode. + dst (:obj:`Box3DMode` or :obj:`Coord3DMode`): The target mode. + rt_mat (np.ndarray or Tensor, optional): The rotation and + translation matrix between different coordinates. + Defaults to None. The conversion from ``src`` coordinates to + ``dst`` coordinates usually comes along the change of sensors, + e.g., from camera to LiDAR. This requires a transformation + matrix. + with_yaw (bool): If ``box`` is an instance of + :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle. + Defaults to True. + correct_yaw (bool): If the yaw is rotated by rt_mat. + Defaults to False. + is_point (bool): If ``input`` is neither an instance of + :obj:`BaseInstance3DBoxes` nor an instance of + :obj:`BasePoints`, whether or not it is point data. + Defaults to True. + + Returns: + Sequence[float] or np.ndarray or Tensor or + :obj:`BaseInstance3DBoxes` or :obj:`BasePoints`: The converted box + or points of the same type. + """ + if isinstance(input, BaseInstance3DBoxes): + return Coord3DMode.convert_box(input, + src, + dst, + rt_mat=rt_mat, + with_yaw=with_yaw, + correct_yaw=correct_yaw) + elif isinstance(input, BasePoints): + return Coord3DMode.convert_point(input, src, dst, rt_mat=rt_mat) + elif isinstance(input, (tuple, list, np.ndarray, Tensor)): + if is_point: + return Coord3DMode.convert_point(input, + src, + dst, + rt_mat=rt_mat) + else: + return Coord3DMode.convert_box(input, + src, + dst, + rt_mat=rt_mat, + with_yaw=with_yaw, + correct_yaw=correct_yaw) + else: + raise NotImplementedError + + @staticmethod + def convert_box( + box: Union[Sequence[float], np.ndarray, Tensor, BaseInstance3DBoxes], + src: Box3DMode, + dst: Box3DMode, + rt_mat: Optional[Union[np.ndarray, Tensor]] = None, + with_yaw: bool = True, + correct_yaw: bool = False + ) -> Union[Sequence[float], np.ndarray, Tensor, BaseInstance3DBoxes]: + """Convert boxes from ``src`` mode to ``dst`` mode. + + Args: + box (Sequence[float] or np.ndarray or Tensor or + :obj:`BaseInstance3DBoxes`): Can be a k-tuple, k-list or an Nxk + array/tensor. + src (:obj:`Box3DMode`): The source box mode. + dst (:obj:`Box3DMode`): The target box mode. + rt_mat (np.ndarray or Tensor, optional): The rotation and + translation matrix between different coordinates. + Defaults to None. The conversion from ``src`` coordinates to + ``dst`` coordinates usually comes along the change of sensors, + e.g., from camera to LiDAR. This requires a transformation + matrix. + with_yaw (bool): If ``box`` is an instance of + :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle. + Defaults to True. + correct_yaw (bool): If the yaw is rotated by rt_mat. + Defaults to False. + + Returns: + Sequence[float] or np.ndarray or Tensor or + :obj:`BaseInstance3DBoxes`: The converted box of the same type. + """ + return Box3DMode.convert(box, + src, + dst, + rt_mat=rt_mat, + with_yaw=with_yaw, + correct_yaw=correct_yaw) + + @staticmethod + def convert_point( + point: Union[Sequence[float], np.ndarray, Tensor, BasePoints], + src: 'Coord3DMode', + dst: 'Coord3DMode', + rt_mat: Optional[Union[np.ndarray, Tensor]] = None, + ) -> Union[Sequence[float], np.ndarray, Tensor, BasePoints]: + """Convert points from ``src`` mode to ``dst`` mode. + + Args: + box (Sequence[float] or np.ndarray or Tensor or :obj:`BasePoints`): + Can be a k-tuple, k-list or an Nxk array/tensor. + src (:obj:`Coord3DMode`): The source point mode. + dst (:obj:`Coord3DMode`): The target point mode. + rt_mat (np.ndarray or Tensor, optional): The rotation and + translation matrix between different coordinates. + Defaults to None. The conversion from ``src`` coordinates to + ``dst`` coordinates usually comes along the change of sensors, + e.g., from camera to LiDAR. This requires a transformation + matrix. + + Returns: + Sequence[float] or np.ndarray or Tensor or :obj:`BasePoints`: The + converted point of the same type. + """ + if src == dst: + return point + + is_numpy = isinstance(point, np.ndarray) + is_InstancePoints = isinstance(point, BasePoints) + single_point = isinstance(point, (list, tuple)) + if single_point: + assert len(point) >= 3, ( + 'Coord3DMode.convert takes either a k-tuple/list or ' + 'an Nxk array/tensor, where k >= 3') + arr = torch.tensor(point)[None, :] + else: + # avoid modifying the input point + if is_numpy: + arr = torch.from_numpy(np.asarray(point)).clone() + elif is_InstancePoints: + arr = point.tensor.clone() + else: + arr = point.clone() + + # convert point from `src` mode to `dst` mode. + if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM: + if rt_mat is None: + rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]]) + elif src == Coord3DMode.CAM and dst == Coord3DMode.LIDAR: + if rt_mat is None: + rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]]) + elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM: + if rt_mat is None: + rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]]) + elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH: + if rt_mat is None: + rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]]) + elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH: + if rt_mat is None: + rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]]) + elif src == Coord3DMode.DEPTH and dst == Coord3DMode.LIDAR: + if rt_mat is None: + rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]]) + else: + raise NotImplementedError( + f'Conversion from Coord3DMode {src} to {dst} ' + 'is not supported yet') + + if not isinstance(rt_mat, Tensor): + rt_mat = arr.new_tensor(rt_mat) + if rt_mat.size(1) == 4: + extended_xyz = torch.cat( + [arr[..., :3], arr.new_ones(arr.size(0), 1)], dim=-1) + xyz = extended_xyz @ rt_mat.t() + else: + xyz = arr[..., :3] @ rt_mat.t() + + remains = arr[..., 3:] + arr = torch.cat([xyz[..., :3], remains], dim=-1) + + # convert arr to the original type + original_type = type(point) + if single_point: + return original_type(arr.flatten().tolist()) + if is_numpy: + return arr.numpy() + elif is_InstancePoints: + if dst == Coord3DMode.CAM: + target_type = CameraPoints + elif dst == Coord3DMode.LIDAR: + target_type = LiDARPoints + elif dst == Coord3DMode.DEPTH: + target_type = DepthPoints + else: + raise NotImplementedError( + f'Conversion to {dst} through {original_type} ' + 'is not supported yet') + return target_type(arr, + points_dim=arr.size(-1), + attribute_dims=point.attribute_dims) + else: + return arr diff --git a/embodiedscan/structures/bbox_3d/depth_box3d.py b/embodiedscan/structures/bbox_3d/depth_box3d.py new file mode 100644 index 0000000..bf2f4c1 --- /dev/null +++ b/embodiedscan/structures/bbox_3d/depth_box3d.py @@ -0,0 +1,282 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple, Union + +import numpy as np +import torch +from mmdet3d.structures.points import BasePoints +from torch import Tensor + +from .base_box3d import BaseInstance3DBoxes +from .utils import rotation_3d_in_axis + + +class DepthInstance3DBoxes(BaseInstance3DBoxes): + """3D boxes of instances in DEPTH coordinates. + + Coordinates in Depth: + + .. code-block:: none + + up z y front (yaw=0.5*pi) + ^ ^ + | / + | / + 0 ------> x right (yaw=0) + + The relative coordinate of bottom center in a Depth box is (0.5, 0.5, 0), + and the yaw is around the z axis, thus the rotation axis=2. The yaw is 0 at + the positive direction of x axis, and increases from the positive direction + of x to the positive direction of y. + + Attributes: + tensor (Tensor): Float matrix with shape (N, box_dim). + box_dim (int): Integer indicating the dimension of a box. Each row is + (x, y, z, x_size, y_size, z_size, yaw, ...). + with_yaw (bool): If True, the value of yaw will be set to 0 as minmax + boxes. + """ + YAW_AXIS = 2 + + @property + def corners(self) -> Tensor: + """Convert boxes to corners in clockwise order, in the form of (x0y0z0, + x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0). + + .. code-block:: none + + up z + front y ^ + / | + / | + (x0, y1, z1) + ----------- + (x1, y1, z1) + /| / | + / | / | + (x0, y0, z1) + ----------- + + (x1, y1, z0) + | / . | / + | / origin | / + (x0, y0, z0) + ----------- + --------> right x + (x1, y0, z0) + + Returns: + Tensor: A tensor with 8 corners of each box in shape (N, 8, 3). + """ + if self.tensor.numel() == 0: + return torch.empty([0, 8, 3], device=self.tensor.device) + + dims = self.dims + corners_norm = torch.from_numpy( + np.stack(np.unravel_index(np.arange(8), [2] * 3), + axis=1)).to(device=dims.device, dtype=dims.dtype) + + corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] + # use relative origin (0.5, 0.5, 0) + corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0]) + corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3]) + + # rotate around z axis + corners = rotation_3d_in_axis(corners, + self.tensor[:, 6], + axis=self.YAW_AXIS) + corners += self.tensor[:, :3].view(-1, 1, 3) + return corners + + def rotate( + self, + angle: Union[Tensor, np.ndarray, float], + points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None + ) -> Union[Tuple[Tensor, Tensor], Tuple[np.ndarray, np.ndarray], Tuple[ + BasePoints, Tensor], None]: + """Rotate boxes with points (optional) with the given angle or rotation + matrix. + + Args: + angle (Tensor or np.ndarray or float): Rotation angle or rotation + matrix. + points (Tensor or np.ndarray or :obj:`BasePoints`, optional): + Points to rotate. Defaults to None. + + Returns: + tuple or None: When ``points`` is None, the function returns None, + otherwise it returns the rotated points and the rotation matrix + ``rot_mat_T``. + """ + if not isinstance(angle, Tensor): + angle = self.tensor.new_tensor(angle) + + assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \ + f'invalid rotation angle shape {angle.shape}' + + if angle.numel() == 1: + self.tensor[:, 0:3], rot_mat_T = rotation_3d_in_axis( + self.tensor[:, 0:3], + angle, + axis=self.YAW_AXIS, + return_mat=True) + else: + rot_mat_T = angle + rot_sin = rot_mat_T[0, 1] + rot_cos = rot_mat_T[0, 0] + angle = np.arctan2(rot_sin, rot_cos) + self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T + + if self.with_yaw: + self.tensor[:, 6] += angle + else: + # for axis-aligned boxes, we take the new + # enclosing axis-aligned boxes after rotation + corners_rot = self.corners @ rot_mat_T + new_x_size = corners_rot[..., 0].max( + dim=1, keepdim=True)[0] - corners_rot[..., 0].min( + dim=1, keepdim=True)[0] + new_y_size = corners_rot[..., 1].max( + dim=1, keepdim=True)[0] - corners_rot[..., 1].min( + dim=1, keepdim=True)[0] + self.tensor[:, 3:5] = torch.cat((new_x_size, new_y_size), dim=-1) + + if points is not None: + if isinstance(points, Tensor): + points[:, :3] = points[:, :3] @ rot_mat_T + elif isinstance(points, np.ndarray): + rot_mat_T = rot_mat_T.cpu().numpy() + points[:, :3] = np.dot(points[:, :3], rot_mat_T) + elif isinstance(points, BasePoints): + points.rotate(rot_mat_T) + else: + raise ValueError + return points, rot_mat_T + else: + return rot_mat_T + + def flip( + self, + bev_direction: str = 'horizontal', + points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None + ) -> Union[Tensor, np.ndarray, BasePoints, None]: + """Flip the boxes in BEV along given BEV direction. + + In Depth coordinates, it flips the x (horizontal) or y (vertical) axis. + + Args: + bev_direction (str): Direction by which to flip. Can be chosen from + 'horizontal' and 'vertical'. Defaults to 'horizontal'. + points (Tensor or np.ndarray or :obj:`BasePoints`, optional): + Points to flip. Defaults to None. + + Returns: + Tensor or np.ndarray or :obj:`BasePoints` or None: When ``points`` + is None, the function returns None, otherwise it returns the + flipped points. + """ + assert bev_direction in ('horizontal', 'vertical') + if bev_direction == 'horizontal': + self.tensor[:, 0::7] = -self.tensor[:, 0::7] + if self.with_yaw: + self.tensor[:, 6] = -self.tensor[:, 6] + np.pi + elif bev_direction == 'vertical': + self.tensor[:, 1::7] = -self.tensor[:, 1::7] + if self.with_yaw: + self.tensor[:, 6] = -self.tensor[:, 6] + + if points is not None: + assert isinstance(points, (Tensor, np.ndarray, BasePoints)) + if isinstance(points, (Tensor, np.ndarray)): + if bev_direction == 'horizontal': + points[:, 0] = -points[:, 0] + elif bev_direction == 'vertical': + points[:, 1] = -points[:, 1] + elif isinstance(points, BasePoints): + points.flip(bev_direction) + return points + + def convert_to(self, + dst: int, + rt_mat: Optional[Union[Tensor, np.ndarray]] = None, + correct_yaw: bool = False) -> 'BaseInstance3DBoxes': + """Convert self to ``dst`` mode. + + Args: + dst (int): The target Box mode. + rt_mat (Tensor or np.ndarray, optional): The rotation and + translation matrix between different coordinates. + Defaults to None. The conversion from ``src`` coordinates to + ``dst`` coordinates usually comes along the change of sensors, + e.g., from camera to LiDAR. This requires a transformation + matrix. + correct_yaw (bool): Whether to convert the yaw angle to the target + coordinate. Defaults to False. + + Returns: + :obj:`BaseInstance3DBoxes`: The converted box of the same type in + the ``dst`` mode. + """ + from .box_3d_mode import Box3DMode + return Box3DMode.convert(box=self, + src=Box3DMode.DEPTH, + dst=dst, + rt_mat=rt_mat, + correct_yaw=correct_yaw) + + def enlarged_box( + self, extra_width: Union[float, Tensor]) -> 'DepthInstance3DBoxes': + """Enlarge the length, width and height of boxes. + + Args: + extra_width (float or Tensor): Extra width to enlarge the box. + + Returns: + :obj:`DepthInstance3DBoxes`: Enlarged boxes. + """ + enlarged_boxes = self.tensor.clone() + enlarged_boxes[:, 3:6] += extra_width * 2 + # bottom center z minus extra_width + enlarged_boxes[:, 2] -= extra_width + return self.new_box(enlarged_boxes) + + def get_surface_line_center(self) -> Tuple[Tensor, Tensor]: + """Compute surface and line center of bounding boxes. + + Returns: + Tuple[Tensor, Tensor]: Surface and line center of bounding boxes. + """ + obj_size = self.dims + center = self.gravity_center.view(-1, 1, 3) + batch_size = center.shape[0] + + rot_sin = torch.sin(-self.yaw) + rot_cos = torch.cos(-self.yaw) + rot_mat_T = self.yaw.new_zeros(tuple(list(self.yaw.shape) + [3, 3])) + rot_mat_T[..., 0, 0] = rot_cos + rot_mat_T[..., 0, 1] = -rot_sin + rot_mat_T[..., 1, 0] = rot_sin + rot_mat_T[..., 1, 1] = rot_cos + rot_mat_T[..., 2, 2] = 1 + + # Get the object surface center + offset = obj_size.new_tensor([[0, 0, 1], [0, 0, -1], [0, 1, 0], + [0, -1, 0], [1, 0, 0], [-1, 0, 0]]) + offset = offset.view(1, 6, 3) / 2 + surface_3d = (offset * + obj_size.view(batch_size, 1, 3).repeat(1, 6, 1)).reshape( + -1, 3) + + # Get the object line center + offset = obj_size.new_tensor([[1, 0, 1], [-1, 0, 1], [0, 1, 1], + [0, -1, 1], [1, 0, -1], [-1, 0, -1], + [0, 1, -1], [0, -1, -1], [1, 1, 0], + [1, -1, 0], [-1, 1, 0], [-1, -1, 0]]) + offset = offset.view(1, 12, 3) / 2 + + line_3d = (offset * + obj_size.view(batch_size, 1, 3).repeat(1, 12, 1)).reshape( + -1, 3) + + surface_rot = rot_mat_T.repeat(6, 1, 1) + surface_3d = torch.matmul(surface_3d.unsqueeze(-2), + surface_rot).squeeze(-2) + surface_center = center.repeat(1, 6, 1).reshape(-1, 3) + surface_3d + + line_rot = rot_mat_T.repeat(12, 1, 1) + line_3d = torch.matmul(line_3d.unsqueeze(-2), line_rot).squeeze(-2) + line_center = center.repeat(1, 12, 1).reshape(-1, 3) + line_3d + + return surface_center, line_center diff --git a/embodiedscan/structures/bbox_3d/euler_box3d.py b/embodiedscan/structures/bbox_3d/euler_box3d.py new file mode 100644 index 0000000..f328ff2 --- /dev/null +++ b/embodiedscan/structures/bbox_3d/euler_box3d.py @@ -0,0 +1,410 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch +from mmdet3d.structures.points import BasePoints +from pytorch3d.ops import box3d_overlap +from pytorch3d.transforms import euler_angles_to_matrix, matrix_to_euler_angles + +from .base_box3d import BaseInstance3DBoxes +from .utils import rotation_3d_in_euler + + +class EulerInstance3DBoxes(BaseInstance3DBoxes): + """3D boxes of instances in Depth coordinates. + + Coordinates in Depth: + + .. code-block:: none + + up z y front (alpha=0.5*pi) + ^ ^ + | / + | / + 0 ------> x right (alpha=0) + + The relative coordinate of bottom center in a Depth box is (0.5, 0.5, 0), + and the yaw is around the z axis, thus the rotation axis=2. + The yaw is 0 at the positive direction of x axis, and decreases from + the positive direction of x to the positive direction of y. + Also note that rotation of DepthInstance3DBoxes is counterclockwise, + which is reverse to the definition of the yaw angle (clockwise). + + A refactor is ongoing to make the three coordinate systems + easier to understand and convert between each other. + + Attributes: + tensor (torch.Tensor): Float matrix of N x box_dim. + box_dim (int): Integer indicates the dimension of a box + Each row is (x, y, z, x_size, y_size, z_size, alpha, beta, gamma). + with_yaw (bool): If True, the value of yaw will be set to 0 as minmax + boxes. + """ + + def __init__(self, + tensor, + box_dim=9, + with_yaw=True, + origin=(0.5, 0.5, 0.5)): + if isinstance(tensor, torch.Tensor): + device = tensor.device + else: + device = torch.device('cpu') + tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device) + if tensor.numel() == 0: + # Use reshape, so we don't end up creating a new tensor that + # does not depend on the inputs (and consequently confuses jit) + tensor = tensor.reshape((0, box_dim)).to(dtype=torch.float32, + device=device) + assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size() + + if tensor.shape[-1] == 6: + # If the dimension of boxes is 6, we expand box_dim by padding + # 0 as a fake yaw and set with_yaw to False. + assert box_dim == 6 + fake_rot = tensor.new_zeros(tensor.shape[0], 3) + tensor = torch.cat((tensor, fake_rot), dim=-1) + self.box_dim = box_dim + 3 + self.with_yaw = True # TODO + elif tensor.shape[-1] == 7: + assert box_dim == 7 + fake_euler = tensor.new_zeros(tensor.shape[0], 2) + tensor = torch.cat((tensor, fake_euler), dim=-1) + self.box_dim = box_dim + 2 + self.with_yaw = True + else: + assert tensor.shape[-1] == 9 + self.box_dim = box_dim + self.with_yaw = True # TODO + self.tensor = tensor.clone() + + self.origin = origin + if origin != (0.5, 0.5, 0.5): + dst = self.tensor.new_tensor((0.5, 0.5, 0.5)) + src = self.tensor.new_tensor(origin) + self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src) + + def get_corners(self, tensor1): + """torch.Tensor: Coordinates of corners of all the boxes + in shape (N, 8, 3). + + Convert the boxes to corners in clockwise order, in form of + ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)`` + + .. code-block:: none + + up z + front y ^ + / | + / | + (x0, y1, z1) + ----------- + (x1, y1, z1) + /| / | + / | / | + (x0, y0, z1) + ----------- + + (x1, y1, z0) + | / . | / + | / origin | / + (x0, y0, z0) + ----------- + --------> right x + (x1, y0, z0) + """ + if tensor1.numel() == 0: + return torch.empty([0, 8, 3], device=tensor1.device) + + dims = tensor1[:, 3:6] + corners_norm = torch.from_numpy( + np.stack(np.unravel_index(np.arange(8), [2] * 3), + axis=1)).to(device=dims.device, dtype=dims.dtype) + + corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] + # use relative origin + assert self.origin == (0.5, 0.5, 0.5), \ + 'self.origin != (0.5, 0.5, 0.5) needs to be checked!' + corners_norm = corners_norm - dims.new_tensor(self.origin) + corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3]) + + # rotate + corners = rotation_3d_in_euler(corners, tensor1[:, 6:]) + + corners += tensor1[:, :3].view(-1, 1, 3) + return corners + + @classmethod + def overlaps(cls, boxes1, boxes2, mode='iou'): + """Calculate 3D overlaps of two boxes. + + Note: + This function calculates the overlaps between ``boxes1`` and + ``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type. + + Args: + boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes. + boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes. + mode (str, optional): Mode of iou calculation. Defaults to 'iou'. + + Returns: + torch.Tensor: Calculated 3D overlaps of the boxes. + """ + assert isinstance(boxes1, EulerInstance3DBoxes) + assert isinstance(boxes2, EulerInstance3DBoxes) + assert type(boxes1) == type(boxes2), '"boxes1" and "boxes2" should' \ + f'be in the same type, got {type(boxes1)} and {type(boxes2)}.' + + assert mode in ['iou'] + + rows = len(boxes1) + cols = len(boxes2) + if rows * cols == 0: + return boxes1.tensor.new(rows, cols) + + corners1 = boxes1.corners + corners2 = boxes2.corners + _, iou3d = box3d_overlap(corners1, corners2, eps=1e-4) + return iou3d + + @property + def bottom_center(self): + """torch.Tensor: A tensor with center of each box in shape (N, 3).""" + raise NotImplementedError('Not support') + + @property + def gravity_center(self): + """torch.Tensor: A tensor with center of each box in shape (N, 3).""" + return self.tensor[:, :3] + + @property + def corners(self): + """torch.Tensor: Coordinates of corners of all the boxes + in shape (N, 8, 3). + + Convert the boxes to corners in clockwise order, in form of + ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)`` + + .. code-block:: none + + up z + front y ^ + / | + / | + (x0, y1, z1) + ----------- + (x1, y1, z1) + /| / | + / | / | + (x0, y0, z1) + ----------- + + (x1, y1, z0) + | / . | / + | / origin | / + (x0, y0, z0) + ----------- + --------> right x + (x1, y0, z0) + """ + if self.tensor.numel() == 0: + return torch.empty([0, 8, 3], device=self.tensor.device) + + dims = self.dims + corners_norm = torch.from_numpy( + np.stack(np.unravel_index(np.arange(8), [2] * 3), + axis=1)).to(device=dims.device, dtype=dims.dtype) + + corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] + # use relative origin + assert self.origin == (0.5, 0.5, 0.5), \ + 'self.origin != (0.5, 0.5, 0.5) needs to be checked!' + corners_norm = corners_norm - dims.new_tensor(self.origin) + corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3]) + + # rotate + corners = rotation_3d_in_euler(corners, self.tensor[:, 6:]) + + corners += self.tensor[:, :3].view(-1, 1, 3) + return corners + + def transform(self, matrix): + if self.tensor.shape[0] == 0: + return + if not isinstance(matrix, torch.Tensor): + matrix = self.tensor.new_tensor(matrix) + points = self.tensor[:, :3] + constant = points.new_ones(points.shape[0], 1) + points_extend = torch.concat([points, constant], dim=-1) + points_trans = torch.matmul(points_extend, matrix.transpose(-2, + -1))[:, :3] + + size = self.tensor[:, 3:6] + + # angle_delta = matrix_to_euler_angles(matrix[:3,:3], 'ZXY') + # angle = self.tensor[:,6:] + angle_delta + ori_matrix = euler_angles_to_matrix(self.tensor[:, 6:], 'ZXY') + rot_matrix = matrix[:3, :3].expand_as(ori_matrix) + final = torch.bmm(rot_matrix, ori_matrix) + angle = matrix_to_euler_angles(final, 'ZXY') + + self.tensor = torch.cat([points_trans, size, angle], dim=-1) + + def rotate(self, angle, points=None): + """Rotate boxes with points (optional) with the given angle or rotation + matrix. + + Args: + angle (float | torch.Tensor | np.ndarray): + Rotation angle or rotation matrix. + points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional): + Points to rotate. Defaults to None. + + Returns: + tuple or None: When ``points`` is None, the function returns + None, otherwise it returns the rotated points and the + rotation matrix ``rot_mat_T``. + """ + if not isinstance(angle, torch.Tensor): + angle = self.tensor.new_tensor(angle) + + if angle.numel() == 1: # only given yaw angle for rotation + angle = self.tensor.new_tensor([0., 0., angle]) + rot_matrix = euler_angles_to_matrix(angle, 'ZXY') + elif angle.numel() == 3: + rot_matrix = euler_angles_to_matrix(angle, 'ZXY') + elif angle.shape == torch.Size([3, 3]): + rot_matrix = angle + else: + raise NotImplementedError + + rot_mat_T = rot_matrix.T + transform_matrix = torch.eye(4) + transform_matrix[:3, :3] = rot_matrix + self.transform(transform_matrix) + + if points is not None: + if isinstance(points, torch.Tensor): + points[:, :3] = points[:, :3] @ rot_mat_T + elif isinstance(points, np.ndarray): + rot_mat_T = rot_mat_T.cpu().numpy() + points[:, :3] = np.dot(points[:, :3], rot_mat_T) + elif isinstance(points, BasePoints): + points.rotate(rot_mat_T) + else: + raise ValueError + return points, rot_mat_T + else: + return rot_mat_T + + def flip(self, direction='X', points=None): + """Flip the boxes in BEV along given BEV direction. + + In Depth coordinates, it flips x (horizontal) or y (vertical) axis. + + Args: + bev_direction (str, optional): Flip direction + (horizontal or vertical). Defaults to 'horizontal'. + points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional): + Points to flip. Defaults to None. + + Returns: + torch.Tensor, numpy.ndarray or None: Flipped points. + """ + assert direction in ['X', 'Y', 'Z'] + if direction == 'X': + self.tensor[:, 0] = -self.tensor[:, 0] + self.tensor[:, 6] = -self.tensor[:, 6] + np.pi + self.tensor[:, 8] = -self.tensor[:, 8] + elif direction == 'Y': + self.tensor[:, 1] = -self.tensor[:, 1] + self.tensor[:, 6] = -self.tensor[:, 6] + self.tensor[:, 7] = -self.tensor[:, 7] + np.pi + elif direction == 'Z': + self.tensor[:, 2] = -self.tensor[:, 2] + self.tensor[:, 7] = -self.tensor[:, 7] + self.tensor[:, 8] = -self.tensor[:, 8] + np.pi + + if points is not None: + # assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints)) + if isinstance(points, (torch.Tensor, np.ndarray)): + if direction == 'X': + points[:, 0] = -points[:, 0] + elif direction == 'Y': + points[:, 1] = -points[:, 1] + elif direction == 'Z': + points[:, 2] = -points[:, 2] + else: + points.flip(direction) + return points + + def convert_to(self, dst, rt_mat=None): + """Convert self to ``dst`` mode. + + Args: + dst (:obj:`Box3DMode`): The target Box mode. + rt_mat (np.ndarray | torch.Tensor, optional): The rotation and + translation matrix between different coordinates. + Defaults to None. + The conversion from ``src`` coordinates to ``dst`` coordinates + usually comes along the change of sensors, e.g., from camera + to LiDAR. This requires a transformation matrix. + + Returns: + :obj:`DepthInstance3DBoxes`: + The converted box of the same type in the ``dst`` mode. + """ + from .box_3d_mode import Box3DMode + assert dst == Box3DMode.EULER_DEPTH + return self + + def enlarged_box(self, extra_width): + """Enlarge the length, width and height boxes. + + Args: + extra_width (float | torch.Tensor): Extra width to enlarge the box. + + Returns: + :obj:`DepthInstance3DBoxes`: Enlarged boxes. + """ + raise NotImplementedError('enlarged box') + enlarged_boxes = self.tensor.clone() + enlarged_boxes[:, 3:6] += extra_width * 2 + # bottom center z minus extra_width + enlarged_boxes[:, 2] -= extra_width + return self.new_box(enlarged_boxes) + + def get_surface_line_center(self): + """Compute surface and line center of bounding boxes. + + Returns: + torch.Tensor: Surface and line center of bounding boxes. + """ + raise NotImplementedError('surface line center') + obj_size = self.dims + center = self.gravity_center.view(-1, 1, 3) + batch_size = center.shape[0] + + rot_sin = torch.sin(-self.yaw) + rot_cos = torch.cos(-self.yaw) + rot_mat_T = self.yaw.new_zeros(tuple(list(self.yaw.shape) + [3, 3])) + rot_mat_T[..., 0, 0] = rot_cos + rot_mat_T[..., 0, 1] = -rot_sin + rot_mat_T[..., 1, 0] = rot_sin + rot_mat_T[..., 1, 1] = rot_cos + rot_mat_T[..., 2, 2] = 1 + + # Get the object surface center + offset = obj_size.new_tensor([[0, 0, 1], [0, 0, -1], [0, 1, 0], + [0, -1, 0], [1, 0, 0], [-1, 0, 0]]) + offset = offset.view(1, 6, 3) / 2 + surface_3d = (offset * + obj_size.view(batch_size, 1, 3).repeat(1, 6, 1)).reshape( + -1, 3) + + # Get the object line center + offset = obj_size.new_tensor([[1, 0, 1], [-1, 0, 1], [0, 1, 1], + [0, -1, 1], [1, 0, -1], [-1, 0, -1], + [0, 1, -1], [0, -1, -1], [1, 1, 0], + [1, -1, 0], [-1, 1, 0], [-1, -1, 0]]) + offset = offset.view(1, 12, 3) / 2 + + line_3d = (offset * + obj_size.view(batch_size, 1, 3).repeat(1, 12, 1)).reshape( + -1, 3) + + surface_rot = rot_mat_T.repeat(6, 1, 1) + surface_3d = torch.matmul(surface_3d.unsqueeze(-2), + surface_rot).squeeze(-2) + surface_center = center.repeat(1, 6, 1).reshape(-1, 3) + surface_3d + + line_rot = rot_mat_T.repeat(12, 1, 1) + line_3d = torch.matmul(line_3d.unsqueeze(-2), line_rot).squeeze(-2) + line_center = center.repeat(1, 12, 1).reshape(-1, 3) + line_3d + + return surface_center, line_center diff --git a/embodiedscan/structures/bbox_3d/lidar_box3d.py b/embodiedscan/structures/bbox_3d/lidar_box3d.py new file mode 100644 index 0000000..438a200 --- /dev/null +++ b/embodiedscan/structures/bbox_3d/lidar_box3d.py @@ -0,0 +1,225 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple, Union + +import numpy as np +import torch +from mmdet3d.structures.points import BasePoints +from torch import Tensor + +from .base_box3d import BaseInstance3DBoxes +from .utils import rotation_3d_in_axis + + +class LiDARInstance3DBoxes(BaseInstance3DBoxes): + """3D boxes of instances in LIDAR coordinates. + + Coordinates in LiDAR: + + .. code-block:: none + + up z x front (yaw=0) + ^ ^ + | / + | / + (yaw=0.5*pi) left y <------ 0 + + The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0), + and the yaw is around the z axis, thus the rotation axis=2. The yaw is 0 at + the positive direction of x axis, and increases from the positive direction + of x to the positive direction of y. + + Attributes: + tensor (Tensor): Float matrix with shape (N, box_dim). + box_dim (int): Integer indicating the dimension of a box. Each row is + (x, y, z, x_size, y_size, z_size, yaw, ...). + with_yaw (bool): If True, the value of yaw will be set to 0 as minmax + boxes. + """ + YAW_AXIS = 2 + + @property + def corners(self) -> Tensor: + """Convert boxes to corners in clockwise order, in the form of (x0y0z0, + x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0). + + .. code-block:: none + + up z + front x ^ + / | + / | + (x1, y0, z1) + ----------- + (x1, y1, z1) + /| / | + / | / | + (x0, y0, z1) + ----------- + + (x1, y1, z0) + | / . | / + | / origin | / + left y <------- + ----------- + (x0, y1, z0) + (x0, y0, z0) + + Returns: + Tensor: A tensor with 8 corners of each box in shape (N, 8, 3). + """ + if self.tensor.numel() == 0: + return torch.empty([0, 8, 3], device=self.tensor.device) + + dims = self.dims + corners_norm = torch.from_numpy( + np.stack(np.unravel_index(np.arange(8), [2] * 3), + axis=1)).to(device=dims.device, dtype=dims.dtype) + + corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] + # use relative origin (0.5, 0.5, 0) + corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0]) + corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3]) + + # rotate around z axis + corners = rotation_3d_in_axis(corners, + self.tensor[:, 6], + axis=self.YAW_AXIS) + corners += self.tensor[:, :3].view(-1, 1, 3) + return corners + + def rotate( + self, + angle: Union[Tensor, np.ndarray, float], + points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None + ) -> Union[Tuple[Tensor, Tensor], Tuple[np.ndarray, np.ndarray], Tuple[ + BasePoints, Tensor], None]: + """Rotate boxes with points (optional) with the given angle or rotation + matrix. + + Args: + angle (Tensor or np.ndarray or float): Rotation angle or rotation + matrix. + points (Tensor or np.ndarray or :obj:`BasePoints`, optional): + Points to rotate. Defaults to None. + + Returns: + tuple or None: When ``points`` is None, the function returns None, + otherwise it returns the rotated points and the rotation matrix + ``rot_mat_T``. + """ + if not isinstance(angle, Tensor): + angle = self.tensor.new_tensor(angle) + + assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \ + f'invalid rotation angle shape {angle.shape}' + + if angle.numel() == 1: + self.tensor[:, 0:3], rot_mat_T = rotation_3d_in_axis( + self.tensor[:, 0:3], + angle, + axis=self.YAW_AXIS, + return_mat=True) + else: + rot_mat_T = angle + rot_sin = rot_mat_T[0, 1] + rot_cos = rot_mat_T[0, 0] + angle = np.arctan2(rot_sin, rot_cos) + self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T + + self.tensor[:, 6] += angle + + if self.tensor.shape[1] == 9: + # rotate velo vector + self.tensor[:, 7:9] = self.tensor[:, 7:9] @ rot_mat_T[:2, :2] + + if points is not None: + if isinstance(points, Tensor): + points[:, :3] = points[:, :3] @ rot_mat_T + elif isinstance(points, np.ndarray): + rot_mat_T = rot_mat_T.cpu().numpy() + points[:, :3] = np.dot(points[:, :3], rot_mat_T) + elif isinstance(points, BasePoints): + points.rotate(rot_mat_T) + else: + raise ValueError + return points, rot_mat_T + else: + return rot_mat_T + + def flip( + self, + bev_direction: str = 'horizontal', + points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None + ) -> Union[Tensor, np.ndarray, BasePoints, None]: + """Flip the boxes in BEV along given BEV direction. + + In LIDAR coordinates, it flips the y (horizontal) or x (vertical) axis. + + Args: + bev_direction (str): Direction by which to flip. Can be chosen from + 'horizontal' and 'vertical'. Defaults to 'horizontal'. + points (Tensor or np.ndarray or :obj:`BasePoints`, optional): + Points to flip. Defaults to None. + + Returns: + Tensor or np.ndarray or :obj:`BasePoints` or None: When ``points`` + is None, the function returns None, otherwise it returns the + flipped points. + """ + assert bev_direction in ('horizontal', 'vertical') + if bev_direction == 'horizontal': + self.tensor[:, 1::7] = -self.tensor[:, 1::7] + if self.with_yaw: + self.tensor[:, 6] = -self.tensor[:, 6] + elif bev_direction == 'vertical': + self.tensor[:, 0::7] = -self.tensor[:, 0::7] + if self.with_yaw: + self.tensor[:, 6] = -self.tensor[:, 6] + np.pi + + if points is not None: + assert isinstance(points, (Tensor, np.ndarray, BasePoints)) + if isinstance(points, (Tensor, np.ndarray)): + if bev_direction == 'horizontal': + points[:, 1] = -points[:, 1] + elif bev_direction == 'vertical': + points[:, 0] = -points[:, 0] + elif isinstance(points, BasePoints): + points.flip(bev_direction) + return points + + def convert_to(self, + dst: int, + rt_mat: Optional[Union[Tensor, np.ndarray]] = None, + correct_yaw: bool = False) -> 'BaseInstance3DBoxes': + """Convert self to ``dst`` mode. + + Args: + dst (int): The target Box mode. + rt_mat (Tensor or np.ndarray, optional): The rotation and + translation matrix between different coordinates. + Defaults to None. The conversion from ``src`` coordinates to + ``dst`` coordinates usually comes along the change of sensors, + e.g., from camera to LiDAR. This requires a transformation + matrix. + correct_yaw (bool): Whether to convert the yaw angle to the target + coordinate. Defaults to False. + + Returns: + :obj:`BaseInstance3DBoxes`: The converted box of the same type in + the ``dst`` mode. + """ + from .box_3d_mode import Box3DMode + return Box3DMode.convert(box=self, + src=Box3DMode.LIDAR, + dst=dst, + rt_mat=rt_mat, + correct_yaw=correct_yaw) + + def enlarged_box( + self, extra_width: Union[float, Tensor]) -> 'LiDARInstance3DBoxes': + """Enlarge the length, width and height of boxes. + + Args: + extra_width (float or Tensor): Extra width to enlarge the box. + + Returns: + :obj:`LiDARInstance3DBoxes`: Enlarged boxes. + """ + enlarged_boxes = self.tensor.clone() + enlarged_boxes[:, 3:6] += extra_width * 2 + # bottom center z minus extra_width + enlarged_boxes[:, 2] -= extra_width + return self.new_box(enlarged_boxes) diff --git a/embodiedscan/structures/bbox_3d/utils.py b/embodiedscan/structures/bbox_3d/utils.py new file mode 100644 index 0000000..3810051 --- /dev/null +++ b/embodiedscan/structures/bbox_3d/utils.py @@ -0,0 +1,482 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from logging import warning +from typing import Tuple, Union + +import numpy as np +import torch +from mmdet3d.utils.array_converter import array_converter +from pytorch3d.transforms import euler_angles_to_matrix +from torch import Tensor + + +@array_converter(apply_to=('val', )) +def limit_period(val: Union[np.ndarray, Tensor], + offset: float = 0.5, + period: float = np.pi) -> Union[np.ndarray, Tensor]: + """Limit the value into a period for periodic function. + + Args: + val (np.ndarray or Tensor): The value to be converted. + offset (float): Offset to set the value range. Defaults to 0.5. + period (float): Period of the value. Defaults to np.pi. + + Returns: + np.ndarray or Tensor: Value in the range of + [-offset * period, (1-offset) * period]. + """ + limited_val = val - torch.floor(val / period + offset) * period + return limited_val + + +@array_converter(apply_to=('points', 'angles')) +def rotation_3d_in_euler(points, angles, return_mat=False, clockwise=False): + """Rotate points by angles according to axis. + + Args: + points (np.ndarray | torch.Tensor | list | tuple ): + Points of shape (N, M, 3). + angles (np.ndarray | torch.Tensor | list | tuple): + Vector of angles in shape (N, 3) + return_mat: Whether or not return the rotation matrix (transposed). + Defaults to False. + clockwise: Whether the rotation is clockwise. Defaults to False. + + Raises: + ValueError: when the axis is not in range [0, 1, 2], it will + raise value error. + + Returns: + (torch.Tensor | np.ndarray): Rotated points in shape (N, M, 3). + """ + batch_free = len(points.shape) == 2 + if batch_free: + points = points[None] + + if len(angles.shape) == 1: + angles = angles.expand(points.shape[:1] + (3, )) + # angles = torch.full(points.shape[:1], angles) + + assert len(points.shape) == 3 and len(angles.shape) == 2 \ + and points.shape[0] == angles.shape[0], f'Incorrect shape of points ' \ + f'angles: {points.shape}, {angles.shape}' + + assert points.shape[-1] in [2, 3], \ + f'Points size should be 2 or 3 instead of {points.shape[-1]}' + + rot_mat_T = euler_angles_to_matrix(angles, 'ZXY') # N, 3,3 + rot_mat_T = rot_mat_T.transpose(-2, -1) + + if clockwise: + raise NotImplementedError('clockwise') + + if points.shape[0] == 0: + points_new = points + else: + points_new = torch.bmm(points, rot_mat_T) + + if batch_free: + points_new = points_new.squeeze(0) + + if return_mat: + if batch_free: + rot_mat_T = rot_mat_T.squeeze(0) + return points_new, rot_mat_T + else: + return points_new + + +@array_converter(apply_to=('points', 'angles')) +def rotation_3d_in_axis( + points: Union[np.ndarray, Tensor], + angles: Union[np.ndarray, Tensor, float], + axis: int = 0, + return_mat: bool = False, + clockwise: bool = False +) -> Union[Tuple[np.ndarray, np.ndarray], Tuple[Tensor, Tensor], np.ndarray, + Tensor]: + """Rotate points by angles according to axis. + + Args: + points (np.ndarray or Tensor): Points with shape (N, M, 3). + angles (np.ndarray or Tensor or float): Vector of angles with shape + (N, ). + axis (int): The axis to be rotated. Defaults to 0. + return_mat (bool): Whether or not to return the rotation matrix + (transposed). Defaults to False. + clockwise (bool): Whether the rotation is clockwise. Defaults to False. + + Raises: + ValueError: When the axis is not in range [-3, -2, -1, 0, 1, 2], it + will raise ValueError. + + Returns: + Tuple[np.ndarray, np.ndarray] or Tuple[Tensor, Tensor] or np.ndarray or + Tensor: Rotated points with shape (N, M, 3) and rotation matrix with + shape (N, 3, 3). + """ + batch_free = len(points.shape) == 2 + if batch_free: + points = points[None] + + if isinstance(angles, float) or len(angles.shape) == 0: + angles = torch.full(points.shape[:1], angles) + + assert len(points.shape) == 3 and len(angles.shape) == 1 and \ + points.shape[0] == angles.shape[0], 'Incorrect shape of points ' \ + f'angles: {points.shape}, {angles.shape}' + + assert points.shape[-1] in [2, 3], \ + f'Points size should be 2 or 3 instead of {points.shape[-1]}' + + rot_sin = torch.sin(angles) + rot_cos = torch.cos(angles) + ones = torch.ones_like(rot_cos) + zeros = torch.zeros_like(rot_cos) + + if points.shape[-1] == 3: + if axis == 1 or axis == -2: + rot_mat_T = torch.stack([ + torch.stack([rot_cos, zeros, -rot_sin]), + torch.stack([zeros, ones, zeros]), + torch.stack([rot_sin, zeros, rot_cos]) + ]) + elif axis == 2 or axis == -1: + rot_mat_T = torch.stack([ + torch.stack([rot_cos, rot_sin, zeros]), + torch.stack([-rot_sin, rot_cos, zeros]), + torch.stack([zeros, zeros, ones]) + ]) + elif axis == 0 or axis == -3: + rot_mat_T = torch.stack([ + torch.stack([ones, zeros, zeros]), + torch.stack([zeros, rot_cos, rot_sin]), + torch.stack([zeros, -rot_sin, rot_cos]) + ]) + else: + raise ValueError( + f'axis should in range [-3, -2, -1, 0, 1, 2], got {axis}') + else: + rot_mat_T = torch.stack([ + torch.stack([rot_cos, rot_sin]), + torch.stack([-rot_sin, rot_cos]) + ]) + + if clockwise: + rot_mat_T = rot_mat_T.transpose(0, 1) + + if points.shape[0] == 0: + points_new = points + else: + points_new = torch.einsum('aij,jka->aik', points, rot_mat_T) + + if batch_free: + points_new = points_new.squeeze(0) + + if return_mat: + rot_mat_T = torch.einsum('jka->ajk', rot_mat_T) + if batch_free: + rot_mat_T = rot_mat_T.squeeze(0) + return points_new, rot_mat_T + else: + return points_new + + +@array_converter(apply_to=('boxes_xywhr', )) +def xywhr2xyxyr( + boxes_xywhr: Union[Tensor, np.ndarray]) -> Union[Tensor, np.ndarray]: + """Convert a rotated boxes in XYWHR format to XYXYR format. + + Args: + boxes_xywhr (Tensor or np.ndarray): Rotated boxes in XYWHR format. + + Returns: + Tensor or np.ndarray: Converted boxes in XYXYR format. + """ + boxes = torch.zeros_like(boxes_xywhr) + half_w = boxes_xywhr[..., 2] / 2 + half_h = boxes_xywhr[..., 3] / 2 + + boxes[..., 0] = boxes_xywhr[..., 0] - half_w + boxes[..., 1] = boxes_xywhr[..., 1] - half_h + boxes[..., 2] = boxes_xywhr[..., 0] + half_w + boxes[..., 3] = boxes_xywhr[..., 1] + half_h + boxes[..., 4] = boxes_xywhr[..., 4] + return boxes + + +def get_box_type(box_type: str) -> Tuple[type, int]: + """Get the type and mode of box structure. + + Args: + box_type (str): The type of box structure. The valid value are "LiDAR", + "Camera" and "Depth". + + Raises: + ValueError: A ValueError is raised when ``box_type`` does not belong to + the three valid types. + + Returns: + tuple: Box type and box mode. + """ + from .box_3d_mode import (Box3DMode, CameraInstance3DBoxes, + DepthInstance3DBoxes, EulerCameraInstance3DBoxes, + EulerDepthInstance3DBoxes, LiDARInstance3DBoxes) + box_type_lower = box_type.lower() + if box_type_lower == 'lidar': + box_type_3d = LiDARInstance3DBoxes + box_mode_3d = Box3DMode.LIDAR + elif box_type_lower == 'camera': + box_type_3d = CameraInstance3DBoxes + box_mode_3d = Box3DMode.CAM + elif box_type_lower == 'depth': + box_type_3d = DepthInstance3DBoxes + box_mode_3d = Box3DMode.DEPTH + elif box_type_lower == 'euler-depth': + box_type_3d = EulerDepthInstance3DBoxes + box_mode_3d = Box3DMode.EULER_DEPTH + elif box_type_lower == 'euler-camera': + box_type_3d = EulerCameraInstance3DBoxes + box_mode_3d = Box3DMode.EULER_CAM + else: + raise ValueError( + 'Only "box_type" of "camera", "lidar", "depth", "euler"' + f' are supported, got {box_type}') + + return box_type_3d, box_mode_3d + + +@array_converter(apply_to=('points_3d', 'proj_mat')) +def points_cam2img(points_3d: Union[Tensor, np.ndarray], + proj_mat: Union[Tensor, np.ndarray], + with_depth: bool = False) -> Union[Tensor, np.ndarray]: + """Project points in camera coordinates to image coordinates. + + Args: + points_3d (Tensor or np.ndarray): Points in shape (N, 3). + proj_mat (Tensor or np.ndarray): Transformation matrix between + coordinates. + with_depth (bool): Whether to keep depth in the output. + Defaults to False. + + Returns: + Tensor or np.ndarray: Points in image coordinates with shape [N, 2] if + ``with_depth=False``, else [N, 3]. + """ + points_shape = list(points_3d.shape) + points_shape[-1] = 1 + + assert len(proj_mat.shape) == 2, \ + 'The dimension of the projection matrix should be 2 ' \ + f'instead of {len(proj_mat.shape)}.' + d1, d2 = proj_mat.shape[:2] + assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or \ + (d1 == 4 and d2 == 4), 'The shape of the projection matrix ' \ + f'({d1}*{d2}) is not supported.' + if d1 == 3: + proj_mat_expanded = torch.eye(4, + device=proj_mat.device, + dtype=proj_mat.dtype) + proj_mat_expanded[:d1, :d2] = proj_mat + proj_mat = proj_mat_expanded + + # previous implementation use new_zeros, new_one yields better results + points_4 = torch.cat([points_3d, points_3d.new_ones(points_shape)], dim=-1) + + point_2d = points_4 @ proj_mat.T + point_2d_res = point_2d[..., :2] / point_2d[..., 2:3] + + if with_depth: + point_2d_res = torch.cat([point_2d_res, point_2d[..., 2:3]], dim=-1) + + return point_2d_res + + +@array_converter(apply_to=('points_3d', 'proj_mat')) +def batch_points_cam2img(points_3d, proj_mat, with_depth=False): + """Project points in camera coordinates to image coordinates. + + Args: + points_3d (torch.Tensor | np.ndarray): Points in shape (N, D, 3) + proj_mat (torch.Tensor | np.ndarray): + Transformation matrix between coordinates. + with_depth (bool, optional): Whether to keep depth in the output. + Defaults to False. + + Returns: + (torch.Tensor | np.ndarray): Points in image coordinates, + with shape [N, D, 2] if `with_depth=False`, else [N, D, 3]. + """ + points_shape = list(points_3d.shape) + points_shape[-1] = 1 + + assert len(proj_mat.shape) == 3, 'The dimension of the projection'\ + f' matrix should be 2 instead of {len(proj_mat.shape)}.' + d0, d1, d2 = proj_mat.shape[:3] + assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or ( + d1 == 4 and d2 == 4), 'The shape of the projection matrix'\ + f' ({d1}*{d2}) is not supported.' + if d1 == 3: + proj_mat_expanded = torch.eye(4, + device=proj_mat.device, + dtype=proj_mat.dtype) + proj_mat_expanded = proj_mat_expanded[None, :, :].expand(d0, -1, -1) + proj_mat_expanded[:, :d1, :d2] = proj_mat + proj_mat = proj_mat_expanded + + # previous implementation use new_zeros, new_one yields better results + points_4 = torch.cat([points_3d, points_3d.new_ones(points_shape)], dim=-1) + # do the batch wise operation + point_2d = torch.bmm(points_4, proj_mat.permute(0, 2, 1)) + # point_2d = points_4 @ proj_mat.T + + point_2d_res = point_2d[..., :2] / point_2d[..., 2:3].clamp(min=1e-3) + + if with_depth: + point_2d_res = torch.cat([point_2d_res, point_2d[..., 2:3]], dim=-1) + + return point_2d_res + + +@array_converter(apply_to=('points', 'cam2img')) +def points_img2cam( + points: Union[Tensor, np.ndarray], + cam2img: Union[Tensor, np.ndarray]) -> Union[Tensor, np.ndarray]: + """Project points in image coordinates to camera coordinates. + + Args: + points (Tensor or np.ndarray): 2.5D points in 2D images with shape + [N, 3], 3 corresponds with x, y in the image and depth. + cam2img (Tensor or np.ndarray): Camera intrinsic matrix. The shape can + be [3, 3], [3, 4] or [4, 4]. + + Returns: + Tensor or np.ndarray: Points in 3D space with shape [N, 3], 3 + corresponds with x, y, z in 3D space. + """ + assert cam2img.shape[0] <= 4 + assert cam2img.shape[1] <= 4 + assert points.shape[1] == 3 + + xys = points[:, :2] + depths = points[:, 2].view(-1, 1) + unnormed_xys = torch.cat([xys * depths, depths], dim=1) + + pad_cam2img = torch.eye(4, dtype=xys.dtype, device=xys.device) + pad_cam2img[:cam2img.shape[0], :cam2img.shape[1]] = cam2img + inv_pad_cam2img = torch.inverse(pad_cam2img).transpose(0, 1) + + # Do operation in homogeneous coordinates. + num_points = unnormed_xys.shape[0] + homo_xys = torch.cat([unnormed_xys, xys.new_ones((num_points, 1))], dim=1) + points3D = torch.mm(homo_xys, inv_pad_cam2img)[:, :3] + + return points3D + + +def mono_cam_box2vis(cam_box): + """This is a post-processing function on the bboxes from Mono-3D task. If + we want to perform projection visualization, we need to: + + 1. rotate the box along x-axis for np.pi / 2 (roll) + 2. change orientation from local yaw to global yaw + 3. convert yaw by (np.pi / 2 - yaw) + + After applying this function, we can project and draw it on 2D images. + + Args: + cam_box (:obj:`CameraInstance3DBoxes`): 3D bbox in camera coordinate + system before conversion. Could be gt bbox loaded from dataset or + network prediction output. + + Returns: + :obj:`CameraInstance3DBoxes`: Box after conversion. + """ + warning.warn('DeprecationWarning: The hack of yaw and dimension in the ' + 'monocular 3D detection on nuScenes has been removed. The ' + 'function mono_cam_box2vis will be deprecated.') + from .cam_box3d import CameraInstance3DBoxes + assert isinstance(cam_box, CameraInstance3DBoxes), \ + 'input bbox should be CameraInstance3DBoxes!' + loc = cam_box.gravity_center + dim = cam_box.dims + yaw = cam_box.yaw + feats = cam_box.tensor[:, 7:] + # rotate along x-axis for np.pi / 2 + # see also here: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L557 # noqa + dim[:, [1, 2]] = dim[:, [2, 1]] + # change local yaw to global yaw for visualization + # refer to https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L164-L166 # noqa + yaw += torch.atan2(loc[:, 0], loc[:, 2]) + # convert yaw by (-yaw - np.pi / 2) + # this is because mono 3D box class such as `NuScenesBox` has different + # definition of rotation with our `CameraInstance3DBoxes` + yaw = -yaw - np.pi / 2 + cam_box = torch.cat([loc, dim, yaw[:, None], feats], dim=1) + cam_box = CameraInstance3DBoxes(cam_box, + box_dim=cam_box.shape[-1], + origin=(0.5, 0.5, 0.5)) + + return cam_box + + +def get_proj_mat_by_coord_type(img_meta: dict, coord_type: str) -> Tensor: + """Obtain image features using points. + + Args: + img_meta (dict): Meta information. + coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'. Can be case- + insensitive. + + Returns: + Tensor: Transformation matrix. + """ + coord_type = coord_type.upper() + mapping = {'LIDAR': 'lidar2img', 'DEPTH': 'depth2img', 'CAMERA': 'cam2img'} + assert coord_type in mapping.keys() + return img_meta[mapping[coord_type]] + + +def yaw2local(yaw: Tensor, loc: Tensor) -> Tensor: + """Transform global yaw to local yaw (alpha in kitti) in camera + coordinates, ranges from -pi to pi. + + Args: + yaw (Tensor): A vector with local yaw of each box in shape (N, ). + loc (Tensor): Gravity center of each box in shape (N, 3). + + Returns: + Tensor: Local yaw (alpha in kitti). + """ + local_yaw = yaw - torch.atan2(loc[:, 0], loc[:, 2]) + larger_idx = (local_yaw > np.pi).nonzero(as_tuple=False) + small_idx = (local_yaw < -np.pi).nonzero(as_tuple=False) + if len(larger_idx) != 0: + local_yaw[larger_idx] -= 2 * np.pi + if len(small_idx) != 0: + local_yaw[small_idx] += 2 * np.pi + + return local_yaw + + +def get_lidar2img(cam2img: Tensor, lidar2cam: Tensor) -> Tensor: + """Get the projection matrix of lidar2img. + + Args: + cam2img (torch.Tensor): A 3x3 or 4x4 projection matrix. + lidar2cam (torch.Tensor): A 3x3 or 4x4 projection matrix. + + Returns: + Tensor: Transformation matrix with shape 4x4. + """ + if cam2img.shape == (3, 3): + temp = cam2img.new_zeros(4, 4) + temp[:3, :3] = cam2img + temp[3, 3] = 1 + cam2img = temp + + if lidar2cam.shape == (3, 3): + temp = lidar2cam.new_zeros(4, 4) + temp[:3, :3] = lidar2cam + temp[3, 3] = 1 + lidar2cam = temp + return torch.matmul(cam2img, lidar2cam) diff --git a/embodiedscan/structures/det3d_data_sample.py b/embodiedscan/structures/det3d_data_sample.py new file mode 100644 index 0000000..1081fc3 --- /dev/null +++ b/embodiedscan/structures/det3d_data_sample.py @@ -0,0 +1,237 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional, Tuple, Union + +import torch +from mmdet.structures import DetDataSample +from mmengine.structures import InstanceData, PixelData + +from .point_data import PointData + + +class Det3DDataSample(DetDataSample): + """A data structure interface of MMDetection3D. They are used as interfaces + between different components. + + The attributes in ``Det3DDataSample`` are divided into several parts: + + - ``proposals`` (InstanceData): Region proposals used in two-stage + detectors. + - ``ignored_instances`` (InstanceData): Instances to be ignored during + training/testing. + - ``gt_instances_3d`` (InstanceData): Ground truth of 3D instance + annotations. + - ``gt_instances`` (InstanceData): Ground truth of 2D instance + annotations. + - ``pred_instances_3d`` (InstanceData): 3D instances of model + predictions. + - For point-cloud 3D object detection task whose input modality is + `use_lidar=True, use_camera=False`, the 3D predictions results are + saved in `pred_instances_3d`. + - For vision-only (monocular/multi-view) 3D object detection task + whose input modality is `use_lidar=False, use_camera=True`, the 3D + predictions are saved in `pred_instances_3d`. + - ``pred_instances`` (InstanceData): 2D instances of model predictions. + - For multi-modality 3D detection task whose input modality is + `use_lidar=True, use_camera=True`, the 2D predictions are saved in + `pred_instances`. + - ``pts_pred_instances_3d`` (InstanceData): 3D instances of model + predictions based on point cloud. + - For multi-modality 3D detection task whose input modality is + `use_lidar=True, use_camera=True`, the 3D predictions based on + point cloud are saved in `pts_pred_instances_3d` to distinguish + with `img_pred_instances_3d` which based on image. + - ``img_pred_instances_3d`` (InstanceData): 3D instances of model + predictions based on image. + - For multi-modality 3D detection task whose input modality is + `use_lidar=True, use_camera=True`, the 3D predictions based on + image are saved in `img_pred_instances_3d` to distinguish with + `pts_pred_instances_3d` which based on point cloud. + - ``gt_pts_seg`` (PointData): Ground truth of point cloud segmentation. + - ``pred_pts_seg`` (PointData): Prediction of point cloud segmentation. + - ``eval_ann_info`` (dict or None): Raw annotation, which will be + passed to evaluator and do the online evaluation. + + Examples: + >>> import torch + >>> from mmengine.structures import InstanceData + + >>> from mmdet3d.structures import Det3DDataSample + >>> from mmdet3d.structures.bbox_3d import BaseInstance3DBoxes + + >>> data_sample = Det3DDataSample() + >>> meta_info = dict( + ... img_shape=(800, 1196, 3), + ... pad_shape=(800, 1216, 3)) + >>> gt_instances_3d = InstanceData(metainfo=meta_info) + >>> gt_instances_3d.bboxes_3d = BaseInstance3DBoxes(torch.rand((5, 7))) + >>> gt_instances_3d.labels_3d = torch.randint(0, 3, (5,)) + >>> data_sample.gt_instances_3d = gt_instances_3d + >>> assert 'img_shape' in data_sample.gt_instances_3d.metainfo_keys() + >>> len(data_sample.gt_instances_3d) + 5 + >>> print(data_sample) + + ) at 0x7f7e2a0e8640> + >>> pred_instances = InstanceData(metainfo=meta_info) + >>> pred_instances.bboxes = torch.rand((5, 4)) + >>> pred_instances.scores = torch.rand((5, )) + >>> data_sample = Det3DDataSample(pred_instances=pred_instances) + >>> assert 'pred_instances' in data_sample + + >>> pred_instances_3d = InstanceData(metainfo=meta_info) + >>> pred_instances_3d.bboxes_3d = BaseInstance3DBoxes( + ... torch.rand((5, 7))) + >>> pred_instances_3d.scores_3d = torch.rand((5, )) + >>> pred_instances_3d.labels_3d = torch.rand((5, )) + >>> data_sample = Det3DDataSample(pred_instances_3d=pred_instances_3d) + >>> assert 'pred_instances_3d' in data_sample + + >>> data_sample = Det3DDataSample() + >>> gt_instances_3d_data = dict( + ... bboxes_3d=BaseInstance3DBoxes(torch.rand((2, 7))), + ... labels_3d=torch.rand(2)) + >>> gt_instances_3d = InstanceData(**gt_instances_3d_data) + >>> data_sample.gt_instances_3d = gt_instances_3d + >>> assert 'gt_instances_3d' in data_sample + >>> assert 'bboxes_3d' in data_sample.gt_instances_3d + + >>> from mmdet3d.structures import PointData + >>> data_sample = Det3DDataSample() + >>> gt_pts_seg_data = dict( + ... pts_instance_mask=torch.rand(2), + ... pts_semantic_mask=torch.rand(2)) + >>> data_sample.gt_pts_seg = PointData(**gt_pts_seg_data) + >>> print(data_sample) + + ) at 0x7f7e29ff0d60> + """ # noqa: E501 + + @property + def gt_instances_3d(self) -> InstanceData: + return self._gt_instances_3d + + @gt_instances_3d.setter + def gt_instances_3d(self, value: InstanceData) -> None: + self.set_field(value, '_gt_instances_3d', dtype=InstanceData) + + @gt_instances_3d.deleter + def gt_instances_3d(self) -> None: + del self._gt_instances_3d + + @property + def pred_instances_3d(self) -> InstanceData: + return self._pred_instances_3d + + @pred_instances_3d.setter + def pred_instances_3d(self, value: InstanceData) -> None: + self.set_field(value, '_pred_instances_3d', dtype=InstanceData) + + @pred_instances_3d.deleter + def pred_instances_3d(self) -> None: + del self._pred_instances_3d + + @property + def pts_pred_instances_3d(self) -> InstanceData: + return self._pts_pred_instances_3d + + @pts_pred_instances_3d.setter + def pts_pred_instances_3d(self, value: InstanceData) -> None: + self.set_field(value, '_pts_pred_instances_3d', dtype=InstanceData) + + @pts_pred_instances_3d.deleter + def pts_pred_instances_3d(self) -> None: + del self._pts_pred_instances_3d + + @property + def img_pred_instances_3d(self) -> InstanceData: + return self._img_pred_instances_3d + + @img_pred_instances_3d.setter + def img_pred_instances_3d(self, value: InstanceData) -> None: + self.set_field(value, '_img_pred_instances_3d', dtype=InstanceData) + + @img_pred_instances_3d.deleter + def img_pred_instances_3d(self) -> None: + del self._img_pred_instances_3d + + @property + def gt_pts_seg(self) -> PointData: + return self._gt_pts_seg + + @gt_pts_seg.setter + def gt_pts_seg(self, value: PointData) -> None: + self.set_field(value, '_gt_pts_seg', dtype=PointData) + + @gt_pts_seg.deleter + def gt_pts_seg(self) -> None: + del self._gt_pts_seg + + @property + def pred_pts_seg(self) -> PointData: + return self._pred_pts_seg + + @pred_pts_seg.setter + def pred_pts_seg(self, value: PointData) -> None: + self.set_field(value, '_pred_pts_seg', dtype=PointData) + + @pred_pts_seg.deleter + def pred_pts_seg(self) -> None: + del self._pred_pts_seg + + @property + def gt_depth_map(self) -> PixelData: + return self._gt_depth_map + + @gt_depth_map.setter + def gt_depth_map(self, value: PixelData) -> None: + self.set_field(value, '_gt_depth_map', dtype=PixelData) + + @gt_depth_map.deleter + def gt_depth_map(self) -> None: + del self._gt_depth_map + + @property + def pred_depth_map(self) -> PixelData: + return self._pred_depth_map + + @pred_depth_map.setter + def pred_depth_map(self, value: PixelData) -> None: + self.set_field(value, '_pred_depth_map', dtype=PixelData) + + @pred_depth_map.deleter + def pred_depth_map(self) -> None: + del self._pred_depth_map + + +SampleList = List[Det3DDataSample] +OptSampleList = Optional[SampleList] +ForwardResults = Union[Dict[str, torch.Tensor], List[Det3DDataSample], + Tuple[torch.Tensor], torch.Tensor] diff --git a/embodiedscan/structures/ops/__init__.py b/embodiedscan/structures/ops/__init__.py new file mode 100644 index 0000000..d71ec30 --- /dev/null +++ b/embodiedscan/structures/ops/__init__.py @@ -0,0 +1,38 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# yapf:disable +from .box_np_ops import (box2d_to_corner_jit, box3d_to_bbox, + box_camera_to_lidar, boxes3d_to_corners3d_lidar, + camera_to_lidar, center_to_corner_box2d, + center_to_corner_box3d, center_to_minmax_2d, + corner_to_standup_nd_jit, corner_to_surfaces_3d, + corner_to_surfaces_3d_jit, corners_nd, + create_anchors_3d_range, depth_to_lidar_points, + depth_to_points, get_frustum, iou_jit, + minmax_to_corner_2d, points_in_convex_polygon_3d_jit, + points_in_convex_polygon_jit, points_in_rbbox, + projection_matrix_to_CRT_kitti, rbbox2d_to_near_bbox, + remove_outside_points, rotation_points_single_angle, + surface_equ_3d) +# yapf:enable +from .iou3d_calculator import (AxisAlignedBboxOverlaps3D, BboxOverlaps3D, + BboxOverlapsNearest3D, + axis_aligned_bbox_overlaps_3d, bbox_overlaps_3d, + bbox_overlaps_nearest_3d) +from .transforms import bbox3d2result, bbox3d2roi, bbox3d_mapping_back + +__all__ = [ + 'box2d_to_corner_jit', 'box3d_to_bbox', 'box_camera_to_lidar', + 'boxes3d_to_corners3d_lidar', 'camera_to_lidar', 'center_to_corner_box2d', + 'center_to_corner_box3d', 'center_to_minmax_2d', + 'corner_to_standup_nd_jit', 'corner_to_surfaces_3d', + 'corner_to_surfaces_3d_jit', 'corners_nd', 'create_anchors_3d_range', + 'depth_to_lidar_points', 'depth_to_points', 'get_frustum', 'iou_jit', + 'minmax_to_corner_2d', 'points_in_convex_polygon_3d_jit', + 'points_in_convex_polygon_jit', 'points_in_rbbox', + 'projection_matrix_to_CRT_kitti', 'rbbox2d_to_near_bbox', + 'remove_outside_points', 'rotation_points_single_angle', 'surface_equ_3d', + 'BboxOverlapsNearest3D', 'BboxOverlaps3D', 'bbox_overlaps_nearest_3d', + 'bbox_overlaps_3d', 'AxisAlignedBboxOverlaps3D', + 'axis_aligned_bbox_overlaps_3d', 'bbox3d_mapping_back', 'bbox3d2roi', + 'bbox3d2result' +] diff --git a/embodiedscan/structures/ops/box_np_ops.py b/embodiedscan/structures/ops/box_np_ops.py new file mode 100644 index 0000000..9189103 --- /dev/null +++ b/embodiedscan/structures/ops/box_np_ops.py @@ -0,0 +1,838 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# TODO: clean the functions in this file and move the APIs into box bbox_3d +# in the future +# NOTICE: All functions in this file are valid for LiDAR or depth boxes only +# if we use default parameters. + +import numba +import numpy as np +from mmdet3d.structures.bbox_3d import (limit_period, points_cam2img, + rotation_3d_in_axis) + + +def camera_to_lidar(points, r_rect, velo2cam): + """Convert points in camera coordinate to lidar coordinate. + + Note: + This function is for KITTI only. + + Args: + points (np.ndarray, shape=[N, 3]): Points in camera coordinate. + r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in + specific camera coordinate (e.g. CAM2) to CAM0. + velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in + camera coordinate to lidar coordinate. + + Returns: + np.ndarray, shape=[N, 3]: Points in lidar coordinate. + """ + points_shape = list(points.shape[0:-1]) + if points.shape[-1] == 3: + points = np.concatenate([points, np.ones(points_shape + [1])], axis=-1) + lidar_points = points @ np.linalg.inv((r_rect @ velo2cam).T) + return lidar_points[..., :3] + + +def box_camera_to_lidar(data, r_rect, velo2cam): + """Convert boxes in camera coordinate to lidar coordinate. + + Note: + This function is for KITTI only. + + Args: + data (np.ndarray, shape=[N, 7]): Boxes in camera coordinate. + r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in + specific camera coordinate (e.g. CAM2) to CAM0. + velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in + camera coordinate to lidar coordinate. + + Returns: + np.ndarray, shape=[N, 3]: Boxes in lidar coordinate. + """ + xyz = data[:, 0:3] + x_size, y_size, z_size = data[:, 3:4], data[:, 4:5], data[:, 5:6] + r = data[:, 6:7] + xyz_lidar = camera_to_lidar(xyz, r_rect, velo2cam) + # yaw and dims also needs to be converted + r_new = -r - np.pi / 2 + r_new = limit_period(r_new, period=np.pi * 2) + return np.concatenate([xyz_lidar, x_size, z_size, y_size, r_new], axis=1) + + +def corners_nd(dims, origin=0.5): + """Generate relative box corners based on length per dim and origin point. + + Args: + dims (np.ndarray, shape=[N, ndim]): Array of length per dim + origin (list or array or float, optional): origin point relate to + smallest point. Defaults to 0.5 + + Returns: + np.ndarray, shape=[N, 2 ** ndim, ndim]: Returned corners. + point layout example: (2d) x0y0, x0y1, x1y0, x1y1; + (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1 + where x0 < x1, y0 < y1, z0 < z1. + """ + ndim = int(dims.shape[1]) + corners_norm = np.stack(np.unravel_index(np.arange(2**ndim), [2] * ndim), + axis=1).astype(dims.dtype) + # now corners_norm has format: (2d) x0y0, x0y1, x1y0, x1y1 + # (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1 + # so need to convert to a format which is convenient to do other computing. + # for 2d boxes, format is clockwise start with minimum point + # for 3d boxes, please draw lines by your hand. + if ndim == 2: + # generate clockwise box corners + corners_norm = corners_norm[[0, 1, 3, 2]] + elif ndim == 3: + corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] + corners_norm = corners_norm - np.array(origin, dtype=dims.dtype) + corners = dims.reshape([-1, 1, ndim]) * corners_norm.reshape( + [1, 2**ndim, ndim]) + return corners + + +def center_to_corner_box2d(centers, dims, angles=None, origin=0.5): + """Convert kitti locations, dimensions and angles to corners. + format: center(xy), dims(xy), angles(counterclockwise when positive) + + Args: + centers (np.ndarray): Locations in kitti label file with shape (N, 2). + dims (np.ndarray): Dimensions in kitti label file with shape (N, 2). + angles (np.ndarray, optional): Rotation_y in kitti label file with + shape (N). Defaults to None. + origin (list or array or float, optional): origin point relate to + smallest point. Defaults to 0.5. + + Returns: + np.ndarray: Corners with the shape of (N, 4, 2). + """ + # 'length' in kitti format is in x axis. + # xyz(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar) + # center in kitti format is [0.5, 1.0, 0.5] in xyz. + corners = corners_nd(dims, origin=origin) + # corners: [N, 4, 2] + if angles is not None: + corners = rotation_3d_in_axis(corners, angles) + corners += centers.reshape([-1, 1, 2]) + return corners + + +@numba.jit(nopython=True) +def depth_to_points(depth, trunc_pixel): + """Convert depth map to points. + + Args: + depth (np.array, shape=[H, W]): Depth map which + the row of [0~`trunc_pixel`] are truncated. + trunc_pixel (int): The number of truncated row. + + Returns: + np.ndarray: Points in camera coordinates. + """ + num_pts = np.sum(depth[trunc_pixel:, ] > 0.1) + points = np.zeros((num_pts, 3), dtype=depth.dtype) + x = np.array([0, 0, 1], dtype=depth.dtype) + k = 0 + for i in range(trunc_pixel, depth.shape[0]): + for j in range(depth.shape[1]): + if depth[i, j] > 0.1: + x = np.array([j, i, 1], dtype=depth.dtype) + points[k] = x * depth[i, j] + k += 1 + return points + + +def depth_to_lidar_points(depth, trunc_pixel, P2, r_rect, velo2cam): + """Convert depth map to points in lidar coordinate. + + Args: + depth (np.array, shape=[H, W]): Depth map which + the row of [0~`trunc_pixel`] are truncated. + trunc_pixel (int): The number of truncated row. + P2 (p.array, shape=[4, 4]): Intrinsics of Camera2. + r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in + specific camera coordinate (e.g. CAM2) to CAM0. + velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in + camera coordinate to lidar coordinate. + + Returns: + np.ndarray: Points in lidar coordinates. + """ + pts = depth_to_points(depth, trunc_pixel) + points_shape = list(pts.shape[0:-1]) + points = np.concatenate([pts, np.ones(points_shape + [1])], axis=-1) + points = points @ np.linalg.inv(P2.T) + lidar_points = camera_to_lidar(points, r_rect, velo2cam) + return lidar_points + + +def center_to_corner_box3d(centers, + dims, + angles=None, + origin=(0.5, 1.0, 0.5), + axis=1): + """Convert kitti locations, dimensions and angles to corners. + + Args: + centers (np.ndarray): Locations in kitti label file with shape (N, 3). + dims (np.ndarray): Dimensions in kitti label file with shape (N, 3). + angles (np.ndarray, optional): Rotation_y in kitti label file with + shape (N). Defaults to None. + origin (list or array or float, optional): Origin point relate to + smallest point. Use (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) + in lidar. Defaults to (0.5, 1.0, 0.5). + axis (int, optional): Rotation axis. 1 for camera and 2 for lidar. + Defaults to 1. + + Returns: + np.ndarray: Corners with the shape of (N, 8, 3). + """ + # 'length' in kitti format is in x axis. + # yzx(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(lwh)(lidar) + # center in kitti format is [0.5, 1.0, 0.5] in xyz. + corners = corners_nd(dims, origin=origin) + # corners: [N, 8, 3] + if angles is not None: + corners = rotation_3d_in_axis(corners, angles, axis=axis) + corners += centers.reshape([-1, 1, 3]) + return corners + + +@numba.jit(nopython=True) +def box2d_to_corner_jit(boxes): + """Convert box2d to corner. + + Args: + boxes (np.ndarray, shape=[N, 5]): Boxes2d with rotation. + + Returns: + box_corners (np.ndarray, shape=[N, 4, 2]): Box corners. + """ + num_box = boxes.shape[0] + corners_norm = np.zeros((4, 2), dtype=boxes.dtype) + corners_norm[1, 1] = 1.0 + corners_norm[2] = 1.0 + corners_norm[3, 0] = 1.0 + corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype) + corners = boxes.reshape(num_box, 1, 5)[:, :, 2:4] * corners_norm.reshape( + 1, 4, 2) + rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) + box_corners = np.zeros((num_box, 4, 2), dtype=boxes.dtype) + for i in range(num_box): + rot_sin = np.sin(boxes[i, -1]) + rot_cos = np.cos(boxes[i, -1]) + rot_mat_T[0, 0] = rot_cos + rot_mat_T[0, 1] = rot_sin + rot_mat_T[1, 0] = -rot_sin + rot_mat_T[1, 1] = rot_cos + box_corners[i] = corners[i] @ rot_mat_T + boxes[i, :2] + return box_corners + + +@numba.njit +def corner_to_standup_nd_jit(boxes_corner): + """Convert boxes_corner to aligned (min-max) boxes. + + Args: + boxes_corner (np.ndarray, shape=[N, 2**dim, dim]): Boxes corners. + + Returns: + np.ndarray, shape=[N, dim*2]: Aligned (min-max) boxes. + """ + num_boxes = boxes_corner.shape[0] + ndim = boxes_corner.shape[-1] + result = np.zeros((num_boxes, ndim * 2), dtype=boxes_corner.dtype) + for i in range(num_boxes): + for j in range(ndim): + result[i, j] = np.min(boxes_corner[i, :, j]) + for j in range(ndim): + result[i, j + ndim] = np.max(boxes_corner[i, :, j]) + return result + + +@numba.jit(nopython=True) +def corner_to_surfaces_3d_jit(corners): + """Convert 3d box corners from corner function above to surfaces that + normal vectors all direct to internal. + + Args: + corners (np.ndarray): 3d box corners with the shape of (N, 8, 3). + + Returns: + np.ndarray: Surfaces with the shape of (N, 6, 4, 3). + """ + # box_corners: [N, 8, 3], must from corner functions in this module + num_boxes = corners.shape[0] + surfaces = np.zeros((num_boxes, 6, 4, 3), dtype=corners.dtype) + corner_idxes = np.array([ + 0, 1, 2, 3, 7, 6, 5, 4, 0, 3, 7, 4, 1, 5, 6, 2, 0, 4, 5, 1, 3, 2, 6, 7 + ]).reshape(6, 4) + for i in range(num_boxes): + for j in range(6): + for k in range(4): + surfaces[i, j, k] = corners[i, corner_idxes[j, k]] + return surfaces + + +def rotation_points_single_angle(points, angle, axis=0): + """Rotate points with a single angle. + + Args: + points (np.ndarray, shape=[N, 3]]): + angle (np.ndarray, shape=[1]]): + axis (int, optional): Axis to rotate at. Defaults to 0. + + Returns: + np.ndarray: Rotated points. + """ + # points: [N, 3] + rot_sin = np.sin(angle) + rot_cos = np.cos(angle) + if axis == 1: + rot_mat_T = np.array( + [[rot_cos, 0, rot_sin], [0, 1, 0], [-rot_sin, 0, rot_cos]], + dtype=points.dtype) + elif axis == 2 or axis == -1: + rot_mat_T = np.array( + [[rot_cos, rot_sin, 0], [-rot_sin, rot_cos, 0], [0, 0, 1]], + dtype=points.dtype) + elif axis == 0: + rot_mat_T = np.array( + [[1, 0, 0], [0, rot_cos, rot_sin], [0, -rot_sin, rot_cos]], + dtype=points.dtype) + else: + raise ValueError('axis should in range') + + return points @ rot_mat_T, rot_mat_T + + +def box3d_to_bbox(box3d, P2): + """Convert box3d in camera coordinates to bbox in image coordinates. + + Args: + box3d (np.ndarray, shape=[N, 7]): Boxes in camera coordinate. + P2 (np.array, shape=[4, 4]): Intrinsics of Camera2. + + Returns: + np.ndarray, shape=[N, 4]: Boxes 2d in image coordinates. + """ + box_corners = center_to_corner_box3d(box3d[:, :3], + box3d[:, 3:6], + box3d[:, 6], [0.5, 1.0, 0.5], + axis=1) + box_corners_in_image = points_cam2img(box_corners, P2) + # box_corners_in_image: [N, 8, 2] + minxy = np.min(box_corners_in_image, axis=1) + maxxy = np.max(box_corners_in_image, axis=1) + bbox = np.concatenate([minxy, maxxy], axis=1) + return bbox + + +def corner_to_surfaces_3d(corners): + """convert 3d box corners from corner function above to surfaces that + normal vectors all direct to internal. + + Args: + corners (np.ndarray): 3D box corners with shape of (N, 8, 3). + + Returns: + np.ndarray: Surfaces with the shape of (N, 6, 4, 3). + """ + # box_corners: [N, 8, 3], must from corner functions in this module + surfaces = np.array([ + [corners[:, 0], corners[:, 1], corners[:, 2], corners[:, 3]], + [corners[:, 7], corners[:, 6], corners[:, 5], corners[:, 4]], + [corners[:, 0], corners[:, 3], corners[:, 7], corners[:, 4]], + [corners[:, 1], corners[:, 5], corners[:, 6], corners[:, 2]], + [corners[:, 0], corners[:, 4], corners[:, 5], corners[:, 1]], + [corners[:, 3], corners[:, 2], corners[:, 6], corners[:, 7]], + ]).transpose([2, 0, 1, 3]) + return surfaces + + +def points_in_rbbox(points, rbbox, z_axis=2, origin=(0.5, 0.5, 0)): + """Check points in rotated bbox and return indices. + + Note: + This function is for counterclockwise boxes. + + Args: + points (np.ndarray, shape=[N, 3+dim]): Points to query. + rbbox (np.ndarray, shape=[M, 7]): Boxes3d with rotation. + z_axis (int, optional): Indicate which axis is height. + Defaults to 2. + origin (tuple[int], optional): Indicate the position of + box center. Defaults to (0.5, 0.5, 0). + + Returns: + np.ndarray, shape=[N, M]: Indices of points in each box. + """ + # TODO: this function is different from PointCloud3D, be careful + # when start to use nuscene, check the input + rbbox_corners = center_to_corner_box3d(rbbox[:, :3], + rbbox[:, 3:6], + rbbox[:, 6], + origin=origin, + axis=z_axis) + surfaces = corner_to_surfaces_3d(rbbox_corners) + indices = points_in_convex_polygon_3d_jit(points[:, :3], surfaces) + return indices + + +def minmax_to_corner_2d(minmax_box): + """Convert minmax box to corners2d. + + Args: + minmax_box (np.ndarray, shape=[N, dims]): minmax boxes. + + Returns: + np.ndarray: 2d corners of boxes + """ + ndim = minmax_box.shape[-1] // 2 + center = minmax_box[..., :ndim] + dims = minmax_box[..., ndim:] - center + return center_to_corner_box2d(center, dims, origin=0.0) + + +def create_anchors_3d_range(feature_size, + anchor_range, + sizes=((3.9, 1.6, 1.56), ), + rotations=(0, np.pi / 2), + dtype=np.float32): + """Create anchors 3d by range. + + Args: + feature_size (list[float] | tuple[float]): Feature map size. It is + either a list of a tuple of [D, H, W](in order of z, y, and x). + anchor_range (torch.Tensor | list[float]): Range of anchors with + shape [6]. The order is consistent with that of anchors, i.e., + (x_min, y_min, z_min, x_max, y_max, z_max). + sizes (list[list] | np.ndarray | torch.Tensor, optional): + Anchor size with shape [N, 3], in order of x, y, z. + Defaults to ((3.9, 1.6, 1.56), ). + rotations (list[float] | np.ndarray | torch.Tensor, optional): + Rotations of anchors in a single feature grid. + Defaults to (0, np.pi / 2). + dtype (type, optional): Data type. Defaults to np.float32. + + Returns: + np.ndarray: Range based anchors with shape of + (*feature_size, num_sizes, num_rots, 7). + """ + anchor_range = np.array(anchor_range, dtype) + z_centers = np.linspace(anchor_range[2], + anchor_range[5], + feature_size[0], + dtype=dtype) + y_centers = np.linspace(anchor_range[1], + anchor_range[4], + feature_size[1], + dtype=dtype) + x_centers = np.linspace(anchor_range[0], + anchor_range[3], + feature_size[2], + dtype=dtype) + sizes = np.reshape(np.array(sizes, dtype=dtype), [-1, 3]) + rotations = np.array(rotations, dtype=dtype) + rets = np.meshgrid(x_centers, + y_centers, + z_centers, + rotations, + indexing='ij') + tile_shape = [1] * 5 + tile_shape[-2] = int(sizes.shape[0]) + for i in range(len(rets)): + rets[i] = np.tile(rets[i][..., np.newaxis, :], tile_shape) + rets[i] = rets[i][..., np.newaxis] # for concat + sizes = np.reshape(sizes, [1, 1, 1, -1, 1, 3]) + tile_size_shape = list(rets[0].shape) + tile_size_shape[3] = 1 + sizes = np.tile(sizes, tile_size_shape) + rets.insert(3, sizes) + ret = np.concatenate(rets, axis=-1) + return np.transpose(ret, [2, 1, 0, 3, 4, 5]) + + +def center_to_minmax_2d(centers, dims, origin=0.5): + """Center to minmax. + + Args: + centers (np.ndarray): Center points. + dims (np.ndarray): Dimensions. + origin (list or array or float, optional): Origin point relate + to smallest point. Defaults to 0.5. + + Returns: + np.ndarray: Minmax points. + """ + if origin == 0.5: + return np.concatenate([centers - dims / 2, centers + dims / 2], + axis=-1) + corners = center_to_corner_box2d(centers, dims, origin=origin) + return corners[:, [0, 2]].reshape([-1, 4]) + + +def rbbox2d_to_near_bbox(rbboxes): + """convert rotated bbox to nearest 'standing' or 'lying' bbox. + + Args: + rbboxes (np.ndarray): Rotated bboxes with shape of + (N, 5(x, y, xdim, ydim, rad)). + + Returns: + np.ndarray: Bounding boxes with the shape of + (N, 4(xmin, ymin, xmax, ymax)). + """ + rots = rbboxes[..., -1] + rots_0_pi_div_2 = np.abs(limit_period(rots, 0.5, np.pi)) + cond = (rots_0_pi_div_2 > np.pi / 4)[..., np.newaxis] + bboxes_center = np.where(cond, rbboxes[:, [0, 1, 3, 2]], rbboxes[:, :4]) + bboxes = center_to_minmax_2d(bboxes_center[:, :2], bboxes_center[:, 2:]) + return bboxes + + +@numba.jit(nopython=True) +def iou_jit(boxes, query_boxes, mode='iou', eps=0.0): + """Calculate box iou. Note that jit version runs ~10x faster than the + box_overlaps function in mmdet3d.core.evaluation. + + Note: + This function is for counterclockwise boxes. + + Args: + boxes (np.ndarray): Input bounding boxes with shape of (N, 4). + query_boxes (np.ndarray): Query boxes with shape of (K, 4). + mode (str, optional): IoU mode. Defaults to 'iou'. + eps (float, optional): Value added to denominator. Defaults to 0. + + Returns: + np.ndarray: Overlap between boxes and query_boxes + with the shape of [N, K]. + """ + N = boxes.shape[0] + K = query_boxes.shape[0] + overlaps = np.zeros((N, K), dtype=boxes.dtype) + for k in range(K): + box_area = ((query_boxes[k, 2] - query_boxes[k, 0] + eps) * + (query_boxes[k, 3] - query_boxes[k, 1] + eps)) + for n in range(N): + iw = (min(boxes[n, 2], query_boxes[k, 2]) - + max(boxes[n, 0], query_boxes[k, 0]) + eps) + if iw > 0: + ih = (min(boxes[n, 3], query_boxes[k, 3]) - + max(boxes[n, 1], query_boxes[k, 1]) + eps) + if ih > 0: + if mode == 'iou': + ua = ((boxes[n, 2] - boxes[n, 0] + eps) * + (boxes[n, 3] - boxes[n, 1] + eps) + box_area - + iw * ih) + else: + ua = ((boxes[n, 2] - boxes[n, 0] + eps) * + (boxes[n, 3] - boxes[n, 1] + eps)) + overlaps[n, k] = iw * ih / ua + return overlaps + + +def projection_matrix_to_CRT_kitti(proj): + """Split projection matrix of KITTI. + + Note: + This function is for KITTI only. + + P = C @ [R|T] + C is upper triangular matrix, so we need to inverse CR and use QR + stable for all kitti camera projection matrix. + + Args: + proj (p.array, shape=[4, 4]): Intrinsics of camera. + + Returns: + tuple[np.ndarray]: Splited matrix of C, R and T. + """ + + CR = proj[0:3, 0:3] + CT = proj[0:3, 3] + RinvCinv = np.linalg.inv(CR) + Rinv, Cinv = np.linalg.qr(RinvCinv) + C = np.linalg.inv(Cinv) + R = np.linalg.inv(Rinv) + T = Cinv @ CT + return C, R, T + + +def remove_outside_points(points, rect, Trv2c, P2, image_shape): + """Remove points which are outside of image. + + Note: + This function is for KITTI only. + + Args: + points (np.ndarray, shape=[N, 3+dims]): Total points. + rect (np.ndarray, shape=[4, 4]): Matrix to project points in + specific camera coordinate (e.g. CAM2) to CAM0. + Trv2c (np.ndarray, shape=[4, 4]): Matrix to project points in + camera coordinate to lidar coordinate. + P2 (p.array, shape=[4, 4]): Intrinsics of Camera2. + image_shape (list[int]): Shape of image. + + Returns: + np.ndarray, shape=[N, 3+dims]: Filtered points. + """ + # 5x faster than remove_outside_points_v1(2ms vs 10ms) + C, R, T = projection_matrix_to_CRT_kitti(P2) + image_bbox = [0, 0, image_shape[1], image_shape[0]] + frustum = get_frustum(image_bbox, C) + frustum -= T + frustum = np.linalg.inv(R) @ frustum.T + frustum = camera_to_lidar(frustum.T, rect, Trv2c) + frustum_surfaces = corner_to_surfaces_3d_jit(frustum[np.newaxis, ...]) + indices = points_in_convex_polygon_3d_jit(points[:, :3], frustum_surfaces) + points = points[indices.reshape([-1])] + return points + + +def get_frustum(bbox_image, C, near_clip=0.001, far_clip=100): + """Get frustum corners in camera coordinates. + + Args: + bbox_image (list[int]): box in image coordinates. + C (np.ndarray): Intrinsics. + near_clip (float, optional): Nearest distance of frustum. + Defaults to 0.001. + far_clip (float, optional): Farthest distance of frustum. + Defaults to 100. + + Returns: + np.ndarray, shape=[8, 3]: coordinates of frustum corners. + """ + fku = C[0, 0] + fkv = -C[1, 1] + u0v0 = C[0:2, 2] + z_points = np.array([near_clip] * 4 + [far_clip] * 4, + dtype=C.dtype)[:, np.newaxis] + b = bbox_image + box_corners = np.array( + [[b[0], b[1]], [b[0], b[3]], [b[2], b[3]], [b[2], b[1]]], + dtype=C.dtype) + near_box_corners = (box_corners - u0v0) / np.array( + [fku / near_clip, -fkv / near_clip], dtype=C.dtype) + far_box_corners = (box_corners - u0v0) / np.array( + [fku / far_clip, -fkv / far_clip], dtype=C.dtype) + ret_xy = np.concatenate([near_box_corners, far_box_corners], + axis=0) # [8, 2] + ret_xyz = np.concatenate([ret_xy, z_points], axis=1) + return ret_xyz + + +def surface_equ_3d(polygon_surfaces): + """ + + Args: + polygon_surfaces (np.ndarray): Polygon surfaces with shape of + [num_polygon, max_num_surfaces, max_num_points_of_surface, 3]. + All surfaces' normal vector must direct to internal. + Max_num_points_of_surface must at least 3. + + Returns: + tuple: normal vector and its direction. + """ + # return [a, b, c], d in ax+by+cz+d=0 + # polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3] + surface_vec = polygon_surfaces[:, :, :2, :] - \ + polygon_surfaces[:, :, 1:3, :] + # normal_vec: [..., 3] + normal_vec = np.cross(surface_vec[:, :, 0, :], surface_vec[:, :, 1, :]) + # print(normal_vec.shape, points[..., 0, :].shape) + # d = -np.inner(normal_vec, points[..., 0, :]) + d = np.einsum('aij, aij->ai', normal_vec, polygon_surfaces[:, :, 0, :]) + return normal_vec, -d + + +@numba.njit +def _points_in_convex_polygon_3d_jit(points, polygon_surfaces, normal_vec, d, + num_surfaces): + """ + Args: + points (np.ndarray): Input points with shape of (num_points, 3). + polygon_surfaces (np.ndarray): Polygon surfaces with shape of + (num_polygon, max_num_surfaces, max_num_points_of_surface, 3). + All surfaces' normal vector must direct to internal. + Max_num_points_of_surface must at least 3. + normal_vec (np.ndarray): Normal vector of polygon_surfaces. + d (int): Directions of normal vector. + num_surfaces (np.ndarray): Number of surfaces a polygon contains + shape of (num_polygon). + + Returns: + np.ndarray: Result matrix with the shape of [num_points, num_polygon]. + """ + max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] + num_points = points.shape[0] + num_polygons = polygon_surfaces.shape[0] + ret = np.ones((num_points, num_polygons), dtype=np.bool_) + sign = 0.0 + for i in range(num_points): + for j in range(num_polygons): + for k in range(max_num_surfaces): + if k > num_surfaces[j]: + break + sign = (points[i, 0] * normal_vec[j, k, 0] + + points[i, 1] * normal_vec[j, k, 1] + + points[i, 2] * normal_vec[j, k, 2] + d[j, k]) + if sign >= 0: + ret[i, j] = False + break + return ret + + +def points_in_convex_polygon_3d_jit(points, + polygon_surfaces, + num_surfaces=None): + """Check points is in 3d convex polygons. + + Args: + points (np.ndarray): Input points with shape of (num_points, 3). + polygon_surfaces (np.ndarray): Polygon surfaces with shape of + (num_polygon, max_num_surfaces, max_num_points_of_surface, 3). + All surfaces' normal vector must direct to internal. + Max_num_points_of_surface must at least 3. + num_surfaces (np.ndarray, optional): Number of surfaces a polygon + contains shape of (num_polygon). Defaults to None. + + Returns: + np.ndarray: Result matrix with the shape of [num_points, num_polygon]. + """ + max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] + # num_points = points.shape[0] + num_polygons = polygon_surfaces.shape[0] + if num_surfaces is None: + num_surfaces = np.full((num_polygons, ), 9999999, dtype=np.int64) + normal_vec, d = surface_equ_3d(polygon_surfaces[:, :, :3, :]) + # normal_vec: [num_polygon, max_num_surfaces, 3] + # d: [num_polygon, max_num_surfaces] + return _points_in_convex_polygon_3d_jit(points, polygon_surfaces, + normal_vec, d, num_surfaces) + + +@numba.njit +def points_in_convex_polygon_jit(points, polygon, clockwise=False): + """Check points is in 2d convex polygons. True when point in polygon. + + Args: + points (np.ndarray): Input points with the shape of [num_points, 2]. + polygon (np.ndarray): Input polygon with the shape of + [num_polygon, num_points_of_polygon, 2]. + clockwise (bool, optional): Indicate polygon is clockwise. Defaults + to True. + + Returns: + np.ndarray: Result matrix with the shape of [num_points, num_polygon]. + """ + # first convert polygon to directed lines + num_points_of_polygon = polygon.shape[1] + num_points = points.shape[0] + num_polygons = polygon.shape[0] + # vec for all the polygons + if clockwise: + vec1 = polygon - polygon[:, + np.array([num_points_of_polygon - 1] + + list(range(num_points_of_polygon - + 1))), :] + else: + vec1 = polygon[:, + np.array([num_points_of_polygon - 1] + + list(range(num_points_of_polygon - + 1))), :] - polygon + ret = np.zeros((num_points, num_polygons), dtype=np.bool_) + success = True + cross = 0.0 + for i in range(num_points): + for j in range(num_polygons): + success = True + for k in range(num_points_of_polygon): + vec = vec1[j, k] + cross = vec[1] * (polygon[j, k, 0] - points[i, 0]) + cross -= vec[0] * (polygon[j, k, 1] - points[i, 1]) + if cross >= 0: + success = False + break + ret[i, j] = success + return ret + + +def boxes3d_to_corners3d_lidar(boxes3d, bottom_center=True): + """Convert kitti center boxes to corners. + + 7 -------- 4 + /| /| + 6 -------- 5 . + | | | | + . 3 -------- 0 + |/ |/ + 2 -------- 1 + + Note: + This function is for LiDAR boxes only. + + Args: + boxes3d (np.ndarray): Boxes with shape of (N, 7) + [x, y, z, x_size, y_size, z_size, ry] in LiDAR coords, + see the definition of ry in KITTI dataset. + bottom_center (bool, optional): Whether z is on the bottom center + of object. Defaults to True. + + Returns: + np.ndarray: Box corners with the shape of [N, 8, 3]. + """ + boxes_num = boxes3d.shape[0] + x_size, y_size, z_size = boxes3d[:, 3], boxes3d[:, 4], boxes3d[:, 5] + x_corners = np.array([ + x_size / 2., -x_size / 2., -x_size / 2., x_size / 2., x_size / 2., + -x_size / 2., -x_size / 2., x_size / 2. + ], + dtype=np.float32).T + y_corners = np.array([ + -y_size / 2., -y_size / 2., y_size / 2., y_size / 2., -y_size / 2., + -y_size / 2., y_size / 2., y_size / 2. + ], + dtype=np.float32).T + if bottom_center: + z_corners = np.zeros((boxes_num, 8), dtype=np.float32) + z_corners[:, 4:8] = z_size.reshape(boxes_num, + 1).repeat(4, axis=1) # (N, 8) + else: + z_corners = np.array([ + -z_size / 2., -z_size / 2., -z_size / 2., -z_size / 2., + z_size / 2., z_size / 2., z_size / 2., z_size / 2. + ], + dtype=np.float32).T + + ry = boxes3d[:, 6] + zeros, ones = np.zeros(ry.size, + dtype=np.float32), np.ones(ry.size, + dtype=np.float32) + rot_list = np.array([[np.cos(ry), np.sin(ry), zeros], + [-np.sin(ry), np.cos(ry), zeros], + [zeros, zeros, ones]]) # (3, 3, N) + R_list = np.transpose(rot_list, (2, 0, 1)) # (N, 3, 3) + + temp_corners = np.concatenate((x_corners.reshape( + -1, 8, 1), y_corners.reshape(-1, 8, 1), z_corners.reshape(-1, 8, 1)), + axis=2) # (N, 8, 3) + rotated_corners = np.matmul(temp_corners, R_list) # (N, 8, 3) + x_corners = rotated_corners[:, :, 0] + y_corners = rotated_corners[:, :, 1] + z_corners = rotated_corners[:, :, 2] + + x_loc, y_loc, z_loc = boxes3d[:, 0], boxes3d[:, 1], boxes3d[:, 2] + + x = x_loc.reshape(-1, 1) + x_corners.reshape(-1, 8) + y = y_loc.reshape(-1, 1) + y_corners.reshape(-1, 8) + z = z_loc.reshape(-1, 1) + z_corners.reshape(-1, 8) + + corners = np.concatenate( + (x.reshape(-1, 8, 1), y.reshape(-1, 8, 1), z.reshape(-1, 8, 1)), + axis=2) + + return corners.astype(np.float32) diff --git a/embodiedscan/structures/ops/iou3d_calculator.py b/embodiedscan/structures/ops/iou3d_calculator.py new file mode 100644 index 0000000..f2475eb --- /dev/null +++ b/embodiedscan/structures/ops/iou3d_calculator.py @@ -0,0 +1,330 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from mmdet3d.registry import TASK_UTILS +from mmdet3d.structures.bbox_3d import get_box_type +from mmdet.structures.bbox import bbox_overlaps + + +@TASK_UTILS.register_module() +class BboxOverlapsNearest3D(object): + """Nearest 3D IoU Calculator. + + Note: + This IoU calculator first finds the nearest 2D boxes in bird eye view + (BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`. + + Args: + coordinate (str): 'camera', 'lidar', or 'depth' coordinate system. + """ + + def __init__(self, coordinate='lidar'): + assert coordinate in ['camera', 'lidar', 'depth'] + self.coordinate = coordinate + + def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False): + """Calculate nearest 3D IoU. + + Note: + If ``is_aligned`` is ``False``, then it calculates the ious between + each bbox of bboxes1 and bboxes2, otherwise it calculates the ious + between each aligned pair of bboxes1 and bboxes2. + + Args: + bboxes1 (torch.Tensor): shape (N, 7+N) + [x, y, z, x_size, y_size, z_size, ry, v]. + bboxes2 (torch.Tensor): shape (M, 7+N) + [x, y, z, x_size, y_size, z_size, ry, v]. + mode (str): "iou" (intersection over union) or iof + (intersection over foreground). + is_aligned (bool): Whether the calculation is aligned. + + Return: + torch.Tensor: If ``is_aligned`` is ``True``, return ious between + bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is + ``False``, return shape is M. + """ + return bbox_overlaps_nearest_3d(bboxes1, bboxes2, mode, is_aligned, + self.coordinate) + + def __repr__(self): + """str: Return a string that describes the module.""" + repr_str = self.__class__.__name__ + repr_str += f'(coordinate={self.coordinate}' + return repr_str + + +@TASK_UTILS.register_module() +class BboxOverlaps3D(object): + """3D IoU Calculator. + + Args: + coordinate (str): The coordinate system, valid options are + 'camera', 'lidar', and 'depth'. + """ + + def __init__(self, coordinate): + assert coordinate in ['camera', 'lidar', 'depth'] + self.coordinate = coordinate + + def __call__(self, bboxes1, bboxes2, mode='iou'): + """Calculate 3D IoU using cuda implementation. + + Note: + This function calculate the IoU of 3D boxes based on their volumes. + IoU calculator ``:class:BboxOverlaps3D`` uses this function to + calculate the actual 3D IoUs of boxes. + + Args: + bboxes1 (torch.Tensor): with shape (N, 7+C), + (x, y, z, x_size, y_size, z_size, ry, v*). + bboxes2 (torch.Tensor): with shape (M, 7+C), + (x, y, z, x_size, y_size, z_size, ry, v*). + mode (str): "iou" (intersection over union) or + iof (intersection over foreground). + + Return: + torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2 + with shape (M, N) (aligned mode is not supported currently). + """ + return bbox_overlaps_3d(bboxes1, bboxes2, mode, self.coordinate) + + def __repr__(self): + """str: return a string that describes the module""" + repr_str = self.__class__.__name__ + repr_str += f'(coordinate={self.coordinate}' + return repr_str + + +def bbox_overlaps_nearest_3d(bboxes1, + bboxes2, + mode='iou', + is_aligned=False, + coordinate='lidar'): + """Calculate nearest 3D IoU. + + Note: + This function first finds the nearest 2D boxes in bird eye view + (BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`. + This IoU calculator :class:`BboxOverlapsNearest3D` uses this + function to calculate IoUs of boxes. + + If ``is_aligned`` is ``False``, then it calculates the ious between + each bbox of bboxes1 and bboxes2, otherwise the ious between each + aligned pair of bboxes1 and bboxes2. + + Args: + bboxes1 (torch.Tensor): with shape (N, 7+C), + (x, y, z, x_size, y_size, z_size, ry, v*). + bboxes2 (torch.Tensor): with shape (M, 7+C), + (x, y, z, x_size, y_size, z_size, ry, v*). + mode (str): "iou" (intersection over union) or iof + (intersection over foreground). + is_aligned (bool): Whether the calculation is aligned + + Return: + torch.Tensor: If ``is_aligned`` is ``True``, return ious between + bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is + ``False``, return shape is M. + """ + assert bboxes1.size(-1) == bboxes2.size(-1) >= 7 + + box_type, _ = get_box_type(coordinate) + + bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1]) + bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1]) + + # Change the bboxes to bev + # box conversion and iou calculation in torch version on CUDA + # is 10x faster than that in numpy version + bboxes1_bev = bboxes1.nearest_bev + bboxes2_bev = bboxes2.nearest_bev + + ret = bbox_overlaps(bboxes1_bev, + bboxes2_bev, + mode=mode, + is_aligned=is_aligned) + return ret + + +def bbox_overlaps_3d(bboxes1, bboxes2, mode='iou', coordinate='camera'): + """Calculate 3D IoU using cuda implementation. + + Note: + This function calculates the IoU of 3D boxes based on their volumes. + IoU calculator :class:`BboxOverlaps3D` uses this function to + calculate the actual IoUs of boxes. + + Args: + bboxes1 (torch.Tensor): with shape (N, 7+C), + (x, y, z, x_size, y_size, z_size, ry, v*). + bboxes2 (torch.Tensor): with shape (M, 7+C), + (x, y, z, x_size, y_size, z_size, ry, v*). + mode (str): "iou" (intersection over union) or + iof (intersection over foreground). + coordinate (str): 'camera' or 'lidar' coordinate system. + + Return: + torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2 + with shape (M, N) (aligned mode is not supported currently). + """ + assert bboxes1.size(-1) == bboxes2.size(-1) >= 7 + + box_type, _ = get_box_type(coordinate) + + bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1]) + bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1]) + + return bboxes1.overlaps(bboxes1, bboxes2, mode=mode) + + +@TASK_UTILS.register_module() +class AxisAlignedBboxOverlaps3D(object): + """Axis-aligned 3D Overlaps (IoU) Calculator.""" + + def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False): + """Calculate IoU between 2D bboxes. + + Args: + bboxes1 (Tensor): shape (B, m, 6) in + format or empty. + bboxes2 (Tensor): shape (B, n, 6) in + format or empty. + B indicates the batch dim, in shape (B1, B2, ..., Bn). + If ``is_aligned`` is ``True``, then m and n must be equal. + mode (str): "iou" (intersection over union) or "giou" (generalized + intersection over union). + is_aligned (bool, optional): If True, then m and n must be equal. + Defaults to False. + Returns: + Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,) + """ + assert bboxes1.size(-1) == bboxes2.size(-1) == 6 + return axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2, mode, + is_aligned) + + def __repr__(self): + """str: a string describing the module""" + repr_str = self.__class__.__name__ + '()' + return repr_str + + +def axis_aligned_bbox_overlaps_3d(bboxes1, + bboxes2, + mode='iou', + is_aligned=False, + eps=1e-6): + """Calculate overlap between two set of axis aligned 3D bboxes. If + ``is_aligned`` is ``False``, then calculate the overlaps between each bbox + of bboxes1 and bboxes2, otherwise the overlaps between each aligned pair of + bboxes1 and bboxes2. + + Args: + bboxes1 (Tensor): shape (B, m, 6) in + format or empty. + bboxes2 (Tensor): shape (B, n, 6) in + format or empty. + B indicates the batch dim, in shape (B1, B2, ..., Bn). + If ``is_aligned`` is ``True``, then m and n must be equal. + mode (str): "iou" (intersection over union) or "giou" (generalized + intersection over union). + is_aligned (bool, optional): If True, then m and n must be equal. + Defaults to False. + eps (float, optional): A value added to the denominator for numerical + stability. Defaults to 1e-6. + + Returns: + Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,) + + Example: + >>> bboxes1 = torch.FloatTensor([ + >>> [0, 0, 0, 10, 10, 10], + >>> [10, 10, 10, 20, 20, 20], + >>> [32, 32, 32, 38, 40, 42], + >>> ]) + >>> bboxes2 = torch.FloatTensor([ + >>> [0, 0, 0, 10, 20, 20], + >>> [0, 10, 10, 10, 19, 20], + >>> [10, 10, 10, 20, 20, 20], + >>> ]) + >>> overlaps = axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2) + >>> assert overlaps.shape == (3, 3) + >>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True) + >>> assert overlaps.shape == (3, ) + Example: + >>> empty = torch.empty(0, 6) + >>> nonempty = torch.FloatTensor([[0, 0, 0, 10, 9, 10]]) + >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1) + >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0) + >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0) + """ + + assert mode in ['iou', 'giou'], f'Unsupported mode {mode}' + # Either the boxes are empty or the length of boxes's last dimension is 6 + assert (bboxes1.size(-1) == 6 or bboxes1.size(0) == 0) + assert (bboxes2.size(-1) == 6 or bboxes2.size(0) == 0) + + # Batch dim must be the same + # Batch dim: (B1, B2, ... Bn) + assert bboxes1.shape[:-2] == bboxes2.shape[:-2] + batch_shape = bboxes1.shape[:-2] + + rows = bboxes1.size(-2) + cols = bboxes2.size(-2) + if is_aligned: + assert rows == cols + + if rows * cols == 0: + if is_aligned: + return bboxes1.new(batch_shape + (rows, )) + else: + return bboxes1.new(batch_shape + (rows, cols)) + + area1 = (bboxes1[..., 3] - bboxes1[..., 0]) * ( + bboxes1[..., 4] - bboxes1[..., 1]) * (bboxes1[..., 5] - + bboxes1[..., 2]) + area2 = (bboxes2[..., 3] - bboxes2[..., 0]) * ( + bboxes2[..., 4] - bboxes2[..., 1]) * (bboxes2[..., 5] - + bboxes2[..., 2]) + + if is_aligned: + lt = torch.max(bboxes1[..., :3], bboxes2[..., :3]) # [B, rows, 3] + rb = torch.min(bboxes1[..., 3:], bboxes2[..., 3:]) # [B, rows, 3] + + wh = (rb - lt).clamp(min=0) # [B, rows, 2] + overlap = wh[..., 0] * wh[..., 1] * wh[..., 2] + + if mode in ['iou', 'giou']: + union = area1 + area2 - overlap + else: + union = area1 + if mode == 'giou': + enclosed_lt = torch.min(bboxes1[..., :3], bboxes2[..., :3]) + enclosed_rb = torch.max(bboxes1[..., 3:], bboxes2[..., 3:]) + else: + lt = torch.max(bboxes1[..., :, None, :3], + bboxes2[..., None, :, :3]) # [B, rows, cols, 3] + rb = torch.min(bboxes1[..., :, None, 3:], + bboxes2[..., None, :, 3:]) # [B, rows, cols, 3] + + wh = (rb - lt).clamp(min=0) # [B, rows, cols, 3] + overlap = wh[..., 0] * wh[..., 1] * wh[..., 2] + + if mode in ['iou', 'giou']: + union = area1[..., None] + area2[..., None, :] - overlap + if mode == 'giou': + enclosed_lt = torch.min(bboxes1[..., :, None, :3], + bboxes2[..., None, :, :3]) + enclosed_rb = torch.max(bboxes1[..., :, None, 3:], + bboxes2[..., None, :, 3:]) + + eps = union.new_tensor([eps]) + union = torch.max(union, eps) + ious = overlap / union + if mode in ['iou']: + return ious + # calculate gious + enclose_wh = (enclosed_rb - enclosed_lt).clamp(min=0) + enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1] * enclose_wh[..., 2] + enclose_area = torch.max(enclose_area, eps) + gious = ious - (enclose_area - union) / enclose_area + return gious diff --git a/embodiedscan/structures/ops/transforms.py b/embodiedscan/structures/ops/transforms.py new file mode 100644 index 0000000..491b791 --- /dev/null +++ b/embodiedscan/structures/ops/transforms.py @@ -0,0 +1,76 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + + +def bbox3d_mapping_back(bboxes, scale_factor, flip_horizontal, flip_vertical): + """Map bboxes from testing scale to original image scale. + + Args: + bboxes (:obj:`BaseInstance3DBoxes`): Boxes to be mapped back. + scale_factor (float): Scale factor. + flip_horizontal (bool): Whether to flip horizontally. + flip_vertical (bool): Whether to flip vertically. + + Returns: + :obj:`BaseInstance3DBoxes`: Boxes mapped back. + """ + new_bboxes = bboxes.clone() + if flip_horizontal: + new_bboxes.flip('horizontal') + if flip_vertical: + new_bboxes.flip('vertical') + new_bboxes.scale(1 / scale_factor) + + return new_bboxes + + +def bbox3d2roi(bbox_list): + """Convert a list of bounding boxes to roi format. + + Args: + bbox_list (list[torch.Tensor]): A list of bounding boxes + corresponding to a batch of images. + + Returns: + torch.Tensor: Region of interests in shape (n, c), where + the channels are in order of [batch_ind, x, y ...]. + """ + rois_list = [] + for img_id, bboxes in enumerate(bbox_list): + if bboxes.size(0) > 0: + img_inds = bboxes.new_full((bboxes.size(0), 1), img_id) + rois = torch.cat([img_inds, bboxes], dim=-1) + else: + rois = torch.zeros_like(bboxes) + rois_list.append(rois) + rois = torch.cat(rois_list, 0) + return rois + + +# TODO delete this +def bbox3d2result(bboxes, scores, labels, attrs=None): + """Convert detection results to a list of numpy arrays. + + Args: + bboxes (torch.Tensor): Bounding boxes with shape (N, 5). + labels (torch.Tensor): Labels with shape (N, ). + scores (torch.Tensor): Scores with shape (N, ). + attrs (torch.Tensor, optional): Attributes with shape (N, ). + Defaults to None. + + Returns: + dict[str, torch.Tensor]: Bounding box results in cpu mode. + + - boxes_3d (torch.Tensor): 3D boxes. + - scores (torch.Tensor): Prediction scores. + - labels_3d (torch.Tensor): Box labels. + - attrs_3d (torch.Tensor, optional): Box attributes. + """ + result_dict = dict(bboxes_3d=bboxes.to('cpu'), + scores_3d=scores.cpu(), + labels_3d=labels.cpu()) + + if attrs is not None: + result_dict['attr_labels'] = attrs.cpu() + + return result_dict diff --git a/embodiedscan/structures/point_data.py b/embodiedscan/structures/point_data.py new file mode 100644 index 0000000..f12d4c8 --- /dev/null +++ b/embodiedscan/structures/point_data.py @@ -0,0 +1,161 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections.abc import Sized +from typing import Union + +import numpy as np +import torch +from mmengine.structures import BaseDataElement + +IndexType = Union[str, slice, int, list, torch.LongTensor, + torch.cuda.LongTensor, torch.BoolTensor, + torch.cuda.BoolTensor, np.ndarray] + + +class PointData(BaseDataElement): + """Data structure for point-level annotations or predictions. + + All data items in ``data_fields`` of ``PointData`` meet the following + requirements: + + - They are all one dimension. + - They should have the same length. + + `PointData` is used to save point-level semantic and instance mask, + it also can save `instances_labels` and `instances_scores` temporarily. + In the future, we would consider to move the instance-level info into + `gt_instances_3d` and `pred_instances_3d`. + + Examples: + >>> metainfo = dict( + ... sample_idx=random.randint(0, 100)) + >>> points = np.random.randint(0, 255, (100, 3)) + >>> point_data = PointData(metainfo=metainfo, + ... points=points) + >>> print(len(point_data)) + 100 + + >>> # slice + >>> slice_data = point_data[10:60] + >>> assert len(slice_data) == 50 + + >>> # set + >>> point_data.pts_semantic_mask = torch.randint(0, 255, (100,)) + >>> point_data.pts_instance_mask = torch.randint(0, 255, (100,)) + >>> assert tuple(point_data.pts_semantic_mask.shape) == (100,) + >>> assert tuple(point_data.pts_instance_mask.shape) == (100,) + """ + + def __setattr__(self, name: str, value: Sized) -> None: + """setattr is only used to set data. + + The value must have the attribute of `__len__` and have the same length + of `PointData`. + """ + if name in ('_metainfo_fields', '_data_fields'): + if not hasattr(self, name): + super().__setattr__(name, value) + else: + raise AttributeError(f'{name} has been used as a ' + 'private attribute, which is immutable.') + + else: + assert isinstance(value, + Sized), 'value must contain `__len__` attribute' + # TODO: make sure the input value share the same length + super().__setattr__(name, value) + + __setitem__ = __setattr__ + + def __getitem__(self, item: IndexType) -> 'PointData': + """ + Args: + item (str, int, list, :obj:`slice`, :obj:`numpy.ndarray`, + :obj:`torch.LongTensor`, :obj:`torch.BoolTensor`): + Get the corresponding values according to item. + + Returns: + :obj:`PointData`: Corresponding values. + """ + if isinstance(item, list): + item = np.array(item) + if isinstance(item, np.ndarray): + # The default int type of numpy is platform dependent, int32 for + # windows and int64 for linux. `torch.Tensor` requires the index + # should be int64, therefore we simply convert it to int64 here. + # Mode details in https://github.com/numpy/numpy/issues/9464 + item = item.astype(np.int64) if item.dtype == np.int32 else item + item = torch.from_numpy(item) + assert isinstance( + item, (str, slice, int, torch.LongTensor, torch.cuda.LongTensor, + torch.BoolTensor, torch.cuda.BoolTensor)) + + if isinstance(item, str): + return getattr(self, item) + + if isinstance(item, int): + if item >= len(self) or item < -len(self): # type: ignore + raise IndexError(f'Index {item} out of range!') + else: + # keep the dimension + item = slice(item, None, len(self)) + + new_data = self.__class__(metainfo=self.metainfo) + if isinstance(item, torch.Tensor): + assert item.dim() == 1, 'Only support to get the' \ + ' values along the first dimension.' + if isinstance(item, (torch.BoolTensor, torch.cuda.BoolTensor)): + assert len(item) == len(self), 'The shape of the ' \ + 'input(BoolTensor) ' \ + f'{len(item)} ' \ + 'does not match the shape ' \ + 'of the indexed tensor ' \ + 'in results_field ' \ + f'{len(self)} at ' \ + 'first dimension.' + + for k, v in self.items(): + if isinstance(v, torch.Tensor): + new_data[k] = v[item] + elif isinstance(v, np.ndarray): + new_data[k] = v[item.cpu().numpy()] + elif isinstance( + v, (str, list, tuple)) or (hasattr(v, '__getitem__') + and hasattr(v, 'cat')): + # convert to indexes from BoolTensor + if isinstance(item, + (torch.BoolTensor, torch.cuda.BoolTensor)): + indexes = torch.nonzero(item).view( + -1).cpu().numpy().tolist() + else: + indexes = item.cpu().numpy().tolist() + slice_list = [] + if indexes: + for index in indexes: + slice_list.append(slice(index, None, len(v))) + else: + slice_list.append(slice(None, 0, None)) + r_list = [v[s] for s in slice_list] + if isinstance(v, (str, list, tuple)): + new_value = r_list[0] + for r in r_list[1:]: + new_value = new_value + r + else: + new_value = v.cat(r_list) + new_data[k] = new_value + else: + raise ValueError( + f'The type of `{k}` is `{type(v)}`, which has no ' + 'attribute of `cat`, so it does not ' + 'support slice with `bool`') + else: + # item is a slice + for k, v in self.items(): + new_data[k] = v[item] + return new_data # type: ignore + + def __len__(self) -> int: + """int: The length of `PointData`.""" + if len(self._data_fields) > 0: + return len(self.values()[0]) + else: + return 0 diff --git a/embodiedscan/structures/points/__init__.py b/embodiedscan/structures/points/__init__.py new file mode 100644 index 0000000..eedae14 --- /dev/null +++ b/embodiedscan/structures/points/__init__.py @@ -0,0 +1,31 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base_points import BasePoints +from .cam_points import CameraPoints +from .depth_points import DepthPoints +from .lidar_points import LiDARPoints + +__all__ = ['BasePoints', 'CameraPoints', 'DepthPoints', 'LiDARPoints'] + + +def get_points_type(points_type: str) -> type: + """Get the class of points according to coordinate type. + + Args: + points_type (str): The type of points coordinate. The valid value are + "CAMERA", "LIDAR" and "DEPTH". + + Returns: + type: Points type. + """ + points_type_upper = points_type.upper() + if points_type_upper == 'CAMERA': + points_cls = CameraPoints + elif points_type_upper == 'LIDAR': + points_cls = LiDARPoints + elif points_type_upper == 'DEPTH': + points_cls = DepthPoints + else: + raise ValueError('Only "points_type" of "CAMERA", "LIDAR" and "DEPTH" ' + f'are supported, got {points_type}') + + return points_cls diff --git a/embodiedscan/structures/points/base_points.py b/embodiedscan/structures/points/base_points.py new file mode 100644 index 0000000..1fa2282 --- /dev/null +++ b/embodiedscan/structures/points/base_points.py @@ -0,0 +1,521 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from abc import abstractmethod +from typing import Iterator, Optional, Sequence, Union + +import numpy as np +import torch +from torch import Tensor + +from ..bbox_3d.utils import rotation_3d_in_axis, rotation_3d_in_euler + + +class BasePoints: + """Base class for Points. + + Args: + tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The points + data with shape (N, points_dim). + points_dim (int): Integer indicating the dimension of a point. Each row + is (x, y, z, ...). Defaults to 3. + attribute_dims (dict, optional): Dictionary to indicate the meaning of + extra dimension. Defaults to None. + + Attributes: + tensor (Tensor): Float matrix with shape (N, points_dim). + points_dim (int): Integer indicating the dimension of a point. Each row + is (x, y, z, ...). + attribute_dims (dict, optional): Dictionary to indicate the meaning of + extra dimension. Defaults to None. + rotation_axis (int): Default rotation axis for points rotation. + """ + + def __init__(self, + tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]], + points_dim: int = 3, + attribute_dims: Optional[dict] = None) -> None: + if isinstance(tensor, Tensor): + device = tensor.device + else: + device = torch.device('cpu') + tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device) + if tensor.numel() == 0: + # Use reshape, so we don't end up creating a new tensor that does + # not depend on the inputs (and consequently confuses jit) + tensor = tensor.reshape((-1, points_dim)) + assert tensor.dim() == 2 and tensor.size(-1) == points_dim, \ + ('The points dimension must be 2 and the length of the last ' + f'dimension must be {points_dim}, but got points with shape ' + f'{tensor.shape}.') + + self.tensor = tensor.clone() + self.points_dim = points_dim + self.attribute_dims = attribute_dims + self.rotation_axis = 0 + + @property + def coord(self) -> Tensor: + """Tensor: Coordinates of each point in shape (N, 3).""" + return self.tensor[:, :3] + + @coord.setter + def coord(self, tensor: Union[Tensor, np.ndarray]) -> None: + """Set the coordinates of each point. + + Args: + tensor (Tensor or np.ndarray): Coordinates of each point with shape + (N, 3). + """ + try: + tensor = tensor.reshape(self.shape[0], 3) + except (RuntimeError, ValueError): # for torch.Tensor and np.ndarray + raise ValueError(f'got unexpected shape {tensor.shape}') + if not isinstance(tensor, Tensor): + tensor = self.tensor.new_tensor(tensor) + self.tensor[:, :3] = tensor + + @property + def height(self) -> Union[Tensor, None]: + """Tensor or None: Returns a vector with height of each point in shape + (N, ).""" + if self.attribute_dims is not None and \ + 'height' in self.attribute_dims.keys(): + return self.tensor[:, self.attribute_dims['height']] + else: + return None + + @height.setter + def height(self, tensor: Union[Tensor, np.ndarray]) -> None: + """Set the height of each point. + + Args: + tensor (Tensor or np.ndarray): Height of each point with shape + (N, ). + """ + try: + tensor = tensor.reshape(self.shape[0]) + except (RuntimeError, ValueError): # for torch.Tensor and np.ndarray + raise ValueError(f'got unexpected shape {tensor.shape}') + if not isinstance(tensor, Tensor): + tensor = self.tensor.new_tensor(tensor) + if self.attribute_dims is not None and \ + 'height' in self.attribute_dims.keys(): + self.tensor[:, self.attribute_dims['height']] = tensor + else: + # add height attribute + if self.attribute_dims is None: + self.attribute_dims = dict() + attr_dim = self.shape[1] + self.tensor = torch.cat([self.tensor, tensor.unsqueeze(1)], dim=1) + self.attribute_dims.update(dict(height=attr_dim)) + self.points_dim += 1 + + @property + def color(self) -> Union[Tensor, None]: + """Tensor or None: Returns a vector with color of each point in shape + (N, 3).""" + if self.attribute_dims is not None and \ + 'color' in self.attribute_dims.keys(): + return self.tensor[:, self.attribute_dims['color']] + else: + return None + + @color.setter + def color(self, tensor: Union[Tensor, np.ndarray]) -> None: + """Set the color of each point. + + Args: + tensor (Tensor or np.ndarray): Color of each point with shape + (N, 3). + """ + try: + tensor = tensor.reshape(self.shape[0], 3) + except (RuntimeError, ValueError): # for torch.Tensor and np.ndarray + raise ValueError(f'got unexpected shape {tensor.shape}') + if tensor.max() >= 256 or tensor.min() < 0: + warnings.warn('point got color value beyond [0, 255]') + if not isinstance(tensor, Tensor): + tensor = self.tensor.new_tensor(tensor) + if self.attribute_dims is not None and \ + 'color' in self.attribute_dims.keys(): + self.tensor[:, self.attribute_dims['color']] = tensor + else: + # add color attribute + if self.attribute_dims is None: + self.attribute_dims = dict() + attr_dim = self.shape[1] + self.tensor = torch.cat([self.tensor, tensor], dim=1) + self.attribute_dims.update( + dict(color=[attr_dim, attr_dim + 1, attr_dim + 2])) + self.points_dim += 3 + + @property + def shape(self) -> torch.Size: + """torch.Size: Shape of points.""" + return self.tensor.shape + + def shuffle(self) -> Tensor: + """Shuffle the points. + + Returns: + Tensor: The shuffled index. + """ + idx = torch.randperm(self.__len__(), device=self.tensor.device) + self.tensor = self.tensor[idx] + return idx + + def rotate(self, + rotation: Union[Tensor, np.ndarray, float], + axis: Optional[int] = None) -> Tensor: + """Rotate points with the given rotation matrix or angle. + + Args: + rotation (Tensor or np.ndarray or float): Rotation matrix or angle. + axis (int, optional): Axis to rotate at. Defaults to None. + + Returns: + Tensor: Rotation matrix. + """ + if not isinstance(rotation, Tensor): + rotation = self.tensor.new_tensor(rotation) + assert rotation.shape == torch.Size([3, 3]) or rotation.numel() == 1, \ + f'invalid rotation shape {rotation.shape}' + + if axis is None: + axis = self.rotation_axis + + if rotation.numel() == 1: + rotated_points, rot_mat_T = rotation_3d_in_axis( + self.tensor[:, :3][None], rotation, axis=axis, return_mat=True) + self.tensor[:, :3] = rotated_points.squeeze(0) + rot_mat_T = rot_mat_T.squeeze(0) + elif rotation.numel() == 3: + rotated_points, rot_mat_T = rotation_3d_in_euler( + self.tensor[:, :3][None], rotation, return_mat=True) + self.tensor[:, :3] = rotated_points.squeeze(0) + rot_mat_T = rot_mat_T.squeeze(0) + else: + # rotation.numel() == 9 + self.tensor[:, :3] = self.tensor[:, :3] @ rotation + rot_mat_T = rotation + + return rot_mat_T + + @abstractmethod + def flip(self, bev_direction: str = 'horizontal') -> None: + """Flip the points along given BEV direction. + + Args: + bev_direction (str): Flip direction (horizontal or vertical). + Defaults to 'horizontal'. + """ + pass + + def translate(self, trans_vector: Union[Tensor, np.ndarray]) -> None: + """Translate points with the given translation vector. + + Args: + trans_vector (Tensor or np.ndarray): Translation vector of size 3 + or nx3. + """ + if not isinstance(trans_vector, Tensor): + trans_vector = self.tensor.new_tensor(trans_vector) + trans_vector = trans_vector.squeeze(0) + if trans_vector.dim() == 1: + assert trans_vector.shape[0] == 3 + elif trans_vector.dim() == 2: + assert trans_vector.shape[0] == self.tensor.shape[0] and \ + trans_vector.shape[1] == 3 + else: + raise NotImplementedError( + f'Unsupported translation vector of shape {trans_vector.shape}' + ) + self.tensor[:, :3] += trans_vector + + def in_range_3d( + self, point_range: Union[Tensor, np.ndarray, + Sequence[float]]) -> Tensor: + """Check whether the points are in the given range. + + Args: + point_range (Tensor or np.ndarray or Sequence[float]): The range of + point (x_min, y_min, z_min, x_max, y_max, z_max). + + Note: + In the original implementation of SECOND, checking whether a box in + the range checks whether the points are in a convex polygon, we try + to reduce the burden for simpler cases. + + Returns: + Tensor: A binary vector indicating whether each point is inside the + reference range. + """ + in_range_flags = ((self.tensor[:, 0] > point_range[0]) + & (self.tensor[:, 1] > point_range[1]) + & (self.tensor[:, 2] > point_range[2]) + & (self.tensor[:, 0] < point_range[3]) + & (self.tensor[:, 1] < point_range[4]) + & (self.tensor[:, 2] < point_range[5])) + return in_range_flags + + @property + def bev(self) -> Tensor: + """Tensor: BEV of the points in shape (N, 2).""" + return self.tensor[:, [0, 1]] + + def in_range_bev( + self, point_range: Union[Tensor, np.ndarray, + Sequence[float]]) -> Tensor: + """Check whether the points are in the given range. + + Args: + point_range (Tensor or np.ndarray or Sequence[float]): The range of + point in order of (x_min, y_min, x_max, y_max). + + Returns: + Tensor: A binary vector indicating whether each point is inside the + reference range. + """ + in_range_flags = ((self.bev[:, 0] > point_range[0]) + & (self.bev[:, 1] > point_range[1]) + & (self.bev[:, 0] < point_range[2]) + & (self.bev[:, 1] < point_range[3])) + return in_range_flags + + @abstractmethod + def convert_to(self, + dst: int, + rt_mat: Optional[Union[Tensor, + np.ndarray]] = None) -> 'BasePoints': + """Convert self to ``dst`` mode. + + Args: + dst (int): The target Point mode. + rt_mat (Tensor or np.ndarray, optional): The rotation and + translation matrix between different coordinates. + Defaults to None. The conversion from ``src`` coordinates to + ``dst`` coordinates usually comes along the change of sensors, + e.g., from camera to LiDAR. This requires a transformation + matrix. + + Returns: + :obj:`BasePoints`: The converted point of the same type in the + ``dst`` mode. + """ + pass + + def scale(self, scale_factor: float) -> None: + """Scale the points with horizontal and vertical scaling factors. + + Args: + scale_factors (float): Scale factors to scale the points. + """ + self.tensor[:, :3] *= scale_factor + + def __getitem__( + self, item: Union[int, tuple, slice, np.ndarray, + Tensor]) -> 'BasePoints': + """ + Args: + item (int or tuple or slice or np.ndarray or Tensor): Index of + points. + + Note: + The following usage are allowed: + + 1. `new_points = points[3]`: Return a `Points` that contains only + one point. + 2. `new_points = points[2:10]`: Return a slice of points. + 3. `new_points = points[vector]`: Whether vector is a + torch.BoolTensor with `length = len(points)`. Nonzero elements + in the vector will be selected. + 4. `new_points = points[3:11, vector]`: Return a slice of points + and attribute dims. + 5. `new_points = points[4:12, 2]`: Return a slice of points with + single attribute. + + Note that the returned Points might share storage with this Points, + subject to PyTorch's indexing semantics. + + Returns: + :obj:`BasePoints`: A new object of :class:`BasePoints` after + indexing. + """ + original_type = type(self) + if isinstance(item, int): + return original_type(self.tensor[item].view(1, -1), + points_dim=self.points_dim, + attribute_dims=self.attribute_dims) + elif isinstance(item, tuple) and len(item) == 2: + if isinstance(item[1], slice): + start = 0 if item[1].start is None else item[1].start + stop = self.tensor.shape[1] \ + if item[1].stop is None else item[1].stop + step = 1 if item[1].step is None else item[1].step + item = list(item) + item[1] = list(range(start, stop, step)) + item = tuple(item) + elif isinstance(item[1], int): + item = list(item) + item[1] = [item[1]] + item = tuple(item) + p = self.tensor[item[0], item[1]] + + keep_dims = list( + set(item[1]).intersection(set(range(3, self.tensor.shape[1])))) + if self.attribute_dims is not None: + attribute_dims = self.attribute_dims.copy() + for key in self.attribute_dims.keys(): + cur_attribute_dims = attribute_dims[key] + if isinstance(cur_attribute_dims, int): + cur_attribute_dims = [cur_attribute_dims] + intersect_attr = list( + set(cur_attribute_dims).intersection(set(keep_dims))) + if len(intersect_attr) == 1: + attribute_dims[key] = intersect_attr[0] + elif len(intersect_attr) > 1: + attribute_dims[key] = intersect_attr + else: + attribute_dims.pop(key) + else: + attribute_dims = None + elif isinstance(item, (slice, np.ndarray, Tensor)): + p = self.tensor[item] + attribute_dims = self.attribute_dims + else: + raise NotImplementedError(f'Invalid slice {item}!') + + assert p.dim() == 2, \ + f'Indexing on Points with {item} failed to return a matrix!' + return original_type(p, + points_dim=p.shape[1], + attribute_dims=attribute_dims) + + def __len__(self) -> int: + """int: Number of points in the current object.""" + return self.tensor.shape[0] + + def __repr__(self) -> str: + """str: Return a string that describes the object.""" + return self.__class__.__name__ + '(\n ' + str(self.tensor) + ')' + + @classmethod + def cat(cls, points_list: Sequence['BasePoints']) -> 'BasePoints': + """Concatenate a list of Points into a single Points. + + Args: + points_list (Sequence[:obj:`BasePoints`]): List of points. + + Returns: + :obj:`BasePoints`: The concatenated points. + """ + assert isinstance(points_list, (list, tuple)) + if len(points_list) == 0: + return cls(torch.empty(0)) + assert all(isinstance(points, cls) for points in points_list) + + # use torch.cat (v.s. layers.cat) + # so the returned points never share storage with input + cat_points = cls(torch.cat([p.tensor for p in points_list], dim=0), + points_dim=points_list[0].points_dim, + attribute_dims=points_list[0].attribute_dims) + return cat_points + + def numpy(self) -> np.ndarray: + """Reload ``numpy`` from self.tensor.""" + return self.tensor.numpy() + + def to(self, device: Union[str, torch.device], *args, + **kwargs) -> 'BasePoints': + """Convert current points to a specific device. + + Args: + device (str or :obj:`torch.device`): The name of the device. + + Returns: + :obj:`BasePoints`: A new points object on the specific device. + """ + original_type = type(self) + return original_type(self.tensor.to(device, *args, **kwargs), + points_dim=self.points_dim, + attribute_dims=self.attribute_dims) + + def cpu(self) -> 'BasePoints': + """Convert current points to cpu device. + + Returns: + :obj:`BasePoints`: A new points object on the cpu device. + """ + original_type = type(self) + return original_type(self.tensor.cpu(), + points_dim=self.points_dim, + attribute_dims=self.attribute_dims) + + def cuda(self, *args, **kwargs) -> 'BasePoints': + """Convert current points to cuda device. + + Returns: + :obj:`BasePoints`: A new points object on the cuda device. + """ + original_type = type(self) + return original_type(self.tensor.cuda(*args, **kwargs), + points_dim=self.points_dim, + attribute_dims=self.attribute_dims) + + def clone(self) -> 'BasePoints': + """Clone the points. + + Returns: + :obj:`BasePoints`: Point object with the same properties as self. + """ + original_type = type(self) + return original_type(self.tensor.clone(), + points_dim=self.points_dim, + attribute_dims=self.attribute_dims) + + def detach(self) -> 'BasePoints': + """Detach the points. + + Returns: + :obj:`BasePoints`: Point object with the same properties as self. + """ + original_type = type(self) + return original_type(self.tensor.detach(), + points_dim=self.points_dim, + attribute_dims=self.attribute_dims) + + @property + def device(self) -> torch.device: + """torch.device: The device of the points are on.""" + return self.tensor.device + + def __iter__(self) -> Iterator[Tensor]: + """Yield a point as a Tensor at a time. + + Returns: + Iterator[Tensor]: A point of shape (points_dim, ). + """ + yield from self.tensor + + def new_point( + self, data: Union[Tensor, np.ndarray, Sequence[Sequence[float]]] + ) -> 'BasePoints': + """Create a new point object with data. + + The new point and its tensor has the similar properties as self and + self.tensor, respectively. + + Args: + data (Tensor or np.ndarray or Sequence[Sequence[float]]): Data to + be copied. + + Returns: + :obj:`BasePoints`: A new point object with ``data``, the object's + other properties are similar to ``self``. + """ + new_tensor = self.tensor.new_tensor(data) \ + if not isinstance(data, Tensor) else data.to(self.device) + original_type = type(self) + return original_type(new_tensor, + points_dim=self.points_dim, + attribute_dims=self.attribute_dims) diff --git a/embodiedscan/structures/points/cam_points.py b/embodiedscan/structures/points/cam_points.py new file mode 100644 index 0000000..4a835a1 --- /dev/null +++ b/embodiedscan/structures/points/cam_points.py @@ -0,0 +1,80 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Sequence, Union + +import numpy as np +from torch import Tensor + +from .base_points import BasePoints + + +class CameraPoints(BasePoints): + """Points of instances in CAM coordinates. + + Args: + tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The points + data with shape (N, points_dim). + points_dim (int): Integer indicating the dimension of a point. Each row + is (x, y, z, ...). Defaults to 3. + attribute_dims (dict, optional): Dictionary to indicate the meaning of + extra dimension. Defaults to None. + + Attributes: + tensor (Tensor): Float matrix with shape (N, points_dim). + points_dim (int): Integer indicating the dimension of a point. Each row + is (x, y, z, ...). + attribute_dims (dict, optional): Dictionary to indicate the meaning of + extra dimension. Defaults to None. + rotation_axis (int): Default rotation axis for points rotation. + """ + + def __init__(self, + tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]], + points_dim: int = 3, + attribute_dims: Optional[dict] = None) -> None: + super(CameraPoints, self).__init__(tensor, + points_dim=points_dim, + attribute_dims=attribute_dims) + self.rotation_axis = 1 + + def flip(self, bev_direction: str = 'horizontal') -> None: + """Flip the points along given BEV direction. + + Args: + bev_direction (str): Flip direction (horizontal or vertical). + Defaults to 'horizontal'. + """ + assert bev_direction in ('horizontal', 'vertical') + if bev_direction == 'horizontal': + self.tensor[:, 0] = -self.tensor[:, 0] + elif bev_direction == 'vertical': + self.tensor[:, 2] = -self.tensor[:, 2] + + @property + def bev(self) -> Tensor: + """Tensor: BEV of the points in shape (N, 2).""" + return self.tensor[:, [0, 2]] + + def convert_to(self, + dst: int, + rt_mat: Optional[Union[Tensor, + np.ndarray]] = None) -> 'BasePoints': + """Convert self to ``dst`` mode. + + Args: + dst (int): The target Point mode. + rt_mat (Tensor or np.ndarray, optional): The rotation and + translation matrix between different coordinates. + Defaults to None. The conversion from ``src`` coordinates to + ``dst`` coordinates usually comes along the change of sensors, + e.g., from camera to LiDAR. This requires a transformation + matrix. + + Returns: + :obj:`BasePoints`: The converted point of the same type in the + ``dst`` mode. + """ + from mmdet3d.structures.bbox_3d import Coord3DMode + return Coord3DMode.convert_point(point=self, + src=Coord3DMode.CAM, + dst=dst, + rt_mat=rt_mat) diff --git a/embodiedscan/structures/points/depth_points.py b/embodiedscan/structures/points/depth_points.py new file mode 100644 index 0000000..c3ff712 --- /dev/null +++ b/embodiedscan/structures/points/depth_points.py @@ -0,0 +1,75 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Sequence, Union + +import numpy as np +from torch import Tensor + +from .base_points import BasePoints + + +class DepthPoints(BasePoints): + """Points of instances in DEPTH coordinates. + + Args: + tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The points + data with shape (N, points_dim). + points_dim (int): Integer indicating the dimension of a point. Each row + is (x, y, z, ...). Defaults to 3. + attribute_dims (dict, optional): Dictionary to indicate the meaning of + extra dimension. Defaults to None. + + Attributes: + tensor (Tensor): Float matrix with shape (N, points_dim). + points_dim (int): Integer indicating the dimension of a point. Each row + is (x, y, z, ...). + attribute_dims (dict, optional): Dictionary to indicate the meaning of + extra dimension. Defaults to None. + rotation_axis (int): Default rotation axis for points rotation. + """ + + def __init__(self, + tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]], + points_dim: int = 3, + attribute_dims: Optional[dict] = None) -> None: + super(DepthPoints, self).__init__(tensor, + points_dim=points_dim, + attribute_dims=attribute_dims) + self.rotation_axis = 2 + + def flip(self, bev_direction: str = 'horizontal') -> None: + """Flip the points along given BEV direction. + + Args: + bev_direction (str): Flip direction (horizontal or vertical). + Defaults to 'horizontal'. + """ + assert bev_direction in ('horizontal', 'vertical') + if bev_direction == 'horizontal': + self.tensor[:, 0] = -self.tensor[:, 0] + elif bev_direction == 'vertical': + self.tensor[:, 1] = -self.tensor[:, 1] + + def convert_to(self, + dst: int, + rt_mat: Optional[Union[Tensor, + np.ndarray]] = None) -> 'BasePoints': + """Convert self to ``dst`` mode. + + Args: + dst (int): The target Point mode. + rt_mat (Tensor or np.ndarray, optional): The rotation and + translation matrix between different coordinates. + Defaults to None. The conversion from ``src`` coordinates to + ``dst`` coordinates usually comes along the change of sensors, + e.g., from camera to LiDAR. This requires a transformation + matrix. + + Returns: + :obj:`BasePoints`: The converted point of the same type in the + ``dst`` mode. + """ + from mmdet3d.structures.bbox_3d import Coord3DMode + return Coord3DMode.convert_point(point=self, + src=Coord3DMode.DEPTH, + dst=dst, + rt_mat=rt_mat) diff --git a/embodiedscan/structures/points/lidar_points.py b/embodiedscan/structures/points/lidar_points.py new file mode 100644 index 0000000..71ecb49 --- /dev/null +++ b/embodiedscan/structures/points/lidar_points.py @@ -0,0 +1,75 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Sequence, Union + +import numpy as np +from torch import Tensor + +from .base_points import BasePoints + + +class LiDARPoints(BasePoints): + """Points of instances in LIDAR coordinates. + + Args: + tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The points + data with shape (N, points_dim). + points_dim (int): Integer indicating the dimension of a point. Each row + is (x, y, z, ...). Defaults to 3. + attribute_dims (dict, optional): Dictionary to indicate the meaning of + extra dimension. Defaults to None. + + Attributes: + tensor (Tensor): Float matrix with shape (N, points_dim). + points_dim (int): Integer indicating the dimension of a point. Each row + is (x, y, z, ...). + attribute_dims (dict, optional): Dictionary to indicate the meaning of + extra dimension. Defaults to None. + rotation_axis (int): Default rotation axis for points rotation. + """ + + def __init__(self, + tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]], + points_dim: int = 3, + attribute_dims: Optional[dict] = None) -> None: + super(LiDARPoints, self).__init__(tensor, + points_dim=points_dim, + attribute_dims=attribute_dims) + self.rotation_axis = 2 + + def flip(self, bev_direction: str = 'horizontal') -> None: + """Flip the points along given BEV direction. + + Args: + bev_direction (str): Flip direction (horizontal or vertical). + Defaults to 'horizontal'. + """ + assert bev_direction in ('horizontal', 'vertical') + if bev_direction == 'horizontal': + self.tensor[:, 1] = -self.tensor[:, 1] + elif bev_direction == 'vertical': + self.tensor[:, 0] = -self.tensor[:, 0] + + def convert_to(self, + dst: int, + rt_mat: Optional[Union[Tensor, + np.ndarray]] = None) -> 'BasePoints': + """Convert self to ``dst`` mode. + + Args: + dst (int): The target Point mode. + rt_mat (Tensor or np.ndarray, optional): The rotation and + translation matrix between different coordinates. + Defaults to None. The conversion from ``src`` coordinates to + ``dst`` coordinates usually comes along the change of sensors, + e.g., from camera to LiDAR. This requires a transformation + matrix. + + Returns: + :obj:`BasePoints`: The converted point of the same type in the + ``dst`` mode. + """ + from mmdet3d.structures.bbox_3d import Coord3DMode + return Coord3DMode.convert_point(point=self, + src=Coord3DMode.LIDAR, + dst=dst, + rt_mat=rt_mat) diff --git a/embodiedscan/utils/__init__.py b/embodiedscan/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/embodiedscan/utils/color_selector.py b/embodiedscan/utils/color_selector.py new file mode 100644 index 0000000..e38fda9 --- /dev/null +++ b/embodiedscan/utils/color_selector.py @@ -0,0 +1,913 @@ +import random + +COCO_COLOR = [ + { + 'color': [220, 20, 60], + 'isthing': 1, + 'id': 1, + 'name': 'person' + }, + { + 'color': [119, 11, 32], + 'isthing': 1, + 'id': 2, + 'name': 'bicycle' + }, + { + 'color': [0, 0, 142], + 'isthing': 1, + 'id': 3, + 'name': 'car' + }, + { + 'color': [0, 0, 230], + 'isthing': 1, + 'id': 4, + 'name': 'motorcycle' + }, + { + 'color': [106, 0, 228], + 'isthing': 1, + 'id': 5, + 'name': 'airplane' + }, + { + 'color': [0, 60, 100], + 'isthing': 1, + 'id': 6, + 'name': 'bus' + }, + { + 'color': [0, 80, 100], + 'isthing': 1, + 'id': 7, + 'name': 'train' + }, + { + 'color': [0, 0, 70], + 'isthing': 1, + 'id': 8, + 'name': 'truck' + }, + { + 'color': [0, 0, 192], + 'isthing': 1, + 'id': 9, + 'name': 'boat' + }, + { + 'color': [250, 170, 30], + 'isthing': 1, + 'id': 10, + 'name': 'traffic light' + }, + { + 'color': [100, 170, 30], + 'isthing': 1, + 'id': 11, + 'name': 'fire hydrant' + }, + { + 'color': [220, 220, 0], + 'isthing': 1, + 'id': 13, + 'name': 'stop sign' + }, + { + 'color': [175, 116, 175], + 'isthing': 1, + 'id': 14, + 'name': 'parking meter' + }, + { + 'color': [250, 0, 30], + 'isthing': 1, + 'id': 15, + 'name': 'bench' + }, + { + 'color': [165, 42, 42], + 'isthing': 1, + 'id': 16, + 'name': 'bird' + }, + { + 'color': [255, 77, 255], + 'isthing': 1, + 'id': 17, + 'name': 'cat' + }, + { + 'color': [0, 226, 252], + 'isthing': 1, + 'id': 18, + 'name': 'dog' + }, + { + 'color': [182, 182, 255], + 'isthing': 1, + 'id': 19, + 'name': 'horse' + }, + { + 'color': [0, 82, 0], + 'isthing': 1, + 'id': 20, + 'name': 'sheep' + }, + { + 'color': [120, 166, 157], + 'isthing': 1, + 'id': 21, + 'name': 'cow' + }, + { + 'color': [110, 76, 0], + 'isthing': 1, + 'id': 22, + 'name': 'elephant' + }, + { + 'color': [174, 57, 255], + 'isthing': 1, + 'id': 23, + 'name': 'bear' + }, + { + 'color': [199, 100, 0], + 'isthing': 1, + 'id': 24, + 'name': 'zebra' + }, + { + 'color': [72, 0, 118], + 'isthing': 1, + 'id': 25, + 'name': 'giraffe' + }, + { + 'color': [255, 179, 240], + 'isthing': 1, + 'id': 27, + 'name': 'backpack' + }, + { + 'color': [0, 125, 92], + 'isthing': 1, + 'id': 28, + 'name': 'umbrella' + }, + { + 'color': [209, 0, 151], + 'isthing': 1, + 'id': 31, + 'name': 'handbag' + }, + { + 'color': [188, 208, 182], + 'isthing': 1, + 'id': 32, + 'name': 'tie' + }, + { + 'color': [0, 220, 176], + 'isthing': 1, + 'id': 33, + 'name': 'suitcase' + }, + { + 'color': [255, 99, 164], + 'isthing': 1, + 'id': 34, + 'name': 'frisbee' + }, + { + 'color': [92, 0, 73], + 'isthing': 1, + 'id': 35, + 'name': 'skis' + }, + { + 'color': [133, 129, 255], + 'isthing': 1, + 'id': 36, + 'name': 'snowboard' + }, + { + 'color': [78, 180, 255], + 'isthing': 1, + 'id': 37, + 'name': 'sports ball' + }, + { + 'color': [0, 228, 0], + 'isthing': 1, + 'id': 38, + 'name': 'kite' + }, + { + 'color': [174, 255, 243], + 'isthing': 1, + 'id': 39, + 'name': 'baseball bat' + }, + { + 'color': [45, 89, 255], + 'isthing': 1, + 'id': 40, + 'name': 'baseball glove' + }, + { + 'color': [134, 134, 103], + 'isthing': 1, + 'id': 41, + 'name': 'skateboard' + }, + { + 'color': [145, 148, 174], + 'isthing': 1, + 'id': 42, + 'name': 'surfboard' + }, + { + 'color': [255, 208, 186], + 'isthing': 1, + 'id': 43, + 'name': 'tennis racket' + }, + { + 'color': [197, 226, 255], + 'isthing': 1, + 'id': 44, + 'name': 'bottle' + }, + { + 'color': [171, 134, 1], + 'isthing': 1, + 'id': 46, + 'name': 'wine glass' + }, + { + 'color': [109, 63, 54], + 'isthing': 1, + 'id': 47, + 'name': 'cup' + }, + { + 'color': [207, 138, 255], + 'isthing': 1, + 'id': 48, + 'name': 'fork' + }, + { + 'color': [151, 0, 95], + 'isthing': 1, + 'id': 49, + 'name': 'knife' + }, + { + 'color': [9, 80, 61], + 'isthing': 1, + 'id': 50, + 'name': 'spoon' + }, + { + 'color': [84, 105, 51], + 'isthing': 1, + 'id': 51, + 'name': 'bowl' + }, + { + 'color': [74, 65, 105], + 'isthing': 1, + 'id': 52, + 'name': 'banana' + }, + { + 'color': [166, 196, 102], + 'isthing': 1, + 'id': 53, + 'name': 'apple' + }, + { + 'color': [208, 195, 210], + 'isthing': 1, + 'id': 54, + 'name': 'sandwich' + }, + { + 'color': [255, 109, 65], + 'isthing': 1, + 'id': 55, + 'name': 'orange' + }, + { + 'color': [0, 143, 149], + 'isthing': 1, + 'id': 56, + 'name': 'broccoli' + }, + { + 'color': [179, 0, 194], + 'isthing': 1, + 'id': 57, + 'name': 'carrot' + }, + { + 'color': [209, 99, 106], + 'isthing': 1, + 'id': 58, + 'name': 'hot dog' + }, + { + 'color': [5, 121, 0], + 'isthing': 1, + 'id': 59, + 'name': 'pizza' + }, + { + 'color': [227, 255, 205], + 'isthing': 1, + 'id': 60, + 'name': 'donut' + }, + { + 'color': [147, 186, 208], + 'isthing': 1, + 'id': 61, + 'name': 'cake' + }, + { + 'color': [153, 69, 1], + 'isthing': 1, + 'id': 62, + 'name': 'chair' + }, + { + 'color': [3, 95, 161], + 'isthing': 1, + 'id': 63, + 'name': 'couch' + }, + { + 'color': [163, 255, 0], + 'isthing': 1, + 'id': 64, + 'name': 'potted plant' + }, + { + 'color': [123, 104, 238], + 'isthing': 1, + 'id': 65, + 'name': 'bed' + }, + { + 'color': [255., 187., 120.], + 'isthing': 1, + 'id': 67, + 'name': 'dining table' + }, + { + 'color': [0, 165, 120], + 'isthing': 1, + 'id': 70, + 'name': 'toilet' + }, + { + 'color': [183, 130, 88], + 'isthing': 1, + 'id': 72, + 'name': 'tv' + }, + { + 'color': [95, 32, 0], + 'isthing': 1, + 'id': 73, + 'name': 'laptop' + }, + { + 'color': [130, 114, 135], + 'isthing': 1, + 'id': 74, + 'name': 'mouse' + }, + { + 'color': [110, 129, 133], + 'isthing': 1, + 'id': 75, + 'name': 'remote' + }, + { + 'color': [166, 74, 118], + 'isthing': 1, + 'id': 76, + 'name': 'keyboard' + }, + { + 'color': [219, 142, 185], + 'isthing': 1, + 'id': 77, + 'name': 'cell phone' + }, + { + 'color': [79, 210, 114], + 'isthing': 1, + 'id': 78, + 'name': 'microwave' + }, + { + 'color': [178, 90, 62], + 'isthing': 1, + 'id': 79, + 'name': 'oven' + }, + { + 'color': [65, 70, 15], + 'isthing': 1, + 'id': 80, + 'name': 'toaster' + }, + { + 'color': [127, 167, 115], + 'isthing': 1, + 'id': 81, + 'name': 'sink' + }, + { + 'color': [59, 105, 106], + 'isthing': 1, + 'id': 82, + 'name': 'refrigerator' + }, + { + 'color': [142, 108, 45], + 'isthing': 1, + 'id': 84, + 'name': 'book' + }, + { + 'color': [196, 172, 0], + 'isthing': 1, + 'id': 85, + 'name': 'clock' + }, + { + 'color': [95, 54, 80], + 'isthing': 1, + 'id': 86, + 'name': 'vase' + }, + { + 'color': [128, 76, 255], + 'isthing': 1, + 'id': 87, + 'name': 'scissors' + }, + { + 'color': [201, 57, 1], + 'isthing': 1, + 'id': 88, + 'name': 'teddy bear' + }, + { + 'color': [246, 0, 122], + 'isthing': 1, + 'id': 89, + 'name': 'hair drier' + }, + { + 'color': [191, 162, 208], + 'isthing': 1, + 'id': 90, + 'name': 'toothbrush' + }, + { + 'color': [255, 255, 128], + 'isthing': 0, + 'id': 92, + 'name': 'banner' + }, + { + 'color': [147, 211, 203], + 'isthing': 0, + 'id': 93, + 'name': 'blanket' + }, + { + 'color': [150, 100, 100], + 'isthing': 0, + 'id': 95, + 'name': 'bridge' + }, + { + 'color': [168, 171, 172], + 'isthing': 0, + 'id': 100, + 'name': 'cardboard' + }, + { + 'color': [146, 112, 198], + 'isthing': 0, + 'id': 107, + 'name': 'counter' + }, + { + 'color': [210, 170, 100], + 'isthing': 0, + 'id': 109, + 'name': 'curtain' + }, + { + 'color': [92, 136, 89], + 'isthing': 0, + 'id': 112, + 'name': 'door-stuff' + }, + { + 'color': [255, 193, 193], + 'isthing': 0, + 'id': 118, + 'name': 'floor-wood' + }, + { + 'color': [241, 129, 0], + 'isthing': 0, + 'id': 119, + 'name': 'flower' + }, + { + 'color': [217, 17, 255], + 'isthing': 0, + 'id': 122, + 'name': 'fruit' + }, + { + 'color': [124, 74, 181], + 'isthing': 0, + 'id': 125, + 'name': 'gravel' + }, + { + 'color': [70, 70, 70], + 'isthing': 0, + 'id': 128, + 'name': 'house' + }, + { + 'color': [255, 228, 255], + 'isthing': 0, + 'id': 130, + 'name': 'light' + }, + { + 'color': [154, 208, 0], + 'isthing': 0, + 'id': 133, + 'name': 'mirror-stuff' + }, + { + 'color': [193, 0, 92], + 'isthing': 0, + 'id': 138, + 'name': 'net' + }, + { + 'color': [76, 91, 113], + 'isthing': 0, + 'id': 141, + 'name': 'pillow' + }, + { + 'color': [255, 180, 195], + 'isthing': 0, + 'id': 144, + 'name': 'platform' + }, + { + 'color': [106, 154, 176], + 'isthing': 0, + 'id': 145, + 'name': 'playingfield' + }, + { + 'color': [230, 150, 140], + 'isthing': 0, + 'id': 147, + 'name': 'railroad' + }, + { + 'color': [60, 143, 255], + 'isthing': 0, + 'id': 148, + 'name': 'river' + }, + { + 'color': [128, 64, 128], + 'isthing': 0, + 'id': 149, + 'name': 'road' + }, + { + 'color': [92, 82, 55], + 'isthing': 0, + 'id': 151, + 'name': 'roof' + }, + { + 'color': [254, 212, 124], + 'isthing': 0, + 'id': 154, + 'name': 'sand' + }, + { + 'color': [73, 77, 174], + 'isthing': 0, + 'id': 155, + 'name': 'sea' + }, + { + 'color': [255, 160, 98], + 'isthing': 0, + 'id': 156, + 'name': 'shelf' + }, + { + 'color': [255, 255, 255], + 'isthing': 0, + 'id': 159, + 'name': 'snow' + }, + { + 'color': [104, 84, 109], + 'isthing': 0, + 'id': 161, + 'name': 'stairs' + }, + { + 'color': [169, 164, 131], + 'isthing': 0, + 'id': 166, + 'name': 'tent' + }, + { + 'color': [225, 199, 255], + 'isthing': 0, + 'id': 168, + 'name': 'towel' + }, + { + 'color': [137, 54, 74], + 'isthing': 0, + 'id': 171, + 'name': 'wall-brick' + }, + { + 'color': [135, 158, 223], + 'isthing': 0, + 'id': 175, + 'name': 'wall-stone' + }, + { + 'color': [7, 246, 231], + 'isthing': 0, + 'id': 176, + 'name': 'wall-tile' + }, + { + 'color': [107, 255, 200], + 'isthing': 0, + 'id': 177, + 'name': 'wall-wood' + }, + { + 'color': [58, 41, 149], + 'isthing': 0, + 'id': 178, + 'name': 'water-other' + }, + { + 'color': [183, 121, 142], + 'isthing': 0, + 'id': 180, + 'name': 'window-blind' + }, + { + 'color': [255, 73, 97], + 'isthing': 0, + 'id': 181, + 'name': 'window-other' + }, + { + 'color': [107, 142, 35], + 'isthing': 0, + 'id': 184, + 'name': 'tree-merged' + }, + { + 'color': [190, 153, 153], + 'isthing': 0, + 'id': 185, + 'name': 'fence-merged' + }, + { + 'color': [146, 139, 141], + 'isthing': 0, + 'id': 186, + 'name': 'ceiling-merged' + }, + { + 'color': [70, 130, 180], + 'isthing': 0, + 'id': 187, + 'name': 'sky-other-merged' + }, + { + 'color': [134, 199, 156], + 'isthing': 0, + 'id': 188, + 'name': 'cabinet-merged' + }, + { + 'color': [209, 226, 140], + 'isthing': 0, + 'id': 189, + 'name': 'table-merged' + }, + { + 'color': [96, 36, 108], + 'isthing': 0, + 'id': 190, + 'name': 'floor-other-merged' + }, + { + 'color': [96, 96, 96], + 'isthing': 0, + 'id': 191, + 'name': 'pavement-merged' + }, + { + 'color': [64, 170, 64], + 'isthing': 0, + 'id': 192, + 'name': 'mountain-merged' + }, + { + 'color': [152, 251, 152], + 'isthing': 0, + 'id': 193, + 'name': 'grass-merged' + }, + { + 'color': [208, 229, 228], + 'isthing': 0, + 'id': 194, + 'name': 'dirt-merged' + }, + { + 'color': [206, 186, 171], + 'isthing': 0, + 'id': 195, + 'name': 'paper-merged' + }, + { + 'color': [152, 161, 64], + 'isthing': 0, + 'id': 196, + 'name': 'food-other-merged' + }, + { + 'color': [116, 112, 0], + 'isthing': 0, + 'id': 197, + 'name': 'building-other-merged' + }, + { + 'color': [0, 114, 143], + 'isthing': 0, + 'id': 198, + 'name': 'rock-merged' + }, + { + 'color': [102, 102, 156], + 'isthing': 0, + 'id': 199, + 'name': 'wall-other-merged' + }, + { + 'color': [250, 141, 255], + 'isthing': 0, + 'id': 200, + 'name': 'rug-merged' + }, +] + +EMBODIED_CATE = [ + 'adhesive tape', 'air conditioner', 'alarm', 'album', 'arch', 'backpack', + 'bag', 'balcony', 'ball', 'banister', 'bar', 'barricade', 'baseboard', + 'basin', 'basket', 'bathtub', 'beam', 'beanbag', 'bed', 'bench', 'bicycle', + 'bidet', 'bin', 'blackboard', 'blanket', 'blinds', 'board', 'body loofah', + 'book', 'boots', 'bottle', 'bowl', 'box', 'bread', 'broom', 'brush', + 'bucket', 'cabinet', 'calendar', 'camera', 'can', 'candle', 'candlestick', + 'cap', 'car', 'carpet', 'cart', 'case', 'chair', 'chandelier', 'cleanser', + 'clock', 'clothes', 'clothes dryer', 'coat hanger', 'coffee maker', 'coil', + 'column', 'commode', 'computer', 'conducting wire', 'container', 'control', + 'copier', 'cosmetics', 'couch', 'counter', 'countertop', 'crate', 'crib', + 'cube', 'cup', 'curtain', 'cushion', 'decoration', 'desk', 'detergent', + 'device', 'dish rack', 'dishwasher', 'dispenser', 'divider', 'door', + 'door knob', 'doorframe', 'doorway', 'drawer', 'dress', 'dresser', 'drum', + 'duct', 'dumbbell', 'dustpan', 'dvd', 'eraser', 'excercise equipment', + 'fan', 'faucet', 'fence', 'file', 'fire extinguisher', 'fireplace', + 'flowerpot', 'flush', 'folder', 'food', 'footstool', 'frame', 'fruit', + 'furniture', 'garage door', 'garbage', 'glass', 'globe', 'glove', + 'grab bar', 'grass', 'guitar', 'hair dryer', 'hamper', 'handle', 'hanger', + 'hat', 'headboard', 'headphones', 'heater', 'helmets', 'holder', 'hook', + 'humidifier', 'ironware', 'jacket', 'jalousie', 'jar', 'kettle', + 'keyboard', 'kitchen island', 'kitchenware', 'knife', 'label', 'ladder', + 'lamp', 'laptop', 'ledge', 'letter', 'light', 'luggage', 'machine', + 'magazine', 'mailbox', 'map', 'mask', 'mat', 'mattress', 'menu', + 'microwave', 'mirror', 'molding', 'monitor', 'mop', 'mouse', 'napkins', + 'notebook', 'ottoman', 'oven', 'pack', 'package', 'pad', 'pan', 'panel', + 'paper', 'paper cutter', 'partition', 'pedestal', 'pen', 'person', 'piano', + 'picture', 'pillar', 'pillow', 'pipe', 'pitcher', 'plant', 'plate', + 'player', 'plug', 'plunger', 'pool', 'pool table', 'poster', 'pot', + 'price tag', 'printer', 'projector', 'purse', 'rack', 'radiator', 'radio', + 'rail', 'range hood', 'refrigerator', 'remote control', 'ridge', 'rod', + 'roll', 'roof', 'rope', 'sack', 'salt', 'scale', 'scissors', 'screen', + 'seasoning', 'shampoo', 'sheet', 'shelf', 'shirt', 'shoe', 'shovel', + 'shower', 'sign', 'sink', 'soap', 'soap dish', 'soap dispenser', 'socket', + 'speaker', 'sponge', 'spoon', 'stairs', 'stall', 'stand', 'stapler', + 'statue', 'steps', 'stick', 'stool', 'stopcock', 'stove', 'structure', + 'sunglasses', 'support', 'switch', 'table', 'tablet', 'teapot', + 'telephone', 'thermostat', 'tissue', 'tissue box', 'toaster', 'toilet', + 'toilet paper', 'toiletry', 'tool', 'toothbrush', 'toothpaste', 'towel', + 'toy', 'tray', 'treadmill', 'trophy', 'tube', 'tv', 'umbrella', 'urn', + 'utensil', 'vacuum cleaner', 'vanity', 'vase', 'vent', 'ventilation', + 'wardrobe', 'washbasin', 'washing machine', 'water cooler', 'water heater', + 'window', 'window frame', 'windowsill', 'wine', 'wire', 'wood', 'wrap' +] + + +class ColorMap(object): + + def __init__(self, + classes=EMBODIED_CATE, + init_file='./utils/full_color_map.txt'): + self.color_map = dict() + if init_file is not None: + with open(init_file, 'r') as f: + pre_data = f.readlines() + for ins in pre_data: + s = ins.strip() + cate = s.split('[')[0].strip() + color = eval(s[len(cate):]) + self.color_map[cate] = color + + self.classes = classes + self.color_pool = COCO_COLOR + + for label in classes: + if label not in self.color_map: + x = random.choice(self.color_pool) + self.color_map[label] = x['color'] + + self.inv_color_map = dict() + for key, value in self.color_map.items(): + color_idx = value[0] * 256 * 256 + value[1] * 256 + value[2] + if color_idx in self.inv_color_map: + self.inv_color_map[color_idx].append(key) + else: + self.inv_color_map[color_idx] = [key] + + self.visible_label = set() + + def save(self, out_file): + with open(out_file, 'w') as f: + for key, value in self.color_map.items(): + print(key, value, file=f) + + def get_color(self, label): + color = self.color_map[label] + if label in self.visible_label: + return color + color_idx = color[0] * 256 * 256 + color[1] * 256 + color[2] + bo = False + for value in self.inv_color_map[color_idx]: + if value in self.visible_label: + if not bo: + print('same color: ', end='') + bo = True + print(value, ' ', end='') + + if bo: + print(label) + + self.visible_label.add(label) + return color + + def clear_stat(self): + self.visible_label.clear() + + +if __name__ == '__main__': + a = ColorMap(init_file='occ_color_map.txt') + print(a.get_color('bed')) diff --git a/embodiedscan/utils/full_color_map.txt b/embodiedscan/utils/full_color_map.txt new file mode 100644 index 0000000..0c62ffe --- /dev/null +++ b/embodiedscan/utils/full_color_map.txt @@ -0,0 +1,287 @@ +floor [255, 193, 193] +wall [137, 54, 74] +chair [153, 69, 1] +cabinet [134, 199, 156] +door [92, 136, 89] +table [255.0, 187.0, 120.0] +couch [3, 95, 161] +shelf [255, 160, 98] +window [183, 121, 142] +bed [123, 104, 238] +curtain [210, 170, 100] +plant [163, 255, 0] +stairs [104, 84, 109] +pillow [76, 91, 113] +counter [146, 112, 198] +bench [250, 0, 30] +rail [230, 150, 140] +sink [135, 206, 250] +mirror [154, 208, 0] +toilet [0, 165, 120] +refrigerator [59, 105, 106] +book [142, 108, 45] +tv [183, 130, 88] +blanket [147, 211, 203] +rack [255, 208, 186] +towel [225, 199, 255] +backpack [255, 179, 240] +roof [92, 82, 55] +bag [209, 0, 151] +board [133, 129, 255] +bicycle [119, 11, 32] +oven [178, 90, 62] +microwave [79, 210, 114] +desk [109, 63, 54] +doorframe [199, 100, 0] +wardrobe [7, 246, 231] +picture [171, 134, 1] +bathtub [92, 0, 73] +box [188, 208, 182] +stand [146, 139, 141] +clothes [96, 96, 96] +lamp [107, 255, 200] +dresser [206, 186, 171] +stool [73, 77, 174] +fireplace [255, 77, 255] +commode [102, 102, 156] +washing machine [152, 251, 152] +monitor [208, 195, 210] +window frame [227, 255, 205] +radiator [191, 162, 208] +mat [250, 141, 255] +shower [154, 255, 154] +ottoman [95, 32, 0] +column [60, 143, 255] +blinds [134, 134, 103] +stove [128, 64, 128] +bar [72, 0, 118] +pillar [220, 20, 60] +bin [187, 255, 255] +heater [209, 226, 140] +clothes dryer [100, 170, 30] +blackboard [0, 82, 0] +decoration [107, 142, 35] +steps [120, 166, 157] +windowsill [9, 80, 61] +cushion [0, 228, 0] +carpet [175, 116, 175] +copier [241, 129, 0] +countertop [207, 138, 255] +basket [0, 0, 70] +mailbox [150, 100, 100] +kitchen island [220, 220, 0] +washbasin [0, 80, 100] +drawer [0, 220, 176] +piano [78, 180, 255] +exercise equipment [151, 0, 95] +beam [255, 255, 128] +partition [168, 171, 172] +printer [179, 0, 194] +frame [255, 180, 195] +object [0, 0, 0] +adhesive tape [0, 220, 176] +air conditioner [109, 63, 54] +alarm [0, 114, 143] +album [147, 186, 208] +arch [135, 158, 223] +balcony [70, 70, 70] +ball [96, 96, 96] +banister [196, 172, 0] +barricade [45, 89, 255] +baseboard [153, 69, 1] +basin [255.0, 187.0, 120.0] +beanbag [190, 153, 153] +bidet [123, 104, 238] +body loofah [196, 172, 0] +boots [134, 199, 156] +bottle [241, 129, 0] +bowl [92, 136, 89] +bread [119, 11, 32] +broom [0, 226, 252] +brush [255, 255, 128] +bucket [255, 73, 97] +calendar [76, 91, 113] +camera [72, 0, 118] +can [109, 63, 54] +candle [78, 180, 255] +candlestick [104, 84, 109] +cap [128, 76, 255] +car [107, 142, 35] +cart [255, 255, 128] +case [0, 0, 230] +chandelier [169, 164, 131] +cleanser [0, 165, 120] +clock [190, 153, 153] +coat hanger [179, 0, 194] +coffee maker [0, 82, 0] +coil [255, 179, 240] +computer [225, 199, 255] +conducting wire [150, 100, 100] +container [0, 0, 70] +control [255, 77, 255] +cosmetics [142, 108, 45] +crate [0, 226, 252] +crib [169, 164, 131] +cube [116, 112, 0] +cup [175, 116, 175] +detergent [255, 208, 186] +device [146, 139, 141] +dish rack [0, 0, 142] +dishwasher [92, 82, 55] +dispenser [95, 32, 0] +divider [219, 142, 185] +door knob [166, 74, 118] +doorway [134, 134, 103] +dress [0, 114, 143] +drum [107, 142, 35] +duct [0, 80, 100] +dumbbell [0, 0, 192] +dustpan [78, 180, 255] +dvd [0, 143, 149] +eraser [0, 82, 0] +fan [0, 0, 70] +faucet [84, 105, 51] +fence [190, 153, 153] +file [255, 228, 255] +fire extinguisher [107, 255, 200] +flowerpot [9, 80, 61] +flush [227, 255, 205] +folder [208, 229, 228] +food [109, 63, 54] +footstool [133, 129, 255] +fruit [179, 0, 194] +furniture [220, 20, 60] +garage door [217, 17, 255] +garbage [0, 82, 0] +glass [255, 99, 164] +globe [255, 77, 255] +glove [166, 196, 102] +grab bar [145, 148, 174] +grass [0, 60, 100] +guitar [73, 77, 174] +hair dryer [169, 164, 131] +hamper [241, 129, 0] +handle [142, 108, 45] +hanger [150, 100, 100] +hat [154, 208, 0] +headboard [171, 134, 1] +headphones [124, 74, 181] +helmets [209, 226, 140] +holder [151, 0, 95] +hook [92, 136, 89] +humidifier [209, 99, 106] +ironware [127, 167, 115] +jacket [255, 73, 97] +jalousie [255, 179, 240] +jar [106, 154, 176] +kettle [196, 172, 0] +keyboard [0, 125, 92] +kitchenware [74, 65, 105] +knife [70, 130, 180] +label [0, 228, 0] +ladder [0, 114, 143] +laptop [255, 180, 195] +ledge [58, 41, 149] +letter [0, 0, 192] +light [78, 180, 255] +luggage [0, 226, 252] +machine [197, 226, 255] +magazine [199, 100, 0] +map [183, 121, 142] +mask [74, 65, 105] +mattress [255, 179, 240] +menu [255, 255, 128] +molding [104, 84, 109] +mop [199, 100, 0] +mouse [5, 121, 0] +napkins [165, 42, 42] +notebook [175, 116, 175] +pack [0, 143, 149] +package [166, 196, 102] +pad [208, 229, 228] +pan [209, 99, 106] +panel [201, 57, 1] +paper [255, 179, 240] +paper cutter [207, 138, 255] +pedestal [64, 170, 64] +pen [193, 0, 92] +person [7, 246, 231] +pipe [255, 180, 195] +pitcher [220, 20, 60] +plate [142, 108, 45] +player [0, 143, 149] +plug [255, 77, 255] +plunger [165, 42, 42] +pool [153, 69, 1] +pool table [0, 0, 230] +poster [130, 114, 135] +pot [96, 36, 108] +price tag [255, 77, 255] +projector [179, 0, 194] +purse [0, 228, 0] +radio [116, 112, 0] +range hood [199, 100, 0] +remote control [188, 208, 182] +ridge [59, 105, 106] +rod [207, 138, 255] +roll [123, 104, 238] +rope [110, 76, 0] +sack [190, 153, 153] +salt [250, 0, 30] +scale [58, 41, 149] +scissors [60, 143, 255] +screen [0, 82, 0] +seasoning [254, 212, 124] +shampoo [70, 130, 180] +sheet [151, 0, 95] +shirt [190, 153, 153] +shoe [199, 100, 0] +shovel [241, 129, 0] +sign [208, 195, 210] +soap [109, 63, 54] +soap dish [166, 74, 118] +soap dispenser [95, 32, 0] +socket [255, 255, 255] +speaker [65, 70, 15] +sponge [0, 220, 176] +spoon [134, 134, 103] +stall [0, 60, 100] +stapler [246, 0, 122] +statue [196, 172, 0] +stick [0, 165, 120] +stopcock [0, 60, 100] +structure [220, 20, 60] +sunglasses [142, 108, 45] +support [209, 226, 140] +switch [7, 246, 231] +tablet [137, 54, 74] +teapot [0, 80, 100] +telephone [220, 220, 0] +thermostat [128, 76, 255] +tissue [73, 77, 174] +tissue box [96, 96, 96] +toaster [106, 0, 228] +toilet paper [84, 105, 51] +toiletry [128, 64, 128] +tool [220, 20, 60] +toothbrush [130, 114, 135] +toothpaste [0, 143, 149] +toy [255.0, 187.0, 120.0] +tray [255, 179, 240] +treadmill [166, 74, 118] +trophy [0, 220, 176] +tube [255, 255, 128] +umbrella [250, 0, 30] +urn [152, 251, 152] +utensil [220, 220, 0] +vacuum cleaner [96, 36, 108] +vanity [5, 121, 0] +vase [255, 193, 193] +vent [209, 226, 140] +ventilation [123, 104, 238] +water cooler [255, 255, 128] +water heater [145, 148, 174] +wine [220, 220, 0] +wire [96, 36, 108] +wood [127, 167, 115] +wrap [175, 116, 175] diff --git a/embodiedscan/utils/img_drawer.py b/embodiedscan/utils/img_drawer.py new file mode 100644 index 0000000..94ffe4a --- /dev/null +++ b/embodiedscan/utils/img_drawer.py @@ -0,0 +1,127 @@ +import cv2 +import matplotlib.pyplot as plt +import numpy as np +import open3d as o3d +from matplotlib import path + + +class ImageDrawer: + + def __init__(self, image, verbose=False): + self.verbose = verbose + if self.verbose: + print('Loading image', image) + img = cv2.imread(image) + if self.verbose: + print('Loading image Complete') + img = img[:, :, ::-1].astype(np.float32) # BGR to RGB + self.occupied = np.zeros((img.shape[0], img.shape[1]), dtype=bool) + self.img = img + self.EPS = 1e-4 + self.ALPHA = 0.75 + + def draw_text(self, + text, + font=cv2.FONT_HERSHEY_SIMPLEX, + pos=(0, 0), + size=(0, 0), + font_scale=1, + font_thickness=2, + text_color=(0, 255, 0), + text_color_bg=(0, 0, 0)): + + x, y = pos + w, h = size + text_size, _ = cv2.getTextSize(text, font, font_scale, font_thickness) + text_w, text_h = text_size + if y * 2 > h: + dy = -10 + else: + dy = 10 + + try: + while self.occupied[y, x] or self.occupied[ + y, x + + text_w] or self.occupied[y + text_h, + x] or self.occupied[y + text_h, + x + text_w]: + y += dy + except: # noqa: E722 + pass + # TODO + cv2.rectangle(self.img, (x, y), (x + text_w, y + text_h), + text_color_bg, -1) + cv2.putText(self.img, text, (x, y + text_h + font_scale - 1), font, + font_scale, text_color, font_thickness) + + self.occupied[y:y + text_h, x:x + text_w] = True + + def draw_box3d(self, box, color, label, extrinsic, intrinsic): + """ + box : open3d box + color : 0-255 + extrinsic : 4x4 CAM 2 WORLD + """ + extrinsic_w2c = np.linalg.inv(extrinsic) + h, w, _ = self.img.shape + x, y = np.meshgrid(np.arange(w), np.arange(h)) + x, y = x.flatten(), y.flatten() + pixel_points = np.vstack((x, y)).T + + camera_pos_in_world = ( + extrinsic @ np.array([0, 0, 0, 1]).reshape(4, 1)).transpose() + if self._inside_box(box, camera_pos_in_world): + return + + corners = np.asarray(box.get_box_points()) + corners = corners[[0, 1, 7, 2, 3, 6, 4, 5]] + corners = np.concatenate( + [corners, np.ones((corners.shape[0], 1))], axis=1) + corners_img = intrinsic @ extrinsic_w2c @ corners.transpose() + corners_img = corners_img.transpose() + corners_pixel = np.zeros((corners_img.shape[0], 2)) + for i in range(corners_img.shape[0]): + corners_pixel[i] = corners_img[i][:2] / np.abs(corners_img[i][2]) + lines = [[0, 1], [1, 2], [2, 3], [3, 0], [4, 5], [5, 6], [6, 7], + [7, 4], [0, 4], [1, 5], [2, 6], [3, 7]] + faces = [[0, 1, 2, 3], [4, 5, 6, 7], [0, 1, 5, 4], [3, 2, 6, 7], + [0, 3, 7, 4], [1, 2, 6, 5]] + for line in lines: + if (corners_img[line][:, 2] < self.EPS).any(): + continue + px = corners_pixel[line[0]].astype(np.int32) + py = corners_pixel[line[1]].astype(np.int32) + cv2.line(self.img, (px[0], px[1]), (py[0], py[1]), color, 2) + + all_mask = np.zeros((h, w), dtype=bool) + for face in faces: + if (corners_img[face][:, 2] < self.EPS).any(): + continue + pts = corners_pixel[face] + p = path.Path(pts[:, :2]) + mask = p.contains_points(pixel_points).reshape((h, w)) + all_mask = np.logical_or(all_mask, mask) + self.img[all_mask] = self.img[all_mask] * self.ALPHA + ( + 1 - self.ALPHA) * np.array(color) + + if (all_mask.any()): + textpos = np.min(corners_pixel, axis=0).astype(np.int32) + textpos[0] = np.clip(textpos[0], a_min=0, a_max=w) + textpos[1] = np.clip(textpos[1], a_min=0, a_max=h) + self.draw_text(label, + pos=textpos, + size=(w, h), + text_color=(255, 255, 255), + text_color_bg=color) + + def show(self): + plt.imshow(self.img / 255.0) + plt.show() + + @staticmethod + def _inside_box(box, point): + point_vec = o3d.utility.Vector3dVector(point[:, :3]) + inside_idx = box.get_point_indices_within_bounding_box(point_vec) + if len(inside_idx) > 0: + return True + return False