diff --git a/embodiedscan/converter/generate_image_3rscan.py b/embodiedscan/converter/generate_image_3rscan.py
new file mode 100644
index 0000000..826b0c8
--- /dev/null
+++ b/embodiedscan/converter/generate_image_3rscan.py
@@ -0,0 +1,27 @@
+import os
+import zipfile
+from argparse import ArgumentParser
+from functools import partial
+
+import mmengine
+
+
+def process_scene(path, scene_name):
+    """Process single 3Rscan scene."""
+    with zipfile.ZipFile(os.path.join(path, scene_name, 'sequence.zip'),
+                         'r') as zip_ref:
+        zip_ref.extractall(os.path.join(path, scene_name, 'sequence'))
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser()
+    parser.add_argument('--dataset_folder',
+                        required=True,
+                        help='folder of the dataset.')
+    parser.add_argument('--nproc', type=int, default=8)
+    args = parser.parse_args()
+
+    mmengine.track_parallel_progress(func=partial(process_scene,
+                                                  args.dataset_folder),
+                                     tasks=os.listdir(args.dataset_folder),
+                                     nproc=args.nproc)
diff --git a/embodiedscan/converter/generate_image_scannet.py b/embodiedscan/converter/generate_image_scannet.py
new file mode 100644
index 0000000..77c52b2
--- /dev/null
+++ b/embodiedscan/converter/generate_image_scannet.py
@@ -0,0 +1,189 @@
+# Modified from https://github.com/ScanNet/ScanNet/blob/master/SensReader/python/SensorData.py # noqa
+import os
+import struct
+import zlib
+from argparse import ArgumentParser
+from functools import partial
+
+import imageio
+import mmengine
+import numpy as np
+
+COMPRESSION_TYPE_COLOR = {-1: 'unknown', 0: 'raw', 1: 'png', 2: 'jpeg'}
+
+COMPRESSION_TYPE_DEPTH = {
+    -1: 'unknown',
+    0: 'raw_ushort',
+    1: 'zlib_ushort',
+    2: 'occi_ushort'
+}
+
+
+class RGBDFrame:
+    """Class for single ScanNet RGB-D image processing."""
+
+    def load(self, file_handle):
+        self.camera_to_world = np.asarray(struct.unpack(
+            'f' * 16, file_handle.read(16 * 4)),
+                                          dtype=np.float32).reshape(4, 4)
+        self.timestamp_color = struct.unpack('Q', file_handle.read(8))[0]
+        self.timestamp_depth = struct.unpack('Q', file_handle.read(8))[0]
+        self.color_size_bytes = struct.unpack('Q', file_handle.read(8))[0]
+        self.depth_size_bytes = struct.unpack('Q', file_handle.read(8))[0]
+        self.color_data = b''.join(
+            struct.unpack('c' * self.color_size_bytes,
+                          file_handle.read(self.color_size_bytes)))
+        self.depth_data = b''.join(
+            struct.unpack('c' * self.depth_size_bytes,
+                          file_handle.read(self.depth_size_bytes)))
+
+    def decompress_depth(self, compression_type):
+        assert compression_type == 'zlib_ushort'
+        return zlib.decompress(self.depth_data)
+
+    def decompress_color(self, compression_type):
+        assert compression_type == 'jpeg'
+        return imageio.imread(self.color_data)
+
+
+class SensorData:
+    """Class for single ScanNet scene processing.
+
+    Single scene file contains multiple RGB-D images.
+    """
+
+    def __init__(self, filename, fast=False):
+        self.version = 4
+        self.load(filename, fast)
+
+    def load(self, filename, fast):
+        with open(filename, 'rb') as f:
+            version = struct.unpack('I', f.read(4))[0]
+            assert self.version == version
+            strlen = struct.unpack('Q', f.read(8))[0]
+            self.sensor_name = b''.join(
+                struct.unpack('c' * strlen, f.read(strlen)))
+            self.intrinsic_color = np.asarray(struct.unpack(
+                'f' * 16, f.read(16 * 4)),
+                                              dtype=np.float32).reshape(4, 4)
+            self.extrinsic_color = np.asarray(struct.unpack(
+                'f' * 16, f.read(16 * 4)),
+                                              dtype=np.float32).reshape(4, 4)
+            self.intrinsic_depth = np.asarray(struct.unpack(
+                'f' * 16, f.read(16 * 4)),
+                                              dtype=np.float32).reshape(4, 4)
+            self.extrinsic_depth = np.asarray(struct.unpack(
+                'f' * 16, f.read(16 * 4)),
+                                              dtype=np.float32).reshape(4, 4)
+            self.color_compression_type = COMPRESSION_TYPE_COLOR[struct.unpack(
+                'i', f.read(4))[0]]
+            self.depth_compression_type = COMPRESSION_TYPE_DEPTH[struct.unpack(
+                'i', f.read(4))[0]]
+            self.color_width = struct.unpack('I', f.read(4))[0]
+            self.color_height = struct.unpack('I', f.read(4))[0]
+            self.depth_width = struct.unpack('I', f.read(4))[0]
+            self.depth_height = struct.unpack('I', f.read(4))[0]
+            self.depth_shift = struct.unpack('f', f.read(4))[0]
+            num_frames = struct.unpack('Q', f.read(8))[0]
+            self.num_frames = num_frames
+            self.frames = []
+            if fast:
+                index = list(range(num_frames))[::10]
+            else:
+                index = list(range(num_frames))
+            self.index = index
+            for i in range(num_frames):
+                frame = RGBDFrame()
+                frame.load(f)
+                if i in index:
+                    self.frames.append(frame)
+
+    def export_depth_images(self, output_path):
+        if not os.path.exists(output_path):
+            os.makedirs(output_path)
+        for f in range(len(self.frames)):
+            depth_data = self.frames[f].decompress_depth(
+                self.depth_compression_type)
+            depth = np.fromstring(depth_data, dtype=np.uint16).reshape(
+                self.depth_height, self.depth_width)
+            imageio.imwrite(
+                os.path.join(output_path,
+                             self.index_to_str(self.index[f]) + '.png'), depth)
+
+    def export_color_images(self, output_path):
+        if not os.path.exists(output_path):
+            os.makedirs(output_path)
+        for f in range(len(self.frames)):
+            color = self.frames[f].decompress_color(
+                self.color_compression_type)
+            imageio.imwrite(
+                os.path.join(output_path,
+                             self.index_to_str(self.index[f]) + '.jpg'), color)
+
+    @staticmethod
+    def index_to_str(index):
+        return str(index).zfill(5)
+
+    @staticmethod
+    def save_mat_to_file(matrix, filename):
+        with open(filename, 'w') as f:
+            for line in matrix:
+                np.savetxt(f, line[np.newaxis], fmt='%f')
+
+    def export_poses(self, output_path):
+        if not os.path.exists(output_path):
+            os.makedirs(output_path)
+        for f in range(len(self.frames)):
+            self.save_mat_to_file(
+                self.frames[f].camera_to_world,
+                os.path.join(output_path,
+                             self.index_to_str(self.index[f]) + '.txt'))
+
+    def export_intrinsics(self, output_path):
+        if not os.path.exists(output_path):
+            os.makedirs(output_path)
+        self.save_mat_to_file(self.intrinsic_color,
+                              os.path.join(output_path, 'intrinsic.txt'))
+
+    def export_depth_intrinsics(self, output_path):
+        if not os.path.exists(output_path):
+            os.makedirs(output_path)
+        self.save_mat_to_file(self.intrinsic_depth,
+                              os.path.join(output_path, 'depth_intrinsic.txt'))
+
+
+def process_scene(path, fast, idx):
+    """Process single ScanNet scene.
+
+    Extract RGB images, poses and camera intrinsics.
+    """
+    data = SensorData(os.path.join(path, idx, f'{idx}.sens'), fast)
+    output_path = os.path.join('posed_images', idx)
+    data.export_color_images(output_path)
+    data.export_intrinsics(output_path)
+    data.export_poses(output_path)
+    data.export_depth_images(output_path)
+    data.export_depth_intrinsics(output_path)
+
+
+def process_directory(path, fast, nproc):
+    mmengine.track_parallel_progress(func=partial(process_scene, path, fast),
+                                     tasks=os.listdir(path),
+                                     nproc=nproc)
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser()
+    parser.add_argument('--dataset_folder',
+                        default=None,
+                        help='folder of the dataset.')
+    parser.add_argument('--nproc', type=int, default=8)
+    parser.add_argument('--fast', action='store_true')
+    args = parser.parse_args()
+
+    if args.dataset_folder is not None:
+        os.chdir(args.dataset_folder)
+
+    # process train and val scenes
+    if os.path.exists('scans'):
+        process_directory('scans', args.fast, args.nproc)
diff --git a/embodiedscan/embodied_dataset.py b/embodiedscan/embodied_dataset.py
new file mode 100644
index 0000000..697a02e
--- /dev/null
+++ b/embodiedscan/embodied_dataset.py
@@ -0,0 +1,322 @@
+import os
+import warnings
+from typing import Callable, List, Optional, Union
+
+import mmengine
+import numpy as np
+from mmdet3d.registry import DATASETS
+from mmdet3d.structures import get_box_type
+from mmengine.dataset import BaseDataset
+from mmengine.fileio import load
+
+
+@DATASETS.register_module()
+class EmbodiedScanDataset(BaseDataset):
+
+    def __init__(self,
+                 data_root: str,
+                 ann_file: str,
+                 metainfo: Optional[dict] = None,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 test_mode: bool = False,
+                 load_eval_anns: bool = True,
+                 filter_empty_gt: bool = True,
+                 remove_dontcare: bool = False,
+                 box_type_3d: str = 'Euler-Depth',
+                 **kwargs) -> None:
+
+        self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d)
+        self.filter_empty_gt = filter_empty_gt
+        self.remove_dontcare = remove_dontcare
+        self.load_eval_anns = load_eval_anns
+        super().__init__(ann_file=ann_file,
+                         metainfo=metainfo,
+                         data_root=data_root,
+                         pipeline=pipeline,
+                         test_mode=test_mode,
+                         **kwargs)
+
+    def process_metainfo(self):
+        """This function will be processed after metainfos from ann_file and
+        config are combined."""
+        assert 'categories' in self._metainfo
+
+        if 'classes' not in self._metainfo:
+            self._metainfo.setdefault(
+                'classes', list(self._metainfo['categories'].keys()))
+
+        self.label_mapping = np.full(
+            max(list(self._metainfo['categories'].values())) + 1,
+            -1,
+            dtype=int)
+        for key, value in self._metainfo['categories'].items():
+            if key in self._metainfo['classes']:
+                self.label_mapping[value] = self._metainfo['classes'].index(
+                    key)
+
+        self.occ_label_mapping = np.full(
+            max(list(self._metainfo['categories'].values())) + 1,
+            -1,
+            dtype=int)
+        if 'occ_classes' in self._metainfo:
+            for idx, label_name in enumerate(self._metainfo['occ_classes']):
+                self.occ_label_mapping[self.metainfo['categories'][
+                    label_name]] = idx + 1  # 1-based, 0 is empty
+
+    def parse_data_info(self, info: dict) -> dict:
+        """Process the raw data info.
+
+        The only difference with it in `Det3DDataset`
+        is the specific process for `axis_align_matrix'.
+
+        Args:
+            info (dict): Raw info dict.
+
+        Returns:
+            dict: Has `ann_info` in training stage. And
+            all path has been converted to absolute path.
+        """
+        info['axis_align_matrix'] = self._get_axis_align_matrix(info)
+        # Because multi-view settings are different from original designs
+        # we temporarily follow the ori design in ImVoxelNet
+        info['img_path'] = []
+        info['depth_img_path'] = []
+        if 'cam2img' in info:
+            cam2img = info['cam2img'].astype(np.float32)
+        else:
+            cam2img = []
+
+        extrinsics = []
+        for i in range(len(info['images'])):
+            img_path = os.path.join(self.data_prefix.get('img_path', ''),
+                                    info['images'][i]['img_path'])
+            depth_img_path = os.path.join(self.data_prefix.get('img_path', ''),
+                                          info['images'][i]['depth_path'])
+
+            info['img_path'].append(img_path)
+            info['depth_img_path'].append(depth_img_path)
+            align_global2cam = np.linalg.inv(
+                info['axis_align_matrix'] @ info['images'][i]['cam2global'])
+            extrinsics.append(align_global2cam.astype(np.float32))
+            if 'cam2img' not in info:
+                cam2img.append(info['images'][i]['cam2img'].astype(np.float32))
+
+        info['depth2img'] = dict(extrinsic=extrinsics,
+                                 intrinsic=cam2img,
+                                 origin=np.array([.0, .0,
+                                                  .5]).astype(np.float32))
+
+        if 'depth_cam2img' not in info:
+            info['depth_cam2img'] = cam2img
+
+        if not self.test_mode:
+            info['ann_info'] = self.parse_ann_info(info)
+        if self.test_mode and self.load_eval_anns:
+            eval_ann_info = self.parse_ann_info(info)
+            info['eval_ann_info'] = self._remove_dontcare(eval_ann_info)
+
+        return info
+
+    def parse_ann_info(self, info: dict) -> dict:
+        """Process the `instances` in data info to `ann_info`.
+
+        Args:
+            info (dict): Info dict.
+
+        Returns:
+            dict: Processed `ann_info`.
+        """
+
+        ann_info = None
+        if 'instances' in info and len(info['instances']) > 0:
+            ann_info = dict(
+                gt_bboxes_3d=np.zeros((len(info['instances']), 9),
+                                      dtype=np.float32),
+                gt_labels_3d=np.zeros((len(info['instances']), ),
+                                      dtype=np.int64),
+            )
+            for idx, instance in enumerate(info['instances']):
+                ann_info['gt_bboxes_3d'][idx] = instance['bbox_3d']
+                ann_info['gt_labels_3d'][idx] = self.label_mapping[
+                    instance['bbox_label_3d']]
+
+        # pack ann_info for return
+        if ann_info is None:
+            ann_info = dict()
+            ann_info['gt_bboxes_3d'] = np.zeros((0, 9), dtype=np.float32)
+            ann_info['gt_labels_3d'] = np.zeros((0, ), dtype=np.int64)
+
+        # post-processing/filtering ann_info if not empty gt
+        if 'visible_instance_ids' in info['images'][0]:
+            ids = []
+            for i in range(len(info['images'])):
+                ids.append(info['images'][i]['visible_instance_ids'])
+            mask_length = ann_info['gt_labels_3d'].shape[0]
+            ann_info['visible_instance_masks'] = self._ids2masks(
+                ids, mask_length)
+
+        if self.remove_dontcare:
+            ann_info = self._remove_dontcare(ann_info)
+
+        ann_dataset = info['sample_idx'].split('/')[0]
+        if ann_dataset == 'scannet':
+            region = info['sample_idx'].split('/')[1]
+            occ_filename = os.path.join(self.data_prefix.get('img_path', ''),
+                                        ann_dataset, 'scans', region,
+                                        'occupancy', 'occupancy.npy')
+            mask_filename = os.path.join(self.data_prefix.get('img_path', ''),
+                                         ann_dataset, 'scans', region,
+                                         'occupancy', 'visible_occupancy.pkl')
+        elif ann_dataset == '3rscan':
+            region = info['sample_idx'].split('/')[1]
+            occ_filename = os.path.join(self.data_prefix.get('img_path',
+                                                             ''), ann_dataset,
+                                        region, 'occupancy', 'occupancy.npy')
+            mask_filename = os.path.join(self.data_prefix.get('img_path', ''),
+                                         ann_dataset, region, 'occupancy',
+                                         'visible_occupancy.pkl')
+        elif ann_dataset == 'matterport3d':
+            building = info['sample_idx'].split('/')[1]
+            region = info['sample_idx'].split('/')[2]
+            occ_filename = os.path.join(self.data_prefix.get('img_path', ''),
+                                        ann_dataset, building, 'occupancy',
+                                        f'occupancy_{region}.npy')
+            mask_filename = os.path.join(self.data_prefix.get('img_path', ''),
+                                         ann_dataset, building, 'occupancy',
+                                         f'visible_occupancy_{region}.pkl')
+        else:
+            raise NotImplementedError
+
+        gt_occ = np.load(occ_filename)
+        for i in range(gt_occ.shape[0]):
+            cls_id = self.occ_label_mapping[gt_occ[i][3]]
+            if cls_id < 0:
+                cls_id = 255
+            gt_occ[i][3] = cls_id
+        ann_info['gt_occupancy'] = gt_occ
+
+        ann_info['visible_occupancy_masks'] = []
+        occ_masks = mmengine.load(mask_filename)
+        for i in range(len(info['images'])):
+            ann_info['visible_occupancy_masks'].append(
+                occ_masks[i]['visible_occupancy'])
+
+        ann_info['gt_bboxes_3d'] = self.box_type_3d(
+            ann_info['gt_bboxes_3d'],
+            box_dim=ann_info['gt_bboxes_3d'].shape[-1],
+            with_yaw=True,
+            origin=(0.5, 0.5, 0.5))
+
+        return ann_info
+
+    @staticmethod
+    def _get_axis_align_matrix(info: dict) -> np.ndarray:
+        """Get axis_align_matrix from info. If not exist, return identity mat.
+
+        Args:
+            info (dict): Info of a single sample data.
+
+        Returns:
+            np.ndarray: 4x4 transformation matrix.
+        """
+        if 'axis_align_matrix' in info:
+            return np.array(info['axis_align_matrix'])
+        else:
+            warnings.warn(
+                'axis_align_matrix is not found in ScanNet data info, please '
+                'use new pre-process scripts to re-generate ScanNet data')
+            return np.eye(4).astype(np.float32)
+
+    def _ids2masks(self, ids, mask_length):
+        """Change visible_instance_ids to visible_instance_masks."""
+        masks = []
+        for idx in range(len(ids)):
+            mask = np.zeros((mask_length, ), dtype=bool)
+            mask[ids[idx]] = 1
+            masks.append(mask)
+        return masks
+
+    def _remove_dontcare(self, ann_info: dict) -> dict:
+        """Remove annotations that do not need to be cared.
+
+        -1 indicates dontcare in MMDet3d.
+
+        Args:
+            ann_info (dict): Dict of annotation infos. The
+                instance with label `-1` will be removed.
+
+        Returns:
+            dict: Annotations after filtering.
+        """
+        img_filtered_annotations = {}
+        filter_mask = ann_info['gt_labels_3d'] > -1
+        for key in ann_info.keys():
+            if key == 'instances':
+                img_filtered_annotations[key] = ann_info[key]
+            elif key == 'visible_instance_masks':
+                img_filtered_annotations[key] = []
+                for idx in range(len(ann_info[key])):
+                    img_filtered_annotations[key].append(
+                        ann_info[key][idx][filter_mask])
+            elif key in ['gt_occupancy', 'visible_occupancy_masks']:
+                pass
+            else:
+                img_filtered_annotations[key] = (ann_info[key][filter_mask])
+        return img_filtered_annotations
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        If the annotation file does not follow `OpenMMLab 2.0 format dataset
+        <https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html>`_ .
+        The subclass must override this method for load annotations. The meta
+        information of annotation file will be overwritten :attr:`METAINFO`
+        and ``metainfo`` argument of constructor.
+
+        Returns:
+            list[dict]: A list of annotation.
+        """  # noqa: E501
+        # `self.ann_file` denotes the absolute annotation file path if
+        # `self.root=None` or relative path if `self.root=/path/to/data/`.
+        annotations = load(self.ann_file)
+        if not isinstance(annotations, dict):
+            raise TypeError(f'The annotations loaded from annotation file '
+                            f'should be a dict, but got {type(annotations)}!')
+        if 'data_list' not in annotations or 'metainfo' not in annotations:
+            raise ValueError('Annotation must have data_list and metainfo '
+                             'keys')
+        metainfo = annotations['metainfo']
+        raw_data_list = annotations['data_list']
+
+        # Meta information load from annotation file will not influence the
+        # existed meta information load from `BaseDataset.METAINFO` and
+        # `metainfo` arguments defined in constructor.
+        for k, v in metainfo.items():
+            self._metainfo.setdefault(k, v)
+
+        self.process_metainfo()
+
+        # load and parse data_infos.
+        data_list = []
+        for raw_data_info in raw_data_list:
+            # parse raw data information to target format
+            data_info = self.parse_data_info(raw_data_info)
+            if isinstance(data_info, dict):
+                # For image tasks, `data_info` should information if single
+                # image, such as dict(img_path='xxx', width=360, ...)
+                data_list.append(data_info)
+            elif isinstance(data_info, list):
+                # For video tasks, `data_info` could contain image
+                # information of multiple frames, such as
+                # [dict(video_path='xxx', timestamps=...),
+                #  dict(video_path='xxx', timestamps=...)]
+                for item in data_info:
+                    if not isinstance(item, dict):
+                        raise TypeError('data_info must be list of dict, but '
+                                        f'got {type(item)}')
+                data_list.extend(data_info)
+            else:
+                raise TypeError('data_info should be a dict or list of dict, '
+                                f'but got {type(data_info)}')
+
+        return data_list
diff --git a/embodiedscan/eval/__init__.py b/embodiedscan/eval/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/embodiedscan/eval/det_metric.py b/embodiedscan/eval/det_metric.py
new file mode 100644
index 0000000..7774dee
--- /dev/null
+++ b/embodiedscan/eval/det_metric.py
@@ -0,0 +1,237 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from collections import OrderedDict
+from typing import Dict, List, Optional, Sequence, Union
+
+import numpy as np
+from mmdet3d.evaluation import indoor_eval
+from mmdet3d.registry import METRICS
+from mmdet3d.structures import get_box_type
+from mmdet.evaluation import eval_map
+from mmengine.dist import (broadcast_object_list, collect_results,
+                           is_main_process)
+from mmengine.evaluator import BaseMetric
+from mmengine.evaluator.metric import _to_cpu
+from mmengine.logging import MMLogger, print_log
+
+
+@METRICS.register_module()
+class IndoorDetMetric(BaseMetric):
+    """Indoor scene evaluation metric.
+
+    Args:
+        iou_thr (float or List[float]): List of iou threshold when calculate
+            the metric. Defaults to [0.25, 0.5].
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix will
+            be used instead. Defaults to None.
+    """
+
+    def __init__(self,
+                 iou_thr: List[float] = [0.25, 0.5],
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 batchwise_anns: bool = False,
+                 **kwargs) -> None:
+        super(IndoorDetMetric, self).__init__(prefix=prefix,
+                                              collect_device=collect_device)
+        self.iou_thr = [iou_thr] if isinstance(iou_thr, float) else iou_thr
+        self.batchwise_anns = batchwise_anns
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        for data_sample in data_samples:
+            pred_3d = data_sample['pred_instances_3d']
+            eval_ann_info = data_sample['eval_ann_info']
+            cpu_pred_3d = dict()
+            for k, v in pred_3d.items():
+                if hasattr(v, 'to'):
+                    cpu_pred_3d[k] = v.to('cpu')
+                else:
+                    cpu_pred_3d[k] = v
+            self.results.append((eval_ann_info, cpu_pred_3d))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        ann_infos = []
+        pred_results = []
+
+        for eval_ann, sinlge_pred_results in results:
+            ann_infos.append(eval_ann)
+            pred_results.append(sinlge_pred_results)
+
+        # some checkpoints may not record the key "box_type_3d"
+        box_type_3d, box_mode_3d = get_box_type(
+            self.dataset_meta.get('box_type_3d', 'depth'))
+
+        ret_dict = indoor_eval(ann_infos,
+                               pred_results,
+                               self.iou_thr,
+                               self.dataset_meta['classes'],
+                               logger=logger,
+                               box_mode_3d=box_mode_3d,
+                               classes_split=self.dataset_meta.get(
+                                   'classes_split', None))
+
+        return ret_dict
+
+    def evaluate(self, size: int) -> dict:
+        """Evaluate the model performance of the whole dataset after processing
+        all batches.
+
+        Args:
+            size (int): Length of the entire validation dataset. When batch
+                size > 1, the dataloader may pad some data samples to make
+                sure all ranks have the same length of dataset slice. The
+                ``collect_results`` function will drop the padded data based on
+                this size.
+
+        Returns:
+            dict: Evaluation metrics dict on the val dataset. The keys are the
+            names of the metrics, and the values are corresponding results.
+        """
+        if len(self.results) == 0:
+            print_log(
+                f'{self.__class__.__name__} got empty `self.results`. Please '
+                'ensure that the processed results are properly added into '
+                '`self.results` in `process` method.',
+                logger='current',
+                level=logging.WARNING)
+
+        if self.batchwise_anns:
+            # the actual dataset length/size is the len(self.results)
+            if self.collect_device == 'cpu':
+                results = collect_results(self.results,
+                                          len(self.results),
+                                          self.collect_device,
+                                          tmpdir=self.collect_dir)
+            else:
+                results = collect_results(self.results, len(self.results),
+                                          self.collect_device)
+        else:
+            if self.collect_device == 'cpu':
+                results = collect_results(self.results,
+                                          size,
+                                          self.collect_device,
+                                          tmpdir=self.collect_dir)
+            else:
+                results = collect_results(self.results, size,
+                                          self.collect_device)
+
+        if is_main_process():
+            # cast all tensors in results list to cpu
+            results = _to_cpu(results)
+            _metrics = self.compute_metrics(results)  # type: ignore
+            # Add prefix to metric names
+            if self.prefix:
+                _metrics = {
+                    '/'.join((self.prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+            metrics = [_metrics]
+        else:
+            metrics = [None]  # type: ignore
+
+        broadcast_object_list(metrics)
+
+        # reset the results list
+        self.results.clear()
+        return metrics[0]
+
+
+@METRICS.register_module()
+class Indoor2DMetric(BaseMetric):
+    """indoor 2d predictions evaluation metric.
+
+    Args:
+        iou_thr (float or List[float]): List of iou threshold when calculate
+            the metric. Defaults to [0.5].
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix will
+            be used instead. Defaults to None.
+    """
+
+    def __init__(self,
+                 iou_thr: Union[float, List[float]] = [0.5],
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None):
+        super(Indoor2DMetric, self).__init__(prefix=prefix,
+                                             collect_device=collect_device)
+        self.iou_thr = [iou_thr] if isinstance(iou_thr, float) else iou_thr
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        for data_sample in data_samples:
+            pred = data_sample['pred_instances']
+            eval_ann_info = data_sample['eval_ann_info']
+            ann = dict(labels=eval_ann_info['gt_bboxes_labels'],
+                       bboxes=eval_ann_info['gt_bboxes'])
+
+            pred_bboxes = pred['bboxes'].cpu().numpy()
+            pred_scores = pred['scores'].cpu().numpy()
+            pred_labels = pred['labels'].cpu().numpy()
+
+            dets = []
+            for label in range(len(self.dataset_meta['classes'])):
+                index = np.where(pred_labels == label)[0]
+                pred_bbox_scores = np.hstack(
+                    [pred_bboxes[index], pred_scores[index].reshape((-1, 1))])
+                dets.append(pred_bbox_scores)
+
+            self.results.append((ann, dets))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        annotations, preds = zip(*results)
+        eval_results = OrderedDict()
+        for iou_thr_2d_single in self.iou_thr:
+            mean_ap, _ = eval_map(preds,
+                                  annotations,
+                                  scale_ranges=None,
+                                  iou_thr=iou_thr_2d_single,
+                                  dataset=self.dataset_meta['classes'],
+                                  logger=logger)
+            eval_results['mAP_' + str(iou_thr_2d_single)] = mean_ap
+        return eval_results
diff --git a/embodiedscan/eval/indoor_eval.py b/embodiedscan/eval/indoor_eval.py
new file mode 100644
index 0000000..9b57810
--- /dev/null
+++ b/embodiedscan/eval/indoor_eval.py
@@ -0,0 +1,377 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmengine.logging import print_log
+from terminaltables import AsciiTable
+
+
+def average_precision(recalls, precisions, mode='area'):
+    """Calculate average precision (for single or multiple scales).
+
+    Args:
+        recalls (np.ndarray): Recalls with shape of (num_scales, num_dets)
+            or (num_dets, ).
+        precisions (np.ndarray): Precisions with shape of
+            (num_scales, num_dets) or (num_dets, ).
+        mode (str): 'area' or '11points', 'area' means calculating the area
+            under precision-recall curve, '11points' means calculating
+            the average precision of recalls at [0, 0.1, ..., 1]
+
+    Returns:
+        float or np.ndarray: Calculated average precision.
+    """
+    if recalls.ndim == 1:
+        recalls = recalls[np.newaxis, :]
+        precisions = precisions[np.newaxis, :]
+
+    assert recalls.shape == precisions.shape
+    assert recalls.ndim == 2
+
+    num_scales = recalls.shape[0]
+    ap = np.zeros(num_scales, dtype=np.float32)
+    if mode == 'area':
+        zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
+        ones = np.ones((num_scales, 1), dtype=recalls.dtype)
+        mrec = np.hstack((zeros, recalls, ones))
+        mpre = np.hstack((zeros, precisions, zeros))
+        for i in range(mpre.shape[1] - 1, 0, -1):
+            mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
+        for i in range(num_scales):
+            ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0]
+            ap[i] = np.sum(
+                (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1])
+    elif mode == '11points':
+        for i in range(num_scales):
+            for thr in np.arange(0, 1 + 1e-3, 0.1):
+                precs = precisions[i, recalls[i, :] >= thr]
+                prec = precs.max() if precs.size > 0 else 0
+                ap[i] += prec
+            ap /= 11
+    else:
+        raise ValueError(
+            'Unrecognized mode, only "area" and "11points" are supported')
+    return ap
+
+
+def eval_det_cls(pred, gt, iou_thr=None):
+    """Generic functions to compute precision/recall for object detection for a
+    single class.
+
+    Args:
+        pred (dict): Predictions mapping from image id to bounding boxes
+            and scores.
+        gt (dict): Ground truths mapping from image id to bounding boxes.
+        iou_thr (list[float]): A list of iou thresholds.
+
+    Return:
+        tuple (np.ndarray, np.ndarray, float): Recalls, precisions and
+            average precision.
+    """
+
+    # {img_id: {'bbox': box structure, 'det': matched list}}
+    class_recs = {}
+    npos = 0
+    # figure out the bbox code size first
+    gt_bbox_code_size = 9
+    pred_bbox_code_size = 9
+    for img_id in gt.keys():
+        if len(gt[img_id]) != 0:
+            gt_bbox_code_size = gt[img_id][0].tensor.shape[1]
+            break
+    for img_id in pred.keys():
+        if len(pred[img_id][0]) != 0:
+            pred_bbox_code_size = pred[img_id][0][0].tensor.shape[1]
+            break
+    assert gt_bbox_code_size == pred_bbox_code_size
+    for img_id in gt.keys():
+        cur_gt_num = len(gt[img_id])
+        if cur_gt_num != 0:
+            gt_cur = torch.zeros([cur_gt_num, gt_bbox_code_size],
+                                 dtype=torch.float32)
+            for i in range(cur_gt_num):
+                gt_cur[i] = gt[img_id][i].tensor
+            bbox = gt[img_id][0].new_box(gt_cur)
+        else:
+            bbox = gt[img_id]
+        det = [[False] * len(bbox) for i in iou_thr]
+        npos += len(bbox)
+        class_recs[img_id] = {'bbox': bbox, 'det': det}
+
+    # construct dets
+    image_ids = []
+    confidence = []
+    ious = []
+    for img_id in pred.keys():
+        cur_num = len(pred[img_id])
+        if cur_num == 0:
+            continue
+        pred_cur = torch.zeros((cur_num, pred_bbox_code_size),
+                               dtype=torch.float32)
+        box_idx = 0
+        for box, score in pred[img_id]:
+            image_ids.append(img_id)
+            confidence.append(score)
+            # handle outlier (too thin) predicted boxes
+            w, l, h = box.tensor[0, 3:6]
+            faces = [w * l, w * h, h * l]
+            if torch.any(box.tensor.new_tensor(faces) < 2e-4):
+                print('Find small predicted boxes,',
+                      'and clamp short edges to 2e-2 meters.')
+                box.tensor[:, 3:6] = torch.clamp(box.tensor[:, 3:6], min=2e-2)
+            pred_cur[box_idx] = box.tensor
+            box_idx += 1
+        pred_cur = box.new_box(pred_cur)
+        gt_cur = class_recs[img_id]['bbox']
+        if len(gt_cur) > 0:
+            # calculate iou in each image
+            iou_cur = pred_cur.overlaps(pred_cur, gt_cur)
+            for i in range(cur_num):
+                ious.append(iou_cur[i])
+        else:
+            for i in range(cur_num):
+                ious.append(np.zeros(1))
+
+    confidence = np.array(confidence)
+
+    # sort by confidence
+    sorted_ind = np.argsort(-confidence)
+    image_ids = [image_ids[x] for x in sorted_ind]
+    ious = [ious[x] for x in sorted_ind]
+
+    # go down dets and mark TPs and FPs
+    num_images = len(image_ids)
+    tp_thr = [np.zeros(num_images) for i in iou_thr]
+    fp_thr = [np.zeros(num_images) for i in iou_thr]
+    for d in range(num_images):
+        R = class_recs[image_ids[d]]
+        iou_max = -np.inf
+        BBGT = R['bbox']
+        cur_iou = ious[d]
+
+        if len(BBGT) > 0:
+            # compute overlaps
+            for j in range(len(BBGT)):
+                # iou = get_iou_main(get_iou_func, (bb, BBGT[j,...]))
+                iou = cur_iou[j]
+                if iou > iou_max:
+                    iou_max = iou
+                    jmax = j
+
+        for iou_idx, thresh in enumerate(iou_thr):
+            if iou_max > thresh:
+                if not R['det'][iou_idx][jmax]:
+                    tp_thr[iou_idx][d] = 1.
+                    R['det'][iou_idx][jmax] = 1
+                else:
+                    fp_thr[iou_idx][d] = 1.
+            else:
+                fp_thr[iou_idx][d] = 1.
+
+    ret = []
+    for iou_idx, thresh in enumerate(iou_thr):
+        # compute precision recall
+        fp = np.cumsum(fp_thr[iou_idx])
+        tp = np.cumsum(tp_thr[iou_idx])
+        recall = tp / float(npos)
+        # avoid divide by zero in case the first detection matches a difficult
+        # ground truth
+        precision = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+        ap = average_precision(recall, precision)
+        ret.append((recall, precision, ap))
+
+    return ret
+
+
+def eval_map_recall(pred, gt, ovthresh=None):
+    """Evaluate mAP and recall.
+
+    Generic functions to compute precision/recall for object detection
+        for multiple classes.
+
+    Args:
+        pred (dict): Information of detection results,
+            which maps class_id and predictions.
+        gt (dict): Information of ground truths, which maps class_id and
+            ground truths.
+        ovthresh (list[float], optional): iou threshold. Default: None.
+
+    Return:
+        tuple[dict]: dict results of recall, AP, and precision for all classes.
+    """
+
+    ret_values = {}
+    for classname in gt.keys():
+        if classname in pred:
+            ret_values[classname] = eval_det_cls(pred[classname],
+                                                 gt[classname], ovthresh)
+    recall = [{} for i in ovthresh]
+    precision = [{} for i in ovthresh]
+    ap = [{} for i in ovthresh]
+
+    for label in gt.keys():
+        for iou_idx, thresh in enumerate(ovthresh):
+            if label in pred:
+                recall[iou_idx][label], precision[iou_idx][label], ap[iou_idx][
+                    label] = ret_values[label][iou_idx]
+            else:
+                recall[iou_idx][label] = np.zeros(1)
+                precision[iou_idx][label] = np.zeros(1)
+                ap[iou_idx][label] = np.zeros(1)
+
+    return recall, precision, ap
+
+
+def indoor_eval(gt_annos,
+                dt_annos,
+                metric,
+                label2cat,
+                logger=None,
+                box_mode_3d=None,
+                classes_split=None):
+    """Indoor Evaluation.
+
+    Evaluate the result of the detection.
+
+    Args:
+        gt_annos (list[dict]): Ground truth annotations.
+        dt_annos (list[dict]): Detection annotations. the dict
+            includes the following keys
+
+            - labels_3d (torch.Tensor): Labels of boxes.
+            - bboxes_3d (:obj:`BaseInstance3DBoxes`):
+                3D bounding boxes in Depth coordinate.
+            - scores_3d (torch.Tensor): Scores of boxes.
+        metric (list[float]): IoU thresholds for computing average precisions.
+        label2cat (tuple): Map from label to category.
+        logger (logging.Logger | str, optional): The way to print the mAP
+            summary. See `mmdet.utils.print_log()` for details. Default: None.
+
+    Return:
+        dict[str, float]: Dict of results.
+    """
+    assert len(dt_annos) == len(gt_annos)
+    pred = {}  # map {class_id: pred}
+    gt = {}  # map {class_id: gt}
+    for img_id in range(len(dt_annos)):
+        # parse detected annotations
+        det_anno = dt_annos[img_id]
+        for i in range(len(det_anno['labels_3d'])):
+            label = det_anno['labels_3d'].numpy()[i]
+            bbox = det_anno['bboxes_3d'].convert_to(box_mode_3d)[i]
+            score = det_anno['scores_3d'].numpy()[i]
+            if label not in pred:
+                pred[int(label)] = {}
+            if img_id not in pred[label]:
+                pred[int(label)][img_id] = []
+            if label not in gt:
+                gt[int(label)] = {}
+            if img_id not in gt[label]:
+                gt[int(label)][img_id] = []
+            pred[int(label)][img_id].append((bbox, score))
+
+        # parse gt annotations
+        gt_anno = gt_annos[img_id]
+
+        gt_boxes = gt_anno['gt_bboxes_3d']
+        labels_3d = gt_anno['gt_labels_3d']
+
+        for i in range(len(labels_3d)):
+            label = labels_3d[i]
+            bbox = gt_boxes[i]
+            if label not in gt:
+                gt[label] = {}
+            if img_id not in gt[label]:
+                gt[label][img_id] = []
+            gt[label][img_id].append(bbox)
+
+    rec, prec, ap = eval_map_recall(pred, gt, metric)
+
+    # filter nan results
+    ori_keys = list(ap[0].keys())
+    for key in ori_keys:
+        if np.isnan(ap[0][key][0]):
+            for r in rec:
+                del r[key]
+            for p in prec:
+                del p[key]
+            for a in ap:
+                del a[key]
+
+    ret_dict = dict()
+    header = ['classes']
+    table_columns = [[label2cat[label]
+                      for label in ap[0].keys()] + ['Overall']]
+
+    for i, iou_thresh in enumerate(metric):
+        header.append(f'AP_{iou_thresh:.2f}')
+        header.append(f'AR_{iou_thresh:.2f}')
+        rec_list = []
+        for label in ap[i].keys():
+            ret_dict[f'{label2cat[label]}_AP_{iou_thresh:.2f}'] = float(
+                ap[i][label][0])
+        ret_dict[f'mAP_{iou_thresh:.2f}'] = float(np.mean(list(
+            ap[i].values())))
+
+        table_columns.append(list(map(float, list(ap[i].values()))))
+        table_columns[-1] += [ret_dict[f'mAP_{iou_thresh:.2f}']]
+        table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]
+
+        for label in rec[i].keys():
+            ret_dict[f'{label2cat[label]}_rec_{iou_thresh:.2f}'] = float(
+                rec[i][label][-1])
+            rec_list.append(rec[i][label][-1])
+        ret_dict[f'mAR_{iou_thresh:.2f}'] = float(np.mean(rec_list))
+
+        table_columns.append(list(map(float, rec_list)))
+        table_columns[-1] += [ret_dict[f'mAR_{iou_thresh:.2f}']]
+        table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]
+
+    table_data = [header]
+    table_rows = list(zip(*table_columns))
+    table_data += table_rows
+    table = AsciiTable(table_data)
+    table.inner_footing_row_border = True
+    print_log('\n' + table.table, logger=logger)
+
+    if classes_split is not None:
+        splits = ['head', 'common', 'tail']
+        for idx in range(len(splits)):
+            header = [f'{splits[idx]}_classes']
+            # init the category list/column
+            cat_list = []
+            for label in classes_split[idx]:
+                if label in ap[0]:
+                    cat_list.append(label2cat[label])
+            table_columns = [cat_list + ['Overall']]
+
+            for i, iou_thresh in enumerate(metric):
+                header.append(f'AP_{iou_thresh:.2f}')
+                header.append(f'AR_{iou_thresh:.2f}')
+                ap_list = []
+                for label in classes_split[idx]:
+                    if label in ap[i]:
+                        ap_list.append(float(ap[i][label][0]))
+                mean_ap = float(np.mean(ap_list))
+
+                table_columns.append(list(map(float, ap_list)))
+                table_columns[-1] += [mean_ap]
+                table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]
+
+                rec_list = []
+                for label in classes_split[idx]:
+                    if label in rec[i]:
+                        rec_list.append(rec[i][label][-1])
+                mean_rec = float(np.mean(rec_list))
+
+                table_columns.append(list(map(float, rec_list)))
+                table_columns[-1] += [mean_rec]
+                table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]
+
+            table_data = [header]
+            table_rows = list(zip(*table_columns))
+            table_data += table_rows
+            table = AsciiTable(table_data)
+            table.inner_footing_row_border = True
+            print_log('\n' + table.table, logger=logger)
+
+    return ret_dict
diff --git a/embodiedscan/eval/occupancy_metric.py b/embodiedscan/eval/occupancy_metric.py
new file mode 100644
index 0000000..3366479
--- /dev/null
+++ b/embodiedscan/eval/occupancy_metric.py
@@ -0,0 +1,177 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from typing import Dict, Optional, Sequence
+
+import numpy as np
+import torch
+from mmdet3d.registry import METRICS
+from mmengine.dist import (broadcast_object_list, collect_results,
+                           is_main_process)
+from mmengine.evaluator import BaseMetric
+from mmengine.evaluator.metric import _to_cpu
+from mmengine.logging import MMLogger, print_log
+from terminaltables import AsciiTable
+
+
+@METRICS.register_module()
+class OccupancyMetric(BaseMetric):
+    """Indoor scene evaluation metric.
+
+    Args:
+        iou_thr (list[float]): List of iou threshold when calculate the
+            metric. Defaults to  [0.25, 0.5].
+        collect_device (str, optional): Device name used for collecting
+            results from different ranks during distributed training.
+            Must be 'cpu' or 'gpu'. Defaults to 'cpu'.
+        prefix (str): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Default: None
+    """
+
+    def __init__(self,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 batchwise_anns: bool = False,
+                 **kwargs):
+        super(OccupancyMetric, self).__init__(prefix=prefix,
+                                              collect_device=collect_device)
+        self.batchwise_anns = batchwise_anns
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed results should be stored in ``self.results``,
+        which will be used to compute the metrics when all batches
+        have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from
+                the model.
+        """
+        for data_sample in data_samples:
+            pred_occ = data_sample['pred_occupancy']
+            gt_4 = data_sample['gt_occupancy']
+            gt_occ = torch.zeros_like(pred_occ)
+            gt_occ[gt_4[:, 0], gt_4[:, 1], gt_4[:, 2]] = gt_4[:, 3]
+            if 'gt_occupancy_masks' in data_sample:
+                gt_occ_mask = data_sample['gt_occupancy_masks']
+                gt_occ[~gt_occ_mask] = 255
+            self.results.append((gt_occ, pred_occ))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        num_class = len(self.dataset_meta['classes']) + 1
+        score = np.zeros((num_class, 3))
+
+        for gt_occ, sinlge_pred_results in results:
+            mask = (gt_occ != 255)
+            for j in range(num_class):
+                if j == 0:  # class 0 (empty) for geometry IoU
+                    score[j][0] += ((gt_occ[mask] != 0) *
+                                    (sinlge_pred_results[mask] != 0)).sum()
+                    score[j][1] += (gt_occ[mask] != 0).sum()
+                    score[j][2] += (sinlge_pred_results[mask] != 0).sum()
+                else:
+                    score[j][0] += ((gt_occ[mask] == j) *
+                                    (sinlge_pred_results[mask] == j)).sum()
+                    score[j][1] += (gt_occ[mask] == j).sum()
+                    score[j][2] += (sinlge_pred_results[mask] == j).sum()
+
+        ret_dict = dict()
+        table_data = [['classes', 'IoU']]
+        res = []
+        for i in range(num_class):
+            name = 'empty'
+            if i > 0:
+                name = self.dataset_meta['classes'][i - 1]
+
+            tp = score[i, 0]
+            p = score[i, 1]
+            g = score[i, 2]
+            union = p + g - tp
+            # do not save the accuracy result if nan
+            if np.isnan(tp / union):
+                continue
+            ret_dict[name] = tp / union
+            res.append(tp / union)
+            table_data.append([name, f'{ret_dict[name]:.5f}'])
+        table_data.append(['mean', f'{sum(res)/len(res):.5f}'])
+
+        table = AsciiTable(table_data)
+        table.inner_footing_row_border = True
+        print_log('\n' + table.table, logger=logger)
+        return ret_dict
+
+    def evaluate(self, size: int) -> dict:
+        """Evaluate the model performance of the whole dataset after processing
+        all batches.
+
+        Args:
+            size (int): Length of the entire validation dataset. When batch
+                size > 1, the dataloader may pad some data samples to make
+                sure all ranks have the same length of dataset slice. The
+                ``collect_results`` function will drop the padded data based on
+                this size.
+
+        Returns:
+            dict: Evaluation metrics dict on the val dataset. The keys are the
+            names of the metrics, and the values are corresponding results.
+        """
+        if len(self.results) == 0:
+            print_log(
+                f'{self.__class__.__name__} got empty `self.results`. Please '
+                'ensure that the processed results are properly added into '
+                '`self.results` in `process` method.',
+                logger='current',
+                level=logging.WARNING)
+
+        if self.batchwise_anns:
+            # the actual dataset length/size is the len(self.results)
+            if self.collect_device == 'cpu':
+                results = collect_results(self.results,
+                                          len(self.results),
+                                          self.collect_device,
+                                          tmpdir=self.collect_dir)
+            else:
+                results = collect_results(self.results, len(self.results),
+                                          self.collect_device)
+        else:
+            if self.collect_device == 'cpu':
+                results = collect_results(self.results,
+                                          size,
+                                          self.collect_device,
+                                          tmpdir=self.collect_dir)
+            else:
+                results = collect_results(self.results, size,
+                                          self.collect_device)
+
+        if is_main_process():
+            # cast all tensors in results list to cpu
+            results = _to_cpu(results)
+            _metrics = self.compute_metrics(results)  # type: ignore
+            # Add prefix to metric names
+            if self.prefix:
+                _metrics = {
+                    '/'.join((self.prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+            metrics = [_metrics]
+        else:
+            metrics = [None]  # type: ignore
+
+        broadcast_object_list(metrics)
+
+        # reset the results list
+        self.results.clear()
+        return metrics[0]
diff --git a/embodiedscan/explorer.py b/embodiedscan/explorer.py
new file mode 100644
index 0000000..7b6e462
--- /dev/null
+++ b/embodiedscan/explorer.py
@@ -0,0 +1,251 @@
+import os
+from typing import List, Union
+
+import mmengine
+import numpy as np
+import open3d as o3d
+from utils.color_selector import ColorMap
+from utils.img_drawer import ImageDrawer
+
+DATASETS = ['scannet', '3rscan', 'matterport3d']
+
+
+class EmbodiedScanExplorer:
+
+    def __init__(
+        self,
+        dataroot: Union[dict, List],
+        ann_file: Union[dict, List, str],
+        verbose: bool = False,
+        color_setting: str = None,
+    ):
+
+        if isinstance(ann_file, dict):
+            ann_file = list(ann_file.values())
+        elif isinstance(ann_file, str):
+            ann_file = [ann_file]
+        self.ann_files = ann_file
+
+        if isinstance(dataroot, str):
+            dataroot = [dataroot]
+        if isinstance(dataroot, list):
+            self.dataroot = dict()
+            for dataset in DATASETS:
+                self.dataroot[dataset] = None
+            for root in dataroot:
+                for dataset in DATASETS:
+                    if dataset.lower() in root.lower():
+                        self.dataroot[dataset] = root
+                        break
+        if isinstance(dataroot, dict):
+            self.dataroot = dataroot
+
+        self.verbose = verbose
+
+        if self.verbose:
+            print('Dataset root')
+            for dataset in DATASETS:
+                print(dataset, ':', self.dataroot[dataset])
+
+        if self.verbose:
+            print('Loading')
+        self.metainfo = None
+        data_list = []
+        for file in self.ann_files:
+            data = mmengine.load(file)
+            if self.metainfo is None:
+                self.metainfo = data['metainfo']
+            else:
+                assert self.metainfo == data['metainfo']
+            data_list += data['data_list']
+
+        self.classes = list(self.metainfo['categories'].keys())
+        self.color_selector = ColorMap(classes=self.classes,
+                                       init_file=color_setting)
+        self.data = []
+        for data in data_list:
+            splits = data['sample_idx'].split('/')
+            data['dataset'] = splits[0]
+            if self.dataroot[splits[0]] is not None:
+                self.data.append(data)
+
+        if self.verbose:
+            print('Loading complete')
+
+    def count_scenes(self):
+        return len(self.data)
+
+    def list_scenes(self):
+        res = []
+        for scene in self.data:
+            res.append(scene['sample_idx'])
+        return res
+
+    def scene_info(self, scene_name):
+        for scene in self.data:
+            if scene['sample_idx'] == scene_name:
+                if self.verbose:
+                    print('Info of', scene_name)
+                    print(len(scene['images']), 'images')
+                    print(len(scene['instances']), 'boxes')
+                return dict(num_images=len(scene['images']),
+                            num_boxes=len(scene['instances']))
+
+        if self.verbose:
+            print('No such scene')
+        return None
+
+    def render_scene(self, scene_name, render_box=False):
+        s = scene_name.split('/')
+        if len(s) == 2:
+            dataset, region = s
+        else:
+            dataset, building, region = s
+        select = None
+        for scene in self.data:
+            if scene['sample_idx'] == scene_name:
+                select = scene
+                break
+        axis_align_matrix = select['axis_align_matrix']
+        if dataset == 'scannet':
+            filepath = os.path.join(self.dataroot['scannet'], 'scans', region,
+                                    f'{region}_vh_clean.ply')
+        elif dataset == '3rscan':
+            filepath = os.path.join(self.dataroot['3rscan'], region,
+                                    'mesh.refined.v2.obj')
+        elif dataset == 'matterport3d':
+            filepath = os.path.join(self.dataroot['matterport3d'], building,
+                                    'region_segmentations', f'{region}.ply')
+        else:
+            raise NotImplementedError
+
+        mesh = o3d.io.read_triangle_mesh(filepath, True)
+        mesh.transform(axis_align_matrix)
+        frame = o3d.geometry.TriangleMesh.create_coordinate_frame()
+        boxes = []
+        if render_box:
+            for instance in select['instances']:
+                box = self._9dof_to_box(instance['bbox_3d'],
+                                        instance['bbox_label_3d'])
+                boxes.append(box)
+        o3d.visualization.draw_geometries([mesh, frame] + boxes)
+
+    def render_occupancy(self, scene_name):
+        s = scene_name.split('/')
+        if len(s) == 2:
+            dataset, region = s
+        else:
+            dataset, building, region = s
+
+        if dataset == 'scannet':
+            filepath = os.path.join(self.dataroot['scannet'], 'scans', region,
+                                    'occupancy', 'occupancy.npy')
+        elif dataset == '3rscan':
+            filepath = os.path.join(self.dataroot['3rscan'], region,
+                                    'occupancy', 'occupancy.npy')
+        elif dataset == 'matterport3d':
+            filepath = os.path.join(self.dataroot['matterport3d'], building,
+                                    'occupancy', f'occupancy_{region}.npy')
+        else:
+            raise NotImplementedError
+
+        gt_occ = np.load(filepath)
+        point_cloud_range = [-3.2, -3.2, -1.28 + 0.5, 3.2, 3.2, 1.28 + 0.5]
+        # occ_size = [40, 40, 16]
+        grid_size = [0.16, 0.16, 0.16]
+        points = np.zeros((gt_occ.shape[0], 6), dtype=float)
+        for i in range(gt_occ.shape[0]):
+            x, y, z, label_id = gt_occ[i]
+            label_id = int(label_id)
+            label = 'object'
+            if label_id == 0:
+                label = 'object'
+            else:
+                label = self.classes[label_id - 1]
+            color = self.color_selector.get_color(label)
+            color = [x / 255.0 for x in color]
+            points[i][:3] = [
+                x * grid_size[0] + point_cloud_range[0] + grid_size[0] / 2,
+                y * grid_size[1] + point_cloud_range[1] + grid_size[1] / 2,
+                z * grid_size[2] + point_cloud_range[2] + grid_size[2] / 2
+            ]
+            points[i][3:] = color
+        pcd = o3d.geometry.PointCloud()
+        pcd.points = o3d.utility.Vector3dVector(points[:, :3])
+        pcd.colors = o3d.utility.Vector3dVector(points[:, 3:])
+        voxel_grid = o3d.geometry.VoxelGrid.create_from_point_cloud(
+            pcd, voxel_size=grid_size[0])
+        frame = o3d.geometry.TriangleMesh.create_coordinate_frame()
+        o3d.visualization.draw_geometries([frame, voxel_grid])
+
+    def render_image(self, scene_name, camera_name):
+        dataset = scene_name.split('/')[0]
+        select = None
+        for scene in self.data:
+            if scene['sample_idx'] == scene_name:
+                select = scene
+        for camera in select['images']:
+            img_path = camera['img_path']
+            img_path = os.path.join(self.dataroot[dataset],
+                                    img_path[img_path.find('/') + 1:])
+            if dataset == 'scannet':
+                cam_name = img_path.split('/')[-1][:-4]
+            elif dataset == '3rscan':
+                cam_name = img_path.split('/')[-1][:-10]
+            elif dataset == 'matterport3d':
+                cam_name = img_path.split('/')[-1][:-8] + img_path.split(
+                    '/')[-1][-7:-4]
+            if cam_name == camera_name:
+                axis_align_matrix = select['axis_align_matrix']
+                extrinsic = axis_align_matrix @ camera['cam2global']
+                if 'cam2img' in camera:
+                    intrinsic = camera['cam2img']
+                else:
+                    intrinsic = select['cam2img']
+                img_drawer = ImageDrawer(img_path, verbose=self.verbose)
+                for i in camera['visible_instance_ids']:
+                    instance = select['instances'][i]
+                    box = self._9dof_to_box(instance['bbox_3d'],
+                                            instance['bbox_label_3d'])
+                    label = self.classes[instance['bbox_label_3d'] - 1]
+                    color = self.color_selector.get_color(label)
+                    img_drawer.draw_box3d(box,
+                                          color,
+                                          label,
+                                          extrinsic=extrinsic,
+                                          intrinsic=intrinsic)
+
+                img_drawer.show()
+                return
+
+        print('No such camera')
+        return
+
+    def _9dof_to_box(self, box, label_id):
+        if isinstance(box, list):
+            box = np.array(box)
+        center = box[:3].reshape(3, 1)
+        scale = box[3:6].reshape(3, 1)
+        rot = box[6:].reshape(3, 1)
+        rot_mat = \
+            o3d.geometry.OrientedBoundingBox.get_rotation_matrix_from_zxy(rot)
+        geo = o3d.geometry.OrientedBoundingBox(center, rot_mat, scale)
+
+        label = self.classes[label_id - 1]
+        color = self.color_selector.get_color(label)
+        color = [x / 255.0 for x in color]
+        geo.color = color
+        return geo
+
+
+if __name__ == '__main__':
+    a = EmbodiedScanExplorer(
+        dataroot=['data/scannet', 'data/3rscan/', 'data/matterport3d/'],
+        ann_file=[
+            'data/full_10_visible/embodiedscan_infos_train_full.pkl',
+            'data/full_10_visible/embodiedscan_infos_val_full.pkl'
+        ],
+        verbose=True)
+    print(a.list_scenes())
+    print(a.count_scenes())
+    a.render_image('scannet/scene0000_00', '00000')
diff --git a/embodiedscan/refine_pickle.py b/embodiedscan/refine_pickle.py
new file mode 100644
index 0000000..d77bf87
--- /dev/null
+++ b/embodiedscan/refine_pickle.py
@@ -0,0 +1,116 @@
+import json
+import os
+import pickle
+
+from tqdm import tqdm
+
+
+def path_split(path):
+    s = path.split('/')
+    return s[0], s[2], s[3]
+
+
+with open(
+        '/mnt/petrelfs/share_data/maoxiaohan/3rscan/meta_data/' +
+        '3rscan_mapping.json', 'r') as f:
+    map_3rscan = json.load(f)
+back_3rscan = {v: k for k, v in map_3rscan.items()}
+
+with open(
+        '/mnt/petrelfs/share_data/maoxiaohan/matterport3d/meta_data/' +
+        'scene_mapping.json', 'r') as f:
+    map_mp3d = json.load(f)
+back_mp3d = {v: k for k, v in map_mp3d.items()}
+buildings = os.listdir(
+    '/mnt/petrelfs/share_data/maoxiaohan/matterport3d/rename')
+assert len(buildings) == len(list(back_mp3d.keys()))
+max_cam = 0
+back_mp3d_cam = dict()
+for building in buildings:
+    assert building[-5:] == '.json'
+    building_name = building[:-5]
+    with open(
+            os.path.join(
+                '/mnt/petrelfs/share_data/maoxiaohan/matterport3d/rename',
+                building), 'r') as f:
+        tmp = json.load(f)
+    max_cam = max(max_cam, len(list(tmp.keys())))
+    back_mp3d_cam[building_name] = {v: k for k, v in tmp.items()}
+
+print(max_cam)
+
+
+def mp3d_split(region, camera):
+    global back_mp3d
+    global back_mp3d_cam
+    x = region.find('_region')
+    building = region[:x]
+    raw_building = back_mp3d[building]
+    raw_region = region[x + 1:]
+    assert camera[-4] == '_'
+    raw_camera = back_mp3d_cam[raw_building][camera[:-4]]
+    cam_pos = camera[-3:]
+    return raw_building, raw_region, raw_camera, cam_pos
+
+
+def generate(in_dir, out_dir, filename):
+    with open(os.path.join(in_dir, filename), 'rb') as f:
+        data = pickle.load(f)
+
+    for scene in tqdm(data['data_list']):
+        bo = False
+        for img in scene['images']:
+            path = img['img_path']
+            dataset, region, camera = path_split(path)
+            assert camera[-4:] == '.jpg'
+            camera = camera[:-4]
+
+            if dataset == 'scannet':
+                img_path = path
+                depth_path = f'{dataset}/posed_images/{region}/{camera}.png'
+                img['depth_path'] = depth_path
+                if not bo:
+                    scene['depth_cam2img'] = scene['depth2img']
+                    scene.pop('depth2img', None)
+                    scene['sample_idx'] = f'scannet/{region}'
+                    bo = True
+            elif dataset == '3rscan':
+                raw_region = back_3rscan[region]
+                img_path = f'{dataset}/{raw_region}/sequence/' + \
+                    'frame-{camera}.color.jpg'
+                depth_path = f'{dataset}/{raw_region}/sequence/' + \
+                    'frame-{camera}.depth.pgm'
+                img['img_path'] = img_path
+                img['depth_path'] = depth_path
+                if not bo:
+                    scene['depth_cam2img'] = scene['cam2depth']
+                    scene.pop('cam2depth', None)
+                    scene['sample_idx'] = f'3rscan/{raw_region}'
+                    bo = True
+            elif dataset == 'matterport3d':
+                raw_building, raw_region, raw_camera, cam_pos = mp3d_split(
+                    region, camera)
+                img_path = f'{dataset}/{raw_building}/' + \
+                    'matterport_color_images/{raw_camera}_i{cam_pos}.jpg'
+                depth_path = f'{dataset}/{raw_building}/' + \
+                    'matterport_depth_images/{raw_camera}_d{cam_pos}.png'
+                img['img_path'] = img_path
+                img['depth_path'] = depth_path
+                img.pop('cam2depth', None)
+                if not bo:
+                    scene['sample_idx'] = \
+                        f'matterport3d/{raw_building}/{raw_region}'
+                    bo = True
+            else:
+                raise NotImplementedError
+
+    with open(os.path.join(out_dir, filename), 'wb') as f:
+        pickle.dump(data, f)
+
+
+generate(in_dir='/mnt/petrelfs/share_data/wangtai/data/full_10_visible',
+         out_dir='./data',
+         filename='embodiedscan_infos_train_full.pkl')
+generate(in_dir='/mnt/petrelfs/share_data/wangtai/data/full_10_visible',
+         out_dir='./data',
+         filename='embodiedscan_infos_val_full.pkl')
diff --git a/embodiedscan/structures/__init__.py b/embodiedscan/structures/__init__.py
new file mode 100644
index 0000000..103d611
--- /dev/null
+++ b/embodiedscan/structures/__init__.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bbox_3d import (BaseInstance3DBoxes, Box3DMode, CameraInstance3DBoxes,
+                      Coord3DMode, DepthInstance3DBoxes,
+                      EulerCameraInstance3DBoxes, EulerDepthInstance3DBoxes,
+                      LiDARInstance3DBoxes, get_box_type,
+                      get_proj_mat_by_coord_type, limit_period,
+                      mono_cam_box2vis, points_cam2img, points_img2cam,
+                      rotation_3d_in_axis, rotation_3d_in_euler, xywhr2xyxyr)
+from .det3d_data_sample import Det3DDataSample
+# yapf: disable
+from .ops import (AxisAlignedBboxOverlaps3D, BboxOverlaps3D,
+                  BboxOverlapsNearest3D, axis_aligned_bbox_overlaps_3d,
+                  bbox3d2result, bbox3d2roi, bbox3d_mapping_back,
+                  bbox_overlaps_3d, bbox_overlaps_nearest_3d,
+                  box2d_to_corner_jit, box3d_to_bbox, box_camera_to_lidar,
+                  boxes3d_to_corners3d_lidar, camera_to_lidar,
+                  center_to_corner_box2d, center_to_corner_box3d,
+                  center_to_minmax_2d, corner_to_standup_nd_jit,
+                  corner_to_surfaces_3d, corner_to_surfaces_3d_jit, corners_nd,
+                  create_anchors_3d_range, depth_to_lidar_points,
+                  depth_to_points, get_frustum, iou_jit, minmax_to_corner_2d,
+                  points_in_convex_polygon_3d_jit,
+                  points_in_convex_polygon_jit, points_in_rbbox,
+                  projection_matrix_to_CRT_kitti, rbbox2d_to_near_bbox,
+                  remove_outside_points, rotation_points_single_angle,
+                  surface_equ_3d)
+# yapf: enable
+from .point_data import PointData
+from .points import BasePoints, CameraPoints, DepthPoints, LiDARPoints
+
+__all__ = [
+    'BasePoints', 'CameraPoints', 'DepthPoints', 'LiDARPoints',
+    'Det3DDataSample', 'PointData', 'Box3DMode', 'BaseInstance3DBoxes',
+    'LiDARInstance3DBoxes', 'CameraInstance3DBoxes', 'DepthInstance3DBoxes',
+    'EulerCameraInstance3DBoxes', 'EulerDepthInstance3DBoxes', 'xywhr2xyxyr',
+    'get_box_type', 'rotation_3d_in_axis', 'limit_period', 'points_cam2img',
+    'points_img2cam', 'Coord3DMode', 'mono_cam_box2vis',
+    'get_proj_mat_by_coord_type', 'box2d_to_corner_jit', 'box3d_to_bbox',
+    'box_camera_to_lidar', 'boxes3d_to_corners3d_lidar', 'camera_to_lidar',
+    'center_to_corner_box2d', 'center_to_corner_box3d', 'center_to_minmax_2d',
+    'corner_to_standup_nd_jit', 'corner_to_surfaces_3d',
+    'corner_to_surfaces_3d_jit', 'corners_nd', 'create_anchors_3d_range',
+    'depth_to_lidar_points', 'depth_to_points', 'get_frustum', 'iou_jit',
+    'minmax_to_corner_2d', 'points_in_convex_polygon_3d_jit',
+    'points_in_convex_polygon_jit', 'points_in_rbbox',
+    'projection_matrix_to_CRT_kitti', 'rbbox2d_to_near_bbox',
+    'remove_outside_points', 'rotation_points_single_angle', 'surface_equ_3d',
+    'BboxOverlapsNearest3D', 'BboxOverlaps3D', 'bbox_overlaps_nearest_3d',
+    'bbox_overlaps_3d', 'AxisAlignedBboxOverlaps3D',
+    'axis_aligned_bbox_overlaps_3d', 'bbox3d_mapping_back', 'bbox3d2roi',
+    'bbox3d2result', 'rotation_3d_in_euler'
+]
diff --git a/embodiedscan/structures/bbox_3d/__init__.py b/embodiedscan/structures/bbox_3d/__init__.py
new file mode 100644
index 0000000..a1515f8
--- /dev/null
+++ b/embodiedscan/structures/bbox_3d/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_box3d import BaseInstance3DBoxes
+from .box_3d_mode import Box3DMode
+from .cam_box3d import CameraInstance3DBoxes
+from .coord_3d_mode import Coord3DMode
+from .depth_box3d import DepthInstance3DBoxes
+from .euler_cam_box3d import EulerCameraInstance3DBoxes
+from .euler_depth_box3d import EulerDepthInstance3DBoxes
+from .lidar_box3d import LiDARInstance3DBoxes
+from .utils import (batch_points_cam2img, get_box_type,
+                    get_proj_mat_by_coord_type, limit_period, mono_cam_box2vis,
+                    points_cam2img, points_img2cam, rotation_3d_in_axis,
+                    rotation_3d_in_euler, xywhr2xyxyr)
+
+__all__ = [
+    'Box3DMode', 'BaseInstance3DBoxes', 'LiDARInstance3DBoxes',
+    'CameraInstance3DBoxes', 'DepthInstance3DBoxes',
+    'EulerCameraInstance3DBoxes', 'EulerDepthInstance3DBoxes', 'xywhr2xyxyr',
+    'get_box_type', 'rotation_3d_in_axis', 'rotation_3d_in_euler',
+    'limit_period', 'points_cam2img', 'points_img2cam', 'Coord3DMode',
+    'mono_cam_box2vis', 'batch_points_cam2img', 'get_proj_mat_by_coord_type'
+]
diff --git a/embodiedscan/structures/bbox_3d/base_box3d.py b/embodiedscan/structures/bbox_3d/base_box3d.py
new file mode 100644
index 0000000..85c83ed
--- /dev/null
+++ b/embodiedscan/structures/bbox_3d/base_box3d.py
@@ -0,0 +1,698 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from abc import abstractmethod
+from typing import Iterator, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+from mmcv.ops import box_iou_rotated, points_in_boxes_all, points_in_boxes_part
+from mmdet3d.structures.points import BasePoints
+from torch import Tensor
+
+from .utils import limit_period
+
+
+class BaseInstance3DBoxes:
+    """Base class for 3D Boxes.
+
+    Note:
+        The box is bottom centered, i.e. the relative position of origin in the
+        box is (0.5, 0.5, 0).
+
+    Args:
+        tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The boxes
+            data with shape (N, box_dim).
+        box_dim (int): Number of the dimension of a box. Each row is
+            (x, y, z, x_size, y_size, z_size, yaw). Defaults to 7.
+        with_yaw (bool): Whether the box is with yaw rotation. If False, the
+            value of yaw will be set to 0 as minmax boxes. Defaults to True.
+        origin (Tuple[float]): Relative position of the box origin.
+            Defaults to (0.5, 0.5, 0). This will guide the box be converted to
+            (0.5, 0.5, 0) mode.
+
+    Attributes:
+        tensor (Tensor): Float matrix with shape (N, box_dim).
+        box_dim (int): Integer indicating the dimension of a box. Each row is
+            (x, y, z, x_size, y_size, z_size, yaw, ...).
+        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
+            boxes.
+    """
+
+    YAW_AXIS: int = 0
+
+    def __init__(
+        self,
+        tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]],
+        box_dim: int = 7,
+        with_yaw: bool = True,
+        origin: Tuple[float, float, float] = (0.5, 0.5, 0)
+    ) -> None:
+        if isinstance(tensor, Tensor):
+            device = tensor.device
+        else:
+            device = torch.device('cpu')
+        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+        if tensor.numel() == 0:
+            # Use reshape, so we don't end up creating a new tensor that does
+            # not depend on the inputs (and consequently confuses jit)
+            tensor = tensor.reshape((-1, box_dim))
+        assert tensor.dim() == 2 and tensor.size(-1) == box_dim, \
+            ('The box dimension must be 2 and the length of the last '
+             f'dimension must be {box_dim}, but got boxes with shape '
+             f'{tensor.shape}.')
+
+        if tensor.shape[-1] == 6:
+            # If the dimension of boxes is 6, we expand box_dim by padding 0 as
+            # a fake yaw and set with_yaw to False
+            assert box_dim == 6
+            fake_rot = tensor.new_zeros(tensor.shape[0], 1)
+            tensor = torch.cat((tensor, fake_rot), dim=-1)
+            self.box_dim = box_dim + 1
+            self.with_yaw = False
+        else:
+            self.box_dim = box_dim
+            self.with_yaw = with_yaw
+        self.tensor = tensor.clone()
+
+        if origin != (0.5, 0.5, 0):
+            dst = self.tensor.new_tensor((0.5, 0.5, 0))
+            src = self.tensor.new_tensor(origin)
+            self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src)
+
+    @property
+    def shape(self) -> torch.Size:
+        """torch.Size: Shape of boxes."""
+        return self.tensor.shape
+
+    @property
+    def volume(self) -> Tensor:
+        """Tensor: A vector with volume of each box in shape (N, )."""
+        return self.tensor[:, 3] * self.tensor[:, 4] * self.tensor[:, 5]
+
+    @property
+    def dims(self) -> Tensor:
+        """Tensor: Size dimensions of each box in shape (N, 3)."""
+        return self.tensor[:, 3:6]
+
+    @property
+    def yaw(self) -> Tensor:
+        """Tensor: A vector with yaw of each box in shape (N, )."""
+        return self.tensor[:, 6]
+
+    @property
+    def height(self) -> Tensor:
+        """Tensor: A vector with height of each box in shape (N, )."""
+        return self.tensor[:, 5]
+
+    @property
+    def top_height(self) -> Tensor:
+        """Tensor: A vector with top height of each box in shape (N, )."""
+        return self.bottom_height + self.height
+
+    @property
+    def bottom_height(self) -> Tensor:
+        """Tensor: A vector with bottom height of each box in shape (N, )."""
+        return self.tensor[:, 2]
+
+    @property
+    def center(self) -> Tensor:
+        """Calculate the center of all the boxes.
+
+        Note:
+            In MMDetection3D's convention, the bottom center is usually taken
+            as the default center.
+
+            The relative position of the centers in different kinds of boxes
+            are different, e.g., the relative center of a boxes is
+            (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar. It is
+            recommended to use ``bottom_center`` or ``gravity_center`` for
+            clearer usage.
+
+        Returns:
+            Tensor: A tensor with center of each box in shape (N, 3).
+        """
+        return self.bottom_center
+
+    @property
+    def bottom_center(self) -> Tensor:
+        """Tensor: A tensor with center of each box in shape (N, 3)."""
+        return self.tensor[:, :3]
+
+    @property
+    def gravity_center(self) -> Tensor:
+        """Tensor: A tensor with center of each box in shape (N, 3)."""
+        bottom_center = self.bottom_center
+        gravity_center = torch.zeros_like(bottom_center)
+        gravity_center[:, :2] = bottom_center[:, :2]
+        gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5
+        return gravity_center
+
+    @property
+    def corners(self) -> Tensor:
+        """Tensor: A tensor with 8 corners of each box in shape (N, 8, 3)."""
+        pass
+
+    @property
+    def bev(self) -> Tensor:
+        """Tensor: 2D BEV box of each box with rotation in XYWHR format, in
+        shape (N, 5)."""
+        return self.tensor[:, [0, 1, 3, 4, 6]]
+
+    @property
+    def nearest_bev(self) -> Tensor:
+        """Tensor: A tensor of 2D BEV box of each box without rotation."""
+        # Obtain BEV boxes with rotation in XYWHR format
+        bev_rotated_boxes = self.bev
+        # convert the rotation to a valid range
+        rotations = bev_rotated_boxes[:, -1]
+        normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi))
+
+        # find the center of boxes
+        conditions = (normed_rotations > np.pi / 4)[..., None]
+        bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:,
+                                                                [0, 1, 3, 2]],
+                                  bev_rotated_boxes[:, :4])
+
+        centers = bboxes_xywh[:, :2]
+        dims = bboxes_xywh[:, 2:]
+        bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1)
+        return bev_boxes
+
+    def in_range_bev(
+            self, box_range: Union[Tensor, np.ndarray,
+                                   Sequence[float]]) -> Tensor:
+        """Check whether the boxes are in the given range.
+
+        Args:
+            box_range (Tensor or np.ndarray or Sequence[float]): The range of
+                box in order of (x_min, y_min, x_max, y_max).
+
+        Note:
+            The original implementation of SECOND checks whether boxes in a
+            range by checking whether the points are in a convex polygon, we
+            reduce the burden for simpler cases.
+
+        Returns:
+            Tensor: A binary vector indicating whether each box is inside the
+            reference range.
+        """
+        in_range_flags = ((self.bev[:, 0] > box_range[0])
+                          & (self.bev[:, 1] > box_range[1])
+                          & (self.bev[:, 0] < box_range[2])
+                          & (self.bev[:, 1] < box_range[3]))
+        return in_range_flags
+
+    @abstractmethod
+    def rotate(
+        self,
+        angle: Union[Tensor, np.ndarray, float],
+        points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None
+    ) -> Union[Tuple[Tensor, Tensor], Tuple[np.ndarray, np.ndarray], Tuple[
+            BasePoints, Tensor], None]:
+        """Rotate boxes with points (optional) with the given angle or rotation
+        matrix.
+
+        Args:
+            angle (Tensor or np.ndarray or float): Rotation angle or rotation
+                matrix.
+            points (Tensor or np.ndarray or :obj:`BasePoints`, optional):
+                Points to rotate. Defaults to None.
+
+        Returns:
+            tuple or None: When ``points`` is None, the function returns None,
+            otherwise it returns the rotated points and the rotation matrix
+            ``rot_mat_T``.
+        """
+        pass
+
+    @abstractmethod
+    def flip(
+        self,
+        bev_direction: str = 'horizontal',
+        points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None
+    ) -> Union[Tensor, np.ndarray, BasePoints, None]:
+        """Flip the boxes in BEV along given BEV direction.
+
+        Args:
+            bev_direction (str): Direction by which to flip. Can be chosen from
+                'horizontal' and 'vertical'. Defaults to 'horizontal'.
+            points (Tensor or np.ndarray or :obj:`BasePoints`, optional):
+                Points to flip. Defaults to None.
+
+        Returns:
+            Tensor or np.ndarray or :obj:`BasePoints` or None: When ``points``
+            is None, the function returns None, otherwise it returns the
+            flipped points.
+        """
+        pass
+
+    def translate(self, trans_vector: Union[Tensor, np.ndarray]) -> None:
+        """Translate boxes with the given translation vector.
+
+        Args:
+            trans_vector (Tensor or np.ndarray): Translation vector of size
+                1x3.
+        """
+        if not isinstance(trans_vector, Tensor):
+            trans_vector = self.tensor.new_tensor(trans_vector)
+        self.tensor[:, :3] += trans_vector
+
+    def in_range_3d(
+            self, box_range: Union[Tensor, np.ndarray,
+                                   Sequence[float]]) -> Tensor:
+        """Check whether the boxes are in the given range.
+
+        Args:
+            box_range (Tensor or np.ndarray or Sequence[float]): The range of
+                box (x_min, y_min, z_min, x_max, y_max, z_max).
+
+        Note:
+            In the original implementation of SECOND, checking whether a box in
+            the range checks whether the points are in a convex polygon, we try
+            to reduce the burden for simpler cases.
+
+        Returns:
+            Tensor: A binary vector indicating whether each point is inside the
+            reference range.
+        """
+        in_range_flags = ((self.tensor[:, 0] > box_range[0])
+                          & (self.tensor[:, 1] > box_range[1])
+                          & (self.tensor[:, 2] > box_range[2])
+                          & (self.tensor[:, 0] < box_range[3])
+                          & (self.tensor[:, 1] < box_range[4])
+                          & (self.tensor[:, 2] < box_range[5]))
+        return in_range_flags
+
+    @abstractmethod
+    def convert_to(self,
+                   dst: int,
+                   rt_mat: Optional[Union[Tensor, np.ndarray]] = None,
+                   correct_yaw: bool = False) -> 'BaseInstance3DBoxes':
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (int): The target Box mode.
+            rt_mat (Tensor or np.ndarray, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None. The conversion from ``src`` coordinates to
+                ``dst`` coordinates usually comes along the change of sensors,
+                e.g., from camera to LiDAR. This requires a transformation
+                matrix.
+            correct_yaw (bool): Whether to convert the yaw angle to the target
+                coordinate. Defaults to False.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: The converted box of the same type in
+            the ``dst`` mode.
+        """
+        pass
+
+    def scale(self, scale_factor: float) -> None:
+        """Scale the box with horizontal and vertical scaling factors.
+
+        Args:
+            scale_factors (float): Scale factors to scale the boxes.
+        """
+        self.tensor[:, :6] *= scale_factor
+        self.tensor[:, 7:] *= scale_factor  # velocity
+
+    def limit_yaw(self, offset: float = 0.5, period: float = np.pi) -> None:
+        """Limit the yaw to a given period and offset.
+
+        Args:
+            offset (float): The offset of the yaw. Defaults to 0.5.
+            period (float): The expected period. Defaults to np.pi.
+        """
+        self.tensor[:, 6] = limit_period(self.tensor[:, 6], offset, period)
+
+    def nonempty(self, threshold: float = 0.0) -> Tensor:
+        """Find boxes that are non-empty.
+
+        A box is considered empty if either of its side is no larger than
+        threshold.
+
+        Args:
+            threshold (float): The threshold of minimal sizes. Defaults to 0.0.
+
+        Returns:
+            Tensor: A binary vector which represents whether each box is empty
+            (False) or non-empty (True).
+        """
+        box = self.tensor
+        size_x = box[..., 3]
+        size_y = box[..., 4]
+        size_z = box[..., 5]
+        keep = ((size_x > threshold)
+                & (size_y > threshold) & (size_z > threshold))
+        return keep
+
+    def __getitem__(
+            self, item: Union[int, slice, np.ndarray,
+                              Tensor]) -> 'BaseInstance3DBoxes':
+        """
+        Args:
+            item (int or slice or np.ndarray or Tensor): Index of boxes.
+
+        Note:
+            The following usage are allowed:
+
+            1. `new_boxes = boxes[3]`: Return a `Boxes` that contains only one
+               box.
+            2. `new_boxes = boxes[2:10]`: Return a slice of boxes.
+            3. `new_boxes = boxes[vector]`: Where vector is a
+               torch.BoolTensor with `length = len(boxes)`. Nonzero elements in
+               the vector will be selected.
+
+            Note that the returned Boxes might share storage with this Boxes,
+            subject to PyTorch's indexing semantics.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: A new object of
+            :class:`BaseInstance3DBoxes` after indexing.
+        """
+        original_type = type(self)
+        if isinstance(item, int):
+            return original_type(self.tensor[item].view(1, -1),
+                                 box_dim=self.box_dim,
+                                 with_yaw=self.with_yaw)
+        b = self.tensor[item]
+        assert b.dim() == 2, \
+            f'Indexing on Boxes with {item} failed to return a matrix!'
+        return original_type(b, box_dim=self.box_dim, with_yaw=self.with_yaw)
+
+    def __len__(self) -> int:
+        """int: Number of boxes in the current object."""
+        return self.tensor.shape[0]
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the object."""
+        return self.__class__.__name__ + '(\n    ' + str(self.tensor) + ')'
+
+    @classmethod
+    def cat(cls, boxes_list: Sequence['BaseInstance3DBoxes']
+            ) -> 'BaseInstance3DBoxes':
+        """Concatenate a list of Boxes into a single Boxes.
+
+        Args:
+            boxes_list (Sequence[:obj:`BaseInstance3DBoxes`]): List of boxes.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: The concatenated boxes.
+        """
+        assert isinstance(boxes_list, (list, tuple))
+        if len(boxes_list) == 0:
+            return cls(torch.empty(0))
+        assert all(isinstance(box, cls) for box in boxes_list)
+
+        # use torch.cat (v.s. layers.cat)
+        # so the returned boxes never share storage with input
+        cat_boxes = cls(torch.cat([b.tensor for b in boxes_list], dim=0),
+                        box_dim=boxes_list[0].box_dim,
+                        with_yaw=boxes_list[0].with_yaw)
+        return cat_boxes
+
+    def numpy(self) -> np.ndarray:
+        """Reload ``numpy`` from self.tensor."""
+        return self.tensor.numpy()
+
+    def to(self, device: Union[str, torch.device], *args,
+           **kwargs) -> 'BaseInstance3DBoxes':
+        """Convert current boxes to a specific device.
+
+        Args:
+            device (str or :obj:`torch.device`): The name of the device.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: A new boxes object on the specific
+            device.
+        """
+        original_type = type(self)
+        return original_type(self.tensor.to(device, *args, **kwargs),
+                             box_dim=self.box_dim,
+                             with_yaw=self.with_yaw)
+
+    def cpu(self) -> 'BaseInstance3DBoxes':
+        """Convert current boxes to cpu device.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: A new boxes object on the cpu device.
+        """
+        original_type = type(self)
+        return original_type(self.tensor.cpu(),
+                             box_dim=self.box_dim,
+                             with_yaw=self.with_yaw)
+
+    def cuda(self, *args, **kwargs) -> 'BaseInstance3DBoxes':
+        """Convert current boxes to cuda device.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: A new boxes object on the cuda device.
+        """
+        original_type = type(self)
+        return original_type(self.tensor.cuda(*args, **kwargs),
+                             box_dim=self.box_dim,
+                             with_yaw=self.with_yaw)
+
+    def clone(self) -> 'BaseInstance3DBoxes':
+        """Clone the boxes.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: Box object with the same properties as
+            self.
+        """
+        original_type = type(self)
+        return original_type(self.tensor.clone(),
+                             box_dim=self.box_dim,
+                             with_yaw=self.with_yaw)
+
+    def detach(self) -> 'BaseInstance3DBoxes':
+        """Detach the boxes.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: Box object with the same properties as
+            self.
+        """
+        original_type = type(self)
+        return original_type(self.tensor.detach(),
+                             box_dim=self.box_dim,
+                             with_yaw=self.with_yaw)
+
+    @property
+    def device(self) -> torch.device:
+        """torch.device: The device of the boxes are on."""
+        return self.tensor.device
+
+    def __iter__(self) -> Iterator[Tensor]:
+        """Yield a box as a Tensor at a time.
+
+        Returns:
+            Iterator[Tensor]: A box of shape (box_dim, ).
+        """
+        yield from self.tensor
+
+    @classmethod
+    def height_overlaps(cls, boxes1: 'BaseInstance3DBoxes',
+                        boxes2: 'BaseInstance3DBoxes') -> Tensor:
+        """Calculate height overlaps of two boxes.
+
+        Note:
+            This function calculates the height overlaps between ``boxes1`` and
+            ``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type.
+
+        Args:
+            boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes.
+            boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes.
+
+        Returns:
+            Tensor: Calculated height overlap of the boxes.
+        """
+        assert isinstance(boxes1, BaseInstance3DBoxes)
+        assert isinstance(boxes2, BaseInstance3DBoxes)
+        assert type(boxes1) == type(boxes2), \
+            '"boxes1" and "boxes2" should be in the same type, ' \
+            f'but got {type(boxes1)} and {type(boxes2)}.'
+
+        boxes1_top_height = boxes1.top_height.view(-1, 1)
+        boxes1_bottom_height = boxes1.bottom_height.view(-1, 1)
+        boxes2_top_height = boxes2.top_height.view(1, -1)
+        boxes2_bottom_height = boxes2.bottom_height.view(1, -1)
+
+        heighest_of_bottom = torch.max(boxes1_bottom_height,
+                                       boxes2_bottom_height)
+        lowest_of_top = torch.min(boxes1_top_height, boxes2_top_height)
+        overlaps_h = torch.clamp(lowest_of_top - heighest_of_bottom, min=0)
+        return overlaps_h
+
+    @classmethod
+    def overlaps(cls,
+                 boxes1: 'BaseInstance3DBoxes',
+                 boxes2: 'BaseInstance3DBoxes',
+                 mode: str = 'iou') -> Tensor:
+        """Calculate 3D overlaps of two boxes.
+
+        Note:
+            This function calculates the overlaps between ``boxes1`` and
+            ``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type.
+
+        Args:
+            boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes.
+            boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes.
+            mode (str): Mode of iou calculation. Defaults to 'iou'.
+
+        Returns:
+            Tensor: Calculated 3D overlap of the boxes.
+        """
+        raise NotImplementedError
+        assert isinstance(boxes1, BaseInstance3DBoxes)
+        assert isinstance(boxes2, BaseInstance3DBoxes)
+        assert type(boxes1) == type(boxes2), \
+            '"boxes1" and "boxes2" should be in the same type, ' \
+            f'but got {type(boxes1)} and {type(boxes2)}.'
+
+        assert mode in ['iou', 'iof']
+
+        rows = len(boxes1)
+        cols = len(boxes2)
+        if rows * cols == 0:
+            return boxes1.tensor.new(rows, cols)
+
+        # height overlap
+        overlaps_h = cls.height_overlaps(boxes1, boxes2)
+
+        # Restrict the min values of W and H to avoid memory overflow in
+        # ``box_iou_rotated``.
+        boxes1_bev, boxes2_bev = boxes1.bev, boxes2.bev
+        boxes1_bev[:, 2:4] = boxes1_bev[:, 2:4].clamp(min=1e-4)
+        boxes2_bev[:, 2:4] = boxes2_bev[:, 2:4].clamp(min=1e-4)
+
+        # bev overlap
+        iou2d = box_iou_rotated(boxes1_bev, boxes2_bev)
+        areas1 = (boxes1_bev[:, 2] * boxes1_bev[:, 3]).unsqueeze(1).expand(
+            rows, cols)
+        areas2 = (boxes2_bev[:, 2] * boxes2_bev[:, 3]).unsqueeze(0).expand(
+            rows, cols)
+        overlaps_bev = iou2d * (areas1 + areas2) / (1 + iou2d)
+
+        # 3d overlaps
+        overlaps_3d = overlaps_bev.to(boxes1.device) * overlaps_h
+
+        volume1 = boxes1.volume.view(-1, 1)
+        volume2 = boxes2.volume.view(1, -1)
+
+        if mode == 'iou':
+            # the clamp func is used to avoid division of 0
+            iou3d = overlaps_3d / torch.clamp(volume1 + volume2 - overlaps_3d,
+                                              min=1e-8)
+        else:
+            iou3d = overlaps_3d / torch.clamp(volume1, min=1e-8)
+
+        return iou3d
+
+    def new_box(
+        self, data: Union[Tensor, np.ndarray, Sequence[Sequence[float]]]
+    ) -> 'BaseInstance3DBoxes':
+        """Create a new box object with data.
+
+        The new box and its tensor has the similar properties as self and
+        self.tensor, respectively.
+
+        Args:
+            data (Tensor or np.ndarray or Sequence[Sequence[float]]): Data to
+                be copied.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: A new bbox object with ``data``, the
+            object's other properties are similar to ``self``.
+        """
+        new_tensor = self.tensor.new_tensor(data) \
+            if not isinstance(data, Tensor) else data.to(self.device)
+        original_type = type(self)
+        return original_type(new_tensor,
+                             box_dim=self.box_dim,
+                             with_yaw=self.with_yaw)
+
+    def points_in_boxes_part(
+            self,
+            points: Tensor,
+            boxes_override: Optional[Tensor] = None) -> Tensor:
+        """Find the box in which each point is.
+
+        Args:
+            points (Tensor): Points in shape (1, M, 3) or (M, 3), 3 dimensions
+                are (x, y, z) in LiDAR or depth coordinate.
+            boxes_override (Tensor, optional): Boxes to override `self.tensor`.
+                Defaults to None.
+
+        Note:
+            If a point is enclosed by multiple boxes, the index of the first
+            box will be returned.
+
+        Returns:
+            Tensor: The index of the first box that each point is in with shape
+            (M, ). Default value is -1 (if the point is not enclosed by any
+            box).
+        """
+        if boxes_override is not None:
+            boxes = boxes_override
+        else:
+            boxes = self.tensor
+
+        points_clone = points.clone()[..., :3]
+        if points_clone.dim() == 2:
+            points_clone = points_clone.unsqueeze(0)
+        else:
+            assert points_clone.dim() == 3 and points_clone.shape[0] == 1
+
+        boxes = boxes.to(points_clone.device).unsqueeze(0)
+        box_idx = points_in_boxes_part(points_clone, boxes)
+
+        return box_idx.squeeze(0)
+
+    def points_in_boxes_all(self,
+                            points: Tensor,
+                            boxes_override: Optional[Tensor] = None) -> Tensor:
+        """Find all boxes in which each point is.
+
+        Args:
+            points (Tensor): Points in shape (1, M, 3) or (M, 3), 3 dimensions
+                are (x, y, z) in LiDAR or depth coordinate.
+            boxes_override (Tensor, optional): Boxes to override `self.tensor`.
+                Defaults to None.
+
+        Returns:
+            Tensor: A tensor indicating whether a point is in a box with shape
+            (M, T). T is the number of boxes. Denote this tensor as A, it the
+            m^th point is in the t^th box, then `A[m, t] == 1`, otherwise
+            `A[m, t] == 0`.
+        """
+        if boxes_override is not None:
+            boxes = boxes_override
+        else:
+            boxes = self.tensor
+
+        points_clone = points.clone()[..., :3]
+        if points_clone.dim() == 2:
+            points_clone = points_clone.unsqueeze(0)
+        else:
+            assert points_clone.dim() == 3 and points_clone.shape[0] == 1
+
+        boxes = boxes.to(points_clone.device).unsqueeze(0)
+        box_idxs_of_pts = points_in_boxes_all(points_clone, boxes)
+
+        return box_idxs_of_pts.squeeze(0)
+
+    def points_in_boxes(self,
+                        points: Tensor,
+                        boxes_override: Optional[Tensor] = None) -> Tensor:
+        warnings.warn('DeprecationWarning: points_in_boxes is a deprecated '
+                      'method, please consider using points_in_boxes_part.')
+        return self.points_in_boxes_part(points, boxes_override)
+
+    def points_in_boxes_batch(
+            self,
+            points: Tensor,
+            boxes_override: Optional[Tensor] = None) -> Tensor:
+        warnings.warn('DeprecationWarning: points_in_boxes_batch is a '
+                      'deprecated method, please consider using '
+                      'points_in_boxes_all.')
+        return self.points_in_boxes_all(points, boxes_override)
diff --git a/embodiedscan/structures/bbox_3d/box_3d_mode.py b/embodiedscan/structures/bbox_3d/box_3d_mode.py
new file mode 100644
index 0000000..1c70e30
--- /dev/null
+++ b/embodiedscan/structures/bbox_3d/box_3d_mode.py
@@ -0,0 +1,269 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from enum import IntEnum, unique
+from typing import Optional, Sequence, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from .base_box3d import BaseInstance3DBoxes
+from .cam_box3d import CameraInstance3DBoxes
+from .depth_box3d import DepthInstance3DBoxes
+from .lidar_box3d import LiDARInstance3DBoxes
+from .utils import limit_period
+
+
+@unique
+class Box3DMode(IntEnum):
+    """Enum of different ways to represent a box.
+
+    Coordinates in LiDAR:
+
+    .. code-block:: none
+
+                    up z
+                       ^   x front
+                       |  /
+                       | /
+        left y <------ 0
+
+    The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+
+    Coordinates in Camera:
+
+    .. code-block:: none
+
+                z front
+               /
+              /
+             0 ------> x right
+             |
+             |
+             v
+        down y
+
+    The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5),
+    and the yaw is around the y axis, thus the rotation axis=1.
+
+    Coordinates in Depth:
+
+    .. code-block:: none
+
+        up z
+           ^   y front
+           |  /
+           | /
+           0 ------> x right
+
+    The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+    """
+
+    LIDAR = 0
+    CAM = 1
+    DEPTH = 2
+    EULER_CAM = 3
+    EULER_DEPTH = 4
+
+    @staticmethod
+    def convert(
+        box: Union[Sequence[float], np.ndarray, Tensor, BaseInstance3DBoxes],
+        src: 'Box3DMode',
+        dst: 'Box3DMode',
+        rt_mat: Optional[Union[np.ndarray, Tensor]] = None,
+        with_yaw: bool = True,
+        correct_yaw: bool = False
+    ) -> Union[Sequence[float], np.ndarray, Tensor, BaseInstance3DBoxes]:
+        """Convert boxes from ``src`` mode to ``dst`` mode.
+
+        Args:
+            box (Sequence[float] or np.ndarray or Tensor or
+                :obj:`BaseInstance3DBoxes`): Can be a k-tuple, k-list or an Nxk
+                array/tensor.
+            src (:obj:`Box3DMode`): The source box mode.
+            dst (:obj:`Box3DMode`): The target box mode.
+            rt_mat (np.ndarray or Tensor, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None. The conversion from ``src`` coordinates to
+                ``dst`` coordinates usually comes along the change of sensors,
+                e.g., from camera to LiDAR. This requires a transformation
+                matrix.
+            with_yaw (bool): If ``box`` is an instance of
+                :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle.
+                Defaults to True.
+            correct_yaw (bool): If the yaw is rotated by rt_mat.
+                Defaults to False.
+
+        Returns:
+            Sequence[float] or np.ndarray or Tensor or
+            :obj:`BaseInstance3DBoxes`: The converted box of the same type.
+        """
+        if src == dst:
+            return box
+
+        is_numpy = isinstance(box, np.ndarray)
+        is_Instance3DBoxes = isinstance(box, BaseInstance3DBoxes)
+        single_box = isinstance(box, (list, tuple))
+        if single_box:
+            assert len(box) >= 7, (
+                'Box3DMode.convert takes either a k-tuple/list or '
+                'an Nxk array/tensor, where k >= 7')
+            arr = torch.tensor(box)[None, :]
+        else:
+            # avoid modifying the input box
+            if is_numpy:
+                arr = torch.from_numpy(np.asarray(box)).clone()
+            elif is_Instance3DBoxes:
+                arr = box.tensor.clone()
+            else:
+                arr = box.clone()
+
+        if is_Instance3DBoxes:
+            with_yaw = box.with_yaw
+
+        # convert box from `src` mode to `dst` mode.
+        x_size, y_size, z_size = arr[..., 3:4], arr[..., 4:5], arr[..., 5:6]
+        if with_yaw:
+            yaw = arr[..., 6:7]
+        if src == Box3DMode.LIDAR and dst == Box3DMode.CAM:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
+            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
+            if with_yaw:
+                if correct_yaw:
+                    yaw_vector = torch.cat([
+                        torch.cos(yaw),
+                        torch.sin(yaw),
+                        torch.zeros_like(yaw)
+                    ],
+                                           dim=1)
+                else:
+                    yaw = -yaw - np.pi / 2
+                    yaw = limit_period(yaw, period=np.pi * 2)
+        elif src == Box3DMode.CAM and dst == Box3DMode.LIDAR:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
+            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
+            if with_yaw:
+                if correct_yaw:
+                    yaw_vector = torch.cat([
+                        torch.cos(-yaw),
+                        torch.zeros_like(yaw),
+                        torch.sin(-yaw)
+                    ],
+                                           dim=1)
+                else:
+                    yaw = -yaw - np.pi / 2
+                    yaw = limit_period(yaw, period=np.pi * 2)
+        elif src == Box3DMode.DEPTH and dst == Box3DMode.CAM:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
+            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
+            if with_yaw:
+                if correct_yaw:
+                    yaw_vector = torch.cat([
+                        torch.cos(yaw),
+                        torch.sin(yaw),
+                        torch.zeros_like(yaw)
+                    ],
+                                           dim=1)
+                else:
+                    yaw = -yaw
+        elif src == Box3DMode.CAM and dst == Box3DMode.DEPTH:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
+            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
+            if with_yaw:
+                if correct_yaw:
+                    yaw_vector = torch.cat([
+                        torch.cos(-yaw),
+                        torch.zeros_like(yaw),
+                        torch.sin(-yaw)
+                    ],
+                                           dim=1)
+                else:
+                    yaw = -yaw
+        elif src == Box3DMode.LIDAR and dst == Box3DMode.DEPTH:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
+            xyz_size = torch.cat([x_size, y_size, z_size], dim=-1)
+            if with_yaw:
+                if correct_yaw:
+                    yaw_vector = torch.cat([
+                        torch.cos(yaw),
+                        torch.sin(yaw),
+                        torch.zeros_like(yaw)
+                    ],
+                                           dim=1)
+                else:
+                    yaw = yaw + np.pi / 2
+                    yaw = limit_period(yaw, period=np.pi * 2)
+        elif src == Box3DMode.DEPTH and dst == Box3DMode.LIDAR:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
+            xyz_size = torch.cat([x_size, y_size, z_size], dim=-1)
+            if with_yaw:
+                if correct_yaw:
+                    yaw_vector = torch.cat([
+                        torch.cos(yaw),
+                        torch.sin(yaw),
+                        torch.zeros_like(yaw)
+                    ],
+                                           dim=1)
+                else:
+                    yaw = yaw - np.pi / 2
+                    yaw = limit_period(yaw, period=np.pi * 2)
+        else:  # TODO: add transformation between euler boxes
+            raise NotImplementedError(
+                f'Conversion from Box3DMode {src} to {dst} '
+                'is not supported yet')
+
+        if not isinstance(rt_mat, Tensor):
+            rt_mat = arr.new_tensor(rt_mat)
+        if rt_mat.size(1) == 4:
+            extended_xyz = torch.cat(
+                [arr[..., :3], arr.new_ones(arr.size(0), 1)], dim=-1)
+            xyz = extended_xyz @ rt_mat.t()
+        else:
+            xyz = arr[..., :3] @ rt_mat.t()
+
+        # Note: we only use rotation in rt_mat
+        # so don't need to extend yaw_vector
+        if with_yaw and correct_yaw:
+            rot_yaw_vector = yaw_vector @ rt_mat[:3, :3].t()
+            if dst == Box3DMode.CAM:
+                yaw = torch.atan2(-rot_yaw_vector[:, [2]], rot_yaw_vector[:,
+                                                                          [0]])
+            elif dst in [Box3DMode.LIDAR, Box3DMode.DEPTH]:
+                yaw = torch.atan2(rot_yaw_vector[:, [1]], rot_yaw_vector[:,
+                                                                         [0]])
+            yaw = limit_period(yaw, period=np.pi * 2)
+
+        if with_yaw:
+            remains = arr[..., 7:]
+            arr = torch.cat([xyz[..., :3], xyz_size, yaw, remains], dim=-1)
+        else:
+            remains = arr[..., 6:]
+            arr = torch.cat([xyz[..., :3], xyz_size, remains], dim=-1)
+
+        # convert arr to the original type
+        original_type = type(box)
+        if single_box:
+            return original_type(arr.flatten().tolist())
+        if is_numpy:
+            return arr.numpy()
+        elif is_Instance3DBoxes:
+            if dst == Box3DMode.CAM:
+                target_type = CameraInstance3DBoxes
+            elif dst == Box3DMode.LIDAR:
+                target_type = LiDARInstance3DBoxes
+            elif dst == Box3DMode.DEPTH:
+                target_type = DepthInstance3DBoxes
+            else:
+                raise NotImplementedError(
+                    f'Conversion to {dst} through {original_type} '
+                    'is not supported yet')
+            return target_type(arr, box_dim=arr.size(-1), with_yaw=with_yaw)
+        else:
+            return arr
diff --git a/embodiedscan/structures/bbox_3d/cam_box3d.py b/embodiedscan/structures/bbox_3d/cam_box3d.py
new file mode 100644
index 0000000..106f3f8
--- /dev/null
+++ b/embodiedscan/structures/bbox_3d/cam_box3d.py
@@ -0,0 +1,403 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+from mmdet3d.structures.points import BasePoints
+from torch import Tensor
+
+from .base_box3d import BaseInstance3DBoxes
+from .utils import rotation_3d_in_axis, yaw2local
+
+
+class CameraInstance3DBoxes(BaseInstance3DBoxes):
+    """3D boxes of instances in CAM coordinates.
+
+    Coordinates in Camera:
+
+    .. code-block:: none
+
+                z front (yaw=-0.5*pi)
+               /
+              /
+             0 ------> x right (yaw=0)
+             |
+             |
+             v
+        down y
+
+    The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5),
+    and the yaw is around the y axis, thus the rotation axis=1. The yaw is 0 at
+    the positive direction of x axis, and decreases from the positive direction
+    of x to the positive direction of z.
+
+    Args:
+        tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The boxes
+            data with shape (N, box_dim).
+        box_dim (int): Number of the dimension of a box. Each row is
+            (x, y, z, x_size, y_size, z_size, yaw). Defaults to 7.
+        with_yaw (bool): Whether the box is with yaw rotation. If False, the
+            value of yaw will be set to 0 as minmax boxes. Defaults to True.
+        origin (Tuple[float]): Relative position of the box origin.
+            Defaults to (0.5, 1.0, 0.5). This will guide the box be converted
+            to (0.5, 1.0, 0.5) mode.
+
+    Attributes:
+        tensor (Tensor): Float matrix with shape (N, box_dim).
+        box_dim (int): Integer indicating the dimension of a box. Each row is
+            (x, y, z, x_size, y_size, z_size, yaw, ...).
+        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
+            boxes.
+    """
+    YAW_AXIS = 1
+
+    def __init__(
+        self,
+        tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]],
+        box_dim: int = 7,
+        with_yaw: bool = True,
+        origin: Tuple[float, float, float] = (0.5, 1.0, 0.5)
+    ) -> None:
+        if isinstance(tensor, Tensor):
+            device = tensor.device
+        else:
+            device = torch.device('cpu')
+        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+        if tensor.numel() == 0:
+            # Use reshape, so we don't end up creating a new tensor that does
+            # not depend on the inputs (and consequently confuses jit)
+            tensor = tensor.reshape((-1, box_dim))
+        assert tensor.dim() == 2 and tensor.size(-1) == box_dim, \
+            ('The box dimension must be 2 and the length of the last '
+             f'dimension must be {box_dim}, but got boxes with shape '
+             f'{tensor.shape}.')
+
+        if tensor.shape[-1] == 6:
+            # If the dimension of boxes is 6, we expand box_dim by padding 0 as
+            # a fake yaw and set with_yaw to False
+            assert box_dim == 6
+            fake_rot = tensor.new_zeros(tensor.shape[0], 1)
+            tensor = torch.cat((tensor, fake_rot), dim=-1)
+            self.box_dim = box_dim + 1
+            self.with_yaw = False
+        else:
+            self.box_dim = box_dim
+            self.with_yaw = with_yaw
+        self.tensor = tensor.clone()
+
+        if origin != (0.5, 1.0, 0.5):
+            dst = self.tensor.new_tensor((0.5, 1.0, 0.5))
+            src = self.tensor.new_tensor(origin)
+            self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src)
+
+    @property
+    def height(self) -> Tensor:
+        """Tensor: A vector with height of each box in shape (N, )."""
+        return self.tensor[:, 4]
+
+    @property
+    def top_height(self) -> Tensor:
+        """Tensor: A vector with top height of each box in shape (N, )."""
+        # the positive direction is down rather than up
+        return self.bottom_height - self.height
+
+    @property
+    def bottom_height(self) -> Tensor:
+        """Tensor: A vector with bottom height of each box in shape (N, )."""
+        return self.tensor[:, 1]
+
+    @property
+    def local_yaw(self) -> Tensor:
+        """Tensor: A vector with local yaw of each box in shape (N, ).
+        local_yaw equals to alpha in kitti, which is commonly used in monocular
+        3D object detection task, so only :obj:`CameraInstance3DBoxes` has the
+        property."""
+        yaw = self.yaw
+        loc = self.gravity_center
+        local_yaw = yaw2local(yaw, loc)
+
+        return local_yaw
+
+    @property
+    def gravity_center(self) -> Tensor:
+        """Tensor: A tensor with center of each box in shape (N, 3)."""
+        bottom_center = self.bottom_center
+        gravity_center = torch.zeros_like(bottom_center)
+        gravity_center[:, [0, 2]] = bottom_center[:, [0, 2]]
+        gravity_center[:, 1] = bottom_center[:, 1] - self.tensor[:, 4] * 0.5
+        return gravity_center
+
+    @property
+    def corners(self) -> Tensor:
+        """Convert boxes to corners in clockwise order, in the form of (x0y0z0,
+        x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0).
+
+        .. code-block:: none
+
+                         front z
+                              /
+                             /
+               (x0, y0, z1) + -----------  + (x1, y0, z1)
+                           /|            / |
+                          / |           /  |
+            (x0, y0, z0) + ----------- +   + (x1, y1, z1)
+                         |  /      .   |  /
+                         | / origin    | /
+            (x0, y1, z0) + ----------- + -------> right x
+                         |             (x1, y1, z0)
+                         |
+                         v
+                    down y
+
+        Returns:
+            Tensor: A tensor with 8 corners of each box in shape (N, 8, 3).
+        """
+        if self.tensor.numel() == 0:
+            return torch.empty([0, 8, 3], device=self.tensor.device)
+
+        dims = self.dims
+        corners_norm = torch.from_numpy(
+            np.stack(np.unravel_index(np.arange(8), [2] * 3),
+                     axis=1)).to(device=dims.device, dtype=dims.dtype)
+
+        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+        # use relative origin (0.5, 1, 0.5)
+        corners_norm = corners_norm - dims.new_tensor([0.5, 1, 0.5])
+        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
+
+        corners = rotation_3d_in_axis(corners,
+                                      self.tensor[:, 6],
+                                      axis=self.YAW_AXIS)
+        corners += self.tensor[:, :3].view(-1, 1, 3)
+        return corners
+
+    @property
+    def bev(self) -> Tensor:
+        """Tensor: 2D BEV box of each box with rotation in XYWHR format, in
+        shape (N, 5)."""
+        bev = self.tensor[:, [0, 2, 3, 5, 6]].clone()
+        # positive direction of the gravity axis
+        # in cam coord system points to the earth
+        # so the bev yaw angle needs to be reversed
+        bev[:, -1] = -bev[:, -1]
+        return bev
+
+    def rotate(
+        self,
+        angle: Union[Tensor, np.ndarray, float],
+        points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None
+    ) -> Union[Tuple[Tensor, Tensor], Tuple[np.ndarray, np.ndarray], Tuple[
+            BasePoints, Tensor], None]:
+        """Rotate boxes with points (optional) with the given angle or rotation
+        matrix.
+
+        Args:
+            angle (Tensor or np.ndarray or float): Rotation angle or rotation
+                matrix.
+            points (Tensor or np.ndarray or :obj:`BasePoints`, optional):
+                Points to rotate. Defaults to None.
+
+        Returns:
+            tuple or None: When ``points`` is None, the function returns None,
+            otherwise it returns the rotated points and the rotation matrix
+            ``rot_mat_T``.
+        """
+        if not isinstance(angle, Tensor):
+            angle = self.tensor.new_tensor(angle)
+
+        assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \
+            f'invalid rotation angle shape {angle.shape}'
+
+        if angle.numel() == 1:
+            self.tensor[:, 0:3], rot_mat_T = rotation_3d_in_axis(
+                self.tensor[:, 0:3],
+                angle,
+                axis=self.YAW_AXIS,
+                return_mat=True)
+        else:
+            rot_mat_T = angle
+            rot_sin = rot_mat_T[2, 0]
+            rot_cos = rot_mat_T[0, 0]
+            angle = np.arctan2(rot_sin, rot_cos)
+            self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T
+
+        self.tensor[:, 6] += angle
+
+        if points is not None:
+            if isinstance(points, Tensor):
+                points[:, :3] = points[:, :3] @ rot_mat_T
+            elif isinstance(points, np.ndarray):
+                rot_mat_T = rot_mat_T.cpu().numpy()
+                points[:, :3] = np.dot(points[:, :3], rot_mat_T)
+            elif isinstance(points, BasePoints):
+                points.rotate(rot_mat_T)
+            else:
+                raise ValueError
+            return points, rot_mat_T
+        else:
+            return rot_mat_T
+
+    def flip(
+        self,
+        bev_direction: str = 'horizontal',
+        points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None
+    ) -> Union[Tensor, np.ndarray, BasePoints, None]:
+        """Flip the boxes in BEV along given BEV direction.
+
+        In CAM coordinates, it flips the x (horizontal) or z (vertical) axis.
+
+        Args:
+            bev_direction (str): Direction by which to flip. Can be chosen from
+                'horizontal' and 'vertical'. Defaults to 'horizontal'.
+            points (Tensor or np.ndarray or :obj:`BasePoints`, optional):
+                Points to flip. Defaults to None.
+
+        Returns:
+            Tensor or np.ndarray or :obj:`BasePoints` or None: When ``points``
+            is None, the function returns None, otherwise it returns the
+            flipped points.
+        """
+        assert bev_direction in ('horizontal', 'vertical')
+        if bev_direction == 'horizontal':
+            self.tensor[:, 0::7] = -self.tensor[:, 0::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
+        elif bev_direction == 'vertical':
+            self.tensor[:, 2::7] = -self.tensor[:, 2::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6]
+
+        if points is not None:
+            assert isinstance(points, (Tensor, np.ndarray, BasePoints))
+            if isinstance(points, (Tensor, np.ndarray)):
+                if bev_direction == 'horizontal':
+                    points[:, 0] = -points[:, 0]
+                elif bev_direction == 'vertical':
+                    points[:, 2] = -points[:, 2]
+            elif isinstance(points, BasePoints):
+                points.flip(bev_direction)
+            return points
+
+    @classmethod
+    def height_overlaps(cls, boxes1: 'CameraInstance3DBoxes',
+                        boxes2: 'CameraInstance3DBoxes') -> Tensor:
+        """Calculate height overlaps of two boxes.
+
+        Note:
+            This function calculates the height overlaps between ``boxes1`` and
+            ``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type.
+
+        Args:
+            boxes1 (:obj:`CameraInstance3DBoxes`): Boxes 1 contain N boxes.
+            boxes2 (:obj:`CameraInstance3DBoxes`): Boxes 2 contain M boxes.
+
+        Returns:
+            Tensor: Calculated height overlap of the boxes.
+        """
+        assert isinstance(boxes1, CameraInstance3DBoxes)
+        assert isinstance(boxes2, CameraInstance3DBoxes)
+
+        boxes1_top_height = boxes1.top_height.view(-1, 1)
+        boxes1_bottom_height = boxes1.bottom_height.view(-1, 1)
+        boxes2_top_height = boxes2.top_height.view(1, -1)
+        boxes2_bottom_height = boxes2.bottom_height.view(1, -1)
+
+        # positive direction of the gravity axis
+        # in cam coord system points to the earth
+        heighest_of_bottom = torch.min(boxes1_bottom_height,
+                                       boxes2_bottom_height)
+        lowest_of_top = torch.max(boxes1_top_height, boxes2_top_height)
+        overlaps_h = torch.clamp(heighest_of_bottom - lowest_of_top, min=0)
+        return overlaps_h
+
+    def convert_to(self,
+                   dst: int,
+                   rt_mat: Optional[Union[Tensor, np.ndarray]] = None,
+                   correct_yaw: bool = False) -> 'BaseInstance3DBoxes':
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (int): The target Box mode.
+            rt_mat (Tensor or np.ndarray, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None. The conversion from ``src`` coordinates to
+                ``dst`` coordinates usually comes along the change of sensors,
+                e.g., from camera to LiDAR. This requires a transformation
+                matrix.
+            correct_yaw (bool): Whether to convert the yaw angle to the target
+                coordinate. Defaults to False.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: The converted box of the same type in
+            the ``dst`` mode.
+        """
+        from .box_3d_mode import Box3DMode
+
+        # TODO: always set correct_yaw=True
+        return Box3DMode.convert(box=self,
+                                 src=Box3DMode.CAM,
+                                 dst=dst,
+                                 rt_mat=rt_mat,
+                                 correct_yaw=correct_yaw)
+
+    def points_in_boxes_part(
+            self,
+            points: Tensor,
+            boxes_override: Optional[Tensor] = None) -> Tensor:
+        """Find the box in which each point is.
+
+        Args:
+            points (Tensor): Points in shape (1, M, 3) or (M, 3), 3 dimensions
+                are (x, y, z) in LiDAR or depth coordinate.
+            boxes_override (Tensor, optional): Boxes to override `self.tensor`.
+                Defaults to None.
+
+        Returns:
+            Tensor: The index of the first box that each point is in with shape
+            (M, ). Default value is -1 (if the point is not enclosed by any
+            box).
+        """
+        from .coord_3d_mode import Coord3DMode
+
+        points_lidar = Coord3DMode.convert(points, Coord3DMode.CAM,
+                                           Coord3DMode.LIDAR)
+        if boxes_override is not None:
+            boxes_lidar = boxes_override
+        else:
+            boxes_lidar = Coord3DMode.convert(self.tensor,
+                                              Coord3DMode.CAM,
+                                              Coord3DMode.LIDAR,
+                                              is_point=False)
+
+        box_idx = super().points_in_boxes_part(points_lidar, boxes_lidar)
+        return box_idx
+
+    def points_in_boxes_all(self,
+                            points: Tensor,
+                            boxes_override: Optional[Tensor] = None) -> Tensor:
+        """Find all boxes in which each point is.
+
+        Args:
+            points (Tensor): Points in shape (1, M, 3) or (M, 3), 3 dimensions
+                are (x, y, z) in LiDAR or depth coordinate.
+            boxes_override (Tensor, optional): Boxes to override `self.tensor`.
+                Defaults to None.
+
+        Returns:
+            Tensor: The index of all boxes in which each point is with shape
+            (M, T).
+        """
+        from .coord_3d_mode import Coord3DMode
+
+        points_lidar = Coord3DMode.convert(points, Coord3DMode.CAM,
+                                           Coord3DMode.LIDAR)
+        if boxes_override is not None:
+            boxes_lidar = boxes_override
+        else:
+            boxes_lidar = Coord3DMode.convert(self.tensor,
+                                              Coord3DMode.CAM,
+                                              Coord3DMode.LIDAR,
+                                              is_point=False)
+
+        box_idx = super().points_in_boxes_all(points_lidar, boxes_lidar)
+        return box_idx
diff --git a/embodiedscan/structures/bbox_3d/coord_3d_mode.py b/embodiedscan/structures/bbox_3d/coord_3d_mode.py
new file mode 100644
index 0000000..f10bb1a
--- /dev/null
+++ b/embodiedscan/structures/bbox_3d/coord_3d_mode.py
@@ -0,0 +1,271 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from enum import IntEnum, unique
+from typing import Optional, Sequence, Union
+
+import numpy as np
+import torch
+from mmdet3d.structures.points import (BasePoints, CameraPoints, DepthPoints,
+                                       LiDARPoints)
+from torch import Tensor
+
+from .base_box3d import BaseInstance3DBoxes
+from .box_3d_mode import Box3DMode
+
+
+@unique
+class Coord3DMode(IntEnum):
+    """Enum of different ways to represent a box and point cloud.
+
+    Coordinates in LiDAR:
+
+    .. code-block:: none
+
+                    up z
+                       ^   x front
+                       |  /
+                       | /
+        left y <------ 0
+
+    The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+
+    Coordinates in Camera:
+
+    .. code-block:: none
+
+                z front
+               /
+              /
+             0 ------> x right
+             |
+             |
+             v
+        down y
+
+    The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5),
+    and the yaw is around the y axis, thus the rotation axis=1.
+
+    Coordinates in Depth:
+
+    .. code-block:: none
+
+        up z
+           ^   y front
+           |  /
+           | /
+           0 ------> x right
+
+    The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+    """
+
+    LIDAR = 0
+    CAM = 1
+    DEPTH = 2
+
+    @staticmethod
+    def convert(input: Union[Sequence[float], np.ndarray, Tensor,
+                             BaseInstance3DBoxes, BasePoints],
+                src: Union[Box3DMode, 'Coord3DMode'],
+                dst: Union[Box3DMode, 'Coord3DMode'],
+                rt_mat: Optional[Union[np.ndarray, Tensor]] = None,
+                with_yaw: bool = True,
+                correct_yaw: bool = False,
+                is_point: bool = True):
+        """Convert boxes or points from ``src`` mode to ``dst`` mode.
+
+        Args:
+            input (Sequence[float] or np.ndarray or Tensor or
+                :obj:`BaseInstance3DBoxes` or :obj:`BasePoints`): Can be a
+                k-tuple, k-list or an Nxk array/tensor.
+            src (:obj:`Box3DMode` or :obj:`Coord3DMode`): The source mode.
+            dst (:obj:`Box3DMode` or :obj:`Coord3DMode`): The target mode.
+            rt_mat (np.ndarray or Tensor, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None. The conversion from ``src`` coordinates to
+                ``dst`` coordinates usually comes along the change of sensors,
+                e.g., from camera to LiDAR. This requires a transformation
+                matrix.
+            with_yaw (bool): If ``box`` is an instance of
+                :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle.
+                Defaults to True.
+            correct_yaw (bool): If the yaw is rotated by rt_mat.
+                Defaults to False.
+            is_point (bool): If ``input`` is neither an instance of
+                :obj:`BaseInstance3DBoxes` nor an instance of
+                :obj:`BasePoints`, whether or not it is point data.
+                Defaults to True.
+
+        Returns:
+            Sequence[float] or np.ndarray or Tensor or
+            :obj:`BaseInstance3DBoxes` or :obj:`BasePoints`: The converted box
+            or points of the same type.
+        """
+        if isinstance(input, BaseInstance3DBoxes):
+            return Coord3DMode.convert_box(input,
+                                           src,
+                                           dst,
+                                           rt_mat=rt_mat,
+                                           with_yaw=with_yaw,
+                                           correct_yaw=correct_yaw)
+        elif isinstance(input, BasePoints):
+            return Coord3DMode.convert_point(input, src, dst, rt_mat=rt_mat)
+        elif isinstance(input, (tuple, list, np.ndarray, Tensor)):
+            if is_point:
+                return Coord3DMode.convert_point(input,
+                                                 src,
+                                                 dst,
+                                                 rt_mat=rt_mat)
+            else:
+                return Coord3DMode.convert_box(input,
+                                               src,
+                                               dst,
+                                               rt_mat=rt_mat,
+                                               with_yaw=with_yaw,
+                                               correct_yaw=correct_yaw)
+        else:
+            raise NotImplementedError
+
+    @staticmethod
+    def convert_box(
+        box: Union[Sequence[float], np.ndarray, Tensor, BaseInstance3DBoxes],
+        src: Box3DMode,
+        dst: Box3DMode,
+        rt_mat: Optional[Union[np.ndarray, Tensor]] = None,
+        with_yaw: bool = True,
+        correct_yaw: bool = False
+    ) -> Union[Sequence[float], np.ndarray, Tensor, BaseInstance3DBoxes]:
+        """Convert boxes from ``src`` mode to ``dst`` mode.
+
+        Args:
+            box (Sequence[float] or np.ndarray or Tensor or
+                :obj:`BaseInstance3DBoxes`): Can be a k-tuple, k-list or an Nxk
+                array/tensor.
+            src (:obj:`Box3DMode`): The source box mode.
+            dst (:obj:`Box3DMode`): The target box mode.
+            rt_mat (np.ndarray or Tensor, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None. The conversion from ``src`` coordinates to
+                ``dst`` coordinates usually comes along the change of sensors,
+                e.g., from camera to LiDAR. This requires a transformation
+                matrix.
+            with_yaw (bool): If ``box`` is an instance of
+                :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle.
+                Defaults to True.
+            correct_yaw (bool): If the yaw is rotated by rt_mat.
+                Defaults to False.
+
+        Returns:
+            Sequence[float] or np.ndarray or Tensor or
+            :obj:`BaseInstance3DBoxes`: The converted box of the same type.
+        """
+        return Box3DMode.convert(box,
+                                 src,
+                                 dst,
+                                 rt_mat=rt_mat,
+                                 with_yaw=with_yaw,
+                                 correct_yaw=correct_yaw)
+
+    @staticmethod
+    def convert_point(
+        point: Union[Sequence[float], np.ndarray, Tensor, BasePoints],
+        src: 'Coord3DMode',
+        dst: 'Coord3DMode',
+        rt_mat: Optional[Union[np.ndarray, Tensor]] = None,
+    ) -> Union[Sequence[float], np.ndarray, Tensor, BasePoints]:
+        """Convert points from ``src`` mode to ``dst`` mode.
+
+        Args:
+            box (Sequence[float] or np.ndarray or Tensor or :obj:`BasePoints`):
+                Can be a k-tuple, k-list or an Nxk array/tensor.
+            src (:obj:`Coord3DMode`): The source point mode.
+            dst (:obj:`Coord3DMode`): The target point mode.
+            rt_mat (np.ndarray or Tensor, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None. The conversion from ``src`` coordinates to
+                ``dst`` coordinates usually comes along the change of sensors,
+                e.g., from camera to LiDAR. This requires a transformation
+                matrix.
+
+        Returns:
+            Sequence[float] or np.ndarray or Tensor or :obj:`BasePoints`: The
+            converted point of the same type.
+        """
+        if src == dst:
+            return point
+
+        is_numpy = isinstance(point, np.ndarray)
+        is_InstancePoints = isinstance(point, BasePoints)
+        single_point = isinstance(point, (list, tuple))
+        if single_point:
+            assert len(point) >= 3, (
+                'Coord3DMode.convert takes either a k-tuple/list or '
+                'an Nxk array/tensor, where k >= 3')
+            arr = torch.tensor(point)[None, :]
+        else:
+            # avoid modifying the input point
+            if is_numpy:
+                arr = torch.from_numpy(np.asarray(point)).clone()
+            elif is_InstancePoints:
+                arr = point.tensor.clone()
+            else:
+                arr = point.clone()
+
+        # convert point from `src` mode to `dst` mode.
+        if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
+        elif src == Coord3DMode.CAM and dst == Coord3DMode.LIDAR:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
+        elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
+        elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
+        elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
+        elif src == Coord3DMode.DEPTH and dst == Coord3DMode.LIDAR:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
+        else:
+            raise NotImplementedError(
+                f'Conversion from Coord3DMode {src} to {dst} '
+                'is not supported yet')
+
+        if not isinstance(rt_mat, Tensor):
+            rt_mat = arr.new_tensor(rt_mat)
+        if rt_mat.size(1) == 4:
+            extended_xyz = torch.cat(
+                [arr[..., :3], arr.new_ones(arr.size(0), 1)], dim=-1)
+            xyz = extended_xyz @ rt_mat.t()
+        else:
+            xyz = arr[..., :3] @ rt_mat.t()
+
+        remains = arr[..., 3:]
+        arr = torch.cat([xyz[..., :3], remains], dim=-1)
+
+        # convert arr to the original type
+        original_type = type(point)
+        if single_point:
+            return original_type(arr.flatten().tolist())
+        if is_numpy:
+            return arr.numpy()
+        elif is_InstancePoints:
+            if dst == Coord3DMode.CAM:
+                target_type = CameraPoints
+            elif dst == Coord3DMode.LIDAR:
+                target_type = LiDARPoints
+            elif dst == Coord3DMode.DEPTH:
+                target_type = DepthPoints
+            else:
+                raise NotImplementedError(
+                    f'Conversion to {dst} through {original_type} '
+                    'is not supported yet')
+            return target_type(arr,
+                               points_dim=arr.size(-1),
+                               attribute_dims=point.attribute_dims)
+        else:
+            return arr
diff --git a/embodiedscan/structures/bbox_3d/depth_box3d.py b/embodiedscan/structures/bbox_3d/depth_box3d.py
new file mode 100644
index 0000000..bf2f4c1
--- /dev/null
+++ b/embodiedscan/structures/bbox_3d/depth_box3d.py
@@ -0,0 +1,282 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+from mmdet3d.structures.points import BasePoints
+from torch import Tensor
+
+from .base_box3d import BaseInstance3DBoxes
+from .utils import rotation_3d_in_axis
+
+
+class DepthInstance3DBoxes(BaseInstance3DBoxes):
+    """3D boxes of instances in DEPTH coordinates.
+
+    Coordinates in Depth:
+
+    .. code-block:: none
+
+        up z    y front (yaw=0.5*pi)
+           ^   ^
+           |  /
+           | /
+           0 ------> x right (yaw=0)
+
+    The relative coordinate of bottom center in a Depth box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2. The yaw is 0 at
+    the positive direction of x axis, and increases from the positive direction
+    of x to the positive direction of y.
+
+    Attributes:
+        tensor (Tensor): Float matrix with shape (N, box_dim).
+        box_dim (int): Integer indicating the dimension of a box. Each row is
+            (x, y, z, x_size, y_size, z_size, yaw, ...).
+        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
+            boxes.
+    """
+    YAW_AXIS = 2
+
+    @property
+    def corners(self) -> Tensor:
+        """Convert boxes to corners in clockwise order, in the form of (x0y0z0,
+        x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0).
+
+        .. code-block:: none
+
+                                        up z
+                         front y           ^
+                              /            |
+                             /             |
+               (x0, y1, z1) + -----------  + (x1, y1, z1)
+                           /|            / |
+                          / |           /  |
+            (x0, y0, z1) + ----------- +   + (x1, y1, z0)
+                         |  /      .   |  /
+                         | / origin    | /
+            (x0, y0, z0) + ----------- + --------> right x
+                                       (x1, y0, z0)
+
+        Returns:
+            Tensor: A tensor with 8 corners of each box in shape (N, 8, 3).
+        """
+        if self.tensor.numel() == 0:
+            return torch.empty([0, 8, 3], device=self.tensor.device)
+
+        dims = self.dims
+        corners_norm = torch.from_numpy(
+            np.stack(np.unravel_index(np.arange(8), [2] * 3),
+                     axis=1)).to(device=dims.device, dtype=dims.dtype)
+
+        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+        # use relative origin (0.5, 0.5, 0)
+        corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0])
+        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
+
+        # rotate around z axis
+        corners = rotation_3d_in_axis(corners,
+                                      self.tensor[:, 6],
+                                      axis=self.YAW_AXIS)
+        corners += self.tensor[:, :3].view(-1, 1, 3)
+        return corners
+
+    def rotate(
+        self,
+        angle: Union[Tensor, np.ndarray, float],
+        points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None
+    ) -> Union[Tuple[Tensor, Tensor], Tuple[np.ndarray, np.ndarray], Tuple[
+            BasePoints, Tensor], None]:
+        """Rotate boxes with points (optional) with the given angle or rotation
+        matrix.
+
+        Args:
+            angle (Tensor or np.ndarray or float): Rotation angle or rotation
+                matrix.
+            points (Tensor or np.ndarray or :obj:`BasePoints`, optional):
+                Points to rotate. Defaults to None.
+
+        Returns:
+            tuple or None: When ``points`` is None, the function returns None,
+            otherwise it returns the rotated points and the rotation matrix
+            ``rot_mat_T``.
+        """
+        if not isinstance(angle, Tensor):
+            angle = self.tensor.new_tensor(angle)
+
+        assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \
+            f'invalid rotation angle shape {angle.shape}'
+
+        if angle.numel() == 1:
+            self.tensor[:, 0:3], rot_mat_T = rotation_3d_in_axis(
+                self.tensor[:, 0:3],
+                angle,
+                axis=self.YAW_AXIS,
+                return_mat=True)
+        else:
+            rot_mat_T = angle
+            rot_sin = rot_mat_T[0, 1]
+            rot_cos = rot_mat_T[0, 0]
+            angle = np.arctan2(rot_sin, rot_cos)
+            self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T
+
+        if self.with_yaw:
+            self.tensor[:, 6] += angle
+        else:
+            # for axis-aligned boxes, we take the new
+            # enclosing axis-aligned boxes after rotation
+            corners_rot = self.corners @ rot_mat_T
+            new_x_size = corners_rot[..., 0].max(
+                dim=1, keepdim=True)[0] - corners_rot[..., 0].min(
+                    dim=1, keepdim=True)[0]
+            new_y_size = corners_rot[..., 1].max(
+                dim=1, keepdim=True)[0] - corners_rot[..., 1].min(
+                    dim=1, keepdim=True)[0]
+            self.tensor[:, 3:5] = torch.cat((new_x_size, new_y_size), dim=-1)
+
+        if points is not None:
+            if isinstance(points, Tensor):
+                points[:, :3] = points[:, :3] @ rot_mat_T
+            elif isinstance(points, np.ndarray):
+                rot_mat_T = rot_mat_T.cpu().numpy()
+                points[:, :3] = np.dot(points[:, :3], rot_mat_T)
+            elif isinstance(points, BasePoints):
+                points.rotate(rot_mat_T)
+            else:
+                raise ValueError
+            return points, rot_mat_T
+        else:
+            return rot_mat_T
+
+    def flip(
+        self,
+        bev_direction: str = 'horizontal',
+        points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None
+    ) -> Union[Tensor, np.ndarray, BasePoints, None]:
+        """Flip the boxes in BEV along given BEV direction.
+
+        In Depth coordinates, it flips the x (horizontal) or y (vertical) axis.
+
+        Args:
+            bev_direction (str): Direction by which to flip. Can be chosen from
+                'horizontal' and 'vertical'. Defaults to 'horizontal'.
+            points (Tensor or np.ndarray or :obj:`BasePoints`, optional):
+                Points to flip. Defaults to None.
+
+        Returns:
+            Tensor or np.ndarray or :obj:`BasePoints` or None: When ``points``
+            is None, the function returns None, otherwise it returns the
+            flipped points.
+        """
+        assert bev_direction in ('horizontal', 'vertical')
+        if bev_direction == 'horizontal':
+            self.tensor[:, 0::7] = -self.tensor[:, 0::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
+        elif bev_direction == 'vertical':
+            self.tensor[:, 1::7] = -self.tensor[:, 1::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6]
+
+        if points is not None:
+            assert isinstance(points, (Tensor, np.ndarray, BasePoints))
+            if isinstance(points, (Tensor, np.ndarray)):
+                if bev_direction == 'horizontal':
+                    points[:, 0] = -points[:, 0]
+                elif bev_direction == 'vertical':
+                    points[:, 1] = -points[:, 1]
+            elif isinstance(points, BasePoints):
+                points.flip(bev_direction)
+            return points
+
+    def convert_to(self,
+                   dst: int,
+                   rt_mat: Optional[Union[Tensor, np.ndarray]] = None,
+                   correct_yaw: bool = False) -> 'BaseInstance3DBoxes':
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (int): The target Box mode.
+            rt_mat (Tensor or np.ndarray, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None. The conversion from ``src`` coordinates to
+                ``dst`` coordinates usually comes along the change of sensors,
+                e.g., from camera to LiDAR. This requires a transformation
+                matrix.
+            correct_yaw (bool): Whether to convert the yaw angle to the target
+                coordinate. Defaults to False.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: The converted box of the same type in
+            the ``dst`` mode.
+        """
+        from .box_3d_mode import Box3DMode
+        return Box3DMode.convert(box=self,
+                                 src=Box3DMode.DEPTH,
+                                 dst=dst,
+                                 rt_mat=rt_mat,
+                                 correct_yaw=correct_yaw)
+
+    def enlarged_box(
+            self, extra_width: Union[float, Tensor]) -> 'DepthInstance3DBoxes':
+        """Enlarge the length, width and height of boxes.
+
+        Args:
+            extra_width (float or Tensor): Extra width to enlarge the box.
+
+        Returns:
+            :obj:`DepthInstance3DBoxes`: Enlarged boxes.
+        """
+        enlarged_boxes = self.tensor.clone()
+        enlarged_boxes[:, 3:6] += extra_width * 2
+        # bottom center z minus extra_width
+        enlarged_boxes[:, 2] -= extra_width
+        return self.new_box(enlarged_boxes)
+
+    def get_surface_line_center(self) -> Tuple[Tensor, Tensor]:
+        """Compute surface and line center of bounding boxes.
+
+        Returns:
+            Tuple[Tensor, Tensor]: Surface and line center of bounding boxes.
+        """
+        obj_size = self.dims
+        center = self.gravity_center.view(-1, 1, 3)
+        batch_size = center.shape[0]
+
+        rot_sin = torch.sin(-self.yaw)
+        rot_cos = torch.cos(-self.yaw)
+        rot_mat_T = self.yaw.new_zeros(tuple(list(self.yaw.shape) + [3, 3]))
+        rot_mat_T[..., 0, 0] = rot_cos
+        rot_mat_T[..., 0, 1] = -rot_sin
+        rot_mat_T[..., 1, 0] = rot_sin
+        rot_mat_T[..., 1, 1] = rot_cos
+        rot_mat_T[..., 2, 2] = 1
+
+        # Get the object surface center
+        offset = obj_size.new_tensor([[0, 0, 1], [0, 0, -1], [0, 1, 0],
+                                      [0, -1, 0], [1, 0, 0], [-1, 0, 0]])
+        offset = offset.view(1, 6, 3) / 2
+        surface_3d = (offset *
+                      obj_size.view(batch_size, 1, 3).repeat(1, 6, 1)).reshape(
+                          -1, 3)
+
+        # Get the object line center
+        offset = obj_size.new_tensor([[1, 0, 1], [-1, 0, 1], [0, 1, 1],
+                                      [0, -1, 1], [1, 0, -1], [-1, 0, -1],
+                                      [0, 1, -1], [0, -1, -1], [1, 1, 0],
+                                      [1, -1, 0], [-1, 1, 0], [-1, -1, 0]])
+        offset = offset.view(1, 12, 3) / 2
+
+        line_3d = (offset *
+                   obj_size.view(batch_size, 1, 3).repeat(1, 12, 1)).reshape(
+                       -1, 3)
+
+        surface_rot = rot_mat_T.repeat(6, 1, 1)
+        surface_3d = torch.matmul(surface_3d.unsqueeze(-2),
+                                  surface_rot).squeeze(-2)
+        surface_center = center.repeat(1, 6, 1).reshape(-1, 3) + surface_3d
+
+        line_rot = rot_mat_T.repeat(12, 1, 1)
+        line_3d = torch.matmul(line_3d.unsqueeze(-2), line_rot).squeeze(-2)
+        line_center = center.repeat(1, 12, 1).reshape(-1, 3) + line_3d
+
+        return surface_center, line_center
diff --git a/embodiedscan/structures/bbox_3d/euler_box3d.py b/embodiedscan/structures/bbox_3d/euler_box3d.py
new file mode 100644
index 0000000..f328ff2
--- /dev/null
+++ b/embodiedscan/structures/bbox_3d/euler_box3d.py
@@ -0,0 +1,410 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmdet3d.structures.points import BasePoints
+from pytorch3d.ops import box3d_overlap
+from pytorch3d.transforms import euler_angles_to_matrix, matrix_to_euler_angles
+
+from .base_box3d import BaseInstance3DBoxes
+from .utils import rotation_3d_in_euler
+
+
+class EulerInstance3DBoxes(BaseInstance3DBoxes):
+    """3D boxes of instances in Depth coordinates.
+
+    Coordinates in Depth:
+
+    .. code-block:: none
+
+                    up z    y front (alpha=0.5*pi)
+                       ^   ^
+                       |  /
+                       | /
+                       0 ------> x right (alpha=0)
+
+    The relative coordinate of bottom center in a Depth box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+    The yaw is 0 at the positive direction of x axis, and decreases from
+    the positive direction of x to the positive direction of y.
+    Also note that rotation of DepthInstance3DBoxes is counterclockwise,
+    which is reverse to the definition of the yaw angle (clockwise).
+
+    A refactor is ongoing to make the three coordinate systems
+    easier to understand and convert between each other.
+
+    Attributes:
+        tensor (torch.Tensor): Float matrix of N x box_dim.
+        box_dim (int): Integer indicates the dimension of a box
+            Each row is (x, y, z, x_size, y_size, z_size, alpha, beta, gamma).
+        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
+            boxes.
+    """
+
+    def __init__(self,
+                 tensor,
+                 box_dim=9,
+                 with_yaw=True,
+                 origin=(0.5, 0.5, 0.5)):
+        if isinstance(tensor, torch.Tensor):
+            device = tensor.device
+        else:
+            device = torch.device('cpu')
+        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+        if tensor.numel() == 0:
+            # Use reshape, so we don't end up creating a new tensor that
+            # does not depend on the inputs (and consequently confuses jit)
+            tensor = tensor.reshape((0, box_dim)).to(dtype=torch.float32,
+                                                     device=device)
+        assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size()
+
+        if tensor.shape[-1] == 6:
+            # If the dimension of boxes is 6, we expand box_dim by padding
+            # 0 as a fake yaw and set with_yaw to False.
+            assert box_dim == 6
+            fake_rot = tensor.new_zeros(tensor.shape[0], 3)
+            tensor = torch.cat((tensor, fake_rot), dim=-1)
+            self.box_dim = box_dim + 3
+            self.with_yaw = True  # TODO
+        elif tensor.shape[-1] == 7:
+            assert box_dim == 7
+            fake_euler = tensor.new_zeros(tensor.shape[0], 2)
+            tensor = torch.cat((tensor, fake_euler), dim=-1)
+            self.box_dim = box_dim + 2
+            self.with_yaw = True
+        else:
+            assert tensor.shape[-1] == 9
+            self.box_dim = box_dim
+            self.with_yaw = True  # TODO
+        self.tensor = tensor.clone()
+
+        self.origin = origin
+        if origin != (0.5, 0.5, 0.5):
+            dst = self.tensor.new_tensor((0.5, 0.5, 0.5))
+            src = self.tensor.new_tensor(origin)
+            self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src)
+
+    def get_corners(self, tensor1):
+        """torch.Tensor: Coordinates of corners of all the boxes
+        in shape (N, 8, 3).
+
+        Convert the boxes to corners in clockwise order, in form of
+        ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)``
+
+        .. code-block:: none
+
+                                           up z
+                            front y           ^
+                                 /            |
+                                /             |
+                  (x0, y1, z1) + -----------  + (x1, y1, z1)
+                              /|            / |
+                             / |           /  |
+               (x0, y0, z1) + ----------- +   + (x1, y1, z0)
+                            |  /      .   |  /
+                            | / origin    | /
+               (x0, y0, z0) + ----------- + --------> right x
+                                          (x1, y0, z0)
+        """
+        if tensor1.numel() == 0:
+            return torch.empty([0, 8, 3], device=tensor1.device)
+
+        dims = tensor1[:, 3:6]
+        corners_norm = torch.from_numpy(
+            np.stack(np.unravel_index(np.arange(8), [2] * 3),
+                     axis=1)).to(device=dims.device, dtype=dims.dtype)
+
+        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+        # use relative origin
+        assert self.origin == (0.5, 0.5, 0.5), \
+            'self.origin != (0.5, 0.5, 0.5) needs to be checked!'
+        corners_norm = corners_norm - dims.new_tensor(self.origin)
+        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
+
+        # rotate
+        corners = rotation_3d_in_euler(corners, tensor1[:, 6:])
+
+        corners += tensor1[:, :3].view(-1, 1, 3)
+        return corners
+
+    @classmethod
+    def overlaps(cls, boxes1, boxes2, mode='iou'):
+        """Calculate 3D overlaps of two boxes.
+
+        Note:
+            This function calculates the overlaps between ``boxes1`` and
+            ``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type.
+
+        Args:
+            boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes.
+            boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes.
+            mode (str, optional): Mode of iou calculation. Defaults to 'iou'.
+
+        Returns:
+            torch.Tensor: Calculated 3D overlaps of the boxes.
+        """
+        assert isinstance(boxes1, EulerInstance3DBoxes)
+        assert isinstance(boxes2, EulerInstance3DBoxes)
+        assert type(boxes1) == type(boxes2), '"boxes1" and "boxes2" should' \
+            f'be in the same type, got {type(boxes1)} and {type(boxes2)}.'
+
+        assert mode in ['iou']
+
+        rows = len(boxes1)
+        cols = len(boxes2)
+        if rows * cols == 0:
+            return boxes1.tensor.new(rows, cols)
+
+        corners1 = boxes1.corners
+        corners2 = boxes2.corners
+        _, iou3d = box3d_overlap(corners1, corners2, eps=1e-4)
+        return iou3d
+
+    @property
+    def bottom_center(self):
+        """torch.Tensor: A tensor with center of each box in shape (N, 3)."""
+        raise NotImplementedError('Not support')
+
+    @property
+    def gravity_center(self):
+        """torch.Tensor: A tensor with center of each box in shape (N, 3)."""
+        return self.tensor[:, :3]
+
+    @property
+    def corners(self):
+        """torch.Tensor: Coordinates of corners of all the boxes
+        in shape (N, 8, 3).
+
+        Convert the boxes to corners in clockwise order, in form of
+        ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)``
+
+        .. code-block:: none
+
+                                           up z
+                            front y           ^
+                                 /            |
+                                /             |
+                  (x0, y1, z1) + -----------  + (x1, y1, z1)
+                              /|            / |
+                             / |           /  |
+               (x0, y0, z1) + ----------- +   + (x1, y1, z0)
+                            |  /      .   |  /
+                            | / origin    | /
+               (x0, y0, z0) + ----------- + --------> right x
+                                          (x1, y0, z0)
+        """
+        if self.tensor.numel() == 0:
+            return torch.empty([0, 8, 3], device=self.tensor.device)
+
+        dims = self.dims
+        corners_norm = torch.from_numpy(
+            np.stack(np.unravel_index(np.arange(8), [2] * 3),
+                     axis=1)).to(device=dims.device, dtype=dims.dtype)
+
+        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+        # use relative origin
+        assert self.origin == (0.5, 0.5, 0.5), \
+            'self.origin != (0.5, 0.5, 0.5) needs to be checked!'
+        corners_norm = corners_norm - dims.new_tensor(self.origin)
+        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
+
+        # rotate
+        corners = rotation_3d_in_euler(corners, self.tensor[:, 6:])
+
+        corners += self.tensor[:, :3].view(-1, 1, 3)
+        return corners
+
+    def transform(self, matrix):
+        if self.tensor.shape[0] == 0:
+            return
+        if not isinstance(matrix, torch.Tensor):
+            matrix = self.tensor.new_tensor(matrix)
+        points = self.tensor[:, :3]
+        constant = points.new_ones(points.shape[0], 1)
+        points_extend = torch.concat([points, constant], dim=-1)
+        points_trans = torch.matmul(points_extend, matrix.transpose(-2,
+                                                                    -1))[:, :3]
+
+        size = self.tensor[:, 3:6]
+
+        # angle_delta = matrix_to_euler_angles(matrix[:3,:3], 'ZXY')
+        # angle = self.tensor[:,6:] + angle_delta
+        ori_matrix = euler_angles_to_matrix(self.tensor[:, 6:], 'ZXY')
+        rot_matrix = matrix[:3, :3].expand_as(ori_matrix)
+        final = torch.bmm(rot_matrix, ori_matrix)
+        angle = matrix_to_euler_angles(final, 'ZXY')
+
+        self.tensor = torch.cat([points_trans, size, angle], dim=-1)
+
+    def rotate(self, angle, points=None):
+        """Rotate boxes with points (optional) with the given angle or rotation
+        matrix.
+
+        Args:
+            angle (float | torch.Tensor | np.ndarray):
+                Rotation angle or rotation matrix.
+            points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional):
+                Points to rotate. Defaults to None.
+
+        Returns:
+            tuple or None: When ``points`` is None, the function returns
+                None, otherwise it returns the rotated points and the
+                rotation matrix ``rot_mat_T``.
+        """
+        if not isinstance(angle, torch.Tensor):
+            angle = self.tensor.new_tensor(angle)
+
+        if angle.numel() == 1:  # only given yaw angle for rotation
+            angle = self.tensor.new_tensor([0., 0., angle])
+            rot_matrix = euler_angles_to_matrix(angle, 'ZXY')
+        elif angle.numel() == 3:
+            rot_matrix = euler_angles_to_matrix(angle, 'ZXY')
+        elif angle.shape == torch.Size([3, 3]):
+            rot_matrix = angle
+        else:
+            raise NotImplementedError
+
+        rot_mat_T = rot_matrix.T
+        transform_matrix = torch.eye(4)
+        transform_matrix[:3, :3] = rot_matrix
+        self.transform(transform_matrix)
+
+        if points is not None:
+            if isinstance(points, torch.Tensor):
+                points[:, :3] = points[:, :3] @ rot_mat_T
+            elif isinstance(points, np.ndarray):
+                rot_mat_T = rot_mat_T.cpu().numpy()
+                points[:, :3] = np.dot(points[:, :3], rot_mat_T)
+            elif isinstance(points, BasePoints):
+                points.rotate(rot_mat_T)
+            else:
+                raise ValueError
+            return points, rot_mat_T
+        else:
+            return rot_mat_T
+
+    def flip(self, direction='X', points=None):
+        """Flip the boxes in BEV along given BEV direction.
+
+        In Depth coordinates, it flips x (horizontal) or y (vertical) axis.
+
+        Args:
+            bev_direction (str, optional): Flip direction
+                (horizontal or vertical). Defaults to 'horizontal'.
+            points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional):
+                Points to flip. Defaults to None.
+
+        Returns:
+            torch.Tensor, numpy.ndarray or None: Flipped points.
+        """
+        assert direction in ['X', 'Y', 'Z']
+        if direction == 'X':
+            self.tensor[:, 0] = -self.tensor[:, 0]
+            self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
+            self.tensor[:, 8] = -self.tensor[:, 8]
+        elif direction == 'Y':
+            self.tensor[:, 1] = -self.tensor[:, 1]
+            self.tensor[:, 6] = -self.tensor[:, 6]
+            self.tensor[:, 7] = -self.tensor[:, 7] + np.pi
+        elif direction == 'Z':
+            self.tensor[:, 2] = -self.tensor[:, 2]
+            self.tensor[:, 7] = -self.tensor[:, 7]
+            self.tensor[:, 8] = -self.tensor[:, 8] + np.pi
+
+        if points is not None:
+            # assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints))
+            if isinstance(points, (torch.Tensor, np.ndarray)):
+                if direction == 'X':
+                    points[:, 0] = -points[:, 0]
+                elif direction == 'Y':
+                    points[:, 1] = -points[:, 1]
+                elif direction == 'Z':
+                    points[:, 2] = -points[:, 2]
+            else:
+                points.flip(direction)
+            return points
+
+    def convert_to(self, dst, rt_mat=None):
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (:obj:`Box3DMode`): The target Box mode.
+            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None.
+                The conversion from ``src`` coordinates to ``dst`` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            :obj:`DepthInstance3DBoxes`:
+                The converted box of the same type in the ``dst`` mode.
+        """
+        from .box_3d_mode import Box3DMode
+        assert dst == Box3DMode.EULER_DEPTH
+        return self
+
+    def enlarged_box(self, extra_width):
+        """Enlarge the length, width and height boxes.
+
+        Args:
+            extra_width (float | torch.Tensor): Extra width to enlarge the box.
+
+        Returns:
+            :obj:`DepthInstance3DBoxes`: Enlarged boxes.
+        """
+        raise NotImplementedError('enlarged box')
+        enlarged_boxes = self.tensor.clone()
+        enlarged_boxes[:, 3:6] += extra_width * 2
+        # bottom center z minus extra_width
+        enlarged_boxes[:, 2] -= extra_width
+        return self.new_box(enlarged_boxes)
+
+    def get_surface_line_center(self):
+        """Compute surface and line center of bounding boxes.
+
+        Returns:
+            torch.Tensor: Surface and line center of bounding boxes.
+        """
+        raise NotImplementedError('surface line center')
+        obj_size = self.dims
+        center = self.gravity_center.view(-1, 1, 3)
+        batch_size = center.shape[0]
+
+        rot_sin = torch.sin(-self.yaw)
+        rot_cos = torch.cos(-self.yaw)
+        rot_mat_T = self.yaw.new_zeros(tuple(list(self.yaw.shape) + [3, 3]))
+        rot_mat_T[..., 0, 0] = rot_cos
+        rot_mat_T[..., 0, 1] = -rot_sin
+        rot_mat_T[..., 1, 0] = rot_sin
+        rot_mat_T[..., 1, 1] = rot_cos
+        rot_mat_T[..., 2, 2] = 1
+
+        # Get the object surface center
+        offset = obj_size.new_tensor([[0, 0, 1], [0, 0, -1], [0, 1, 0],
+                                      [0, -1, 0], [1, 0, 0], [-1, 0, 0]])
+        offset = offset.view(1, 6, 3) / 2
+        surface_3d = (offset *
+                      obj_size.view(batch_size, 1, 3).repeat(1, 6, 1)).reshape(
+                          -1, 3)
+
+        # Get the object line center
+        offset = obj_size.new_tensor([[1, 0, 1], [-1, 0, 1], [0, 1, 1],
+                                      [0, -1, 1], [1, 0, -1], [-1, 0, -1],
+                                      [0, 1, -1], [0, -1, -1], [1, 1, 0],
+                                      [1, -1, 0], [-1, 1, 0], [-1, -1, 0]])
+        offset = offset.view(1, 12, 3) / 2
+
+        line_3d = (offset *
+                   obj_size.view(batch_size, 1, 3).repeat(1, 12, 1)).reshape(
+                       -1, 3)
+
+        surface_rot = rot_mat_T.repeat(6, 1, 1)
+        surface_3d = torch.matmul(surface_3d.unsqueeze(-2),
+                                  surface_rot).squeeze(-2)
+        surface_center = center.repeat(1, 6, 1).reshape(-1, 3) + surface_3d
+
+        line_rot = rot_mat_T.repeat(12, 1, 1)
+        line_3d = torch.matmul(line_3d.unsqueeze(-2), line_rot).squeeze(-2)
+        line_center = center.repeat(1, 12, 1).reshape(-1, 3) + line_3d
+
+        return surface_center, line_center
diff --git a/embodiedscan/structures/bbox_3d/lidar_box3d.py b/embodiedscan/structures/bbox_3d/lidar_box3d.py
new file mode 100644
index 0000000..438a200
--- /dev/null
+++ b/embodiedscan/structures/bbox_3d/lidar_box3d.py
@@ -0,0 +1,225 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+from mmdet3d.structures.points import BasePoints
+from torch import Tensor
+
+from .base_box3d import BaseInstance3DBoxes
+from .utils import rotation_3d_in_axis
+
+
+class LiDARInstance3DBoxes(BaseInstance3DBoxes):
+    """3D boxes of instances in LIDAR coordinates.
+
+    Coordinates in LiDAR:
+
+    .. code-block:: none
+
+                                 up z    x front (yaw=0)
+                                    ^   ^
+                                    |  /
+                                    | /
+        (yaw=0.5*pi) left y <------ 0
+
+    The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2. The yaw is 0 at
+    the positive direction of x axis, and increases from the positive direction
+    of x to the positive direction of y.
+
+    Attributes:
+        tensor (Tensor): Float matrix with shape (N, box_dim).
+        box_dim (int): Integer indicating the dimension of a box. Each row is
+            (x, y, z, x_size, y_size, z_size, yaw, ...).
+        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
+            boxes.
+    """
+    YAW_AXIS = 2
+
+    @property
+    def corners(self) -> Tensor:
+        """Convert boxes to corners in clockwise order, in the form of (x0y0z0,
+        x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0).
+
+        .. code-block:: none
+
+                                           up z
+                            front x           ^
+                                 /            |
+                                /             |
+                  (x1, y0, z1) + -----------  + (x1, y1, z1)
+                              /|            / |
+                             / |           /  |
+               (x0, y0, z1) + ----------- +   + (x1, y1, z0)
+                            |  /      .   |  /
+                            | / origin    | /
+            left y <------- + ----------- + (x0, y1, z0)
+                (x0, y0, z0)
+
+        Returns:
+            Tensor: A tensor with 8 corners of each box in shape (N, 8, 3).
+        """
+        if self.tensor.numel() == 0:
+            return torch.empty([0, 8, 3], device=self.tensor.device)
+
+        dims = self.dims
+        corners_norm = torch.from_numpy(
+            np.stack(np.unravel_index(np.arange(8), [2] * 3),
+                     axis=1)).to(device=dims.device, dtype=dims.dtype)
+
+        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+        # use relative origin (0.5, 0.5, 0)
+        corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0])
+        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
+
+        # rotate around z axis
+        corners = rotation_3d_in_axis(corners,
+                                      self.tensor[:, 6],
+                                      axis=self.YAW_AXIS)
+        corners += self.tensor[:, :3].view(-1, 1, 3)
+        return corners
+
+    def rotate(
+        self,
+        angle: Union[Tensor, np.ndarray, float],
+        points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None
+    ) -> Union[Tuple[Tensor, Tensor], Tuple[np.ndarray, np.ndarray], Tuple[
+            BasePoints, Tensor], None]:
+        """Rotate boxes with points (optional) with the given angle or rotation
+        matrix.
+
+        Args:
+            angle (Tensor or np.ndarray or float): Rotation angle or rotation
+                matrix.
+            points (Tensor or np.ndarray or :obj:`BasePoints`, optional):
+                Points to rotate. Defaults to None.
+
+        Returns:
+            tuple or None: When ``points`` is None, the function returns None,
+            otherwise it returns the rotated points and the rotation matrix
+            ``rot_mat_T``.
+        """
+        if not isinstance(angle, Tensor):
+            angle = self.tensor.new_tensor(angle)
+
+        assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \
+            f'invalid rotation angle shape {angle.shape}'
+
+        if angle.numel() == 1:
+            self.tensor[:, 0:3], rot_mat_T = rotation_3d_in_axis(
+                self.tensor[:, 0:3],
+                angle,
+                axis=self.YAW_AXIS,
+                return_mat=True)
+        else:
+            rot_mat_T = angle
+            rot_sin = rot_mat_T[0, 1]
+            rot_cos = rot_mat_T[0, 0]
+            angle = np.arctan2(rot_sin, rot_cos)
+            self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T
+
+        self.tensor[:, 6] += angle
+
+        if self.tensor.shape[1] == 9:
+            # rotate velo vector
+            self.tensor[:, 7:9] = self.tensor[:, 7:9] @ rot_mat_T[:2, :2]
+
+        if points is not None:
+            if isinstance(points, Tensor):
+                points[:, :3] = points[:, :3] @ rot_mat_T
+            elif isinstance(points, np.ndarray):
+                rot_mat_T = rot_mat_T.cpu().numpy()
+                points[:, :3] = np.dot(points[:, :3], rot_mat_T)
+            elif isinstance(points, BasePoints):
+                points.rotate(rot_mat_T)
+            else:
+                raise ValueError
+            return points, rot_mat_T
+        else:
+            return rot_mat_T
+
+    def flip(
+        self,
+        bev_direction: str = 'horizontal',
+        points: Optional[Union[Tensor, np.ndarray, BasePoints]] = None
+    ) -> Union[Tensor, np.ndarray, BasePoints, None]:
+        """Flip the boxes in BEV along given BEV direction.
+
+        In LIDAR coordinates, it flips the y (horizontal) or x (vertical) axis.
+
+        Args:
+            bev_direction (str): Direction by which to flip. Can be chosen from
+                'horizontal' and 'vertical'. Defaults to 'horizontal'.
+            points (Tensor or np.ndarray or :obj:`BasePoints`, optional):
+                Points to flip. Defaults to None.
+
+        Returns:
+            Tensor or np.ndarray or :obj:`BasePoints` or None: When ``points``
+            is None, the function returns None, otherwise it returns the
+            flipped points.
+        """
+        assert bev_direction in ('horizontal', 'vertical')
+        if bev_direction == 'horizontal':
+            self.tensor[:, 1::7] = -self.tensor[:, 1::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6]
+        elif bev_direction == 'vertical':
+            self.tensor[:, 0::7] = -self.tensor[:, 0::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
+
+        if points is not None:
+            assert isinstance(points, (Tensor, np.ndarray, BasePoints))
+            if isinstance(points, (Tensor, np.ndarray)):
+                if bev_direction == 'horizontal':
+                    points[:, 1] = -points[:, 1]
+                elif bev_direction == 'vertical':
+                    points[:, 0] = -points[:, 0]
+            elif isinstance(points, BasePoints):
+                points.flip(bev_direction)
+            return points
+
+    def convert_to(self,
+                   dst: int,
+                   rt_mat: Optional[Union[Tensor, np.ndarray]] = None,
+                   correct_yaw: bool = False) -> 'BaseInstance3DBoxes':
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (int): The target Box mode.
+            rt_mat (Tensor or np.ndarray, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None. The conversion from ``src`` coordinates to
+                ``dst`` coordinates usually comes along the change of sensors,
+                e.g., from camera to LiDAR. This requires a transformation
+                matrix.
+            correct_yaw (bool): Whether to convert the yaw angle to the target
+                coordinate. Defaults to False.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: The converted box of the same type in
+            the ``dst`` mode.
+        """
+        from .box_3d_mode import Box3DMode
+        return Box3DMode.convert(box=self,
+                                 src=Box3DMode.LIDAR,
+                                 dst=dst,
+                                 rt_mat=rt_mat,
+                                 correct_yaw=correct_yaw)
+
+    def enlarged_box(
+            self, extra_width: Union[float, Tensor]) -> 'LiDARInstance3DBoxes':
+        """Enlarge the length, width and height of boxes.
+
+        Args:
+            extra_width (float or Tensor): Extra width to enlarge the box.
+
+        Returns:
+            :obj:`LiDARInstance3DBoxes`: Enlarged boxes.
+        """
+        enlarged_boxes = self.tensor.clone()
+        enlarged_boxes[:, 3:6] += extra_width * 2
+        # bottom center z minus extra_width
+        enlarged_boxes[:, 2] -= extra_width
+        return self.new_box(enlarged_boxes)
diff --git a/embodiedscan/structures/bbox_3d/utils.py b/embodiedscan/structures/bbox_3d/utils.py
new file mode 100644
index 0000000..3810051
--- /dev/null
+++ b/embodiedscan/structures/bbox_3d/utils.py
@@ -0,0 +1,482 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from logging import warning
+from typing import Tuple, Union
+
+import numpy as np
+import torch
+from mmdet3d.utils.array_converter import array_converter
+from pytorch3d.transforms import euler_angles_to_matrix
+from torch import Tensor
+
+
+@array_converter(apply_to=('val', ))
+def limit_period(val: Union[np.ndarray, Tensor],
+                 offset: float = 0.5,
+                 period: float = np.pi) -> Union[np.ndarray, Tensor]:
+    """Limit the value into a period for periodic function.
+
+    Args:
+        val (np.ndarray or Tensor): The value to be converted.
+        offset (float): Offset to set the value range. Defaults to 0.5.
+        period (float): Period of the value. Defaults to np.pi.
+
+    Returns:
+        np.ndarray or Tensor: Value in the range of
+        [-offset * period, (1-offset) * period].
+    """
+    limited_val = val - torch.floor(val / period + offset) * period
+    return limited_val
+
+
+@array_converter(apply_to=('points', 'angles'))
+def rotation_3d_in_euler(points, angles, return_mat=False, clockwise=False):
+    """Rotate points by angles according to axis.
+
+    Args:
+        points (np.ndarray | torch.Tensor | list | tuple ):
+            Points of shape (N, M, 3).
+        angles (np.ndarray | torch.Tensor | list | tuple):
+            Vector of angles in shape (N, 3)
+        return_mat: Whether or not return the rotation matrix (transposed).
+            Defaults to False.
+        clockwise: Whether the rotation is clockwise. Defaults to False.
+
+    Raises:
+        ValueError: when the axis is not in range [0, 1, 2], it will
+            raise value error.
+
+    Returns:
+        (torch.Tensor | np.ndarray): Rotated points in shape (N, M, 3).
+    """
+    batch_free = len(points.shape) == 2
+    if batch_free:
+        points = points[None]
+
+    if len(angles.shape) == 1:
+        angles = angles.expand(points.shape[:1] + (3, ))
+        # angles = torch.full(points.shape[:1], angles)
+
+    assert len(points.shape) == 3 and len(angles.shape) == 2 \
+        and points.shape[0] == angles.shape[0], f'Incorrect shape of points ' \
+        f'angles: {points.shape}, {angles.shape}'
+
+    assert points.shape[-1] in [2, 3], \
+        f'Points size should be 2 or 3 instead of {points.shape[-1]}'
+
+    rot_mat_T = euler_angles_to_matrix(angles, 'ZXY')  # N, 3,3
+    rot_mat_T = rot_mat_T.transpose(-2, -1)
+
+    if clockwise:
+        raise NotImplementedError('clockwise')
+
+    if points.shape[0] == 0:
+        points_new = points
+    else:
+        points_new = torch.bmm(points, rot_mat_T)
+
+    if batch_free:
+        points_new = points_new.squeeze(0)
+
+    if return_mat:
+        if batch_free:
+            rot_mat_T = rot_mat_T.squeeze(0)
+        return points_new, rot_mat_T
+    else:
+        return points_new
+
+
+@array_converter(apply_to=('points', 'angles'))
+def rotation_3d_in_axis(
+    points: Union[np.ndarray, Tensor],
+    angles: Union[np.ndarray, Tensor, float],
+    axis: int = 0,
+    return_mat: bool = False,
+    clockwise: bool = False
+) -> Union[Tuple[np.ndarray, np.ndarray], Tuple[Tensor, Tensor], np.ndarray,
+           Tensor]:
+    """Rotate points by angles according to axis.
+
+    Args:
+        points (np.ndarray or Tensor): Points with shape (N, M, 3).
+        angles (np.ndarray or Tensor or float): Vector of angles with shape
+            (N, ).
+        axis (int): The axis to be rotated. Defaults to 0.
+        return_mat (bool): Whether or not to return the rotation matrix
+            (transposed). Defaults to False.
+        clockwise (bool): Whether the rotation is clockwise. Defaults to False.
+
+    Raises:
+        ValueError: When the axis is not in range [-3, -2, -1, 0, 1, 2], it
+            will raise ValueError.
+
+    Returns:
+        Tuple[np.ndarray, np.ndarray] or Tuple[Tensor, Tensor] or np.ndarray or
+        Tensor: Rotated points with shape (N, M, 3) and rotation matrix with
+        shape (N, 3, 3).
+    """
+    batch_free = len(points.shape) == 2
+    if batch_free:
+        points = points[None]
+
+    if isinstance(angles, float) or len(angles.shape) == 0:
+        angles = torch.full(points.shape[:1], angles)
+
+    assert len(points.shape) == 3 and len(angles.shape) == 1 and \
+        points.shape[0] == angles.shape[0], 'Incorrect shape of points ' \
+        f'angles: {points.shape}, {angles.shape}'
+
+    assert points.shape[-1] in [2, 3], \
+        f'Points size should be 2 or 3 instead of {points.shape[-1]}'
+
+    rot_sin = torch.sin(angles)
+    rot_cos = torch.cos(angles)
+    ones = torch.ones_like(rot_cos)
+    zeros = torch.zeros_like(rot_cos)
+
+    if points.shape[-1] == 3:
+        if axis == 1 or axis == -2:
+            rot_mat_T = torch.stack([
+                torch.stack([rot_cos, zeros, -rot_sin]),
+                torch.stack([zeros, ones, zeros]),
+                torch.stack([rot_sin, zeros, rot_cos])
+            ])
+        elif axis == 2 or axis == -1:
+            rot_mat_T = torch.stack([
+                torch.stack([rot_cos, rot_sin, zeros]),
+                torch.stack([-rot_sin, rot_cos, zeros]),
+                torch.stack([zeros, zeros, ones])
+            ])
+        elif axis == 0 or axis == -3:
+            rot_mat_T = torch.stack([
+                torch.stack([ones, zeros, zeros]),
+                torch.stack([zeros, rot_cos, rot_sin]),
+                torch.stack([zeros, -rot_sin, rot_cos])
+            ])
+        else:
+            raise ValueError(
+                f'axis should in range [-3, -2, -1, 0, 1, 2], got {axis}')
+    else:
+        rot_mat_T = torch.stack([
+            torch.stack([rot_cos, rot_sin]),
+            torch.stack([-rot_sin, rot_cos])
+        ])
+
+    if clockwise:
+        rot_mat_T = rot_mat_T.transpose(0, 1)
+
+    if points.shape[0] == 0:
+        points_new = points
+    else:
+        points_new = torch.einsum('aij,jka->aik', points, rot_mat_T)
+
+    if batch_free:
+        points_new = points_new.squeeze(0)
+
+    if return_mat:
+        rot_mat_T = torch.einsum('jka->ajk', rot_mat_T)
+        if batch_free:
+            rot_mat_T = rot_mat_T.squeeze(0)
+        return points_new, rot_mat_T
+    else:
+        return points_new
+
+
+@array_converter(apply_to=('boxes_xywhr', ))
+def xywhr2xyxyr(
+        boxes_xywhr: Union[Tensor, np.ndarray]) -> Union[Tensor, np.ndarray]:
+    """Convert a rotated boxes in XYWHR format to XYXYR format.
+
+    Args:
+        boxes_xywhr (Tensor or np.ndarray): Rotated boxes in XYWHR format.
+
+    Returns:
+        Tensor or np.ndarray: Converted boxes in XYXYR format.
+    """
+    boxes = torch.zeros_like(boxes_xywhr)
+    half_w = boxes_xywhr[..., 2] / 2
+    half_h = boxes_xywhr[..., 3] / 2
+
+    boxes[..., 0] = boxes_xywhr[..., 0] - half_w
+    boxes[..., 1] = boxes_xywhr[..., 1] - half_h
+    boxes[..., 2] = boxes_xywhr[..., 0] + half_w
+    boxes[..., 3] = boxes_xywhr[..., 1] + half_h
+    boxes[..., 4] = boxes_xywhr[..., 4]
+    return boxes
+
+
+def get_box_type(box_type: str) -> Tuple[type, int]:
+    """Get the type and mode of box structure.
+
+    Args:
+        box_type (str): The type of box structure. The valid value are "LiDAR",
+            "Camera" and "Depth".
+
+    Raises:
+        ValueError: A ValueError is raised when ``box_type`` does not belong to
+            the three valid types.
+
+    Returns:
+        tuple: Box type and box mode.
+    """
+    from .box_3d_mode import (Box3DMode, CameraInstance3DBoxes,
+                              DepthInstance3DBoxes, EulerCameraInstance3DBoxes,
+                              EulerDepthInstance3DBoxes, LiDARInstance3DBoxes)
+    box_type_lower = box_type.lower()
+    if box_type_lower == 'lidar':
+        box_type_3d = LiDARInstance3DBoxes
+        box_mode_3d = Box3DMode.LIDAR
+    elif box_type_lower == 'camera':
+        box_type_3d = CameraInstance3DBoxes
+        box_mode_3d = Box3DMode.CAM
+    elif box_type_lower == 'depth':
+        box_type_3d = DepthInstance3DBoxes
+        box_mode_3d = Box3DMode.DEPTH
+    elif box_type_lower == 'euler-depth':
+        box_type_3d = EulerDepthInstance3DBoxes
+        box_mode_3d = Box3DMode.EULER_DEPTH
+    elif box_type_lower == 'euler-camera':
+        box_type_3d = EulerCameraInstance3DBoxes
+        box_mode_3d = Box3DMode.EULER_CAM
+    else:
+        raise ValueError(
+            'Only "box_type" of "camera", "lidar", "depth", "euler"'
+            f' are supported, got {box_type}')
+
+    return box_type_3d, box_mode_3d
+
+
+@array_converter(apply_to=('points_3d', 'proj_mat'))
+def points_cam2img(points_3d: Union[Tensor, np.ndarray],
+                   proj_mat: Union[Tensor, np.ndarray],
+                   with_depth: bool = False) -> Union[Tensor, np.ndarray]:
+    """Project points in camera coordinates to image coordinates.
+
+    Args:
+        points_3d (Tensor or np.ndarray): Points in shape (N, 3).
+        proj_mat (Tensor or np.ndarray): Transformation matrix between
+            coordinates.
+        with_depth (bool): Whether to keep depth in the output.
+            Defaults to False.
+
+    Returns:
+        Tensor or np.ndarray: Points in image coordinates with shape [N, 2] if
+        ``with_depth=False``, else [N, 3].
+    """
+    points_shape = list(points_3d.shape)
+    points_shape[-1] = 1
+
+    assert len(proj_mat.shape) == 2, \
+        'The dimension of the projection matrix should be 2 ' \
+        f'instead of {len(proj_mat.shape)}.'
+    d1, d2 = proj_mat.shape[:2]
+    assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or \
+        (d1 == 4 and d2 == 4), 'The shape of the projection matrix ' \
+        f'({d1}*{d2}) is not supported.'
+    if d1 == 3:
+        proj_mat_expanded = torch.eye(4,
+                                      device=proj_mat.device,
+                                      dtype=proj_mat.dtype)
+        proj_mat_expanded[:d1, :d2] = proj_mat
+        proj_mat = proj_mat_expanded
+
+    # previous implementation use new_zeros, new_one yields better results
+    points_4 = torch.cat([points_3d, points_3d.new_ones(points_shape)], dim=-1)
+
+    point_2d = points_4 @ proj_mat.T
+    point_2d_res = point_2d[..., :2] / point_2d[..., 2:3]
+
+    if with_depth:
+        point_2d_res = torch.cat([point_2d_res, point_2d[..., 2:3]], dim=-1)
+
+    return point_2d_res
+
+
+@array_converter(apply_to=('points_3d', 'proj_mat'))
+def batch_points_cam2img(points_3d, proj_mat, with_depth=False):
+    """Project points in camera coordinates to image coordinates.
+
+    Args:
+        points_3d (torch.Tensor | np.ndarray): Points in shape (N, D, 3)
+        proj_mat (torch.Tensor | np.ndarray):
+            Transformation matrix between coordinates.
+        with_depth (bool, optional): Whether to keep depth in the output.
+            Defaults to False.
+
+    Returns:
+        (torch.Tensor | np.ndarray): Points in image coordinates,
+            with shape [N, D, 2] if `with_depth=False`, else [N, D, 3].
+    """
+    points_shape = list(points_3d.shape)
+    points_shape[-1] = 1
+
+    assert len(proj_mat.shape) == 3, 'The dimension of the projection'\
+        f' matrix should be 2 instead of {len(proj_mat.shape)}.'
+    d0, d1, d2 = proj_mat.shape[:3]
+    assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or (
+        d1 == 4 and d2 == 4), 'The shape of the projection matrix'\
+        f' ({d1}*{d2}) is not supported.'
+    if d1 == 3:
+        proj_mat_expanded = torch.eye(4,
+                                      device=proj_mat.device,
+                                      dtype=proj_mat.dtype)
+        proj_mat_expanded = proj_mat_expanded[None, :, :].expand(d0, -1, -1)
+        proj_mat_expanded[:, :d1, :d2] = proj_mat
+        proj_mat = proj_mat_expanded
+
+    # previous implementation use new_zeros, new_one yields better results
+    points_4 = torch.cat([points_3d, points_3d.new_ones(points_shape)], dim=-1)
+    # do the batch wise operation
+    point_2d = torch.bmm(points_4, proj_mat.permute(0, 2, 1))
+    # point_2d = points_4 @ proj_mat.T
+
+    point_2d_res = point_2d[..., :2] / point_2d[..., 2:3].clamp(min=1e-3)
+
+    if with_depth:
+        point_2d_res = torch.cat([point_2d_res, point_2d[..., 2:3]], dim=-1)
+
+    return point_2d_res
+
+
+@array_converter(apply_to=('points', 'cam2img'))
+def points_img2cam(
+        points: Union[Tensor, np.ndarray],
+        cam2img: Union[Tensor, np.ndarray]) -> Union[Tensor, np.ndarray]:
+    """Project points in image coordinates to camera coordinates.
+
+    Args:
+        points (Tensor or np.ndarray): 2.5D points in 2D images with shape
+            [N, 3], 3 corresponds with x, y in the image and depth.
+        cam2img (Tensor or np.ndarray): Camera intrinsic matrix. The shape can
+            be [3, 3], [3, 4] or [4, 4].
+
+    Returns:
+        Tensor or np.ndarray: Points in 3D space with shape [N, 3], 3
+        corresponds with x, y, z in 3D space.
+    """
+    assert cam2img.shape[0] <= 4
+    assert cam2img.shape[1] <= 4
+    assert points.shape[1] == 3
+
+    xys = points[:, :2]
+    depths = points[:, 2].view(-1, 1)
+    unnormed_xys = torch.cat([xys * depths, depths], dim=1)
+
+    pad_cam2img = torch.eye(4, dtype=xys.dtype, device=xys.device)
+    pad_cam2img[:cam2img.shape[0], :cam2img.shape[1]] = cam2img
+    inv_pad_cam2img = torch.inverse(pad_cam2img).transpose(0, 1)
+
+    # Do operation in homogeneous coordinates.
+    num_points = unnormed_xys.shape[0]
+    homo_xys = torch.cat([unnormed_xys, xys.new_ones((num_points, 1))], dim=1)
+    points3D = torch.mm(homo_xys, inv_pad_cam2img)[:, :3]
+
+    return points3D
+
+
+def mono_cam_box2vis(cam_box):
+    """This is a post-processing function on the bboxes from Mono-3D task. If
+    we want to perform projection visualization, we need to:
+
+        1. rotate the box along x-axis for np.pi / 2 (roll)
+        2. change orientation from local yaw to global yaw
+        3. convert yaw by (np.pi / 2 - yaw)
+
+    After applying this function, we can project and draw it on 2D images.
+
+    Args:
+        cam_box (:obj:`CameraInstance3DBoxes`): 3D bbox in camera coordinate
+            system before conversion. Could be gt bbox loaded from dataset or
+            network prediction output.
+
+    Returns:
+        :obj:`CameraInstance3DBoxes`: Box after conversion.
+    """
+    warning.warn('DeprecationWarning: The hack of yaw and dimension in the '
+                 'monocular 3D detection on nuScenes has been removed. The '
+                 'function mono_cam_box2vis will be deprecated.')
+    from .cam_box3d import CameraInstance3DBoxes
+    assert isinstance(cam_box, CameraInstance3DBoxes), \
+        'input bbox should be CameraInstance3DBoxes!'
+    loc = cam_box.gravity_center
+    dim = cam_box.dims
+    yaw = cam_box.yaw
+    feats = cam_box.tensor[:, 7:]
+    # rotate along x-axis for np.pi / 2
+    # see also here: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L557  # noqa
+    dim[:, [1, 2]] = dim[:, [2, 1]]
+    # change local yaw to global yaw for visualization
+    # refer to https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L164-L166  # noqa
+    yaw += torch.atan2(loc[:, 0], loc[:, 2])
+    # convert yaw by (-yaw - np.pi / 2)
+    # this is because mono 3D box class such as `NuScenesBox` has different
+    # definition of rotation with our `CameraInstance3DBoxes`
+    yaw = -yaw - np.pi / 2
+    cam_box = torch.cat([loc, dim, yaw[:, None], feats], dim=1)
+    cam_box = CameraInstance3DBoxes(cam_box,
+                                    box_dim=cam_box.shape[-1],
+                                    origin=(0.5, 0.5, 0.5))
+
+    return cam_box
+
+
+def get_proj_mat_by_coord_type(img_meta: dict, coord_type: str) -> Tensor:
+    """Obtain image features using points.
+
+    Args:
+        img_meta (dict): Meta information.
+        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'. Can be case-
+            insensitive.
+
+    Returns:
+        Tensor: Transformation matrix.
+    """
+    coord_type = coord_type.upper()
+    mapping = {'LIDAR': 'lidar2img', 'DEPTH': 'depth2img', 'CAMERA': 'cam2img'}
+    assert coord_type in mapping.keys()
+    return img_meta[mapping[coord_type]]
+
+
+def yaw2local(yaw: Tensor, loc: Tensor) -> Tensor:
+    """Transform global yaw to local yaw (alpha in kitti) in camera
+    coordinates, ranges from -pi to pi.
+
+    Args:
+        yaw (Tensor): A vector with local yaw of each box in shape (N, ).
+        loc (Tensor): Gravity center of each box in shape (N, 3).
+
+    Returns:
+        Tensor: Local yaw (alpha in kitti).
+    """
+    local_yaw = yaw - torch.atan2(loc[:, 0], loc[:, 2])
+    larger_idx = (local_yaw > np.pi).nonzero(as_tuple=False)
+    small_idx = (local_yaw < -np.pi).nonzero(as_tuple=False)
+    if len(larger_idx) != 0:
+        local_yaw[larger_idx] -= 2 * np.pi
+    if len(small_idx) != 0:
+        local_yaw[small_idx] += 2 * np.pi
+
+    return local_yaw
+
+
+def get_lidar2img(cam2img: Tensor, lidar2cam: Tensor) -> Tensor:
+    """Get the projection matrix of lidar2img.
+
+    Args:
+        cam2img (torch.Tensor): A 3x3 or 4x4 projection matrix.
+        lidar2cam (torch.Tensor): A 3x3 or 4x4 projection matrix.
+
+    Returns:
+        Tensor: Transformation matrix with shape 4x4.
+    """
+    if cam2img.shape == (3, 3):
+        temp = cam2img.new_zeros(4, 4)
+        temp[:3, :3] = cam2img
+        temp[3, 3] = 1
+        cam2img = temp
+
+    if lidar2cam.shape == (3, 3):
+        temp = lidar2cam.new_zeros(4, 4)
+        temp[:3, :3] = lidar2cam
+        temp[3, 3] = 1
+        lidar2cam = temp
+    return torch.matmul(cam2img, lidar2cam)
diff --git a/embodiedscan/structures/det3d_data_sample.py b/embodiedscan/structures/det3d_data_sample.py
new file mode 100644
index 0000000..1081fc3
--- /dev/null
+++ b/embodiedscan/structures/det3d_data_sample.py
@@ -0,0 +1,237 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+from mmdet.structures import DetDataSample
+from mmengine.structures import InstanceData, PixelData
+
+from .point_data import PointData
+
+
+class Det3DDataSample(DetDataSample):
+    """A data structure interface of MMDetection3D. They are used as interfaces
+    between different components.
+
+    The attributes in ``Det3DDataSample`` are divided into several parts:
+
+        - ``proposals`` (InstanceData): Region proposals used in two-stage
+          detectors.
+        - ``ignored_instances`` (InstanceData): Instances to be ignored during
+          training/testing.
+        - ``gt_instances_3d`` (InstanceData): Ground truth of 3D instance
+          annotations.
+        - ``gt_instances`` (InstanceData): Ground truth of 2D instance
+          annotations.
+        - ``pred_instances_3d`` (InstanceData): 3D instances of model
+          predictions.
+          - For point-cloud 3D object detection task whose input modality is
+            `use_lidar=True, use_camera=False`, the 3D predictions results are
+            saved in `pred_instances_3d`.
+          - For vision-only (monocular/multi-view) 3D object detection task
+            whose input modality is `use_lidar=False, use_camera=True`, the 3D
+            predictions are saved in `pred_instances_3d`.
+        - ``pred_instances`` (InstanceData): 2D instances of model predictions.
+          - For multi-modality 3D detection task whose input modality is
+            `use_lidar=True, use_camera=True`, the 2D predictions are saved in
+            `pred_instances`.
+        - ``pts_pred_instances_3d`` (InstanceData): 3D instances of model
+          predictions based on point cloud.
+          - For multi-modality 3D detection task whose input modality is
+            `use_lidar=True, use_camera=True`, the 3D predictions based on
+            point cloud are saved in `pts_pred_instances_3d` to distinguish
+            with `img_pred_instances_3d` which based on image.
+        - ``img_pred_instances_3d`` (InstanceData): 3D instances of model
+          predictions based on image.
+          - For multi-modality 3D detection task whose input modality is
+            `use_lidar=True, use_camera=True`, the 3D predictions based on
+            image are saved in `img_pred_instances_3d` to distinguish with
+            `pts_pred_instances_3d` which based on point cloud.
+        - ``gt_pts_seg`` (PointData): Ground truth of point cloud segmentation.
+        - ``pred_pts_seg`` (PointData): Prediction of point cloud segmentation.
+        - ``eval_ann_info`` (dict or None): Raw annotation, which will be
+          passed to evaluator and do the online evaluation.
+
+    Examples:
+        >>> import torch
+        >>> from mmengine.structures import InstanceData
+
+        >>> from mmdet3d.structures import Det3DDataSample
+        >>> from mmdet3d.structures.bbox_3d import BaseInstance3DBoxes
+
+        >>> data_sample = Det3DDataSample()
+        >>> meta_info = dict(
+        ...     img_shape=(800, 1196, 3),
+        ...     pad_shape=(800, 1216, 3))
+        >>> gt_instances_3d = InstanceData(metainfo=meta_info)
+        >>> gt_instances_3d.bboxes_3d = BaseInstance3DBoxes(torch.rand((5, 7)))
+        >>> gt_instances_3d.labels_3d = torch.randint(0, 3, (5,))
+        >>> data_sample.gt_instances_3d = gt_instances_3d
+        >>> assert 'img_shape' in data_sample.gt_instances_3d.metainfo_keys()
+        >>> len(data_sample.gt_instances_3d)
+        5
+        >>> print(data_sample)
+        <Det3DDataSample(
+            META INFORMATION
+            DATA FIELDS
+            gt_instances_3d: <InstanceData(
+                    META INFORMATION
+                    img_shape: (800, 1196, 3)
+                    pad_shape: (800, 1216, 3)
+                    DATA FIELDS
+                    labels_3d: tensor([1, 0, 2, 0, 1])
+                    bboxes_3d: BaseInstance3DBoxes(
+                            tensor([[1.9115e-01, 3.6061e-01, 6.7707e-01, 5.2902e-01, 8.0736e-01, 8.2759e-01,
+                                2.4328e-01],
+                                [5.6272e-01, 2.7508e-01, 5.7966e-01, 9.2410e-01, 3.0456e-01, 1.8912e-01,
+                                3.3176e-01],
+                                [8.1069e-01, 2.8684e-01, 7.7689e-01, 9.2397e-02, 5.5849e-01, 3.8007e-01,
+                                4.6719e-01],
+                                [6.6346e-01, 4.8005e-01, 5.2318e-02, 4.4137e-01, 4.1163e-01, 8.9339e-01,
+                                7.2847e-01],
+                                [2.4800e-01, 7.1944e-01, 3.4766e-01, 7.8583e-01, 8.5507e-01, 6.3729e-02,
+                                7.5161e-05]]))
+                ) at 0x7f7e29de3a00>
+        ) at 0x7f7e2a0e8640>
+        >>> pred_instances = InstanceData(metainfo=meta_info)
+        >>> pred_instances.bboxes = torch.rand((5, 4))
+        >>> pred_instances.scores = torch.rand((5, ))
+        >>> data_sample = Det3DDataSample(pred_instances=pred_instances)
+        >>> assert 'pred_instances' in data_sample
+
+        >>> pred_instances_3d = InstanceData(metainfo=meta_info)
+        >>> pred_instances_3d.bboxes_3d = BaseInstance3DBoxes(
+        ...     torch.rand((5, 7)))
+        >>> pred_instances_3d.scores_3d = torch.rand((5, ))
+        >>> pred_instances_3d.labels_3d = torch.rand((5, ))
+        >>> data_sample = Det3DDataSample(pred_instances_3d=pred_instances_3d)
+        >>> assert 'pred_instances_3d' in data_sample
+
+        >>> data_sample = Det3DDataSample()
+        >>> gt_instances_3d_data = dict(
+        ...     bboxes_3d=BaseInstance3DBoxes(torch.rand((2, 7))),
+        ...     labels_3d=torch.rand(2))
+        >>> gt_instances_3d = InstanceData(**gt_instances_3d_data)
+        >>> data_sample.gt_instances_3d = gt_instances_3d
+        >>> assert 'gt_instances_3d' in data_sample
+        >>> assert 'bboxes_3d' in data_sample.gt_instances_3d
+
+        >>> from mmdet3d.structures import PointData
+        >>> data_sample = Det3DDataSample()
+        >>> gt_pts_seg_data = dict(
+        ...     pts_instance_mask=torch.rand(2),
+        ...     pts_semantic_mask=torch.rand(2))
+        >>> data_sample.gt_pts_seg = PointData(**gt_pts_seg_data)
+        >>> print(data_sample)
+        <Det3DDataSample(
+            META INFORMATION
+            DATA FIELDS
+            gt_pts_seg: <PointData(
+                    META INFORMATION
+                    DATA FIELDS
+                    pts_semantic_mask: tensor([0.7199, 0.4006])
+                    pts_instance_mask: tensor([0.7363, 0.8096])
+                ) at 0x7f7e2962cc40>
+        ) at 0x7f7e29ff0d60>
+    """  # noqa: E501
+
+    @property
+    def gt_instances_3d(self) -> InstanceData:
+        return self._gt_instances_3d
+
+    @gt_instances_3d.setter
+    def gt_instances_3d(self, value: InstanceData) -> None:
+        self.set_field(value, '_gt_instances_3d', dtype=InstanceData)
+
+    @gt_instances_3d.deleter
+    def gt_instances_3d(self) -> None:
+        del self._gt_instances_3d
+
+    @property
+    def pred_instances_3d(self) -> InstanceData:
+        return self._pred_instances_3d
+
+    @pred_instances_3d.setter
+    def pred_instances_3d(self, value: InstanceData) -> None:
+        self.set_field(value, '_pred_instances_3d', dtype=InstanceData)
+
+    @pred_instances_3d.deleter
+    def pred_instances_3d(self) -> None:
+        del self._pred_instances_3d
+
+    @property
+    def pts_pred_instances_3d(self) -> InstanceData:
+        return self._pts_pred_instances_3d
+
+    @pts_pred_instances_3d.setter
+    def pts_pred_instances_3d(self, value: InstanceData) -> None:
+        self.set_field(value, '_pts_pred_instances_3d', dtype=InstanceData)
+
+    @pts_pred_instances_3d.deleter
+    def pts_pred_instances_3d(self) -> None:
+        del self._pts_pred_instances_3d
+
+    @property
+    def img_pred_instances_3d(self) -> InstanceData:
+        return self._img_pred_instances_3d
+
+    @img_pred_instances_3d.setter
+    def img_pred_instances_3d(self, value: InstanceData) -> None:
+        self.set_field(value, '_img_pred_instances_3d', dtype=InstanceData)
+
+    @img_pred_instances_3d.deleter
+    def img_pred_instances_3d(self) -> None:
+        del self._img_pred_instances_3d
+
+    @property
+    def gt_pts_seg(self) -> PointData:
+        return self._gt_pts_seg
+
+    @gt_pts_seg.setter
+    def gt_pts_seg(self, value: PointData) -> None:
+        self.set_field(value, '_gt_pts_seg', dtype=PointData)
+
+    @gt_pts_seg.deleter
+    def gt_pts_seg(self) -> None:
+        del self._gt_pts_seg
+
+    @property
+    def pred_pts_seg(self) -> PointData:
+        return self._pred_pts_seg
+
+    @pred_pts_seg.setter
+    def pred_pts_seg(self, value: PointData) -> None:
+        self.set_field(value, '_pred_pts_seg', dtype=PointData)
+
+    @pred_pts_seg.deleter
+    def pred_pts_seg(self) -> None:
+        del self._pred_pts_seg
+
+    @property
+    def gt_depth_map(self) -> PixelData:
+        return self._gt_depth_map
+
+    @gt_depth_map.setter
+    def gt_depth_map(self, value: PixelData) -> None:
+        self.set_field(value, '_gt_depth_map', dtype=PixelData)
+
+    @gt_depth_map.deleter
+    def gt_depth_map(self) -> None:
+        del self._gt_depth_map
+
+    @property
+    def pred_depth_map(self) -> PixelData:
+        return self._pred_depth_map
+
+    @pred_depth_map.setter
+    def pred_depth_map(self, value: PixelData) -> None:
+        self.set_field(value, '_pred_depth_map', dtype=PixelData)
+
+    @pred_depth_map.deleter
+    def pred_depth_map(self) -> None:
+        del self._pred_depth_map
+
+
+SampleList = List[Det3DDataSample]
+OptSampleList = Optional[SampleList]
+ForwardResults = Union[Dict[str, torch.Tensor], List[Det3DDataSample],
+                       Tuple[torch.Tensor], torch.Tensor]
diff --git a/embodiedscan/structures/ops/__init__.py b/embodiedscan/structures/ops/__init__.py
new file mode 100644
index 0000000..d71ec30
--- /dev/null
+++ b/embodiedscan/structures/ops/__init__.py
@@ -0,0 +1,38 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# yapf:disable
+from .box_np_ops import (box2d_to_corner_jit, box3d_to_bbox,
+                         box_camera_to_lidar, boxes3d_to_corners3d_lidar,
+                         camera_to_lidar, center_to_corner_box2d,
+                         center_to_corner_box3d, center_to_minmax_2d,
+                         corner_to_standup_nd_jit, corner_to_surfaces_3d,
+                         corner_to_surfaces_3d_jit, corners_nd,
+                         create_anchors_3d_range, depth_to_lidar_points,
+                         depth_to_points, get_frustum, iou_jit,
+                         minmax_to_corner_2d, points_in_convex_polygon_3d_jit,
+                         points_in_convex_polygon_jit, points_in_rbbox,
+                         projection_matrix_to_CRT_kitti, rbbox2d_to_near_bbox,
+                         remove_outside_points, rotation_points_single_angle,
+                         surface_equ_3d)
+# yapf:enable
+from .iou3d_calculator import (AxisAlignedBboxOverlaps3D, BboxOverlaps3D,
+                               BboxOverlapsNearest3D,
+                               axis_aligned_bbox_overlaps_3d, bbox_overlaps_3d,
+                               bbox_overlaps_nearest_3d)
+from .transforms import bbox3d2result, bbox3d2roi, bbox3d_mapping_back
+
+__all__ = [
+    'box2d_to_corner_jit', 'box3d_to_bbox', 'box_camera_to_lidar',
+    'boxes3d_to_corners3d_lidar', 'camera_to_lidar', 'center_to_corner_box2d',
+    'center_to_corner_box3d', 'center_to_minmax_2d',
+    'corner_to_standup_nd_jit', 'corner_to_surfaces_3d',
+    'corner_to_surfaces_3d_jit', 'corners_nd', 'create_anchors_3d_range',
+    'depth_to_lidar_points', 'depth_to_points', 'get_frustum', 'iou_jit',
+    'minmax_to_corner_2d', 'points_in_convex_polygon_3d_jit',
+    'points_in_convex_polygon_jit', 'points_in_rbbox',
+    'projection_matrix_to_CRT_kitti', 'rbbox2d_to_near_bbox',
+    'remove_outside_points', 'rotation_points_single_angle', 'surface_equ_3d',
+    'BboxOverlapsNearest3D', 'BboxOverlaps3D', 'bbox_overlaps_nearest_3d',
+    'bbox_overlaps_3d', 'AxisAlignedBboxOverlaps3D',
+    'axis_aligned_bbox_overlaps_3d', 'bbox3d_mapping_back', 'bbox3d2roi',
+    'bbox3d2result'
+]
diff --git a/embodiedscan/structures/ops/box_np_ops.py b/embodiedscan/structures/ops/box_np_ops.py
new file mode 100644
index 0000000..9189103
--- /dev/null
+++ b/embodiedscan/structures/ops/box_np_ops.py
@@ -0,0 +1,838 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# TODO: clean the functions in this file and move the APIs into box bbox_3d
+# in the future
+# NOTICE: All functions in this file are valid for LiDAR or depth boxes only
+# if we use default parameters.
+
+import numba
+import numpy as np
+from mmdet3d.structures.bbox_3d import (limit_period, points_cam2img,
+                                        rotation_3d_in_axis)
+
+
+def camera_to_lidar(points, r_rect, velo2cam):
+    """Convert points in camera coordinate to lidar coordinate.
+
+    Note:
+        This function is for KITTI only.
+
+    Args:
+        points (np.ndarray, shape=[N, 3]): Points in camera coordinate.
+        r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
+            specific camera coordinate (e.g. CAM2) to CAM0.
+        velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
+            camera coordinate to lidar coordinate.
+
+    Returns:
+        np.ndarray, shape=[N, 3]: Points in lidar coordinate.
+    """
+    points_shape = list(points.shape[0:-1])
+    if points.shape[-1] == 3:
+        points = np.concatenate([points, np.ones(points_shape + [1])], axis=-1)
+    lidar_points = points @ np.linalg.inv((r_rect @ velo2cam).T)
+    return lidar_points[..., :3]
+
+
+def box_camera_to_lidar(data, r_rect, velo2cam):
+    """Convert boxes in camera coordinate to lidar coordinate.
+
+    Note:
+        This function is for KITTI only.
+
+    Args:
+        data (np.ndarray, shape=[N, 7]): Boxes in camera coordinate.
+        r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
+            specific camera coordinate (e.g. CAM2) to CAM0.
+        velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
+            camera coordinate to lidar coordinate.
+
+    Returns:
+        np.ndarray, shape=[N, 3]: Boxes in lidar coordinate.
+    """
+    xyz = data[:, 0:3]
+    x_size, y_size, z_size = data[:, 3:4], data[:, 4:5], data[:, 5:6]
+    r = data[:, 6:7]
+    xyz_lidar = camera_to_lidar(xyz, r_rect, velo2cam)
+    # yaw and dims also needs to be converted
+    r_new = -r - np.pi / 2
+    r_new = limit_period(r_new, period=np.pi * 2)
+    return np.concatenate([xyz_lidar, x_size, z_size, y_size, r_new], axis=1)
+
+
+def corners_nd(dims, origin=0.5):
+    """Generate relative box corners based on length per dim and origin point.
+
+    Args:
+        dims (np.ndarray, shape=[N, ndim]): Array of length per dim
+        origin (list or array or float, optional): origin point relate to
+            smallest point. Defaults to 0.5
+
+    Returns:
+        np.ndarray, shape=[N, 2 ** ndim, ndim]: Returned corners.
+        point layout example: (2d) x0y0, x0y1, x1y0, x1y1;
+            (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1
+            where x0 < x1, y0 < y1, z0 < z1.
+    """
+    ndim = int(dims.shape[1])
+    corners_norm = np.stack(np.unravel_index(np.arange(2**ndim), [2] * ndim),
+                            axis=1).astype(dims.dtype)
+    # now corners_norm has format: (2d) x0y0, x0y1, x1y0, x1y1
+    # (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1
+    # so need to convert to a format which is convenient to do other computing.
+    # for 2d boxes, format is clockwise start with minimum point
+    # for 3d boxes, please draw lines by your hand.
+    if ndim == 2:
+        # generate clockwise box corners
+        corners_norm = corners_norm[[0, 1, 3, 2]]
+    elif ndim == 3:
+        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+    corners_norm = corners_norm - np.array(origin, dtype=dims.dtype)
+    corners = dims.reshape([-1, 1, ndim]) * corners_norm.reshape(
+        [1, 2**ndim, ndim])
+    return corners
+
+
+def center_to_corner_box2d(centers, dims, angles=None, origin=0.5):
+    """Convert kitti locations, dimensions and angles to corners.
+    format: center(xy), dims(xy), angles(counterclockwise when positive)
+
+    Args:
+        centers (np.ndarray): Locations in kitti label file with shape (N, 2).
+        dims (np.ndarray): Dimensions in kitti label file with shape (N, 2).
+        angles (np.ndarray, optional): Rotation_y in kitti label file with
+            shape (N). Defaults to None.
+        origin (list or array or float, optional): origin point relate to
+            smallest point. Defaults to 0.5.
+
+    Returns:
+        np.ndarray: Corners with the shape of (N, 4, 2).
+    """
+    # 'length' in kitti format is in x axis.
+    # xyz(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar)
+    # center in kitti format is [0.5, 1.0, 0.5] in xyz.
+    corners = corners_nd(dims, origin=origin)
+    # corners: [N, 4, 2]
+    if angles is not None:
+        corners = rotation_3d_in_axis(corners, angles)
+    corners += centers.reshape([-1, 1, 2])
+    return corners
+
+
+@numba.jit(nopython=True)
+def depth_to_points(depth, trunc_pixel):
+    """Convert depth map to points.
+
+    Args:
+        depth (np.array, shape=[H, W]): Depth map which
+            the row of [0~`trunc_pixel`] are truncated.
+        trunc_pixel (int): The number of truncated row.
+
+    Returns:
+        np.ndarray: Points in camera coordinates.
+    """
+    num_pts = np.sum(depth[trunc_pixel:, ] > 0.1)
+    points = np.zeros((num_pts, 3), dtype=depth.dtype)
+    x = np.array([0, 0, 1], dtype=depth.dtype)
+    k = 0
+    for i in range(trunc_pixel, depth.shape[0]):
+        for j in range(depth.shape[1]):
+            if depth[i, j] > 0.1:
+                x = np.array([j, i, 1], dtype=depth.dtype)
+                points[k] = x * depth[i, j]
+                k += 1
+    return points
+
+
+def depth_to_lidar_points(depth, trunc_pixel, P2, r_rect, velo2cam):
+    """Convert depth map to points in lidar coordinate.
+
+    Args:
+        depth (np.array, shape=[H, W]): Depth map which
+            the row of [0~`trunc_pixel`] are truncated.
+        trunc_pixel (int): The number of truncated row.
+        P2 (p.array, shape=[4, 4]): Intrinsics of Camera2.
+        r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
+            specific camera coordinate (e.g. CAM2) to CAM0.
+        velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
+            camera coordinate to lidar coordinate.
+
+    Returns:
+        np.ndarray: Points in lidar coordinates.
+    """
+    pts = depth_to_points(depth, trunc_pixel)
+    points_shape = list(pts.shape[0:-1])
+    points = np.concatenate([pts, np.ones(points_shape + [1])], axis=-1)
+    points = points @ np.linalg.inv(P2.T)
+    lidar_points = camera_to_lidar(points, r_rect, velo2cam)
+    return lidar_points
+
+
+def center_to_corner_box3d(centers,
+                           dims,
+                           angles=None,
+                           origin=(0.5, 1.0, 0.5),
+                           axis=1):
+    """Convert kitti locations, dimensions and angles to corners.
+
+    Args:
+        centers (np.ndarray): Locations in kitti label file with shape (N, 3).
+        dims (np.ndarray): Dimensions in kitti label file with shape (N, 3).
+        angles (np.ndarray, optional): Rotation_y in kitti label file with
+            shape (N). Defaults to None.
+        origin (list or array or float, optional): Origin point relate to
+            smallest point. Use (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0)
+            in lidar. Defaults to (0.5, 1.0, 0.5).
+        axis (int, optional): Rotation axis. 1 for camera and 2 for lidar.
+            Defaults to 1.
+
+    Returns:
+        np.ndarray: Corners with the shape of (N, 8, 3).
+    """
+    # 'length' in kitti format is in x axis.
+    # yzx(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(lwh)(lidar)
+    # center in kitti format is [0.5, 1.0, 0.5] in xyz.
+    corners = corners_nd(dims, origin=origin)
+    # corners: [N, 8, 3]
+    if angles is not None:
+        corners = rotation_3d_in_axis(corners, angles, axis=axis)
+    corners += centers.reshape([-1, 1, 3])
+    return corners
+
+
+@numba.jit(nopython=True)
+def box2d_to_corner_jit(boxes):
+    """Convert box2d to corner.
+
+    Args:
+        boxes (np.ndarray, shape=[N, 5]): Boxes2d with rotation.
+
+    Returns:
+        box_corners (np.ndarray, shape=[N, 4, 2]): Box corners.
+    """
+    num_box = boxes.shape[0]
+    corners_norm = np.zeros((4, 2), dtype=boxes.dtype)
+    corners_norm[1, 1] = 1.0
+    corners_norm[2] = 1.0
+    corners_norm[3, 0] = 1.0
+    corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype)
+    corners = boxes.reshape(num_box, 1, 5)[:, :, 2:4] * corners_norm.reshape(
+        1, 4, 2)
+    rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype)
+    box_corners = np.zeros((num_box, 4, 2), dtype=boxes.dtype)
+    for i in range(num_box):
+        rot_sin = np.sin(boxes[i, -1])
+        rot_cos = np.cos(boxes[i, -1])
+        rot_mat_T[0, 0] = rot_cos
+        rot_mat_T[0, 1] = rot_sin
+        rot_mat_T[1, 0] = -rot_sin
+        rot_mat_T[1, 1] = rot_cos
+        box_corners[i] = corners[i] @ rot_mat_T + boxes[i, :2]
+    return box_corners
+
+
+@numba.njit
+def corner_to_standup_nd_jit(boxes_corner):
+    """Convert boxes_corner to aligned (min-max) boxes.
+
+    Args:
+        boxes_corner (np.ndarray, shape=[N, 2**dim, dim]): Boxes corners.
+
+    Returns:
+        np.ndarray, shape=[N, dim*2]: Aligned (min-max) boxes.
+    """
+    num_boxes = boxes_corner.shape[0]
+    ndim = boxes_corner.shape[-1]
+    result = np.zeros((num_boxes, ndim * 2), dtype=boxes_corner.dtype)
+    for i in range(num_boxes):
+        for j in range(ndim):
+            result[i, j] = np.min(boxes_corner[i, :, j])
+        for j in range(ndim):
+            result[i, j + ndim] = np.max(boxes_corner[i, :, j])
+    return result
+
+
+@numba.jit(nopython=True)
+def corner_to_surfaces_3d_jit(corners):
+    """Convert 3d box corners from corner function above to surfaces that
+    normal vectors all direct to internal.
+
+    Args:
+        corners (np.ndarray): 3d box corners with the shape of (N, 8, 3).
+
+    Returns:
+        np.ndarray: Surfaces with the shape of (N, 6, 4, 3).
+    """
+    # box_corners: [N, 8, 3], must from corner functions in this module
+    num_boxes = corners.shape[0]
+    surfaces = np.zeros((num_boxes, 6, 4, 3), dtype=corners.dtype)
+    corner_idxes = np.array([
+        0, 1, 2, 3, 7, 6, 5, 4, 0, 3, 7, 4, 1, 5, 6, 2, 0, 4, 5, 1, 3, 2, 6, 7
+    ]).reshape(6, 4)
+    for i in range(num_boxes):
+        for j in range(6):
+            for k in range(4):
+                surfaces[i, j, k] = corners[i, corner_idxes[j, k]]
+    return surfaces
+
+
+def rotation_points_single_angle(points, angle, axis=0):
+    """Rotate points with a single angle.
+
+    Args:
+        points (np.ndarray, shape=[N, 3]]):
+        angle (np.ndarray, shape=[1]]):
+        axis (int, optional): Axis to rotate at. Defaults to 0.
+
+    Returns:
+        np.ndarray: Rotated points.
+    """
+    # points: [N, 3]
+    rot_sin = np.sin(angle)
+    rot_cos = np.cos(angle)
+    if axis == 1:
+        rot_mat_T = np.array(
+            [[rot_cos, 0, rot_sin], [0, 1, 0], [-rot_sin, 0, rot_cos]],
+            dtype=points.dtype)
+    elif axis == 2 or axis == -1:
+        rot_mat_T = np.array(
+            [[rot_cos, rot_sin, 0], [-rot_sin, rot_cos, 0], [0, 0, 1]],
+            dtype=points.dtype)
+    elif axis == 0:
+        rot_mat_T = np.array(
+            [[1, 0, 0], [0, rot_cos, rot_sin], [0, -rot_sin, rot_cos]],
+            dtype=points.dtype)
+    else:
+        raise ValueError('axis should in range')
+
+    return points @ rot_mat_T, rot_mat_T
+
+
+def box3d_to_bbox(box3d, P2):
+    """Convert box3d in camera coordinates to bbox in image coordinates.
+
+    Args:
+        box3d (np.ndarray, shape=[N, 7]): Boxes in camera coordinate.
+        P2 (np.array, shape=[4, 4]): Intrinsics of Camera2.
+
+    Returns:
+        np.ndarray, shape=[N, 4]: Boxes 2d in image coordinates.
+    """
+    box_corners = center_to_corner_box3d(box3d[:, :3],
+                                         box3d[:, 3:6],
+                                         box3d[:, 6], [0.5, 1.0, 0.5],
+                                         axis=1)
+    box_corners_in_image = points_cam2img(box_corners, P2)
+    # box_corners_in_image: [N, 8, 2]
+    minxy = np.min(box_corners_in_image, axis=1)
+    maxxy = np.max(box_corners_in_image, axis=1)
+    bbox = np.concatenate([minxy, maxxy], axis=1)
+    return bbox
+
+
+def corner_to_surfaces_3d(corners):
+    """convert 3d box corners from corner function above to surfaces that
+    normal vectors all direct to internal.
+
+    Args:
+        corners (np.ndarray): 3D box corners with shape of (N, 8, 3).
+
+    Returns:
+        np.ndarray: Surfaces with the shape of (N, 6, 4, 3).
+    """
+    # box_corners: [N, 8, 3], must from corner functions in this module
+    surfaces = np.array([
+        [corners[:, 0], corners[:, 1], corners[:, 2], corners[:, 3]],
+        [corners[:, 7], corners[:, 6], corners[:, 5], corners[:, 4]],
+        [corners[:, 0], corners[:, 3], corners[:, 7], corners[:, 4]],
+        [corners[:, 1], corners[:, 5], corners[:, 6], corners[:, 2]],
+        [corners[:, 0], corners[:, 4], corners[:, 5], corners[:, 1]],
+        [corners[:, 3], corners[:, 2], corners[:, 6], corners[:, 7]],
+    ]).transpose([2, 0, 1, 3])
+    return surfaces
+
+
+def points_in_rbbox(points, rbbox, z_axis=2, origin=(0.5, 0.5, 0)):
+    """Check points in rotated bbox and return indices.
+
+    Note:
+        This function is for counterclockwise boxes.
+
+    Args:
+        points (np.ndarray, shape=[N, 3+dim]): Points to query.
+        rbbox (np.ndarray, shape=[M, 7]): Boxes3d with rotation.
+        z_axis (int, optional): Indicate which axis is height.
+            Defaults to 2.
+        origin (tuple[int], optional): Indicate the position of
+            box center. Defaults to (0.5, 0.5, 0).
+
+    Returns:
+        np.ndarray, shape=[N, M]: Indices of points in each box.
+    """
+    # TODO: this function is different from PointCloud3D, be careful
+    # when start to use nuscene, check the input
+    rbbox_corners = center_to_corner_box3d(rbbox[:, :3],
+                                           rbbox[:, 3:6],
+                                           rbbox[:, 6],
+                                           origin=origin,
+                                           axis=z_axis)
+    surfaces = corner_to_surfaces_3d(rbbox_corners)
+    indices = points_in_convex_polygon_3d_jit(points[:, :3], surfaces)
+    return indices
+
+
+def minmax_to_corner_2d(minmax_box):
+    """Convert minmax box to corners2d.
+
+    Args:
+        minmax_box (np.ndarray, shape=[N, dims]): minmax boxes.
+
+    Returns:
+        np.ndarray: 2d corners of boxes
+    """
+    ndim = minmax_box.shape[-1] // 2
+    center = minmax_box[..., :ndim]
+    dims = minmax_box[..., ndim:] - center
+    return center_to_corner_box2d(center, dims, origin=0.0)
+
+
+def create_anchors_3d_range(feature_size,
+                            anchor_range,
+                            sizes=((3.9, 1.6, 1.56), ),
+                            rotations=(0, np.pi / 2),
+                            dtype=np.float32):
+    """Create anchors 3d by range.
+
+    Args:
+        feature_size (list[float] | tuple[float]): Feature map size. It is
+            either a list of a tuple of [D, H, W](in order of z, y, and x).
+        anchor_range (torch.Tensor | list[float]): Range of anchors with
+            shape [6]. The order is consistent with that of anchors, i.e.,
+            (x_min, y_min, z_min, x_max, y_max, z_max).
+        sizes (list[list] | np.ndarray | torch.Tensor, optional):
+            Anchor size with shape [N, 3], in order of x, y, z.
+            Defaults to ((3.9, 1.6, 1.56), ).
+        rotations (list[float] | np.ndarray | torch.Tensor, optional):
+            Rotations of anchors in a single feature grid.
+            Defaults to (0, np.pi / 2).
+        dtype (type, optional): Data type. Defaults to np.float32.
+
+    Returns:
+        np.ndarray: Range based anchors with shape of
+            (*feature_size, num_sizes, num_rots, 7).
+    """
+    anchor_range = np.array(anchor_range, dtype)
+    z_centers = np.linspace(anchor_range[2],
+                            anchor_range[5],
+                            feature_size[0],
+                            dtype=dtype)
+    y_centers = np.linspace(anchor_range[1],
+                            anchor_range[4],
+                            feature_size[1],
+                            dtype=dtype)
+    x_centers = np.linspace(anchor_range[0],
+                            anchor_range[3],
+                            feature_size[2],
+                            dtype=dtype)
+    sizes = np.reshape(np.array(sizes, dtype=dtype), [-1, 3])
+    rotations = np.array(rotations, dtype=dtype)
+    rets = np.meshgrid(x_centers,
+                       y_centers,
+                       z_centers,
+                       rotations,
+                       indexing='ij')
+    tile_shape = [1] * 5
+    tile_shape[-2] = int(sizes.shape[0])
+    for i in range(len(rets)):
+        rets[i] = np.tile(rets[i][..., np.newaxis, :], tile_shape)
+        rets[i] = rets[i][..., np.newaxis]  # for concat
+    sizes = np.reshape(sizes, [1, 1, 1, -1, 1, 3])
+    tile_size_shape = list(rets[0].shape)
+    tile_size_shape[3] = 1
+    sizes = np.tile(sizes, tile_size_shape)
+    rets.insert(3, sizes)
+    ret = np.concatenate(rets, axis=-1)
+    return np.transpose(ret, [2, 1, 0, 3, 4, 5])
+
+
+def center_to_minmax_2d(centers, dims, origin=0.5):
+    """Center to minmax.
+
+    Args:
+        centers (np.ndarray): Center points.
+        dims (np.ndarray): Dimensions.
+        origin (list or array or float, optional): Origin point relate
+            to smallest point. Defaults to 0.5.
+
+    Returns:
+        np.ndarray: Minmax points.
+    """
+    if origin == 0.5:
+        return np.concatenate([centers - dims / 2, centers + dims / 2],
+                              axis=-1)
+    corners = center_to_corner_box2d(centers, dims, origin=origin)
+    return corners[:, [0, 2]].reshape([-1, 4])
+
+
+def rbbox2d_to_near_bbox(rbboxes):
+    """convert rotated bbox to nearest 'standing' or 'lying' bbox.
+
+    Args:
+        rbboxes (np.ndarray): Rotated bboxes with shape of
+            (N, 5(x, y, xdim, ydim, rad)).
+
+    Returns:
+        np.ndarray: Bounding boxes with the shape of
+            (N, 4(xmin, ymin, xmax, ymax)).
+    """
+    rots = rbboxes[..., -1]
+    rots_0_pi_div_2 = np.abs(limit_period(rots, 0.5, np.pi))
+    cond = (rots_0_pi_div_2 > np.pi / 4)[..., np.newaxis]
+    bboxes_center = np.where(cond, rbboxes[:, [0, 1, 3, 2]], rbboxes[:, :4])
+    bboxes = center_to_minmax_2d(bboxes_center[:, :2], bboxes_center[:, 2:])
+    return bboxes
+
+
+@numba.jit(nopython=True)
+def iou_jit(boxes, query_boxes, mode='iou', eps=0.0):
+    """Calculate box iou. Note that jit version runs ~10x faster than the
+    box_overlaps function in mmdet3d.core.evaluation.
+
+    Note:
+        This function is for counterclockwise boxes.
+
+    Args:
+        boxes (np.ndarray): Input bounding boxes with shape of (N, 4).
+        query_boxes (np.ndarray): Query boxes with shape of (K, 4).
+        mode (str, optional): IoU mode. Defaults to 'iou'.
+        eps (float, optional): Value added to denominator. Defaults to 0.
+
+    Returns:
+        np.ndarray: Overlap between boxes and query_boxes
+            with the shape of [N, K].
+    """
+    N = boxes.shape[0]
+    K = query_boxes.shape[0]
+    overlaps = np.zeros((N, K), dtype=boxes.dtype)
+    for k in range(K):
+        box_area = ((query_boxes[k, 2] - query_boxes[k, 0] + eps) *
+                    (query_boxes[k, 3] - query_boxes[k, 1] + eps))
+        for n in range(N):
+            iw = (min(boxes[n, 2], query_boxes[k, 2]) -
+                  max(boxes[n, 0], query_boxes[k, 0]) + eps)
+            if iw > 0:
+                ih = (min(boxes[n, 3], query_boxes[k, 3]) -
+                      max(boxes[n, 1], query_boxes[k, 1]) + eps)
+                if ih > 0:
+                    if mode == 'iou':
+                        ua = ((boxes[n, 2] - boxes[n, 0] + eps) *
+                              (boxes[n, 3] - boxes[n, 1] + eps) + box_area -
+                              iw * ih)
+                    else:
+                        ua = ((boxes[n, 2] - boxes[n, 0] + eps) *
+                              (boxes[n, 3] - boxes[n, 1] + eps))
+                    overlaps[n, k] = iw * ih / ua
+    return overlaps
+
+
+def projection_matrix_to_CRT_kitti(proj):
+    """Split projection matrix of KITTI.
+
+    Note:
+        This function is for KITTI only.
+
+    P = C @ [R|T]
+    C is upper triangular matrix, so we need to inverse CR and use QR
+    stable for all kitti camera projection matrix.
+
+    Args:
+        proj (p.array, shape=[4, 4]): Intrinsics of camera.
+
+    Returns:
+        tuple[np.ndarray]: Splited matrix of C, R and T.
+    """
+
+    CR = proj[0:3, 0:3]
+    CT = proj[0:3, 3]
+    RinvCinv = np.linalg.inv(CR)
+    Rinv, Cinv = np.linalg.qr(RinvCinv)
+    C = np.linalg.inv(Cinv)
+    R = np.linalg.inv(Rinv)
+    T = Cinv @ CT
+    return C, R, T
+
+
+def remove_outside_points(points, rect, Trv2c, P2, image_shape):
+    """Remove points which are outside of image.
+
+    Note:
+        This function is for KITTI only.
+
+    Args:
+        points (np.ndarray, shape=[N, 3+dims]): Total points.
+        rect (np.ndarray, shape=[4, 4]): Matrix to project points in
+            specific camera coordinate (e.g. CAM2) to CAM0.
+        Trv2c (np.ndarray, shape=[4, 4]): Matrix to project points in
+            camera coordinate to lidar coordinate.
+        P2 (p.array, shape=[4, 4]): Intrinsics of Camera2.
+        image_shape (list[int]): Shape of image.
+
+    Returns:
+        np.ndarray, shape=[N, 3+dims]: Filtered points.
+    """
+    # 5x faster than remove_outside_points_v1(2ms vs 10ms)
+    C, R, T = projection_matrix_to_CRT_kitti(P2)
+    image_bbox = [0, 0, image_shape[1], image_shape[0]]
+    frustum = get_frustum(image_bbox, C)
+    frustum -= T
+    frustum = np.linalg.inv(R) @ frustum.T
+    frustum = camera_to_lidar(frustum.T, rect, Trv2c)
+    frustum_surfaces = corner_to_surfaces_3d_jit(frustum[np.newaxis, ...])
+    indices = points_in_convex_polygon_3d_jit(points[:, :3], frustum_surfaces)
+    points = points[indices.reshape([-1])]
+    return points
+
+
+def get_frustum(bbox_image, C, near_clip=0.001, far_clip=100):
+    """Get frustum corners in camera coordinates.
+
+    Args:
+        bbox_image (list[int]): box in image coordinates.
+        C (np.ndarray): Intrinsics.
+        near_clip (float, optional): Nearest distance of frustum.
+            Defaults to 0.001.
+        far_clip (float, optional): Farthest distance of frustum.
+            Defaults to 100.
+
+    Returns:
+        np.ndarray, shape=[8, 3]: coordinates of frustum corners.
+    """
+    fku = C[0, 0]
+    fkv = -C[1, 1]
+    u0v0 = C[0:2, 2]
+    z_points = np.array([near_clip] * 4 + [far_clip] * 4,
+                        dtype=C.dtype)[:, np.newaxis]
+    b = bbox_image
+    box_corners = np.array(
+        [[b[0], b[1]], [b[0], b[3]], [b[2], b[3]], [b[2], b[1]]],
+        dtype=C.dtype)
+    near_box_corners = (box_corners - u0v0) / np.array(
+        [fku / near_clip, -fkv / near_clip], dtype=C.dtype)
+    far_box_corners = (box_corners - u0v0) / np.array(
+        [fku / far_clip, -fkv / far_clip], dtype=C.dtype)
+    ret_xy = np.concatenate([near_box_corners, far_box_corners],
+                            axis=0)  # [8, 2]
+    ret_xyz = np.concatenate([ret_xy, z_points], axis=1)
+    return ret_xyz
+
+
+def surface_equ_3d(polygon_surfaces):
+    """
+
+    Args:
+        polygon_surfaces (np.ndarray): Polygon surfaces with shape of
+            [num_polygon, max_num_surfaces, max_num_points_of_surface, 3].
+            All surfaces' normal vector must direct to internal.
+            Max_num_points_of_surface must at least 3.
+
+    Returns:
+        tuple: normal vector and its direction.
+    """
+    # return [a, b, c], d in ax+by+cz+d=0
+    # polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3]
+    surface_vec = polygon_surfaces[:, :, :2, :] - \
+        polygon_surfaces[:, :, 1:3, :]
+    # normal_vec: [..., 3]
+    normal_vec = np.cross(surface_vec[:, :, 0, :], surface_vec[:, :, 1, :])
+    # print(normal_vec.shape, points[..., 0, :].shape)
+    # d = -np.inner(normal_vec, points[..., 0, :])
+    d = np.einsum('aij, aij->ai', normal_vec, polygon_surfaces[:, :, 0, :])
+    return normal_vec, -d
+
+
+@numba.njit
+def _points_in_convex_polygon_3d_jit(points, polygon_surfaces, normal_vec, d,
+                                     num_surfaces):
+    """
+    Args:
+        points (np.ndarray): Input points with shape of (num_points, 3).
+        polygon_surfaces (np.ndarray): Polygon surfaces with shape of
+            (num_polygon, max_num_surfaces, max_num_points_of_surface, 3).
+            All surfaces' normal vector must direct to internal.
+            Max_num_points_of_surface must at least 3.
+        normal_vec (np.ndarray): Normal vector of polygon_surfaces.
+        d (int): Directions of normal vector.
+        num_surfaces (np.ndarray): Number of surfaces a polygon contains
+            shape of (num_polygon).
+
+    Returns:
+        np.ndarray: Result matrix with the shape of [num_points, num_polygon].
+    """
+    max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3]
+    num_points = points.shape[0]
+    num_polygons = polygon_surfaces.shape[0]
+    ret = np.ones((num_points, num_polygons), dtype=np.bool_)
+    sign = 0.0
+    for i in range(num_points):
+        for j in range(num_polygons):
+            for k in range(max_num_surfaces):
+                if k > num_surfaces[j]:
+                    break
+                sign = (points[i, 0] * normal_vec[j, k, 0] +
+                        points[i, 1] * normal_vec[j, k, 1] +
+                        points[i, 2] * normal_vec[j, k, 2] + d[j, k])
+                if sign >= 0:
+                    ret[i, j] = False
+                    break
+    return ret
+
+
+def points_in_convex_polygon_3d_jit(points,
+                                    polygon_surfaces,
+                                    num_surfaces=None):
+    """Check points is in 3d convex polygons.
+
+    Args:
+        points (np.ndarray): Input points with shape of (num_points, 3).
+        polygon_surfaces (np.ndarray): Polygon surfaces with shape of
+            (num_polygon, max_num_surfaces, max_num_points_of_surface, 3).
+            All surfaces' normal vector must direct to internal.
+            Max_num_points_of_surface must at least 3.
+        num_surfaces (np.ndarray, optional): Number of surfaces a polygon
+            contains shape of (num_polygon). Defaults to None.
+
+    Returns:
+        np.ndarray: Result matrix with the shape of [num_points, num_polygon].
+    """
+    max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3]
+    # num_points = points.shape[0]
+    num_polygons = polygon_surfaces.shape[0]
+    if num_surfaces is None:
+        num_surfaces = np.full((num_polygons, ), 9999999, dtype=np.int64)
+    normal_vec, d = surface_equ_3d(polygon_surfaces[:, :, :3, :])
+    # normal_vec: [num_polygon, max_num_surfaces, 3]
+    # d: [num_polygon, max_num_surfaces]
+    return _points_in_convex_polygon_3d_jit(points, polygon_surfaces,
+                                            normal_vec, d, num_surfaces)
+
+
+@numba.njit
+def points_in_convex_polygon_jit(points, polygon, clockwise=False):
+    """Check points is in 2d convex polygons. True when point in polygon.
+
+    Args:
+        points (np.ndarray): Input points with the shape of [num_points, 2].
+        polygon (np.ndarray): Input polygon with the shape of
+            [num_polygon, num_points_of_polygon, 2].
+        clockwise (bool, optional): Indicate polygon is clockwise. Defaults
+            to True.
+
+    Returns:
+        np.ndarray: Result matrix with the shape of [num_points, num_polygon].
+    """
+    # first convert polygon to directed lines
+    num_points_of_polygon = polygon.shape[1]
+    num_points = points.shape[0]
+    num_polygons = polygon.shape[0]
+    # vec for all the polygons
+    if clockwise:
+        vec1 = polygon - polygon[:,
+                                 np.array([num_points_of_polygon - 1] +
+                                          list(range(num_points_of_polygon -
+                                                     1))), :]
+    else:
+        vec1 = polygon[:,
+                       np.array([num_points_of_polygon - 1] +
+                                list(range(num_points_of_polygon -
+                                           1))), :] - polygon
+    ret = np.zeros((num_points, num_polygons), dtype=np.bool_)
+    success = True
+    cross = 0.0
+    for i in range(num_points):
+        for j in range(num_polygons):
+            success = True
+            for k in range(num_points_of_polygon):
+                vec = vec1[j, k]
+                cross = vec[1] * (polygon[j, k, 0] - points[i, 0])
+                cross -= vec[0] * (polygon[j, k, 1] - points[i, 1])
+                if cross >= 0:
+                    success = False
+                    break
+            ret[i, j] = success
+    return ret
+
+
+def boxes3d_to_corners3d_lidar(boxes3d, bottom_center=True):
+    """Convert kitti center boxes to corners.
+
+        7 -------- 4
+       /|         /|
+      6 -------- 5 .
+      | |        | |
+      . 3 -------- 0
+      |/         |/
+      2 -------- 1
+
+    Note:
+        This function is for LiDAR boxes only.
+
+    Args:
+        boxes3d (np.ndarray): Boxes with shape of (N, 7)
+            [x, y, z, x_size, y_size, z_size, ry] in LiDAR coords,
+            see the definition of ry in KITTI dataset.
+        bottom_center (bool, optional): Whether z is on the bottom center
+            of object. Defaults to True.
+
+    Returns:
+        np.ndarray: Box corners with the shape of [N, 8, 3].
+    """
+    boxes_num = boxes3d.shape[0]
+    x_size, y_size, z_size = boxes3d[:, 3], boxes3d[:, 4], boxes3d[:, 5]
+    x_corners = np.array([
+        x_size / 2., -x_size / 2., -x_size / 2., x_size / 2., x_size / 2.,
+        -x_size / 2., -x_size / 2., x_size / 2.
+    ],
+                         dtype=np.float32).T
+    y_corners = np.array([
+        -y_size / 2., -y_size / 2., y_size / 2., y_size / 2., -y_size / 2.,
+        -y_size / 2., y_size / 2., y_size / 2.
+    ],
+                         dtype=np.float32).T
+    if bottom_center:
+        z_corners = np.zeros((boxes_num, 8), dtype=np.float32)
+        z_corners[:, 4:8] = z_size.reshape(boxes_num,
+                                           1).repeat(4, axis=1)  # (N, 8)
+    else:
+        z_corners = np.array([
+            -z_size / 2., -z_size / 2., -z_size / 2., -z_size / 2.,
+            z_size / 2., z_size / 2., z_size / 2., z_size / 2.
+        ],
+                             dtype=np.float32).T
+
+    ry = boxes3d[:, 6]
+    zeros, ones = np.zeros(ry.size,
+                           dtype=np.float32), np.ones(ry.size,
+                                                      dtype=np.float32)
+    rot_list = np.array([[np.cos(ry), np.sin(ry), zeros],
+                         [-np.sin(ry), np.cos(ry), zeros],
+                         [zeros, zeros, ones]])  # (3, 3, N)
+    R_list = np.transpose(rot_list, (2, 0, 1))  # (N, 3, 3)
+
+    temp_corners = np.concatenate((x_corners.reshape(
+        -1, 8, 1), y_corners.reshape(-1, 8, 1), z_corners.reshape(-1, 8, 1)),
+                                  axis=2)  # (N, 8, 3)
+    rotated_corners = np.matmul(temp_corners, R_list)  # (N, 8, 3)
+    x_corners = rotated_corners[:, :, 0]
+    y_corners = rotated_corners[:, :, 1]
+    z_corners = rotated_corners[:, :, 2]
+
+    x_loc, y_loc, z_loc = boxes3d[:, 0], boxes3d[:, 1], boxes3d[:, 2]
+
+    x = x_loc.reshape(-1, 1) + x_corners.reshape(-1, 8)
+    y = y_loc.reshape(-1, 1) + y_corners.reshape(-1, 8)
+    z = z_loc.reshape(-1, 1) + z_corners.reshape(-1, 8)
+
+    corners = np.concatenate(
+        (x.reshape(-1, 8, 1), y.reshape(-1, 8, 1), z.reshape(-1, 8, 1)),
+        axis=2)
+
+    return corners.astype(np.float32)
diff --git a/embodiedscan/structures/ops/iou3d_calculator.py b/embodiedscan/structures/ops/iou3d_calculator.py
new file mode 100644
index 0000000..f2475eb
--- /dev/null
+++ b/embodiedscan/structures/ops/iou3d_calculator.py
@@ -0,0 +1,330 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmdet3d.registry import TASK_UTILS
+from mmdet3d.structures.bbox_3d import get_box_type
+from mmdet.structures.bbox import bbox_overlaps
+
+
+@TASK_UTILS.register_module()
+class BboxOverlapsNearest3D(object):
+    """Nearest 3D IoU Calculator.
+
+    Note:
+        This IoU calculator first finds the nearest 2D boxes in bird eye view
+        (BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`.
+
+    Args:
+        coordinate (str): 'camera', 'lidar', or 'depth' coordinate system.
+    """
+
+    def __init__(self, coordinate='lidar'):
+        assert coordinate in ['camera', 'lidar', 'depth']
+        self.coordinate = coordinate
+
+    def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
+        """Calculate nearest 3D IoU.
+
+        Note:
+            If ``is_aligned`` is ``False``, then it calculates the ious between
+            each bbox of bboxes1 and bboxes2, otherwise it calculates the ious
+            between each aligned pair of bboxes1 and bboxes2.
+
+        Args:
+            bboxes1 (torch.Tensor): shape (N, 7+N)
+                [x, y, z, x_size, y_size, z_size, ry, v].
+            bboxes2 (torch.Tensor): shape (M, 7+N)
+                [x, y, z, x_size, y_size, z_size, ry, v].
+            mode (str): "iou" (intersection over union) or iof
+                (intersection over foreground).
+            is_aligned (bool): Whether the calculation is aligned.
+
+        Return:
+            torch.Tensor: If ``is_aligned`` is ``True``, return ious between
+                bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is
+                ``False``, return shape is M.
+        """
+        return bbox_overlaps_nearest_3d(bboxes1, bboxes2, mode, is_aligned,
+                                        self.coordinate)
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(coordinate={self.coordinate}'
+        return repr_str
+
+
+@TASK_UTILS.register_module()
+class BboxOverlaps3D(object):
+    """3D IoU Calculator.
+
+    Args:
+        coordinate (str): The coordinate system, valid options are
+            'camera', 'lidar', and 'depth'.
+    """
+
+    def __init__(self, coordinate):
+        assert coordinate in ['camera', 'lidar', 'depth']
+        self.coordinate = coordinate
+
+    def __call__(self, bboxes1, bboxes2, mode='iou'):
+        """Calculate 3D IoU using cuda implementation.
+
+        Note:
+            This function calculate the IoU of 3D boxes based on their volumes.
+            IoU calculator ``:class:BboxOverlaps3D`` uses this function to
+            calculate the actual 3D IoUs of boxes.
+
+        Args:
+            bboxes1 (torch.Tensor): with shape (N, 7+C),
+                (x, y, z, x_size, y_size, z_size, ry, v*).
+            bboxes2 (torch.Tensor): with shape (M, 7+C),
+                (x, y, z, x_size, y_size, z_size, ry, v*).
+            mode (str): "iou" (intersection over union) or
+                iof (intersection over foreground).
+
+        Return:
+            torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2
+                with shape (M, N) (aligned mode is not supported currently).
+        """
+        return bbox_overlaps_3d(bboxes1, bboxes2, mode, self.coordinate)
+
+    def __repr__(self):
+        """str: return a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(coordinate={self.coordinate}'
+        return repr_str
+
+
+def bbox_overlaps_nearest_3d(bboxes1,
+                             bboxes2,
+                             mode='iou',
+                             is_aligned=False,
+                             coordinate='lidar'):
+    """Calculate nearest 3D IoU.
+
+    Note:
+        This function first finds the nearest 2D boxes in bird eye view
+        (BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`.
+        This IoU calculator :class:`BboxOverlapsNearest3D` uses this
+        function to calculate IoUs of boxes.
+
+        If ``is_aligned`` is ``False``, then it calculates the ious between
+        each bbox of bboxes1 and bboxes2, otherwise the ious between each
+        aligned pair of bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (torch.Tensor): with shape (N, 7+C),
+            (x, y, z, x_size, y_size, z_size, ry, v*).
+        bboxes2 (torch.Tensor): with shape (M, 7+C),
+            (x, y, z, x_size, y_size, z_size, ry, v*).
+        mode (str): "iou" (intersection over union) or iof
+            (intersection over foreground).
+        is_aligned (bool): Whether the calculation is aligned
+
+    Return:
+        torch.Tensor: If ``is_aligned`` is ``True``, return ious between
+            bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is
+            ``False``, return shape is M.
+    """
+    assert bboxes1.size(-1) == bboxes2.size(-1) >= 7
+
+    box_type, _ = get_box_type(coordinate)
+
+    bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1])
+    bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1])
+
+    # Change the bboxes to bev
+    # box conversion and iou calculation in torch version on CUDA
+    # is 10x faster than that in numpy version
+    bboxes1_bev = bboxes1.nearest_bev
+    bboxes2_bev = bboxes2.nearest_bev
+
+    ret = bbox_overlaps(bboxes1_bev,
+                        bboxes2_bev,
+                        mode=mode,
+                        is_aligned=is_aligned)
+    return ret
+
+
+def bbox_overlaps_3d(bboxes1, bboxes2, mode='iou', coordinate='camera'):
+    """Calculate 3D IoU using cuda implementation.
+
+    Note:
+        This function calculates the IoU of 3D boxes based on their volumes.
+        IoU calculator :class:`BboxOverlaps3D` uses this function to
+        calculate the actual IoUs of boxes.
+
+    Args:
+        bboxes1 (torch.Tensor): with shape (N, 7+C),
+            (x, y, z, x_size, y_size, z_size, ry, v*).
+        bboxes2 (torch.Tensor): with shape (M, 7+C),
+            (x, y, z, x_size, y_size, z_size, ry, v*).
+        mode (str): "iou" (intersection over union) or
+            iof (intersection over foreground).
+        coordinate (str): 'camera' or 'lidar' coordinate system.
+
+    Return:
+        torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2
+            with shape (M, N) (aligned mode is not supported currently).
+    """
+    assert bboxes1.size(-1) == bboxes2.size(-1) >= 7
+
+    box_type, _ = get_box_type(coordinate)
+
+    bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1])
+    bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1])
+
+    return bboxes1.overlaps(bboxes1, bboxes2, mode=mode)
+
+
+@TASK_UTILS.register_module()
+class AxisAlignedBboxOverlaps3D(object):
+    """Axis-aligned 3D Overlaps (IoU) Calculator."""
+
+    def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
+        """Calculate IoU between 2D bboxes.
+
+        Args:
+            bboxes1 (Tensor): shape (B, m, 6) in <x1, y1, z1, x2, y2, z2>
+                format or empty.
+            bboxes2 (Tensor): shape (B, n, 6) in <x1, y1, z1, x2, y2, z2>
+                format or empty.
+                B indicates the batch dim, in shape (B1, B2, ..., Bn).
+                If ``is_aligned`` is ``True``, then m and n must be equal.
+            mode (str): "iou" (intersection over union) or "giou" (generalized
+                intersection over union).
+            is_aligned (bool, optional): If True, then m and n must be equal.
+                Defaults to False.
+        Returns:
+            Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
+        """
+        assert bboxes1.size(-1) == bboxes2.size(-1) == 6
+        return axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2, mode,
+                                             is_aligned)
+
+    def __repr__(self):
+        """str: a string describing the module"""
+        repr_str = self.__class__.__name__ + '()'
+        return repr_str
+
+
+def axis_aligned_bbox_overlaps_3d(bboxes1,
+                                  bboxes2,
+                                  mode='iou',
+                                  is_aligned=False,
+                                  eps=1e-6):
+    """Calculate overlap between two set of axis aligned 3D bboxes. If
+    ``is_aligned`` is ``False``, then calculate the overlaps between each bbox
+    of bboxes1 and bboxes2, otherwise the overlaps between each aligned pair of
+    bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (Tensor): shape (B, m, 6) in <x1, y1, z1, x2, y2, z2>
+            format or empty.
+        bboxes2 (Tensor): shape (B, n, 6) in <x1, y1, z1, x2, y2, z2>
+            format or empty.
+            B indicates the batch dim, in shape (B1, B2, ..., Bn).
+            If ``is_aligned`` is ``True``, then m and n must be equal.
+        mode (str): "iou" (intersection over union) or "giou" (generalized
+            intersection over union).
+        is_aligned (bool, optional): If True, then m and n must be equal.
+            Defaults to False.
+        eps (float, optional): A value added to the denominator for numerical
+            stability. Defaults to 1e-6.
+
+    Returns:
+        Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
+
+    Example:
+        >>> bboxes1 = torch.FloatTensor([
+        >>>     [0, 0, 0, 10, 10, 10],
+        >>>     [10, 10, 10, 20, 20, 20],
+        >>>     [32, 32, 32, 38, 40, 42],
+        >>> ])
+        >>> bboxes2 = torch.FloatTensor([
+        >>>     [0, 0, 0, 10, 20, 20],
+        >>>     [0, 10, 10, 10, 19, 20],
+        >>>     [10, 10, 10, 20, 20, 20],
+        >>> ])
+        >>> overlaps = axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2)
+        >>> assert overlaps.shape == (3, 3)
+        >>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True)
+        >>> assert overlaps.shape == (3, )
+    Example:
+        >>> empty = torch.empty(0, 6)
+        >>> nonempty = torch.FloatTensor([[0, 0, 0, 10, 9, 10]])
+        >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
+        >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
+        >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
+    """
+
+    assert mode in ['iou', 'giou'], f'Unsupported mode {mode}'
+    # Either the boxes are empty or the length of boxes's last dimension is 6
+    assert (bboxes1.size(-1) == 6 or bboxes1.size(0) == 0)
+    assert (bboxes2.size(-1) == 6 or bboxes2.size(0) == 0)
+
+    # Batch dim must be the same
+    # Batch dim: (B1, B2, ... Bn)
+    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
+    batch_shape = bboxes1.shape[:-2]
+
+    rows = bboxes1.size(-2)
+    cols = bboxes2.size(-2)
+    if is_aligned:
+        assert rows == cols
+
+    if rows * cols == 0:
+        if is_aligned:
+            return bboxes1.new(batch_shape + (rows, ))
+        else:
+            return bboxes1.new(batch_shape + (rows, cols))
+
+    area1 = (bboxes1[..., 3] - bboxes1[..., 0]) * (
+        bboxes1[..., 4] - bboxes1[..., 1]) * (bboxes1[..., 5] -
+                                              bboxes1[..., 2])
+    area2 = (bboxes2[..., 3] - bboxes2[..., 0]) * (
+        bboxes2[..., 4] - bboxes2[..., 1]) * (bboxes2[..., 5] -
+                                              bboxes2[..., 2])
+
+    if is_aligned:
+        lt = torch.max(bboxes1[..., :3], bboxes2[..., :3])  # [B, rows, 3]
+        rb = torch.min(bboxes1[..., 3:], bboxes2[..., 3:])  # [B, rows, 3]
+
+        wh = (rb - lt).clamp(min=0)  # [B, rows, 2]
+        overlap = wh[..., 0] * wh[..., 1] * wh[..., 2]
+
+        if mode in ['iou', 'giou']:
+            union = area1 + area2 - overlap
+        else:
+            union = area1
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :3], bboxes2[..., :3])
+            enclosed_rb = torch.max(bboxes1[..., 3:], bboxes2[..., 3:])
+    else:
+        lt = torch.max(bboxes1[..., :, None, :3],
+                       bboxes2[..., None, :, :3])  # [B, rows, cols, 3]
+        rb = torch.min(bboxes1[..., :, None, 3:],
+                       bboxes2[..., None, :, 3:])  # [B, rows, cols, 3]
+
+        wh = (rb - lt).clamp(min=0)  # [B, rows, cols, 3]
+        overlap = wh[..., 0] * wh[..., 1] * wh[..., 2]
+
+        if mode in ['iou', 'giou']:
+            union = area1[..., None] + area2[..., None, :] - overlap
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :, None, :3],
+                                    bboxes2[..., None, :, :3])
+            enclosed_rb = torch.max(bboxes1[..., :, None, 3:],
+                                    bboxes2[..., None, :, 3:])
+
+    eps = union.new_tensor([eps])
+    union = torch.max(union, eps)
+    ious = overlap / union
+    if mode in ['iou']:
+        return ious
+    # calculate gious
+    enclose_wh = (enclosed_rb - enclosed_lt).clamp(min=0)
+    enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1] * enclose_wh[..., 2]
+    enclose_area = torch.max(enclose_area, eps)
+    gious = ious - (enclose_area - union) / enclose_area
+    return gious
diff --git a/embodiedscan/structures/ops/transforms.py b/embodiedscan/structures/ops/transforms.py
new file mode 100644
index 0000000..491b791
--- /dev/null
+++ b/embodiedscan/structures/ops/transforms.py
@@ -0,0 +1,76 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def bbox3d_mapping_back(bboxes, scale_factor, flip_horizontal, flip_vertical):
+    """Map bboxes from testing scale to original image scale.
+
+    Args:
+        bboxes (:obj:`BaseInstance3DBoxes`): Boxes to be mapped back.
+        scale_factor (float): Scale factor.
+        flip_horizontal (bool): Whether to flip horizontally.
+        flip_vertical (bool): Whether to flip vertically.
+
+    Returns:
+        :obj:`BaseInstance3DBoxes`: Boxes mapped back.
+    """
+    new_bboxes = bboxes.clone()
+    if flip_horizontal:
+        new_bboxes.flip('horizontal')
+    if flip_vertical:
+        new_bboxes.flip('vertical')
+    new_bboxes.scale(1 / scale_factor)
+
+    return new_bboxes
+
+
+def bbox3d2roi(bbox_list):
+    """Convert a list of bounding boxes to roi format.
+
+    Args:
+        bbox_list (list[torch.Tensor]): A list of bounding boxes
+            corresponding to a batch of images.
+
+    Returns:
+        torch.Tensor: Region of interests in shape (n, c), where
+            the channels are in order of [batch_ind, x, y ...].
+    """
+    rois_list = []
+    for img_id, bboxes in enumerate(bbox_list):
+        if bboxes.size(0) > 0:
+            img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
+            rois = torch.cat([img_inds, bboxes], dim=-1)
+        else:
+            rois = torch.zeros_like(bboxes)
+        rois_list.append(rois)
+    rois = torch.cat(rois_list, 0)
+    return rois
+
+
+# TODO delete this
+def bbox3d2result(bboxes, scores, labels, attrs=None):
+    """Convert detection results to a list of numpy arrays.
+
+    Args:
+        bboxes (torch.Tensor): Bounding boxes with shape (N, 5).
+        labels (torch.Tensor): Labels with shape (N, ).
+        scores (torch.Tensor): Scores with shape (N, ).
+        attrs (torch.Tensor, optional): Attributes with shape (N, ).
+            Defaults to None.
+
+    Returns:
+        dict[str, torch.Tensor]: Bounding box results in cpu mode.
+
+            - boxes_3d (torch.Tensor): 3D boxes.
+            - scores (torch.Tensor): Prediction scores.
+            - labels_3d (torch.Tensor): Box labels.
+            - attrs_3d (torch.Tensor, optional): Box attributes.
+    """
+    result_dict = dict(bboxes_3d=bboxes.to('cpu'),
+                       scores_3d=scores.cpu(),
+                       labels_3d=labels.cpu())
+
+    if attrs is not None:
+        result_dict['attr_labels'] = attrs.cpu()
+
+    return result_dict
diff --git a/embodiedscan/structures/point_data.py b/embodiedscan/structures/point_data.py
new file mode 100644
index 0000000..f12d4c8
--- /dev/null
+++ b/embodiedscan/structures/point_data.py
@@ -0,0 +1,161 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections.abc import Sized
+from typing import Union
+
+import numpy as np
+import torch
+from mmengine.structures import BaseDataElement
+
+IndexType = Union[str, slice, int, list, torch.LongTensor,
+                  torch.cuda.LongTensor, torch.BoolTensor,
+                  torch.cuda.BoolTensor, np.ndarray]
+
+
+class PointData(BaseDataElement):
+    """Data structure for point-level annotations or predictions.
+
+    All data items in ``data_fields`` of ``PointData`` meet the following
+    requirements:
+
+    - They are all one dimension.
+    - They should have the same length.
+
+    `PointData` is used to save point-level semantic and instance mask,
+    it also can save `instances_labels` and `instances_scores` temporarily.
+    In the future, we would consider to move the instance-level info into
+    `gt_instances_3d` and `pred_instances_3d`.
+
+    Examples:
+        >>> metainfo = dict(
+        ...     sample_idx=random.randint(0, 100))
+        >>> points = np.random.randint(0, 255, (100, 3))
+        >>> point_data = PointData(metainfo=metainfo,
+        ...                        points=points)
+        >>> print(len(point_data))
+        100
+
+        >>> # slice
+        >>> slice_data = point_data[10:60]
+        >>> assert len(slice_data) == 50
+
+        >>> # set
+        >>> point_data.pts_semantic_mask = torch.randint(0, 255, (100,))
+        >>> point_data.pts_instance_mask = torch.randint(0, 255, (100,))
+        >>> assert tuple(point_data.pts_semantic_mask.shape) == (100,)
+        >>> assert tuple(point_data.pts_instance_mask.shape) == (100,)
+    """
+
+    def __setattr__(self, name: str, value: Sized) -> None:
+        """setattr is only used to set data.
+
+        The value must have the attribute of `__len__` and have the same length
+        of `PointData`.
+        """
+        if name in ('_metainfo_fields', '_data_fields'):
+            if not hasattr(self, name):
+                super().__setattr__(name, value)
+            else:
+                raise AttributeError(f'{name} has been used as a '
+                                     'private attribute, which is immutable.')
+
+        else:
+            assert isinstance(value,
+                              Sized), 'value must contain `__len__` attribute'
+            # TODO: make sure the input value share the same length
+            super().__setattr__(name, value)
+
+    __setitem__ = __setattr__
+
+    def __getitem__(self, item: IndexType) -> 'PointData':
+        """
+        Args:
+            item (str, int, list, :obj:`slice`, :obj:`numpy.ndarray`,
+                :obj:`torch.LongTensor`, :obj:`torch.BoolTensor`):
+                Get the corresponding values according to item.
+
+        Returns:
+            :obj:`PointData`: Corresponding values.
+        """
+        if isinstance(item, list):
+            item = np.array(item)
+        if isinstance(item, np.ndarray):
+            # The default int type of numpy is platform dependent, int32 for
+            # windows and int64 for linux. `torch.Tensor` requires the index
+            # should be int64, therefore we simply convert it to int64 here.
+            # Mode details in https://github.com/numpy/numpy/issues/9464
+            item = item.astype(np.int64) if item.dtype == np.int32 else item
+            item = torch.from_numpy(item)
+        assert isinstance(
+            item, (str, slice, int, torch.LongTensor, torch.cuda.LongTensor,
+                   torch.BoolTensor, torch.cuda.BoolTensor))
+
+        if isinstance(item, str):
+            return getattr(self, item)
+
+        if isinstance(item, int):
+            if item >= len(self) or item < -len(self):  # type: ignore
+                raise IndexError(f'Index {item} out of range!')
+            else:
+                # keep the dimension
+                item = slice(item, None, len(self))
+
+        new_data = self.__class__(metainfo=self.metainfo)
+        if isinstance(item, torch.Tensor):
+            assert item.dim() == 1, 'Only support to get the' \
+                                    ' values along the first dimension.'
+            if isinstance(item, (torch.BoolTensor, torch.cuda.BoolTensor)):
+                assert len(item) == len(self), 'The shape of the ' \
+                                               'input(BoolTensor) ' \
+                                               f'{len(item)} ' \
+                                               'does not match the shape ' \
+                                               'of the indexed tensor ' \
+                                               'in results_field ' \
+                                               f'{len(self)} at ' \
+                                               'first dimension.'
+
+            for k, v in self.items():
+                if isinstance(v, torch.Tensor):
+                    new_data[k] = v[item]
+                elif isinstance(v, np.ndarray):
+                    new_data[k] = v[item.cpu().numpy()]
+                elif isinstance(
+                        v, (str, list, tuple)) or (hasattr(v, '__getitem__')
+                                                   and hasattr(v, 'cat')):
+                    # convert to indexes from BoolTensor
+                    if isinstance(item,
+                                  (torch.BoolTensor, torch.cuda.BoolTensor)):
+                        indexes = torch.nonzero(item).view(
+                            -1).cpu().numpy().tolist()
+                    else:
+                        indexes = item.cpu().numpy().tolist()
+                    slice_list = []
+                    if indexes:
+                        for index in indexes:
+                            slice_list.append(slice(index, None, len(v)))
+                    else:
+                        slice_list.append(slice(None, 0, None))
+                    r_list = [v[s] for s in slice_list]
+                    if isinstance(v, (str, list, tuple)):
+                        new_value = r_list[0]
+                        for r in r_list[1:]:
+                            new_value = new_value + r
+                    else:
+                        new_value = v.cat(r_list)
+                    new_data[k] = new_value
+                else:
+                    raise ValueError(
+                        f'The type of `{k}` is `{type(v)}`, which has no '
+                        'attribute of `cat`, so it does not '
+                        'support slice with `bool`')
+        else:
+            # item is a slice
+            for k, v in self.items():
+                new_data[k] = v[item]
+        return new_data  # type: ignore
+
+    def __len__(self) -> int:
+        """int: The length of `PointData`."""
+        if len(self._data_fields) > 0:
+            return len(self.values()[0])
+        else:
+            return 0
diff --git a/embodiedscan/structures/points/__init__.py b/embodiedscan/structures/points/__init__.py
new file mode 100644
index 0000000..eedae14
--- /dev/null
+++ b/embodiedscan/structures/points/__init__.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_points import BasePoints
+from .cam_points import CameraPoints
+from .depth_points import DepthPoints
+from .lidar_points import LiDARPoints
+
+__all__ = ['BasePoints', 'CameraPoints', 'DepthPoints', 'LiDARPoints']
+
+
+def get_points_type(points_type: str) -> type:
+    """Get the class of points according to coordinate type.
+
+    Args:
+        points_type (str): The type of points coordinate. The valid value are
+            "CAMERA", "LIDAR" and "DEPTH".
+
+    Returns:
+        type: Points type.
+    """
+    points_type_upper = points_type.upper()
+    if points_type_upper == 'CAMERA':
+        points_cls = CameraPoints
+    elif points_type_upper == 'LIDAR':
+        points_cls = LiDARPoints
+    elif points_type_upper == 'DEPTH':
+        points_cls = DepthPoints
+    else:
+        raise ValueError('Only "points_type" of "CAMERA", "LIDAR" and "DEPTH" '
+                         f'are supported, got {points_type}')
+
+    return points_cls
diff --git a/embodiedscan/structures/points/base_points.py b/embodiedscan/structures/points/base_points.py
new file mode 100644
index 0000000..1fa2282
--- /dev/null
+++ b/embodiedscan/structures/points/base_points.py
@@ -0,0 +1,521 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from abc import abstractmethod
+from typing import Iterator, Optional, Sequence, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from ..bbox_3d.utils import rotation_3d_in_axis, rotation_3d_in_euler
+
+
+class BasePoints:
+    """Base class for Points.
+
+    Args:
+        tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The points
+            data with shape (N, points_dim).
+        points_dim (int): Integer indicating the dimension of a point. Each row
+            is (x, y, z, ...). Defaults to 3.
+        attribute_dims (dict, optional): Dictionary to indicate the meaning of
+            extra dimension. Defaults to None.
+
+    Attributes:
+        tensor (Tensor): Float matrix with shape (N, points_dim).
+        points_dim (int): Integer indicating the dimension of a point. Each row
+            is (x, y, z, ...).
+        attribute_dims (dict, optional): Dictionary to indicate the meaning of
+            extra dimension. Defaults to None.
+        rotation_axis (int): Default rotation axis for points rotation.
+    """
+
+    def __init__(self,
+                 tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]],
+                 points_dim: int = 3,
+                 attribute_dims: Optional[dict] = None) -> None:
+        if isinstance(tensor, Tensor):
+            device = tensor.device
+        else:
+            device = torch.device('cpu')
+        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+        if tensor.numel() == 0:
+            # Use reshape, so we don't end up creating a new tensor that does
+            # not depend on the inputs (and consequently confuses jit)
+            tensor = tensor.reshape((-1, points_dim))
+        assert tensor.dim() == 2 and tensor.size(-1) == points_dim, \
+            ('The points dimension must be 2 and the length of the last '
+             f'dimension must be {points_dim}, but got points with shape '
+             f'{tensor.shape}.')
+
+        self.tensor = tensor.clone()
+        self.points_dim = points_dim
+        self.attribute_dims = attribute_dims
+        self.rotation_axis = 0
+
+    @property
+    def coord(self) -> Tensor:
+        """Tensor: Coordinates of each point in shape (N, 3)."""
+        return self.tensor[:, :3]
+
+    @coord.setter
+    def coord(self, tensor: Union[Tensor, np.ndarray]) -> None:
+        """Set the coordinates of each point.
+
+        Args:
+            tensor (Tensor or np.ndarray): Coordinates of each point with shape
+                (N, 3).
+        """
+        try:
+            tensor = tensor.reshape(self.shape[0], 3)
+        except (RuntimeError, ValueError):  # for torch.Tensor and np.ndarray
+            raise ValueError(f'got unexpected shape {tensor.shape}')
+        if not isinstance(tensor, Tensor):
+            tensor = self.tensor.new_tensor(tensor)
+        self.tensor[:, :3] = tensor
+
+    @property
+    def height(self) -> Union[Tensor, None]:
+        """Tensor or None: Returns a vector with height of each point in shape
+        (N, )."""
+        if self.attribute_dims is not None and \
+                'height' in self.attribute_dims.keys():
+            return self.tensor[:, self.attribute_dims['height']]
+        else:
+            return None
+
+    @height.setter
+    def height(self, tensor: Union[Tensor, np.ndarray]) -> None:
+        """Set the height of each point.
+
+        Args:
+            tensor (Tensor or np.ndarray): Height of each point with shape
+                (N, ).
+        """
+        try:
+            tensor = tensor.reshape(self.shape[0])
+        except (RuntimeError, ValueError):  # for torch.Tensor and np.ndarray
+            raise ValueError(f'got unexpected shape {tensor.shape}')
+        if not isinstance(tensor, Tensor):
+            tensor = self.tensor.new_tensor(tensor)
+        if self.attribute_dims is not None and \
+                'height' in self.attribute_dims.keys():
+            self.tensor[:, self.attribute_dims['height']] = tensor
+        else:
+            # add height attribute
+            if self.attribute_dims is None:
+                self.attribute_dims = dict()
+            attr_dim = self.shape[1]
+            self.tensor = torch.cat([self.tensor, tensor.unsqueeze(1)], dim=1)
+            self.attribute_dims.update(dict(height=attr_dim))
+            self.points_dim += 1
+
+    @property
+    def color(self) -> Union[Tensor, None]:
+        """Tensor or None: Returns a vector with color of each point in shape
+        (N, 3)."""
+        if self.attribute_dims is not None and \
+                'color' in self.attribute_dims.keys():
+            return self.tensor[:, self.attribute_dims['color']]
+        else:
+            return None
+
+    @color.setter
+    def color(self, tensor: Union[Tensor, np.ndarray]) -> None:
+        """Set the color of each point.
+
+        Args:
+            tensor (Tensor or np.ndarray): Color of each point with shape
+                (N, 3).
+        """
+        try:
+            tensor = tensor.reshape(self.shape[0], 3)
+        except (RuntimeError, ValueError):  # for torch.Tensor and np.ndarray
+            raise ValueError(f'got unexpected shape {tensor.shape}')
+        if tensor.max() >= 256 or tensor.min() < 0:
+            warnings.warn('point got color value beyond [0, 255]')
+        if not isinstance(tensor, Tensor):
+            tensor = self.tensor.new_tensor(tensor)
+        if self.attribute_dims is not None and \
+                'color' in self.attribute_dims.keys():
+            self.tensor[:, self.attribute_dims['color']] = tensor
+        else:
+            # add color attribute
+            if self.attribute_dims is None:
+                self.attribute_dims = dict()
+            attr_dim = self.shape[1]
+            self.tensor = torch.cat([self.tensor, tensor], dim=1)
+            self.attribute_dims.update(
+                dict(color=[attr_dim, attr_dim + 1, attr_dim + 2]))
+            self.points_dim += 3
+
+    @property
+    def shape(self) -> torch.Size:
+        """torch.Size: Shape of points."""
+        return self.tensor.shape
+
+    def shuffle(self) -> Tensor:
+        """Shuffle the points.
+
+        Returns:
+            Tensor: The shuffled index.
+        """
+        idx = torch.randperm(self.__len__(), device=self.tensor.device)
+        self.tensor = self.tensor[idx]
+        return idx
+
+    def rotate(self,
+               rotation: Union[Tensor, np.ndarray, float],
+               axis: Optional[int] = None) -> Tensor:
+        """Rotate points with the given rotation matrix or angle.
+
+        Args:
+            rotation (Tensor or np.ndarray or float): Rotation matrix or angle.
+            axis (int, optional): Axis to rotate at. Defaults to None.
+
+        Returns:
+            Tensor: Rotation matrix.
+        """
+        if not isinstance(rotation, Tensor):
+            rotation = self.tensor.new_tensor(rotation)
+        assert rotation.shape == torch.Size([3, 3]) or rotation.numel() == 1, \
+            f'invalid rotation shape {rotation.shape}'
+
+        if axis is None:
+            axis = self.rotation_axis
+
+        if rotation.numel() == 1:
+            rotated_points, rot_mat_T = rotation_3d_in_axis(
+                self.tensor[:, :3][None], rotation, axis=axis, return_mat=True)
+            self.tensor[:, :3] = rotated_points.squeeze(0)
+            rot_mat_T = rot_mat_T.squeeze(0)
+        elif rotation.numel() == 3:
+            rotated_points, rot_mat_T = rotation_3d_in_euler(
+                self.tensor[:, :3][None], rotation, return_mat=True)
+            self.tensor[:, :3] = rotated_points.squeeze(0)
+            rot_mat_T = rot_mat_T.squeeze(0)
+        else:
+            # rotation.numel() == 9
+            self.tensor[:, :3] = self.tensor[:, :3] @ rotation
+            rot_mat_T = rotation
+
+        return rot_mat_T
+
+    @abstractmethod
+    def flip(self, bev_direction: str = 'horizontal') -> None:
+        """Flip the points along given BEV direction.
+
+        Args:
+            bev_direction (str): Flip direction (horizontal or vertical).
+                Defaults to 'horizontal'.
+        """
+        pass
+
+    def translate(self, trans_vector: Union[Tensor, np.ndarray]) -> None:
+        """Translate points with the given translation vector.
+
+        Args:
+            trans_vector (Tensor or np.ndarray): Translation vector of size 3
+                or nx3.
+        """
+        if not isinstance(trans_vector, Tensor):
+            trans_vector = self.tensor.new_tensor(trans_vector)
+        trans_vector = trans_vector.squeeze(0)
+        if trans_vector.dim() == 1:
+            assert trans_vector.shape[0] == 3
+        elif trans_vector.dim() == 2:
+            assert trans_vector.shape[0] == self.tensor.shape[0] and \
+                trans_vector.shape[1] == 3
+        else:
+            raise NotImplementedError(
+                f'Unsupported translation vector of shape {trans_vector.shape}'
+            )
+        self.tensor[:, :3] += trans_vector
+
+    def in_range_3d(
+            self, point_range: Union[Tensor, np.ndarray,
+                                     Sequence[float]]) -> Tensor:
+        """Check whether the points are in the given range.
+
+        Args:
+            point_range (Tensor or np.ndarray or Sequence[float]): The range of
+                point (x_min, y_min, z_min, x_max, y_max, z_max).
+
+        Note:
+            In the original implementation of SECOND, checking whether a box in
+            the range checks whether the points are in a convex polygon, we try
+            to reduce the burden for simpler cases.
+
+        Returns:
+            Tensor: A binary vector indicating whether each point is inside the
+            reference range.
+        """
+        in_range_flags = ((self.tensor[:, 0] > point_range[0])
+                          & (self.tensor[:, 1] > point_range[1])
+                          & (self.tensor[:, 2] > point_range[2])
+                          & (self.tensor[:, 0] < point_range[3])
+                          & (self.tensor[:, 1] < point_range[4])
+                          & (self.tensor[:, 2] < point_range[5]))
+        return in_range_flags
+
+    @property
+    def bev(self) -> Tensor:
+        """Tensor: BEV of the points in shape (N, 2)."""
+        return self.tensor[:, [0, 1]]
+
+    def in_range_bev(
+            self, point_range: Union[Tensor, np.ndarray,
+                                     Sequence[float]]) -> Tensor:
+        """Check whether the points are in the given range.
+
+        Args:
+            point_range (Tensor or np.ndarray or Sequence[float]): The range of
+                point in order of (x_min, y_min, x_max, y_max).
+
+        Returns:
+            Tensor: A binary vector indicating whether each point is inside the
+            reference range.
+        """
+        in_range_flags = ((self.bev[:, 0] > point_range[0])
+                          & (self.bev[:, 1] > point_range[1])
+                          & (self.bev[:, 0] < point_range[2])
+                          & (self.bev[:, 1] < point_range[3]))
+        return in_range_flags
+
+    @abstractmethod
+    def convert_to(self,
+                   dst: int,
+                   rt_mat: Optional[Union[Tensor,
+                                          np.ndarray]] = None) -> 'BasePoints':
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (int): The target Point mode.
+            rt_mat (Tensor or np.ndarray, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None. The conversion from ``src`` coordinates to
+                ``dst`` coordinates usually comes along the change of sensors,
+                e.g., from camera to LiDAR. This requires a transformation
+                matrix.
+
+        Returns:
+            :obj:`BasePoints`: The converted point of the same type in the
+            ``dst`` mode.
+        """
+        pass
+
+    def scale(self, scale_factor: float) -> None:
+        """Scale the points with horizontal and vertical scaling factors.
+
+        Args:
+            scale_factors (float): Scale factors to scale the points.
+        """
+        self.tensor[:, :3] *= scale_factor
+
+    def __getitem__(
+            self, item: Union[int, tuple, slice, np.ndarray,
+                              Tensor]) -> 'BasePoints':
+        """
+        Args:
+            item (int or tuple or slice or np.ndarray or Tensor): Index of
+                points.
+
+        Note:
+            The following usage are allowed:
+
+            1. `new_points = points[3]`: Return a `Points` that contains only
+               one point.
+            2. `new_points = points[2:10]`: Return a slice of points.
+            3. `new_points = points[vector]`: Whether vector is a
+               torch.BoolTensor with `length = len(points)`. Nonzero elements
+               in the vector will be selected.
+            4. `new_points = points[3:11, vector]`: Return a slice of points
+               and attribute dims.
+            5. `new_points = points[4:12, 2]`: Return a slice of points with
+               single attribute.
+
+            Note that the returned Points might share storage with this Points,
+            subject to PyTorch's indexing semantics.
+
+        Returns:
+            :obj:`BasePoints`: A new object of :class:`BasePoints` after
+            indexing.
+        """
+        original_type = type(self)
+        if isinstance(item, int):
+            return original_type(self.tensor[item].view(1, -1),
+                                 points_dim=self.points_dim,
+                                 attribute_dims=self.attribute_dims)
+        elif isinstance(item, tuple) and len(item) == 2:
+            if isinstance(item[1], slice):
+                start = 0 if item[1].start is None else item[1].start
+                stop = self.tensor.shape[1] \
+                    if item[1].stop is None else item[1].stop
+                step = 1 if item[1].step is None else item[1].step
+                item = list(item)
+                item[1] = list(range(start, stop, step))
+                item = tuple(item)
+            elif isinstance(item[1], int):
+                item = list(item)
+                item[1] = [item[1]]
+                item = tuple(item)
+            p = self.tensor[item[0], item[1]]
+
+            keep_dims = list(
+                set(item[1]).intersection(set(range(3, self.tensor.shape[1]))))
+            if self.attribute_dims is not None:
+                attribute_dims = self.attribute_dims.copy()
+                for key in self.attribute_dims.keys():
+                    cur_attribute_dims = attribute_dims[key]
+                    if isinstance(cur_attribute_dims, int):
+                        cur_attribute_dims = [cur_attribute_dims]
+                    intersect_attr = list(
+                        set(cur_attribute_dims).intersection(set(keep_dims)))
+                    if len(intersect_attr) == 1:
+                        attribute_dims[key] = intersect_attr[0]
+                    elif len(intersect_attr) > 1:
+                        attribute_dims[key] = intersect_attr
+                    else:
+                        attribute_dims.pop(key)
+            else:
+                attribute_dims = None
+        elif isinstance(item, (slice, np.ndarray, Tensor)):
+            p = self.tensor[item]
+            attribute_dims = self.attribute_dims
+        else:
+            raise NotImplementedError(f'Invalid slice {item}!')
+
+        assert p.dim() == 2, \
+            f'Indexing on Points with {item} failed to return a matrix!'
+        return original_type(p,
+                             points_dim=p.shape[1],
+                             attribute_dims=attribute_dims)
+
+    def __len__(self) -> int:
+        """int: Number of points in the current object."""
+        return self.tensor.shape[0]
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the object."""
+        return self.__class__.__name__ + '(\n    ' + str(self.tensor) + ')'
+
+    @classmethod
+    def cat(cls, points_list: Sequence['BasePoints']) -> 'BasePoints':
+        """Concatenate a list of Points into a single Points.
+
+        Args:
+            points_list (Sequence[:obj:`BasePoints`]): List of points.
+
+        Returns:
+            :obj:`BasePoints`: The concatenated points.
+        """
+        assert isinstance(points_list, (list, tuple))
+        if len(points_list) == 0:
+            return cls(torch.empty(0))
+        assert all(isinstance(points, cls) for points in points_list)
+
+        # use torch.cat (v.s. layers.cat)
+        # so the returned points never share storage with input
+        cat_points = cls(torch.cat([p.tensor for p in points_list], dim=0),
+                         points_dim=points_list[0].points_dim,
+                         attribute_dims=points_list[0].attribute_dims)
+        return cat_points
+
+    def numpy(self) -> np.ndarray:
+        """Reload ``numpy`` from self.tensor."""
+        return self.tensor.numpy()
+
+    def to(self, device: Union[str, torch.device], *args,
+           **kwargs) -> 'BasePoints':
+        """Convert current points to a specific device.
+
+        Args:
+            device (str or :obj:`torch.device`): The name of the device.
+
+        Returns:
+            :obj:`BasePoints`: A new points object on the specific device.
+        """
+        original_type = type(self)
+        return original_type(self.tensor.to(device, *args, **kwargs),
+                             points_dim=self.points_dim,
+                             attribute_dims=self.attribute_dims)
+
+    def cpu(self) -> 'BasePoints':
+        """Convert current points to cpu device.
+
+        Returns:
+            :obj:`BasePoints`: A new points object on the cpu device.
+        """
+        original_type = type(self)
+        return original_type(self.tensor.cpu(),
+                             points_dim=self.points_dim,
+                             attribute_dims=self.attribute_dims)
+
+    def cuda(self, *args, **kwargs) -> 'BasePoints':
+        """Convert current points to cuda device.
+
+        Returns:
+            :obj:`BasePoints`: A new points object on the cuda device.
+        """
+        original_type = type(self)
+        return original_type(self.tensor.cuda(*args, **kwargs),
+                             points_dim=self.points_dim,
+                             attribute_dims=self.attribute_dims)
+
+    def clone(self) -> 'BasePoints':
+        """Clone the points.
+
+        Returns:
+            :obj:`BasePoints`: Point object with the same properties as self.
+        """
+        original_type = type(self)
+        return original_type(self.tensor.clone(),
+                             points_dim=self.points_dim,
+                             attribute_dims=self.attribute_dims)
+
+    def detach(self) -> 'BasePoints':
+        """Detach the points.
+
+        Returns:
+            :obj:`BasePoints`: Point object with the same properties as self.
+        """
+        original_type = type(self)
+        return original_type(self.tensor.detach(),
+                             points_dim=self.points_dim,
+                             attribute_dims=self.attribute_dims)
+
+    @property
+    def device(self) -> torch.device:
+        """torch.device: The device of the points are on."""
+        return self.tensor.device
+
+    def __iter__(self) -> Iterator[Tensor]:
+        """Yield a point as a Tensor at a time.
+
+        Returns:
+            Iterator[Tensor]: A point of shape (points_dim, ).
+        """
+        yield from self.tensor
+
+    def new_point(
+        self, data: Union[Tensor, np.ndarray, Sequence[Sequence[float]]]
+    ) -> 'BasePoints':
+        """Create a new point object with data.
+
+        The new point and its tensor has the similar properties as self and
+        self.tensor, respectively.
+
+        Args:
+            data (Tensor or np.ndarray or Sequence[Sequence[float]]): Data to
+                be copied.
+
+        Returns:
+            :obj:`BasePoints`: A new point object with ``data``, the object's
+            other properties are similar to ``self``.
+        """
+        new_tensor = self.tensor.new_tensor(data) \
+            if not isinstance(data, Tensor) else data.to(self.device)
+        original_type = type(self)
+        return original_type(new_tensor,
+                             points_dim=self.points_dim,
+                             attribute_dims=self.attribute_dims)
diff --git a/embodiedscan/structures/points/cam_points.py b/embodiedscan/structures/points/cam_points.py
new file mode 100644
index 0000000..4a835a1
--- /dev/null
+++ b/embodiedscan/structures/points/cam_points.py
@@ -0,0 +1,80 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Union
+
+import numpy as np
+from torch import Tensor
+
+from .base_points import BasePoints
+
+
+class CameraPoints(BasePoints):
+    """Points of instances in CAM coordinates.
+
+    Args:
+        tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The points
+            data with shape (N, points_dim).
+        points_dim (int): Integer indicating the dimension of a point. Each row
+            is (x, y, z, ...). Defaults to 3.
+        attribute_dims (dict, optional): Dictionary to indicate the meaning of
+            extra dimension. Defaults to None.
+
+    Attributes:
+        tensor (Tensor): Float matrix with shape (N, points_dim).
+        points_dim (int): Integer indicating the dimension of a point. Each row
+            is (x, y, z, ...).
+        attribute_dims (dict, optional): Dictionary to indicate the meaning of
+            extra dimension. Defaults to None.
+        rotation_axis (int): Default rotation axis for points rotation.
+    """
+
+    def __init__(self,
+                 tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]],
+                 points_dim: int = 3,
+                 attribute_dims: Optional[dict] = None) -> None:
+        super(CameraPoints, self).__init__(tensor,
+                                           points_dim=points_dim,
+                                           attribute_dims=attribute_dims)
+        self.rotation_axis = 1
+
+    def flip(self, bev_direction: str = 'horizontal') -> None:
+        """Flip the points along given BEV direction.
+
+        Args:
+            bev_direction (str): Flip direction (horizontal or vertical).
+                Defaults to 'horizontal'.
+        """
+        assert bev_direction in ('horizontal', 'vertical')
+        if bev_direction == 'horizontal':
+            self.tensor[:, 0] = -self.tensor[:, 0]
+        elif bev_direction == 'vertical':
+            self.tensor[:, 2] = -self.tensor[:, 2]
+
+    @property
+    def bev(self) -> Tensor:
+        """Tensor: BEV of the points in shape (N, 2)."""
+        return self.tensor[:, [0, 2]]
+
+    def convert_to(self,
+                   dst: int,
+                   rt_mat: Optional[Union[Tensor,
+                                          np.ndarray]] = None) -> 'BasePoints':
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (int): The target Point mode.
+            rt_mat (Tensor or np.ndarray, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None. The conversion from ``src`` coordinates to
+                ``dst`` coordinates usually comes along the change of sensors,
+                e.g., from camera to LiDAR. This requires a transformation
+                matrix.
+
+        Returns:
+            :obj:`BasePoints`: The converted point of the same type in the
+            ``dst`` mode.
+        """
+        from mmdet3d.structures.bbox_3d import Coord3DMode
+        return Coord3DMode.convert_point(point=self,
+                                         src=Coord3DMode.CAM,
+                                         dst=dst,
+                                         rt_mat=rt_mat)
diff --git a/embodiedscan/structures/points/depth_points.py b/embodiedscan/structures/points/depth_points.py
new file mode 100644
index 0000000..c3ff712
--- /dev/null
+++ b/embodiedscan/structures/points/depth_points.py
@@ -0,0 +1,75 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Union
+
+import numpy as np
+from torch import Tensor
+
+from .base_points import BasePoints
+
+
+class DepthPoints(BasePoints):
+    """Points of instances in DEPTH coordinates.
+
+    Args:
+        tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The points
+            data with shape (N, points_dim).
+        points_dim (int): Integer indicating the dimension of a point. Each row
+            is (x, y, z, ...). Defaults to 3.
+        attribute_dims (dict, optional): Dictionary to indicate the meaning of
+            extra dimension. Defaults to None.
+
+    Attributes:
+        tensor (Tensor): Float matrix with shape (N, points_dim).
+        points_dim (int): Integer indicating the dimension of a point. Each row
+            is (x, y, z, ...).
+        attribute_dims (dict, optional): Dictionary to indicate the meaning of
+            extra dimension. Defaults to None.
+        rotation_axis (int): Default rotation axis for points rotation.
+    """
+
+    def __init__(self,
+                 tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]],
+                 points_dim: int = 3,
+                 attribute_dims: Optional[dict] = None) -> None:
+        super(DepthPoints, self).__init__(tensor,
+                                          points_dim=points_dim,
+                                          attribute_dims=attribute_dims)
+        self.rotation_axis = 2
+
+    def flip(self, bev_direction: str = 'horizontal') -> None:
+        """Flip the points along given BEV direction.
+
+        Args:
+            bev_direction (str): Flip direction (horizontal or vertical).
+                Defaults to 'horizontal'.
+        """
+        assert bev_direction in ('horizontal', 'vertical')
+        if bev_direction == 'horizontal':
+            self.tensor[:, 0] = -self.tensor[:, 0]
+        elif bev_direction == 'vertical':
+            self.tensor[:, 1] = -self.tensor[:, 1]
+
+    def convert_to(self,
+                   dst: int,
+                   rt_mat: Optional[Union[Tensor,
+                                          np.ndarray]] = None) -> 'BasePoints':
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (int): The target Point mode.
+            rt_mat (Tensor or np.ndarray, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None. The conversion from ``src`` coordinates to
+                ``dst`` coordinates usually comes along the change of sensors,
+                e.g., from camera to LiDAR. This requires a transformation
+                matrix.
+
+        Returns:
+            :obj:`BasePoints`: The converted point of the same type in the
+            ``dst`` mode.
+        """
+        from mmdet3d.structures.bbox_3d import Coord3DMode
+        return Coord3DMode.convert_point(point=self,
+                                         src=Coord3DMode.DEPTH,
+                                         dst=dst,
+                                         rt_mat=rt_mat)
diff --git a/embodiedscan/structures/points/lidar_points.py b/embodiedscan/structures/points/lidar_points.py
new file mode 100644
index 0000000..71ecb49
--- /dev/null
+++ b/embodiedscan/structures/points/lidar_points.py
@@ -0,0 +1,75 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Union
+
+import numpy as np
+from torch import Tensor
+
+from .base_points import BasePoints
+
+
+class LiDARPoints(BasePoints):
+    """Points of instances in LIDAR coordinates.
+
+    Args:
+        tensor (Tensor or np.ndarray or Sequence[Sequence[float]]): The points
+            data with shape (N, points_dim).
+        points_dim (int): Integer indicating the dimension of a point. Each row
+            is (x, y, z, ...). Defaults to 3.
+        attribute_dims (dict, optional): Dictionary to indicate the meaning of
+            extra dimension. Defaults to None.
+
+    Attributes:
+        tensor (Tensor): Float matrix with shape (N, points_dim).
+        points_dim (int): Integer indicating the dimension of a point. Each row
+            is (x, y, z, ...).
+        attribute_dims (dict, optional): Dictionary to indicate the meaning of
+            extra dimension. Defaults to None.
+        rotation_axis (int): Default rotation axis for points rotation.
+    """
+
+    def __init__(self,
+                 tensor: Union[Tensor, np.ndarray, Sequence[Sequence[float]]],
+                 points_dim: int = 3,
+                 attribute_dims: Optional[dict] = None) -> None:
+        super(LiDARPoints, self).__init__(tensor,
+                                          points_dim=points_dim,
+                                          attribute_dims=attribute_dims)
+        self.rotation_axis = 2
+
+    def flip(self, bev_direction: str = 'horizontal') -> None:
+        """Flip the points along given BEV direction.
+
+        Args:
+            bev_direction (str): Flip direction (horizontal or vertical).
+                Defaults to 'horizontal'.
+        """
+        assert bev_direction in ('horizontal', 'vertical')
+        if bev_direction == 'horizontal':
+            self.tensor[:, 1] = -self.tensor[:, 1]
+        elif bev_direction == 'vertical':
+            self.tensor[:, 0] = -self.tensor[:, 0]
+
+    def convert_to(self,
+                   dst: int,
+                   rt_mat: Optional[Union[Tensor,
+                                          np.ndarray]] = None) -> 'BasePoints':
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (int): The target Point mode.
+            rt_mat (Tensor or np.ndarray, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None. The conversion from ``src`` coordinates to
+                ``dst`` coordinates usually comes along the change of sensors,
+                e.g., from camera to LiDAR. This requires a transformation
+                matrix.
+
+        Returns:
+            :obj:`BasePoints`: The converted point of the same type in the
+            ``dst`` mode.
+        """
+        from mmdet3d.structures.bbox_3d import Coord3DMode
+        return Coord3DMode.convert_point(point=self,
+                                         src=Coord3DMode.LIDAR,
+                                         dst=dst,
+                                         rt_mat=rt_mat)
diff --git a/embodiedscan/utils/__init__.py b/embodiedscan/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/embodiedscan/utils/color_selector.py b/embodiedscan/utils/color_selector.py
new file mode 100644
index 0000000..e38fda9
--- /dev/null
+++ b/embodiedscan/utils/color_selector.py
@@ -0,0 +1,913 @@
+import random
+
+COCO_COLOR = [
+    {
+        'color': [220, 20, 60],
+        'isthing': 1,
+        'id': 1,
+        'name': 'person'
+    },
+    {
+        'color': [119, 11, 32],
+        'isthing': 1,
+        'id': 2,
+        'name': 'bicycle'
+    },
+    {
+        'color': [0, 0, 142],
+        'isthing': 1,
+        'id': 3,
+        'name': 'car'
+    },
+    {
+        'color': [0, 0, 230],
+        'isthing': 1,
+        'id': 4,
+        'name': 'motorcycle'
+    },
+    {
+        'color': [106, 0, 228],
+        'isthing': 1,
+        'id': 5,
+        'name': 'airplane'
+    },
+    {
+        'color': [0, 60, 100],
+        'isthing': 1,
+        'id': 6,
+        'name': 'bus'
+    },
+    {
+        'color': [0, 80, 100],
+        'isthing': 1,
+        'id': 7,
+        'name': 'train'
+    },
+    {
+        'color': [0, 0, 70],
+        'isthing': 1,
+        'id': 8,
+        'name': 'truck'
+    },
+    {
+        'color': [0, 0, 192],
+        'isthing': 1,
+        'id': 9,
+        'name': 'boat'
+    },
+    {
+        'color': [250, 170, 30],
+        'isthing': 1,
+        'id': 10,
+        'name': 'traffic light'
+    },
+    {
+        'color': [100, 170, 30],
+        'isthing': 1,
+        'id': 11,
+        'name': 'fire hydrant'
+    },
+    {
+        'color': [220, 220, 0],
+        'isthing': 1,
+        'id': 13,
+        'name': 'stop sign'
+    },
+    {
+        'color': [175, 116, 175],
+        'isthing': 1,
+        'id': 14,
+        'name': 'parking meter'
+    },
+    {
+        'color': [250, 0, 30],
+        'isthing': 1,
+        'id': 15,
+        'name': 'bench'
+    },
+    {
+        'color': [165, 42, 42],
+        'isthing': 1,
+        'id': 16,
+        'name': 'bird'
+    },
+    {
+        'color': [255, 77, 255],
+        'isthing': 1,
+        'id': 17,
+        'name': 'cat'
+    },
+    {
+        'color': [0, 226, 252],
+        'isthing': 1,
+        'id': 18,
+        'name': 'dog'
+    },
+    {
+        'color': [182, 182, 255],
+        'isthing': 1,
+        'id': 19,
+        'name': 'horse'
+    },
+    {
+        'color': [0, 82, 0],
+        'isthing': 1,
+        'id': 20,
+        'name': 'sheep'
+    },
+    {
+        'color': [120, 166, 157],
+        'isthing': 1,
+        'id': 21,
+        'name': 'cow'
+    },
+    {
+        'color': [110, 76, 0],
+        'isthing': 1,
+        'id': 22,
+        'name': 'elephant'
+    },
+    {
+        'color': [174, 57, 255],
+        'isthing': 1,
+        'id': 23,
+        'name': 'bear'
+    },
+    {
+        'color': [199, 100, 0],
+        'isthing': 1,
+        'id': 24,
+        'name': 'zebra'
+    },
+    {
+        'color': [72, 0, 118],
+        'isthing': 1,
+        'id': 25,
+        'name': 'giraffe'
+    },
+    {
+        'color': [255, 179, 240],
+        'isthing': 1,
+        'id': 27,
+        'name': 'backpack'
+    },
+    {
+        'color': [0, 125, 92],
+        'isthing': 1,
+        'id': 28,
+        'name': 'umbrella'
+    },
+    {
+        'color': [209, 0, 151],
+        'isthing': 1,
+        'id': 31,
+        'name': 'handbag'
+    },
+    {
+        'color': [188, 208, 182],
+        'isthing': 1,
+        'id': 32,
+        'name': 'tie'
+    },
+    {
+        'color': [0, 220, 176],
+        'isthing': 1,
+        'id': 33,
+        'name': 'suitcase'
+    },
+    {
+        'color': [255, 99, 164],
+        'isthing': 1,
+        'id': 34,
+        'name': 'frisbee'
+    },
+    {
+        'color': [92, 0, 73],
+        'isthing': 1,
+        'id': 35,
+        'name': 'skis'
+    },
+    {
+        'color': [133, 129, 255],
+        'isthing': 1,
+        'id': 36,
+        'name': 'snowboard'
+    },
+    {
+        'color': [78, 180, 255],
+        'isthing': 1,
+        'id': 37,
+        'name': 'sports ball'
+    },
+    {
+        'color': [0, 228, 0],
+        'isthing': 1,
+        'id': 38,
+        'name': 'kite'
+    },
+    {
+        'color': [174, 255, 243],
+        'isthing': 1,
+        'id': 39,
+        'name': 'baseball bat'
+    },
+    {
+        'color': [45, 89, 255],
+        'isthing': 1,
+        'id': 40,
+        'name': 'baseball glove'
+    },
+    {
+        'color': [134, 134, 103],
+        'isthing': 1,
+        'id': 41,
+        'name': 'skateboard'
+    },
+    {
+        'color': [145, 148, 174],
+        'isthing': 1,
+        'id': 42,
+        'name': 'surfboard'
+    },
+    {
+        'color': [255, 208, 186],
+        'isthing': 1,
+        'id': 43,
+        'name': 'tennis racket'
+    },
+    {
+        'color': [197, 226, 255],
+        'isthing': 1,
+        'id': 44,
+        'name': 'bottle'
+    },
+    {
+        'color': [171, 134, 1],
+        'isthing': 1,
+        'id': 46,
+        'name': 'wine glass'
+    },
+    {
+        'color': [109, 63, 54],
+        'isthing': 1,
+        'id': 47,
+        'name': 'cup'
+    },
+    {
+        'color': [207, 138, 255],
+        'isthing': 1,
+        'id': 48,
+        'name': 'fork'
+    },
+    {
+        'color': [151, 0, 95],
+        'isthing': 1,
+        'id': 49,
+        'name': 'knife'
+    },
+    {
+        'color': [9, 80, 61],
+        'isthing': 1,
+        'id': 50,
+        'name': 'spoon'
+    },
+    {
+        'color': [84, 105, 51],
+        'isthing': 1,
+        'id': 51,
+        'name': 'bowl'
+    },
+    {
+        'color': [74, 65, 105],
+        'isthing': 1,
+        'id': 52,
+        'name': 'banana'
+    },
+    {
+        'color': [166, 196, 102],
+        'isthing': 1,
+        'id': 53,
+        'name': 'apple'
+    },
+    {
+        'color': [208, 195, 210],
+        'isthing': 1,
+        'id': 54,
+        'name': 'sandwich'
+    },
+    {
+        'color': [255, 109, 65],
+        'isthing': 1,
+        'id': 55,
+        'name': 'orange'
+    },
+    {
+        'color': [0, 143, 149],
+        'isthing': 1,
+        'id': 56,
+        'name': 'broccoli'
+    },
+    {
+        'color': [179, 0, 194],
+        'isthing': 1,
+        'id': 57,
+        'name': 'carrot'
+    },
+    {
+        'color': [209, 99, 106],
+        'isthing': 1,
+        'id': 58,
+        'name': 'hot dog'
+    },
+    {
+        'color': [5, 121, 0],
+        'isthing': 1,
+        'id': 59,
+        'name': 'pizza'
+    },
+    {
+        'color': [227, 255, 205],
+        'isthing': 1,
+        'id': 60,
+        'name': 'donut'
+    },
+    {
+        'color': [147, 186, 208],
+        'isthing': 1,
+        'id': 61,
+        'name': 'cake'
+    },
+    {
+        'color': [153, 69, 1],
+        'isthing': 1,
+        'id': 62,
+        'name': 'chair'
+    },
+    {
+        'color': [3, 95, 161],
+        'isthing': 1,
+        'id': 63,
+        'name': 'couch'
+    },
+    {
+        'color': [163, 255, 0],
+        'isthing': 1,
+        'id': 64,
+        'name': 'potted plant'
+    },
+    {
+        'color': [123, 104, 238],
+        'isthing': 1,
+        'id': 65,
+        'name': 'bed'
+    },
+    {
+        'color': [255., 187., 120.],
+        'isthing': 1,
+        'id': 67,
+        'name': 'dining table'
+    },
+    {
+        'color': [0, 165, 120],
+        'isthing': 1,
+        'id': 70,
+        'name': 'toilet'
+    },
+    {
+        'color': [183, 130, 88],
+        'isthing': 1,
+        'id': 72,
+        'name': 'tv'
+    },
+    {
+        'color': [95, 32, 0],
+        'isthing': 1,
+        'id': 73,
+        'name': 'laptop'
+    },
+    {
+        'color': [130, 114, 135],
+        'isthing': 1,
+        'id': 74,
+        'name': 'mouse'
+    },
+    {
+        'color': [110, 129, 133],
+        'isthing': 1,
+        'id': 75,
+        'name': 'remote'
+    },
+    {
+        'color': [166, 74, 118],
+        'isthing': 1,
+        'id': 76,
+        'name': 'keyboard'
+    },
+    {
+        'color': [219, 142, 185],
+        'isthing': 1,
+        'id': 77,
+        'name': 'cell phone'
+    },
+    {
+        'color': [79, 210, 114],
+        'isthing': 1,
+        'id': 78,
+        'name': 'microwave'
+    },
+    {
+        'color': [178, 90, 62],
+        'isthing': 1,
+        'id': 79,
+        'name': 'oven'
+    },
+    {
+        'color': [65, 70, 15],
+        'isthing': 1,
+        'id': 80,
+        'name': 'toaster'
+    },
+    {
+        'color': [127, 167, 115],
+        'isthing': 1,
+        'id': 81,
+        'name': 'sink'
+    },
+    {
+        'color': [59, 105, 106],
+        'isthing': 1,
+        'id': 82,
+        'name': 'refrigerator'
+    },
+    {
+        'color': [142, 108, 45],
+        'isthing': 1,
+        'id': 84,
+        'name': 'book'
+    },
+    {
+        'color': [196, 172, 0],
+        'isthing': 1,
+        'id': 85,
+        'name': 'clock'
+    },
+    {
+        'color': [95, 54, 80],
+        'isthing': 1,
+        'id': 86,
+        'name': 'vase'
+    },
+    {
+        'color': [128, 76, 255],
+        'isthing': 1,
+        'id': 87,
+        'name': 'scissors'
+    },
+    {
+        'color': [201, 57, 1],
+        'isthing': 1,
+        'id': 88,
+        'name': 'teddy bear'
+    },
+    {
+        'color': [246, 0, 122],
+        'isthing': 1,
+        'id': 89,
+        'name': 'hair drier'
+    },
+    {
+        'color': [191, 162, 208],
+        'isthing': 1,
+        'id': 90,
+        'name': 'toothbrush'
+    },
+    {
+        'color': [255, 255, 128],
+        'isthing': 0,
+        'id': 92,
+        'name': 'banner'
+    },
+    {
+        'color': [147, 211, 203],
+        'isthing': 0,
+        'id': 93,
+        'name': 'blanket'
+    },
+    {
+        'color': [150, 100, 100],
+        'isthing': 0,
+        'id': 95,
+        'name': 'bridge'
+    },
+    {
+        'color': [168, 171, 172],
+        'isthing': 0,
+        'id': 100,
+        'name': 'cardboard'
+    },
+    {
+        'color': [146, 112, 198],
+        'isthing': 0,
+        'id': 107,
+        'name': 'counter'
+    },
+    {
+        'color': [210, 170, 100],
+        'isthing': 0,
+        'id': 109,
+        'name': 'curtain'
+    },
+    {
+        'color': [92, 136, 89],
+        'isthing': 0,
+        'id': 112,
+        'name': 'door-stuff'
+    },
+    {
+        'color': [255, 193, 193],
+        'isthing': 0,
+        'id': 118,
+        'name': 'floor-wood'
+    },
+    {
+        'color': [241, 129, 0],
+        'isthing': 0,
+        'id': 119,
+        'name': 'flower'
+    },
+    {
+        'color': [217, 17, 255],
+        'isthing': 0,
+        'id': 122,
+        'name': 'fruit'
+    },
+    {
+        'color': [124, 74, 181],
+        'isthing': 0,
+        'id': 125,
+        'name': 'gravel'
+    },
+    {
+        'color': [70, 70, 70],
+        'isthing': 0,
+        'id': 128,
+        'name': 'house'
+    },
+    {
+        'color': [255, 228, 255],
+        'isthing': 0,
+        'id': 130,
+        'name': 'light'
+    },
+    {
+        'color': [154, 208, 0],
+        'isthing': 0,
+        'id': 133,
+        'name': 'mirror-stuff'
+    },
+    {
+        'color': [193, 0, 92],
+        'isthing': 0,
+        'id': 138,
+        'name': 'net'
+    },
+    {
+        'color': [76, 91, 113],
+        'isthing': 0,
+        'id': 141,
+        'name': 'pillow'
+    },
+    {
+        'color': [255, 180, 195],
+        'isthing': 0,
+        'id': 144,
+        'name': 'platform'
+    },
+    {
+        'color': [106, 154, 176],
+        'isthing': 0,
+        'id': 145,
+        'name': 'playingfield'
+    },
+    {
+        'color': [230, 150, 140],
+        'isthing': 0,
+        'id': 147,
+        'name': 'railroad'
+    },
+    {
+        'color': [60, 143, 255],
+        'isthing': 0,
+        'id': 148,
+        'name': 'river'
+    },
+    {
+        'color': [128, 64, 128],
+        'isthing': 0,
+        'id': 149,
+        'name': 'road'
+    },
+    {
+        'color': [92, 82, 55],
+        'isthing': 0,
+        'id': 151,
+        'name': 'roof'
+    },
+    {
+        'color': [254, 212, 124],
+        'isthing': 0,
+        'id': 154,
+        'name': 'sand'
+    },
+    {
+        'color': [73, 77, 174],
+        'isthing': 0,
+        'id': 155,
+        'name': 'sea'
+    },
+    {
+        'color': [255, 160, 98],
+        'isthing': 0,
+        'id': 156,
+        'name': 'shelf'
+    },
+    {
+        'color': [255, 255, 255],
+        'isthing': 0,
+        'id': 159,
+        'name': 'snow'
+    },
+    {
+        'color': [104, 84, 109],
+        'isthing': 0,
+        'id': 161,
+        'name': 'stairs'
+    },
+    {
+        'color': [169, 164, 131],
+        'isthing': 0,
+        'id': 166,
+        'name': 'tent'
+    },
+    {
+        'color': [225, 199, 255],
+        'isthing': 0,
+        'id': 168,
+        'name': 'towel'
+    },
+    {
+        'color': [137, 54, 74],
+        'isthing': 0,
+        'id': 171,
+        'name': 'wall-brick'
+    },
+    {
+        'color': [135, 158, 223],
+        'isthing': 0,
+        'id': 175,
+        'name': 'wall-stone'
+    },
+    {
+        'color': [7, 246, 231],
+        'isthing': 0,
+        'id': 176,
+        'name': 'wall-tile'
+    },
+    {
+        'color': [107, 255, 200],
+        'isthing': 0,
+        'id': 177,
+        'name': 'wall-wood'
+    },
+    {
+        'color': [58, 41, 149],
+        'isthing': 0,
+        'id': 178,
+        'name': 'water-other'
+    },
+    {
+        'color': [183, 121, 142],
+        'isthing': 0,
+        'id': 180,
+        'name': 'window-blind'
+    },
+    {
+        'color': [255, 73, 97],
+        'isthing': 0,
+        'id': 181,
+        'name': 'window-other'
+    },
+    {
+        'color': [107, 142, 35],
+        'isthing': 0,
+        'id': 184,
+        'name': 'tree-merged'
+    },
+    {
+        'color': [190, 153, 153],
+        'isthing': 0,
+        'id': 185,
+        'name': 'fence-merged'
+    },
+    {
+        'color': [146, 139, 141],
+        'isthing': 0,
+        'id': 186,
+        'name': 'ceiling-merged'
+    },
+    {
+        'color': [70, 130, 180],
+        'isthing': 0,
+        'id': 187,
+        'name': 'sky-other-merged'
+    },
+    {
+        'color': [134, 199, 156],
+        'isthing': 0,
+        'id': 188,
+        'name': 'cabinet-merged'
+    },
+    {
+        'color': [209, 226, 140],
+        'isthing': 0,
+        'id': 189,
+        'name': 'table-merged'
+    },
+    {
+        'color': [96, 36, 108],
+        'isthing': 0,
+        'id': 190,
+        'name': 'floor-other-merged'
+    },
+    {
+        'color': [96, 96, 96],
+        'isthing': 0,
+        'id': 191,
+        'name': 'pavement-merged'
+    },
+    {
+        'color': [64, 170, 64],
+        'isthing': 0,
+        'id': 192,
+        'name': 'mountain-merged'
+    },
+    {
+        'color': [152, 251, 152],
+        'isthing': 0,
+        'id': 193,
+        'name': 'grass-merged'
+    },
+    {
+        'color': [208, 229, 228],
+        'isthing': 0,
+        'id': 194,
+        'name': 'dirt-merged'
+    },
+    {
+        'color': [206, 186, 171],
+        'isthing': 0,
+        'id': 195,
+        'name': 'paper-merged'
+    },
+    {
+        'color': [152, 161, 64],
+        'isthing': 0,
+        'id': 196,
+        'name': 'food-other-merged'
+    },
+    {
+        'color': [116, 112, 0],
+        'isthing': 0,
+        'id': 197,
+        'name': 'building-other-merged'
+    },
+    {
+        'color': [0, 114, 143],
+        'isthing': 0,
+        'id': 198,
+        'name': 'rock-merged'
+    },
+    {
+        'color': [102, 102, 156],
+        'isthing': 0,
+        'id': 199,
+        'name': 'wall-other-merged'
+    },
+    {
+        'color': [250, 141, 255],
+        'isthing': 0,
+        'id': 200,
+        'name': 'rug-merged'
+    },
+]
+
+EMBODIED_CATE = [
+    'adhesive tape', 'air conditioner', 'alarm', 'album', 'arch', 'backpack',
+    'bag', 'balcony', 'ball', 'banister', 'bar', 'barricade', 'baseboard',
+    'basin', 'basket', 'bathtub', 'beam', 'beanbag', 'bed', 'bench', 'bicycle',
+    'bidet', 'bin', 'blackboard', 'blanket', 'blinds', 'board', 'body loofah',
+    'book', 'boots', 'bottle', 'bowl', 'box', 'bread', 'broom', 'brush',
+    'bucket', 'cabinet', 'calendar', 'camera', 'can', 'candle', 'candlestick',
+    'cap', 'car', 'carpet', 'cart', 'case', 'chair', 'chandelier', 'cleanser',
+    'clock', 'clothes', 'clothes dryer', 'coat hanger', 'coffee maker', 'coil',
+    'column', 'commode', 'computer', 'conducting wire', 'container', 'control',
+    'copier', 'cosmetics', 'couch', 'counter', 'countertop', 'crate', 'crib',
+    'cube', 'cup', 'curtain', 'cushion', 'decoration', 'desk', 'detergent',
+    'device', 'dish rack', 'dishwasher', 'dispenser', 'divider', 'door',
+    'door knob', 'doorframe', 'doorway', 'drawer', 'dress', 'dresser', 'drum',
+    'duct', 'dumbbell', 'dustpan', 'dvd', 'eraser', 'excercise equipment',
+    'fan', 'faucet', 'fence', 'file', 'fire extinguisher', 'fireplace',
+    'flowerpot', 'flush', 'folder', 'food', 'footstool', 'frame', 'fruit',
+    'furniture', 'garage door', 'garbage', 'glass', 'globe', 'glove',
+    'grab bar', 'grass', 'guitar', 'hair dryer', 'hamper', 'handle', 'hanger',
+    'hat', 'headboard', 'headphones', 'heater', 'helmets', 'holder', 'hook',
+    'humidifier', 'ironware', 'jacket', 'jalousie', 'jar', 'kettle',
+    'keyboard', 'kitchen island', 'kitchenware', 'knife', 'label', 'ladder',
+    'lamp', 'laptop', 'ledge', 'letter', 'light', 'luggage', 'machine',
+    'magazine', 'mailbox', 'map', 'mask', 'mat', 'mattress', 'menu',
+    'microwave', 'mirror', 'molding', 'monitor', 'mop', 'mouse', 'napkins',
+    'notebook', 'ottoman', 'oven', 'pack', 'package', 'pad', 'pan', 'panel',
+    'paper', 'paper cutter', 'partition', 'pedestal', 'pen', 'person', 'piano',
+    'picture', 'pillar', 'pillow', 'pipe', 'pitcher', 'plant', 'plate',
+    'player', 'plug', 'plunger', 'pool', 'pool table', 'poster', 'pot',
+    'price tag', 'printer', 'projector', 'purse', 'rack', 'radiator', 'radio',
+    'rail', 'range hood', 'refrigerator', 'remote control', 'ridge', 'rod',
+    'roll', 'roof', 'rope', 'sack', 'salt', 'scale', 'scissors', 'screen',
+    'seasoning', 'shampoo', 'sheet', 'shelf', 'shirt', 'shoe', 'shovel',
+    'shower', 'sign', 'sink', 'soap', 'soap dish', 'soap dispenser', 'socket',
+    'speaker', 'sponge', 'spoon', 'stairs', 'stall', 'stand', 'stapler',
+    'statue', 'steps', 'stick', 'stool', 'stopcock', 'stove', 'structure',
+    'sunglasses', 'support', 'switch', 'table', 'tablet', 'teapot',
+    'telephone', 'thermostat', 'tissue', 'tissue box', 'toaster', 'toilet',
+    'toilet paper', 'toiletry', 'tool', 'toothbrush', 'toothpaste', 'towel',
+    'toy', 'tray', 'treadmill', 'trophy', 'tube', 'tv', 'umbrella', 'urn',
+    'utensil', 'vacuum cleaner', 'vanity', 'vase', 'vent', 'ventilation',
+    'wardrobe', 'washbasin', 'washing machine', 'water cooler', 'water heater',
+    'window', 'window frame', 'windowsill', 'wine', 'wire', 'wood', 'wrap'
+]
+
+
+class ColorMap(object):
+
+    def __init__(self,
+                 classes=EMBODIED_CATE,
+                 init_file='./utils/full_color_map.txt'):
+        self.color_map = dict()
+        if init_file is not None:
+            with open(init_file, 'r') as f:
+                pre_data = f.readlines()
+            for ins in pre_data:
+                s = ins.strip()
+                cate = s.split('[')[0].strip()
+                color = eval(s[len(cate):])
+                self.color_map[cate] = color
+
+        self.classes = classes
+        self.color_pool = COCO_COLOR
+
+        for label in classes:
+            if label not in self.color_map:
+                x = random.choice(self.color_pool)
+                self.color_map[label] = x['color']
+
+        self.inv_color_map = dict()
+        for key, value in self.color_map.items():
+            color_idx = value[0] * 256 * 256 + value[1] * 256 + value[2]
+            if color_idx in self.inv_color_map:
+                self.inv_color_map[color_idx].append(key)
+            else:
+                self.inv_color_map[color_idx] = [key]
+
+        self.visible_label = set()
+
+    def save(self, out_file):
+        with open(out_file, 'w') as f:
+            for key, value in self.color_map.items():
+                print(key, value, file=f)
+
+    def get_color(self, label):
+        color = self.color_map[label]
+        if label in self.visible_label:
+            return color
+        color_idx = color[0] * 256 * 256 + color[1] * 256 + color[2]
+        bo = False
+        for value in self.inv_color_map[color_idx]:
+            if value in self.visible_label:
+                if not bo:
+                    print('same color: ', end='')
+                bo = True
+                print(value, ' ', end='')
+
+        if bo:
+            print(label)
+
+        self.visible_label.add(label)
+        return color
+
+    def clear_stat(self):
+        self.visible_label.clear()
+
+
+if __name__ == '__main__':
+    a = ColorMap(init_file='occ_color_map.txt')
+    print(a.get_color('bed'))
diff --git a/embodiedscan/utils/full_color_map.txt b/embodiedscan/utils/full_color_map.txt
new file mode 100644
index 0000000..0c62ffe
--- /dev/null
+++ b/embodiedscan/utils/full_color_map.txt
@@ -0,0 +1,287 @@
+floor [255, 193, 193]
+wall [137, 54, 74]
+chair [153, 69, 1]
+cabinet [134, 199, 156]
+door [92, 136, 89]
+table [255.0, 187.0, 120.0]
+couch [3, 95, 161]
+shelf [255, 160, 98]
+window [183, 121, 142]
+bed [123, 104, 238]
+curtain [210, 170, 100]
+plant [163, 255, 0]
+stairs [104, 84, 109]
+pillow [76, 91, 113]
+counter [146, 112, 198]
+bench [250, 0, 30]
+rail [230, 150, 140]
+sink [135, 206, 250]
+mirror [154, 208, 0]
+toilet [0, 165, 120]
+refrigerator [59, 105, 106]
+book [142, 108, 45]
+tv [183, 130, 88]
+blanket [147, 211, 203]
+rack [255, 208, 186]
+towel [225, 199, 255]
+backpack [255, 179, 240]
+roof [92, 82, 55]
+bag [209, 0, 151]
+board [133, 129, 255]
+bicycle [119, 11, 32]
+oven [178, 90, 62]
+microwave [79, 210, 114]
+desk [109, 63, 54]
+doorframe [199, 100, 0]
+wardrobe [7, 246, 231]
+picture [171, 134, 1]
+bathtub [92, 0, 73]
+box [188, 208, 182]
+stand [146, 139, 141]
+clothes [96, 96, 96]
+lamp [107, 255, 200]
+dresser [206, 186, 171]
+stool [73, 77, 174]
+fireplace [255, 77, 255]
+commode [102, 102, 156]
+washing machine [152, 251, 152]
+monitor [208, 195, 210]
+window frame [227, 255, 205]
+radiator [191, 162, 208]
+mat [250, 141, 255]
+shower [154, 255, 154]
+ottoman [95, 32, 0]
+column [60, 143, 255]
+blinds [134, 134, 103]
+stove [128, 64, 128]
+bar [72, 0, 118]
+pillar [220, 20, 60]
+bin [187, 255, 255]
+heater [209, 226, 140]
+clothes dryer [100, 170, 30]
+blackboard [0, 82, 0]
+decoration [107, 142, 35]
+steps [120, 166, 157]
+windowsill [9, 80, 61]
+cushion [0, 228, 0]
+carpet [175, 116, 175]
+copier [241, 129, 0]
+countertop [207, 138, 255]
+basket [0, 0, 70]
+mailbox [150, 100, 100]
+kitchen island [220, 220, 0]
+washbasin [0, 80, 100]
+drawer [0, 220, 176]
+piano [78, 180, 255]
+exercise equipment [151, 0, 95]
+beam [255, 255, 128]
+partition [168, 171, 172]
+printer [179, 0, 194]
+frame [255, 180, 195]
+object [0, 0, 0]
+adhesive tape [0, 220, 176]
+air conditioner [109, 63, 54]
+alarm [0, 114, 143]
+album [147, 186, 208]
+arch [135, 158, 223]
+balcony [70, 70, 70]
+ball [96, 96, 96]
+banister [196, 172, 0]
+barricade [45, 89, 255]
+baseboard [153, 69, 1]
+basin [255.0, 187.0, 120.0]
+beanbag [190, 153, 153]
+bidet [123, 104, 238]
+body loofah [196, 172, 0]
+boots [134, 199, 156]
+bottle [241, 129, 0]
+bowl [92, 136, 89]
+bread [119, 11, 32]
+broom [0, 226, 252]
+brush [255, 255, 128]
+bucket [255, 73, 97]
+calendar [76, 91, 113]
+camera [72, 0, 118]
+can [109, 63, 54]
+candle [78, 180, 255]
+candlestick [104, 84, 109]
+cap [128, 76, 255]
+car [107, 142, 35]
+cart [255, 255, 128]
+case [0, 0, 230]
+chandelier [169, 164, 131]
+cleanser [0, 165, 120]
+clock [190, 153, 153]
+coat hanger [179, 0, 194]
+coffee maker [0, 82, 0]
+coil [255, 179, 240]
+computer [225, 199, 255]
+conducting wire [150, 100, 100]
+container [0, 0, 70]
+control [255, 77, 255]
+cosmetics [142, 108, 45]
+crate [0, 226, 252]
+crib [169, 164, 131]
+cube [116, 112, 0]
+cup [175, 116, 175]
+detergent [255, 208, 186]
+device [146, 139, 141]
+dish rack [0, 0, 142]
+dishwasher [92, 82, 55]
+dispenser [95, 32, 0]
+divider [219, 142, 185]
+door knob [166, 74, 118]
+doorway [134, 134, 103]
+dress [0, 114, 143]
+drum [107, 142, 35]
+duct [0, 80, 100]
+dumbbell [0, 0, 192]
+dustpan [78, 180, 255]
+dvd [0, 143, 149]
+eraser [0, 82, 0]
+fan [0, 0, 70]
+faucet [84, 105, 51]
+fence [190, 153, 153]
+file [255, 228, 255]
+fire extinguisher [107, 255, 200]
+flowerpot [9, 80, 61]
+flush [227, 255, 205]
+folder [208, 229, 228]
+food [109, 63, 54]
+footstool [133, 129, 255]
+fruit [179, 0, 194]
+furniture [220, 20, 60]
+garage door [217, 17, 255]
+garbage [0, 82, 0]
+glass [255, 99, 164]
+globe [255, 77, 255]
+glove [166, 196, 102]
+grab bar [145, 148, 174]
+grass [0, 60, 100]
+guitar [73, 77, 174]
+hair dryer [169, 164, 131]
+hamper [241, 129, 0]
+handle [142, 108, 45]
+hanger [150, 100, 100]
+hat [154, 208, 0]
+headboard [171, 134, 1]
+headphones [124, 74, 181]
+helmets [209, 226, 140]
+holder [151, 0, 95]
+hook [92, 136, 89]
+humidifier [209, 99, 106]
+ironware [127, 167, 115]
+jacket [255, 73, 97]
+jalousie [255, 179, 240]
+jar [106, 154, 176]
+kettle [196, 172, 0]
+keyboard [0, 125, 92]
+kitchenware [74, 65, 105]
+knife [70, 130, 180]
+label [0, 228, 0]
+ladder [0, 114, 143]
+laptop [255, 180, 195]
+ledge [58, 41, 149]
+letter [0, 0, 192]
+light [78, 180, 255]
+luggage [0, 226, 252]
+machine [197, 226, 255]
+magazine [199, 100, 0]
+map [183, 121, 142]
+mask [74, 65, 105]
+mattress [255, 179, 240]
+menu [255, 255, 128]
+molding [104, 84, 109]
+mop [199, 100, 0]
+mouse [5, 121, 0]
+napkins [165, 42, 42]
+notebook [175, 116, 175]
+pack [0, 143, 149]
+package [166, 196, 102]
+pad [208, 229, 228]
+pan [209, 99, 106]
+panel [201, 57, 1]
+paper [255, 179, 240]
+paper cutter [207, 138, 255]
+pedestal [64, 170, 64]
+pen [193, 0, 92]
+person [7, 246, 231]
+pipe [255, 180, 195]
+pitcher [220, 20, 60]
+plate [142, 108, 45]
+player [0, 143, 149]
+plug [255, 77, 255]
+plunger [165, 42, 42]
+pool [153, 69, 1]
+pool table [0, 0, 230]
+poster [130, 114, 135]
+pot [96, 36, 108]
+price tag [255, 77, 255]
+projector [179, 0, 194]
+purse [0, 228, 0]
+radio [116, 112, 0]
+range hood [199, 100, 0]
+remote control [188, 208, 182]
+ridge [59, 105, 106]
+rod [207, 138, 255]
+roll [123, 104, 238]
+rope [110, 76, 0]
+sack [190, 153, 153]
+salt [250, 0, 30]
+scale [58, 41, 149]
+scissors [60, 143, 255]
+screen [0, 82, 0]
+seasoning [254, 212, 124]
+shampoo [70, 130, 180]
+sheet [151, 0, 95]
+shirt [190, 153, 153]
+shoe [199, 100, 0]
+shovel [241, 129, 0]
+sign [208, 195, 210]
+soap [109, 63, 54]
+soap dish [166, 74, 118]
+soap dispenser [95, 32, 0]
+socket [255, 255, 255]
+speaker [65, 70, 15]
+sponge [0, 220, 176]
+spoon [134, 134, 103]
+stall [0, 60, 100]
+stapler [246, 0, 122]
+statue [196, 172, 0]
+stick [0, 165, 120]
+stopcock [0, 60, 100]
+structure [220, 20, 60]
+sunglasses [142, 108, 45]
+support [209, 226, 140]
+switch [7, 246, 231]
+tablet [137, 54, 74]
+teapot [0, 80, 100]
+telephone [220, 220, 0]
+thermostat [128, 76, 255]
+tissue [73, 77, 174]
+tissue box [96, 96, 96]
+toaster [106, 0, 228]
+toilet paper [84, 105, 51]
+toiletry [128, 64, 128]
+tool [220, 20, 60]
+toothbrush [130, 114, 135]
+toothpaste [0, 143, 149]
+toy [255.0, 187.0, 120.0]
+tray [255, 179, 240]
+treadmill [166, 74, 118]
+trophy [0, 220, 176]
+tube [255, 255, 128]
+umbrella [250, 0, 30]
+urn [152, 251, 152]
+utensil [220, 220, 0]
+vacuum cleaner [96, 36, 108]
+vanity [5, 121, 0]
+vase [255, 193, 193]
+vent [209, 226, 140]
+ventilation [123, 104, 238]
+water cooler [255, 255, 128]
+water heater [145, 148, 174]
+wine [220, 220, 0]
+wire [96, 36, 108]
+wood [127, 167, 115]
+wrap [175, 116, 175]
diff --git a/embodiedscan/utils/img_drawer.py b/embodiedscan/utils/img_drawer.py
new file mode 100644
index 0000000..94ffe4a
--- /dev/null
+++ b/embodiedscan/utils/img_drawer.py
@@ -0,0 +1,127 @@
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+import open3d as o3d
+from matplotlib import path
+
+
+class ImageDrawer:
+
+    def __init__(self, image, verbose=False):
+        self.verbose = verbose
+        if self.verbose:
+            print('Loading image', image)
+        img = cv2.imread(image)
+        if self.verbose:
+            print('Loading image Complete')
+        img = img[:, :, ::-1].astype(np.float32)  # BGR to RGB
+        self.occupied = np.zeros((img.shape[0], img.shape[1]), dtype=bool)
+        self.img = img
+        self.EPS = 1e-4
+        self.ALPHA = 0.75
+
+    def draw_text(self,
+                  text,
+                  font=cv2.FONT_HERSHEY_SIMPLEX,
+                  pos=(0, 0),
+                  size=(0, 0),
+                  font_scale=1,
+                  font_thickness=2,
+                  text_color=(0, 255, 0),
+                  text_color_bg=(0, 0, 0)):
+
+        x, y = pos
+        w, h = size
+        text_size, _ = cv2.getTextSize(text, font, font_scale, font_thickness)
+        text_w, text_h = text_size
+        if y * 2 > h:
+            dy = -10
+        else:
+            dy = 10
+
+        try:
+            while self.occupied[y, x] or self.occupied[
+                    y, x +
+                    text_w] or self.occupied[y + text_h,
+                                             x] or self.occupied[y + text_h,
+                                                                 x + text_w]:
+                y += dy
+        except:  # noqa: E722
+            pass
+            # TODO
+        cv2.rectangle(self.img, (x, y), (x + text_w, y + text_h),
+                      text_color_bg, -1)
+        cv2.putText(self.img, text, (x, y + text_h + font_scale - 1), font,
+                    font_scale, text_color, font_thickness)
+
+        self.occupied[y:y + text_h, x:x + text_w] = True
+
+    def draw_box3d(self, box, color, label, extrinsic, intrinsic):
+        """
+            box : open3d box
+            color : 0-255
+            extrinsic : 4x4 CAM 2 WORLD
+        """
+        extrinsic_w2c = np.linalg.inv(extrinsic)
+        h, w, _ = self.img.shape
+        x, y = np.meshgrid(np.arange(w), np.arange(h))
+        x, y = x.flatten(), y.flatten()
+        pixel_points = np.vstack((x, y)).T
+
+        camera_pos_in_world = (
+            extrinsic @ np.array([0, 0, 0, 1]).reshape(4, 1)).transpose()
+        if self._inside_box(box, camera_pos_in_world):
+            return
+
+        corners = np.asarray(box.get_box_points())
+        corners = corners[[0, 1, 7, 2, 3, 6, 4, 5]]
+        corners = np.concatenate(
+            [corners, np.ones((corners.shape[0], 1))], axis=1)
+        corners_img = intrinsic @ extrinsic_w2c @ corners.transpose()
+        corners_img = corners_img.transpose()
+        corners_pixel = np.zeros((corners_img.shape[0], 2))
+        for i in range(corners_img.shape[0]):
+            corners_pixel[i] = corners_img[i][:2] / np.abs(corners_img[i][2])
+        lines = [[0, 1], [1, 2], [2, 3], [3, 0], [4, 5], [5, 6], [6, 7],
+                 [7, 4], [0, 4], [1, 5], [2, 6], [3, 7]]
+        faces = [[0, 1, 2, 3], [4, 5, 6, 7], [0, 1, 5, 4], [3, 2, 6, 7],
+                 [0, 3, 7, 4], [1, 2, 6, 5]]
+        for line in lines:
+            if (corners_img[line][:, 2] < self.EPS).any():
+                continue
+            px = corners_pixel[line[0]].astype(np.int32)
+            py = corners_pixel[line[1]].astype(np.int32)
+            cv2.line(self.img, (px[0], px[1]), (py[0], py[1]), color, 2)
+
+        all_mask = np.zeros((h, w), dtype=bool)
+        for face in faces:
+            if (corners_img[face][:, 2] < self.EPS).any():
+                continue
+            pts = corners_pixel[face]
+            p = path.Path(pts[:, :2])
+            mask = p.contains_points(pixel_points).reshape((h, w))
+            all_mask = np.logical_or(all_mask, mask)
+        self.img[all_mask] = self.img[all_mask] * self.ALPHA + (
+            1 - self.ALPHA) * np.array(color)
+
+        if (all_mask.any()):
+            textpos = np.min(corners_pixel, axis=0).astype(np.int32)
+            textpos[0] = np.clip(textpos[0], a_min=0, a_max=w)
+            textpos[1] = np.clip(textpos[1], a_min=0, a_max=h)
+            self.draw_text(label,
+                           pos=textpos,
+                           size=(w, h),
+                           text_color=(255, 255, 255),
+                           text_color_bg=color)
+
+    def show(self):
+        plt.imshow(self.img / 255.0)
+        plt.show()
+
+    @staticmethod
+    def _inside_box(box, point):
+        point_vec = o3d.utility.Vector3dVector(point[:, :3])
+        inside_idx = box.get_point_indices_within_bounding_box(point_vec)
+        if len(inside_idx) > 0:
+            return True
+        return False