From 07fd26d98716fe1f8e229c72bbb52709db746ea2 Mon Sep 17 00:00:00 2001
From: Kadir Yilmaz <kdr193@hotmail.com>
Date: Mon, 4 Dec 2023 00:42:39 +0100
Subject: [PATCH] Code

---
 .gitignore                                    |   7 +
 conf/augmentation/volumentations_aug.yaml     |  53 ++
 conf/callbacks/callbacks_panoptic.yaml        |  11 +
 conf/config_panoptic_4d.yaml                  |  35 +
 .../collation_functions/voxelize_collate.yaml |  16 +
 conf/data/data_loaders/simple_loader.yaml     |  22 +
 conf/data/datasets/semantic_kitti.yaml        |  33 +
 conf/data/kitti.yaml                          |  32 +
 conf/logging/full.yaml                        |   7 +
 conf/loss/set_criterion.yaml                  |   8 +
 conf/matcher/hungarian_matcher.yaml           |   6 +
 conf/metric/lstq.yaml                         |   5 +
 conf/model/mask4d.yaml                        |  22 +
 conf/optimizer/adamw.yaml                     |   3 +
 conf/scheduler/onecyclelr.yaml                |  10 +
 conf/semantic-kitti.yaml                      | 211 +++++
 conf/trainer/trainer30.yaml                   |   4 +
 datasets/lidar.py                             | 201 +++++
 .../semantic_kitti_preprocessing.py           | 253 ++++++
 datasets/utils.py                             | 110 +++
 main_panoptic.py                              | 101 +++
 models/__init__.py                            |  35 +
 models/criterion.py                           | 158 ++++
 models/mask4d.py                              | 273 ++++++
 models/matcher.py                             | 144 +++
 models/metrics/__init__.py                    |   1 +
 models/metrics/panoptic_quality.py            | 146 +++
 models/model.py                               |  27 +
 models/modules/__init__.py                    |   0
 models/modules/attention.py                   | 115 +++
 models/modules/common.py                      | 258 ++++++
 models/modules/helpers_3detr.py               | 116 +++
 models/modules/resnet_block.py                | 149 ++++
 models/position_embedding.py                  |  65 ++
 models/res16unet.py                           | 425 +++++++++
 models/resnet.py                              | 240 +++++
 models/resunet.py                             | 593 +++++++++++++
 scripts/preprocess_kitti.sh                   |   9 +
 scripts/test.sh                               |  10 +
 scripts/train.sh                              |   8 +
 scripts/val.sh                                |  10 +
 .../pointnet2/_ext_src/include/ball_query.h   |   7 +
 .../pointnet2/_ext_src/include/cuda_utils.h   |  43 +
 .../pointnet2/_ext_src/include/group_points.h |   8 +
 .../pointnet2/_ext_src/include/interpolate.h  |  12 +
 .../pointnet2/_ext_src/include/sampling.h     |   9 +
 .../pointnet2/_ext_src/include/utils.h        |  28 +
 .../pointnet2/_ext_src/src/ball_query.cpp     |  35 +
 .../pointnet2/_ext_src/src/ball_query_gpu.cu  |  57 ++
 .../pointnet2/_ext_src/src/bindings.cpp       |  22 +
 .../pointnet2/_ext_src/src/group_points.cpp   |  63 ++
 .../_ext_src/src/group_points_gpu.cu          |  78 ++
 .../pointnet2/_ext_src/src/interpolate.cpp    | 101 +++
 .../pointnet2/_ext_src/src/interpolate_gpu.cu | 157 ++++
 .../pointnet2/_ext_src/src/sampling.cpp       |  88 ++
 .../pointnet2/_ext_src/src/sampling_gpu.cu    | 232 +++++
 third_party/pointnet2/pointnet2_modules.py    | 514 +++++++++++
 third_party/pointnet2/pointnet2_test.py       |  30 +
 third_party/pointnet2/pointnet2_utils.py      | 422 +++++++++
 third_party/pointnet2/pytorch_utils.py        | 295 +++++++
 third_party/pointnet2/setup.py                |  35 +
 trainer/trainer.py                            | 305 +++++++
 utils/__init__.py                             |   0
 utils/pointops2/__init__.py                   |   0
 utils/pointops2/functions/__init__.py         |   0
 utils/pointops2/functions/pointops.py         | 829 ++++++++++++++++++
 utils/pointops2/functions/pointops2.py        | 214 +++++
 .../pointops2/functions/pointops_ablation.py  | 215 +++++
 .../functions/test_attention_op_step1.py      |  78 ++
 .../functions/test_attention_op_step1_v2.py   |  97 ++
 .../functions/test_attention_op_step2.py      |  55 ++
 .../test_relative_pos_encoding_op_step1.py    |  56 ++
 .../test_relative_pos_encoding_op_step1_v2.py |  64 ++
 .../test_relative_pos_encoding_op_step1_v3.py |  90 ++
 .../test_relative_pos_encoding_op_step2.py    |  66 ++
 .../test_relative_pos_encoding_op_step2_v2.py |  92 ++
 utils/pointops2/setup.py                      |  42 +
 utils/pointops2/src/__init__.py               |   0
 .../src/aggregation/aggregation_cuda.cpp      |  29 +
 .../aggregation/aggregation_cuda_kernel.cu    |  53 ++
 .../src/aggregation/aggregation_cuda_kernel.h |  20 +
 .../src/attention/attention_cuda.cpp          |  56 ++
 .../src/attention/attention_cuda_kernel.cu    | 103 +++
 .../src/attention/attention_cuda_kernel.h     |  26 +
 .../attention_v2/attention_cuda_kernel_v2.cu  | 193 ++++
 .../attention_v2/attention_cuda_kernel_v2.h   |  26 +
 .../src/attention_v2/attention_cuda_v2.cpp    |  56 ++
 utils/pointops2/src/cuda_utils.h              |  23 +
 .../pointops2/src/grouping/grouping_cuda.cpp  |  22 +
 .../src/grouping/grouping_cuda_kernel.cu      |  40 +
 .../src/grouping/grouping_cuda_kernel.h       |  20 +
 .../src/interpolation/interpolation_cuda.cpp  |  24 +
 .../interpolation_cuda_kernel.cu              |  47 +
 .../interpolation/interpolation_cuda_kernel.h |  20 +
 .../pointops2/src/knnquery/knnquery_cuda.cpp  |  17 +
 .../src/knnquery/knnquery_cuda_kernel.cu      | 116 +++
 .../src/knnquery/knnquery_cuda_kernel.h       |  18 +
 utils/pointops2/src/pointops_api.cpp          |  45 +
 .../src/rpe/relative_pos_encoding_cuda.cpp    |  60 ++
 .../rpe/relative_pos_encoding_cuda_kernel.cu  | 134 +++
 .../rpe/relative_pos_encoding_cuda_kernel.h   |  26 +
 .../relative_pos_encoding_cuda_kernel_v2.cu   | 525 +++++++++++
 .../relative_pos_encoding_cuda_kernel_v2.h    |  32 +
 .../rpe_v2/relative_pos_encoding_cuda_v2.cpp  | 111 +++
 .../pointops2/src/sampling/sampling_cuda.cpp  |  16 +
 .../src/sampling/sampling_cuda_kernel.cu      | 171 ++++
 .../src/sampling/sampling_cuda_kernel.h       |  18 +
 .../src/subtraction/subtraction_cuda.cpp      |  24 +
 .../subtraction/subtraction_cuda_kernel.cu    |  44 +
 .../src/subtraction/subtraction_cuda_kernel.h |  20 +
 utils/utils.py                                | 101 +++
 111 files changed, 10607 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 conf/augmentation/volumentations_aug.yaml
 create mode 100644 conf/callbacks/callbacks_panoptic.yaml
 create mode 100644 conf/config_panoptic_4d.yaml
 create mode 100644 conf/data/collation_functions/voxelize_collate.yaml
 create mode 100644 conf/data/data_loaders/simple_loader.yaml
 create mode 100644 conf/data/datasets/semantic_kitti.yaml
 create mode 100644 conf/data/kitti.yaml
 create mode 100644 conf/logging/full.yaml
 create mode 100644 conf/loss/set_criterion.yaml
 create mode 100644 conf/matcher/hungarian_matcher.yaml
 create mode 100644 conf/metric/lstq.yaml
 create mode 100644 conf/model/mask4d.yaml
 create mode 100644 conf/optimizer/adamw.yaml
 create mode 100644 conf/scheduler/onecyclelr.yaml
 create mode 100644 conf/semantic-kitti.yaml
 create mode 100644 conf/trainer/trainer30.yaml
 create mode 100644 datasets/lidar.py
 create mode 100644 datasets/preprocessing/semantic_kitti_preprocessing.py
 create mode 100644 datasets/utils.py
 create mode 100644 main_panoptic.py
 create mode 100644 models/__init__.py
 create mode 100644 models/criterion.py
 create mode 100644 models/mask4d.py
 create mode 100644 models/matcher.py
 create mode 100644 models/metrics/__init__.py
 create mode 100644 models/metrics/panoptic_quality.py
 create mode 100644 models/model.py
 create mode 100644 models/modules/__init__.py
 create mode 100644 models/modules/attention.py
 create mode 100644 models/modules/common.py
 create mode 100644 models/modules/helpers_3detr.py
 create mode 100644 models/modules/resnet_block.py
 create mode 100644 models/position_embedding.py
 create mode 100644 models/res16unet.py
 create mode 100644 models/resnet.py
 create mode 100644 models/resunet.py
 create mode 100755 scripts/preprocess_kitti.sh
 create mode 100755 scripts/test.sh
 create mode 100755 scripts/train.sh
 create mode 100755 scripts/val.sh
 create mode 100644 third_party/pointnet2/_ext_src/include/ball_query.h
 create mode 100644 third_party/pointnet2/_ext_src/include/cuda_utils.h
 create mode 100644 third_party/pointnet2/_ext_src/include/group_points.h
 create mode 100644 third_party/pointnet2/_ext_src/include/interpolate.h
 create mode 100644 third_party/pointnet2/_ext_src/include/sampling.h
 create mode 100644 third_party/pointnet2/_ext_src/include/utils.h
 create mode 100644 third_party/pointnet2/_ext_src/src/ball_query.cpp
 create mode 100644 third_party/pointnet2/_ext_src/src/ball_query_gpu.cu
 create mode 100644 third_party/pointnet2/_ext_src/src/bindings.cpp
 create mode 100644 third_party/pointnet2/_ext_src/src/group_points.cpp
 create mode 100644 third_party/pointnet2/_ext_src/src/group_points_gpu.cu
 create mode 100644 third_party/pointnet2/_ext_src/src/interpolate.cpp
 create mode 100644 third_party/pointnet2/_ext_src/src/interpolate_gpu.cu
 create mode 100644 third_party/pointnet2/_ext_src/src/sampling.cpp
 create mode 100644 third_party/pointnet2/_ext_src/src/sampling_gpu.cu
 create mode 100644 third_party/pointnet2/pointnet2_modules.py
 create mode 100644 third_party/pointnet2/pointnet2_test.py
 create mode 100644 third_party/pointnet2/pointnet2_utils.py
 create mode 100644 third_party/pointnet2/pytorch_utils.py
 create mode 100644 third_party/pointnet2/setup.py
 create mode 100644 trainer/trainer.py
 create mode 100644 utils/__init__.py
 create mode 100644 utils/pointops2/__init__.py
 create mode 100644 utils/pointops2/functions/__init__.py
 create mode 100644 utils/pointops2/functions/pointops.py
 create mode 100644 utils/pointops2/functions/pointops2.py
 create mode 100644 utils/pointops2/functions/pointops_ablation.py
 create mode 100644 utils/pointops2/functions/test_attention_op_step1.py
 create mode 100644 utils/pointops2/functions/test_attention_op_step1_v2.py
 create mode 100644 utils/pointops2/functions/test_attention_op_step2.py
 create mode 100644 utils/pointops2/functions/test_relative_pos_encoding_op_step1.py
 create mode 100644 utils/pointops2/functions/test_relative_pos_encoding_op_step1_v2.py
 create mode 100644 utils/pointops2/functions/test_relative_pos_encoding_op_step1_v3.py
 create mode 100644 utils/pointops2/functions/test_relative_pos_encoding_op_step2.py
 create mode 100644 utils/pointops2/functions/test_relative_pos_encoding_op_step2_v2.py
 create mode 100644 utils/pointops2/setup.py
 create mode 100644 utils/pointops2/src/__init__.py
 create mode 100644 utils/pointops2/src/aggregation/aggregation_cuda.cpp
 create mode 100644 utils/pointops2/src/aggregation/aggregation_cuda_kernel.cu
 create mode 100644 utils/pointops2/src/aggregation/aggregation_cuda_kernel.h
 create mode 100644 utils/pointops2/src/attention/attention_cuda.cpp
 create mode 100644 utils/pointops2/src/attention/attention_cuda_kernel.cu
 create mode 100644 utils/pointops2/src/attention/attention_cuda_kernel.h
 create mode 100644 utils/pointops2/src/attention_v2/attention_cuda_kernel_v2.cu
 create mode 100644 utils/pointops2/src/attention_v2/attention_cuda_kernel_v2.h
 create mode 100644 utils/pointops2/src/attention_v2/attention_cuda_v2.cpp
 create mode 100644 utils/pointops2/src/cuda_utils.h
 create mode 100644 utils/pointops2/src/grouping/grouping_cuda.cpp
 create mode 100644 utils/pointops2/src/grouping/grouping_cuda_kernel.cu
 create mode 100644 utils/pointops2/src/grouping/grouping_cuda_kernel.h
 create mode 100644 utils/pointops2/src/interpolation/interpolation_cuda.cpp
 create mode 100644 utils/pointops2/src/interpolation/interpolation_cuda_kernel.cu
 create mode 100644 utils/pointops2/src/interpolation/interpolation_cuda_kernel.h
 create mode 100644 utils/pointops2/src/knnquery/knnquery_cuda.cpp
 create mode 100644 utils/pointops2/src/knnquery/knnquery_cuda_kernel.cu
 create mode 100644 utils/pointops2/src/knnquery/knnquery_cuda_kernel.h
 create mode 100644 utils/pointops2/src/pointops_api.cpp
 create mode 100644 utils/pointops2/src/rpe/relative_pos_encoding_cuda.cpp
 create mode 100644 utils/pointops2/src/rpe/relative_pos_encoding_cuda_kernel.cu
 create mode 100644 utils/pointops2/src/rpe/relative_pos_encoding_cuda_kernel.h
 create mode 100644 utils/pointops2/src/rpe_v2/relative_pos_encoding_cuda_kernel_v2.cu
 create mode 100644 utils/pointops2/src/rpe_v2/relative_pos_encoding_cuda_kernel_v2.h
 create mode 100644 utils/pointops2/src/rpe_v2/relative_pos_encoding_cuda_v2.cpp
 create mode 100644 utils/pointops2/src/sampling/sampling_cuda.cpp
 create mode 100644 utils/pointops2/src/sampling/sampling_cuda_kernel.cu
 create mode 100644 utils/pointops2/src/sampling/sampling_cuda_kernel.h
 create mode 100644 utils/pointops2/src/subtraction/subtraction_cuda.cpp
 create mode 100644 utils/pointops2/src/subtraction/subtraction_cuda_kernel.cu
 create mode 100644 utils/pointops2/src/subtraction/subtraction_cuda_kernel.h
 create mode 100644 utils/utils.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d578018
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+/saved
+/logs
+/data
+/.vscode
+.python-version
+__pycache__/
+*.out
\ No newline at end of file
diff --git a/conf/augmentation/volumentations_aug.yaml b/conf/augmentation/volumentations_aug.yaml
new file mode 100644
index 0000000..5a79185
--- /dev/null
+++ b/conf/augmentation/volumentations_aug.yaml
@@ -0,0 +1,53 @@
+# pi   = 3.14159265358979
+# pi/2 = 1.57079632679489
+# pi/3 = 1.04719755119659
+# pi/6 = 0.52359877559829
+# pi/12 = 0.26179938779914
+# pi/24 = 0.13089969389957
+
+__version__: 0.1.6
+transform:
+  __class_fullname__: volumentations.core.composition.Compose
+  additional_targets: {}
+  p: 1.0
+  transforms:
+    - __class_fullname__: volumentations.augmentations.transforms.Scale3d
+      always_apply: true
+      p: 0.5
+      scale_limit:
+        - - -0.1
+          - 0.1
+        - - -0.1
+          - 0.1
+        - - -0.1
+          - 0.1
+    - __class_fullname__: volumentations.augmentations.transforms.RotateAroundAxis3d
+      always_apply: true
+      axis:
+        - 0
+        - 0
+        - 1
+      p: 0.5
+      rotation_limit:
+        - -3.141592653589793
+        - 3.141592653589793
+    - __class_fullname__: volumentations.augmentations.transforms.RotateAroundAxis3d
+      always_apply: true
+      axis:
+        - 0
+        - 1
+        - 0
+      p: 0.5
+      rotation_limit:
+        - -0.13089969389957
+        - 0.13089969389957
+    - __class_fullname__: volumentations.augmentations.transforms.RotateAroundAxis3d
+      always_apply: true
+      axis:
+        - 1
+        - 0
+        - 0
+      p: 0.5
+      rotation_limit:
+        - -0.13089969389957
+        - 0.13089969389957
diff --git a/conf/callbacks/callbacks_panoptic.yaml b/conf/callbacks/callbacks_panoptic.yaml
new file mode 100644
index 0000000..c5011ec
--- /dev/null
+++ b/conf/callbacks/callbacks_panoptic.yaml
@@ -0,0 +1,11 @@
+# @package _group_
+- _target_: pytorch_lightning.callbacks.ModelCheckpoint
+  monitor: val_mean_lstq
+  save_last: true
+  save_top_k: 1
+  mode: max
+  dirpath: ${general.save_dir}
+  filename: "{epoch}-{val_mean_lstq:.3f}"
+  every_n_epochs: 1
+
+- _target_: pytorch_lightning.callbacks.LearningRateMonitor
diff --git a/conf/config_panoptic_4d.yaml b/conf/config_panoptic_4d.yaml
new file mode 100644
index 0000000..382aed3
--- /dev/null
+++ b/conf/config_panoptic_4d.yaml
@@ -0,0 +1,35 @@
+general:
+  mode: "train"
+  seed: null
+  ckpt_path: null
+  project_name: mask4d
+  workspace: kadiryilmaz
+  instance_population: 10
+  dbscan_eps: null
+  data_percent: 1.0
+  experiment_name: ${now:%Y-%m-%d_%H%M%S}
+  save_dir: saved/${general.experiment_name}
+  gpus: 1
+
+defaults:
+  - data: kitti
+  - data/data_loaders: simple_loader
+  - data/datasets: semantic_kitti
+  - data/collation_functions: voxelize_collate
+  - logging: full
+  - model: mask4d
+  - optimizer: adamw
+  - scheduler: onecyclelr
+  - trainer: trainer30
+  - callbacks: callbacks_panoptic
+  - matcher: hungarian_matcher
+  - loss: set_criterion
+  - metric: lstq
+
+hydra:
+  run:
+    dir: saved/hydra_logs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: saved/hydra_logs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    # dir: ${general.save_dir}
+    subdir: ${hydra.job.num}_${hydra.job.id}
diff --git a/conf/data/collation_functions/voxelize_collate.yaml b/conf/data/collation_functions/voxelize_collate.yaml
new file mode 100644
index 0000000..7302146
--- /dev/null
+++ b/conf/data/collation_functions/voxelize_collate.yaml
@@ -0,0 +1,16 @@
+# @package data
+
+train_collation:
+  _target_: datasets.utils.VoxelizeCollate
+  ignore_label: ${data.ignore_label}
+  voxel_size: ${data.voxel_size}
+
+validation_collation:
+  _target_: datasets.utils.VoxelizeCollate
+  ignore_label: ${data.ignore_label}
+  voxel_size: ${data.voxel_size}
+
+test_collation:
+  _target_: datasets.utils.VoxelizeCollate
+  ignore_label: ${data.ignore_label}
+  voxel_size: ${data.voxel_size}
diff --git a/conf/data/data_loaders/simple_loader.yaml b/conf/data/data_loaders/simple_loader.yaml
new file mode 100644
index 0000000..39996e1
--- /dev/null
+++ b/conf/data/data_loaders/simple_loader.yaml
@@ -0,0 +1,22 @@
+# @package data
+
+train_dataloader:
+  _target_: torch.utils.data.DataLoader
+  shuffle: true
+  pin_memory: ${data.pin_memory}
+  num_workers: ${data.num_workers}
+  batch_size: ${data.batch_size}
+
+validation_dataloader:
+  _target_: torch.utils.data.DataLoader
+  shuffle: false
+  pin_memory: ${data.pin_memory}
+  num_workers: ${data.num_workers}
+  batch_size: ${data.test_batch_size}
+
+test_dataloader:
+  _target_: torch.utils.data.DataLoader
+  shuffle: false
+  pin_memory: ${data.pin_memory}
+  num_workers: ${data.num_workers}
+  batch_size: ${data.test_batch_size}
diff --git a/conf/data/datasets/semantic_kitti.yaml b/conf/data/datasets/semantic_kitti.yaml
new file mode 100644
index 0000000..4a94622
--- /dev/null
+++ b/conf/data/datasets/semantic_kitti.yaml
@@ -0,0 +1,33 @@
+# @package data
+train_dataset:
+  _target_: datasets.lidar.LidarDataset
+  data_dir: data/semantic_kitti
+  mode: ${data.train_mode}
+  add_distance: ${data.add_distance}
+  sweep: ${data.sweep}
+  instance_population: ${data.instance_population}
+  data_percent: ${data.data_percent}
+  ignore_label: ${data.ignore_label}
+  volume_augmentations_path: conf/augmentation/volumentations_aug.yaml
+
+validation_dataset:
+  _target_: datasets.lidar.LidarDataset
+  data_dir: data/semantic_kitti
+  mode: ${data.validation_mode}
+  add_distance: ${data.add_distance}
+  sweep: ${data.sweep}
+  instance_population: 0
+  data_percent: 1.0
+  ignore_label: ${data.ignore_label}
+  volume_augmentations_path: null
+
+test_dataset:
+  _target_: datasets.lidar.LidarDataset
+  data_dir: data/semantic_kitti
+  mode: ${data.test_mode}
+  add_distance: ${data.add_distance}
+  sweep: ${data.sweep}
+  instance_population: 0
+  data_percent: 1.0
+  ignore_label: ${data.ignore_label}
+  volume_augmentations_path: null
diff --git a/conf/data/kitti.yaml b/conf/data/kitti.yaml
new file mode 100644
index 0000000..d3d79f6
--- /dev/null
+++ b/conf/data/kitti.yaml
@@ -0,0 +1,32 @@
+# @package _group_
+
+# these parameters are inherited by datasets, data_loaders and collators
+# but they might be overwritten
+
+# splits
+train_mode: train
+validation_mode: validation
+test_mode: test
+
+# dataset
+ignore_label: 255
+add_distance: true
+in_channels: 2
+num_labels: 19
+instance_population:  ${general.instance_population}
+data_percent: ${general.data_percent}
+sweep: 2
+min_stuff_cls_id: 9
+min_points: 50
+class_names: ['car', 'bicycle', 'motorcycle', 'truck', 'other-vehicle', 'person', 'bicyclist',
+'motorcyclist', 'road', 'parking', 'sidewalk', 'other-ground', 'building', 'fence', 'vegetation',
+'trunk', 'terrain', 'pole', 'traffic-sign']
+
+# data loader
+pin_memory: true
+num_workers: 4
+batch_size: 4
+test_batch_size: 2
+
+# collation
+voxel_size: 0.05
diff --git a/conf/logging/full.yaml b/conf/logging/full.yaml
new file mode 100644
index 0000000..970b2da
--- /dev/null
+++ b/conf/logging/full.yaml
@@ -0,0 +1,7 @@
+# @package _group_
+- _target_: pytorch_lightning.loggers.WandbLogger
+  project: ${general.project_name}
+  name: ${general.experiment_name}
+  save_dir: ${general.save_dir}
+  entity: "kadiryilmaz193"
+  id: ${general.experiment_name}
diff --git a/conf/loss/set_criterion.yaml b/conf/loss/set_criterion.yaml
new file mode 100644
index 0000000..61f4fae
--- /dev/null
+++ b/conf/loss/set_criterion.yaml
@@ -0,0 +1,8 @@
+# @package _group_
+_target_: models.criterion.SetCriterion
+num_classes: ${data.num_labels}
+eos_coef: 0.1
+losses:
+  - "labels"
+  - "masks"
+  - "bboxs"
diff --git a/conf/matcher/hungarian_matcher.yaml b/conf/matcher/hungarian_matcher.yaml
new file mode 100644
index 0000000..e8e17a1
--- /dev/null
+++ b/conf/matcher/hungarian_matcher.yaml
@@ -0,0 +1,6 @@
+# @package _group_
+_target_: models.matcher.HungarianMatcher
+cost_class: 2.
+cost_mask: 5.
+cost_dice: 2.
+cost_box: 5.
diff --git a/conf/metric/lstq.yaml b/conf/metric/lstq.yaml
new file mode 100644
index 0000000..8b8456c
--- /dev/null
+++ b/conf/metric/lstq.yaml
@@ -0,0 +1,5 @@
+# @package _group_
+_target_: models.metrics.Panoptic4DEval
+n_classes: ${data.num_labels}
+min_stuff_cls_id: ${data.min_stuff_cls_id}
+min_points: ${data.min_points}
\ No newline at end of file
diff --git a/conf/model/mask4d.yaml b/conf/model/mask4d.yaml
new file mode 100644
index 0000000..a1836ae
--- /dev/null
+++ b/conf/model/mask4d.yaml
@@ -0,0 +1,22 @@
+# @package _group_
+_target_: models.Mask4D
+
+# backbone
+backbone:
+  _target_: models.Res16UNet34C
+  config:
+    dialations: [ 1, 1, 1, 1 ]
+    conv1_kernel_size: 5
+    bn_momentum: 0.02
+  in_channels: ${data.in_channels}
+  out_channels: ${data.num_labels}
+
+# transformer parameters
+num_queries: 100
+num_heads: 8
+num_decoders: 3
+num_levels: 4
+sample_sizes: [4000, 8000, 16000, 32000]
+mask_dim: 128
+dim_feedforward: 1024
+num_labels: ${data.num_labels}
diff --git a/conf/optimizer/adamw.yaml b/conf/optimizer/adamw.yaml
new file mode 100644
index 0000000..e2e8c3e
--- /dev/null
+++ b/conf/optimizer/adamw.yaml
@@ -0,0 +1,3 @@
+# @package _group_
+_target_: torch.optim.AdamW
+lr: 0.0002
\ No newline at end of file
diff --git a/conf/scheduler/onecyclelr.yaml b/conf/scheduler/onecyclelr.yaml
new file mode 100644
index 0000000..0d992cb
--- /dev/null
+++ b/conf/scheduler/onecyclelr.yaml
@@ -0,0 +1,10 @@
+# @package _group_
+scheduler:
+  _target_: torch.optim.lr_scheduler.OneCycleLR
+  max_lr: ${optimizer.lr}
+  epochs: ${trainer.max_epochs}
+  # need to set to number because of tensorboard logger
+  steps_per_epoch: -1
+
+pytorch_lightning_params:
+  interval: step
diff --git a/conf/semantic-kitti.yaml b/conf/semantic-kitti.yaml
new file mode 100644
index 0000000..6281065
--- /dev/null
+++ b/conf/semantic-kitti.yaml
@@ -0,0 +1,211 @@
+# This file is covered by the LICENSE file in the root of this project.
+labels: 
+  0 : "unlabeled"
+  1 : "outlier"
+  10: "car"
+  11: "bicycle"
+  13: "bus"
+  15: "motorcycle"
+  16: "on-rails"
+  18: "truck"
+  20: "other-vehicle"
+  30: "person"
+  31: "bicyclist"
+  32: "motorcyclist"
+  40: "road"
+  44: "parking"
+  48: "sidewalk"
+  49: "other-ground"
+  50: "building"
+  51: "fence"
+  52: "other-structure"
+  60: "lane-marking"
+  70: "vegetation"
+  71: "trunk"
+  72: "terrain"
+  80: "pole"
+  81: "traffic-sign"
+  99: "other-object"
+  252: "moving-car"
+  253: "moving-bicyclist"
+  254: "moving-person"
+  255: "moving-motorcyclist"
+  256: "moving-on-rails"
+  257: "moving-bus"
+  258: "moving-truck"
+  259: "moving-other-vehicle"
+color_map: # bgr
+  0 : [0, 0, 0]
+  1 : [0, 0, 255]
+  10: [245, 150, 100]
+  11: [245, 230, 100]
+  13: [250, 80, 100]
+  15: [150, 60, 30]
+  16: [255, 0, 0]
+  18: [180, 30, 80]
+  20: [255, 0, 0]
+  30: [30, 30, 255]
+  31: [200, 40, 255]
+  32: [90, 30, 150]
+  40: [255, 0, 255]
+  44: [255, 150, 255]
+  48: [75, 0, 75]
+  49: [75, 0, 175]
+  50: [0, 200, 255]
+  51: [50, 120, 255]
+  52: [0, 150, 255]
+  60: [170, 255, 150]
+  70: [0, 175, 0]
+  71: [0, 60, 135]
+  72: [80, 240, 150]
+  80: [150, 240, 255]
+  81: [0, 0, 255]
+  99: [255, 255, 50]
+  252: [245, 150, 100]
+  256: [255, 0, 0]
+  253: [200, 40, 255]
+  254: [30, 30, 255]
+  255: [90, 30, 150]
+  257: [250, 80, 100]
+  258: [180, 30, 80]
+  259: [255, 0, 0]
+content: # as a ratio with the total number of points
+  0: 0.018889854628292943
+  1: 0.0002937197336781505
+  10: 0.040818519255974316
+  11: 0.00016609538710764618
+  13: 2.7879693665067774e-05
+  15: 0.00039838616015114444
+  16: 0.0
+  18: 0.0020633612104619787
+  20: 0.0016218197275284021
+  30: 0.00017698551338515307
+  31: 1.1065903904919655e-08
+  32: 5.532951952459828e-09
+  40: 0.1987493871255525
+  44: 0.014717169549888214
+  48: 0.14392298360372
+  49: 0.0039048553037472045
+  50: 0.1326861944777486
+  51: 0.0723592229456223
+  52: 0.002395131480328884
+  60: 4.7084144280367186e-05
+  70: 0.26681502148037506
+  71: 0.006035012012626033
+  72: 0.07814222006271769
+  80: 0.002855498193863172
+  81: 0.0006155958086189918
+  99: 0.009923127583046915
+  252: 0.001789309418528068
+  253: 0.00012709999297008662
+  254: 0.00016059776092534436
+  255: 3.745553104802113e-05
+  256: 0.0
+  257: 0.00011351574470342043
+  258: 0.00010157861367183268
+  259: 4.3840131989471124e-05
+# classes that are indistinguishable from single scan or inconsistent in
+# ground truth are mapped to their closest equivalent
+learning_map:
+  0 : 0     # "unlabeled"
+  1 : 0     # "outlier" mapped to "unlabeled" --------------------------mapped
+  10: 1     # "car"
+  11: 2     # "bicycle"
+  13: 5     # "bus" mapped to "other-vehicle" --------------------------mapped
+  15: 3     # "motorcycle"
+  16: 5     # "on-rails" mapped to "other-vehicle" ---------------------mapped
+  18: 4     # "truck"
+  20: 5     # "other-vehicle"
+  30: 6     # "person"
+  31: 7     # "bicyclist"
+  32: 8     # "motorcyclist"
+  40: 9     # "road"
+  44: 10    # "parking"
+  48: 11    # "sidewalk"
+  49: 12    # "other-ground"
+  50: 13    # "building"
+  51: 14    # "fence"
+  52: 0     # "other-structure" mapped to "unlabeled" ------------------mapped
+  60: 9     # "lane-marking" to "road" ---------------------------------mapped
+  70: 15    # "vegetation"
+  71: 16    # "trunk"
+  72: 17    # "terrain"
+  80: 18    # "pole"
+  81: 19    # "traffic-sign"
+  99: 0     # "other-object" to "unlabeled" ----------------------------mapped
+  252: 1    # "moving-car" to "car" ------------------------------------mapped
+  253: 7    # "moving-bicyclist" to "bicyclist" ------------------------mapped
+  254: 6    # "moving-person" to "person" ------------------------------mapped
+  255: 8    # "moving-motorcyclist" to "motorcyclist" ------------------mapped
+  256: 5    # "moving-on-rails" mapped to "other-vehicle" --------------mapped
+  257: 5    # "moving-bus" mapped to "other-vehicle" -------------------mapped
+  258: 4    # "moving-truck" to "truck" --------------------------------mapped
+  259: 5    # "moving-other"-vehicle to "other-vehicle" ----------------mapped
+learning_map_inv: # inverse of previous map
+  0: 0      # "unlabeled", and others ignored
+  1: 10     # "car"
+  2: 11     # "bicycle"
+  3: 15     # "motorcycle"
+  4: 18     # "truck"
+  5: 20     # "other-vehicle"
+  6: 30     # "person"
+  7: 31     # "bicyclist"
+  8: 32     # "motorcyclist"
+  9: 40     # "road"
+  10: 44    # "parking"
+  11: 48    # "sidewalk"
+  12: 49    # "other-ground"
+  13: 50    # "building"
+  14: 51    # "fence"
+  15: 70    # "vegetation"
+  16: 71    # "trunk"
+  17: 72    # "terrain"
+  18: 80    # "pole"
+  19: 81    # "traffic-sign"
+learning_ignore: # Ignore classes
+  0: True      # "unlabeled", and others ignored
+  1: False     # "car"
+  2: False     # "bicycle"
+  3: False     # "motorcycle"
+  4: False     # "truck"
+  5: False     # "other-vehicle"
+  6: False     # "person"
+  7: False     # "bicyclist"
+  8: False     # "motorcyclist"
+  9: False     # "road"
+  10: False    # "parking"
+  11: False    # "sidewalk"
+  12: False    # "other-ground"
+  13: False    # "building"
+  14: False    # "fence"
+  15: False    # "vegetation"
+  16: False    # "trunk"
+  17: False    # "terrain"
+  18: False    # "pole"
+  19: False    # "traffic-sign"
+split: # sequence numbers
+  train:
+    - 0
+    - 1
+    - 2
+    - 3
+    - 4
+    - 5
+    - 6
+    - 7
+    - 9
+    - 10
+  valid:
+    - 8
+  test:
+    - 11
+    - 12
+    - 13
+    - 14
+    - 15
+    - 16
+    - 17
+    - 18
+    - 19
+    - 20
+    - 21
diff --git a/conf/trainer/trainer30.yaml b/conf/trainer/trainer30.yaml
new file mode 100644
index 0000000..80c10be
--- /dev/null
+++ b/conf/trainer/trainer30.yaml
@@ -0,0 +1,4 @@
+# @package _group_
+max_epochs: 30
+check_val_every_n_epoch: 5
+num_sanity_val_steps: 2
diff --git a/datasets/lidar.py b/datasets/lidar.py
new file mode 100644
index 0000000..c09ddaa
--- /dev/null
+++ b/datasets/lidar.py
@@ -0,0 +1,201 @@
+import numpy as np
+import volumentations as V
+import yaml
+from torch.utils.data import Dataset
+from pathlib import Path
+from typing import List, Optional, Union
+from random import random, choice, uniform
+
+
+class LidarDataset(Dataset):
+    def __init__(
+        self,
+        data_dir: Optional[str] = "data/processed/semantic_kitti",
+        mode: Optional[str] = "train",
+        add_distance: Optional[bool] = False,
+        data_percent: Optional[float] = 1.0,
+        ignore_label: Optional[Union[int, List[int]]] = 255,
+        volume_augmentations_path: Optional[str] = None,
+        instance_population: Optional[int] = 0,
+        sweep: Optional[int] = 1,
+    ):
+        self.mode = mode
+        self.data_dir = data_dir
+        self.ignore_label = ignore_label
+        self.add_distance = add_distance
+        self.instance_population = instance_population
+        self.sweep = sweep
+        self.config = self._load_yaml("conf/semantic-kitti.yaml")
+        
+        # loading database file
+        database_path = Path(self.data_dir)
+        if not (database_path / f"{mode}_database.yaml").exists():
+            print(f"generate {database_path}/{mode}_database.yaml first")
+            exit()
+        self.data = self._load_yaml(database_path / f"{mode}_database.yaml")
+        
+        self.label_info = self._select_correct_labels(self.config["learning_ignore"])
+        # augmentations
+        self.volume_augmentations = V.NoOp()
+        if volume_augmentations_path is not None:
+            self.volume_augmentations = V.load(
+                volume_augmentations_path, data_format="yaml"
+            )
+        # reformulating in sweeps
+        data = [[]]
+        last_scene = self.data[0]["scene"]
+        for x in self.data:
+            if x["scene"] == last_scene:
+                data[-1].append(x)
+            else:
+                last_scene = x["scene"]
+                data.append([x])
+        for i in range(len(data)):
+            data[i] = list(self.chunks(data[i], sweep))
+        self.data = [val for sublist in data for val in sublist]
+        
+        if instance_population > 0:
+            self.instance_data = self._load_yaml(database_path / f"{mode}_instances_database.yaml")
+        
+        if data_percent < 1.0:
+            self.data = self.data[: int(len(self.data) * data_percent)]
+
+    def chunks(self, lst, n):
+        if "train" in self.mode or n==1:
+            for i in range(len(lst) - n + 1):
+                yield lst[i : i + n]
+        else:
+            for i in range(0, len(lst) - n + 1, n-1):
+                yield lst[i : i + n]
+            if i != len(lst) - n:
+                yield lst[i + n - 1: ]
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx: int):
+        coordinates_list = []
+        features_list = []
+        labels_list = []
+        acc_num_points = [0]
+        for time, scan in enumerate(self.data[idx]):
+            points = np.fromfile(scan["filepath"], dtype=np.float32).reshape(-1, 4)
+            coordinates = points[:, :3]
+            # rotate and translate
+            pose = np.array(scan["pose"]).T
+            coordinates = coordinates @ pose[:3, :3] + pose[3, :3]
+            coordinates_list.append(coordinates)
+            acc_num_points.append(acc_num_points[-1] + len(coordinates))
+            features = points[:, 3:4]
+            time_array = np.ones((features.shape[0], 1)) * time
+            features = np.hstack((time_array, features))
+            features_list.append(features)
+            if "test" in self.mode:
+                labels = np.zeros_like(features).astype(np.int64)
+                labels_list.append(labels)
+            else:
+                panoptic_label = np.fromfile(scan["label_filepath"], dtype=np.uint32)
+                semantic_label = panoptic_label & 0xFFFF
+                semantic_label = np.vectorize(self.config["learning_map"].__getitem__)(semantic_label)
+                labels = np.hstack((semantic_label[:, None], panoptic_label[:, None]))
+                labels_list.append(labels)
+        
+        coordinates = np.vstack(coordinates_list)
+        features = np.vstack(features_list)
+        labels = np.vstack(labels_list)
+        
+        if "train" in self.mode and self.instance_population > 0:
+            max_instance_id = np.amax(labels[:, 1])
+            pc_center = coordinates.mean(axis=0)
+            instance_c, instance_f, instance_l = self.populate_instances(
+                max_instance_id, pc_center, self.instance_population)
+            coordinates = np.vstack((coordinates, instance_c))
+            features = np.vstack((features, instance_f))
+            labels = np.vstack((labels, instance_l))
+            
+        if self.add_distance:
+            center_coordinate = coordinates.mean(0)
+            features = np.hstack(
+                (
+                    features,
+                    np.linalg.norm(coordinates - center_coordinate, axis=1)[
+                        :, np.newaxis
+                    ],
+                )
+            )
+
+        # volume and image augmentations for train
+        if "train" in self.mode:
+            coordinates -= coordinates.mean(0)
+            if 0.5 > random():
+                coordinates += (
+                    np.random.uniform(coordinates.min(0), coordinates.max(0)) / 2
+                )
+            aug = self.volume_augmentations(points=coordinates)
+            coordinates = aug["points"]
+            
+        features = np.hstack((coordinates, features))
+        
+        labels[:, 0] = np.vectorize(self.label_info.__getitem__)(labels[:, 0])
+        
+        return {
+            "num_points": acc_num_points,
+            "coordinates": coordinates,
+            "features": features,
+            "labels": labels,
+            "sequence": scan["scene"]
+        }
+
+    @staticmethod
+    def _load_yaml(filepath):
+        with open(filepath) as f:
+            file = yaml.safe_load(f)
+        return file
+
+    def _select_correct_labels(self, learning_ignore):
+        count = 0
+        label_info = dict()
+        for k, v, in learning_ignore.items():
+            if v:
+                label_info[k] = self.ignore_label
+            else:
+                label_info[k] = count
+                count += 1
+        return label_info
+
+    def _remap_model_output(self, output):
+        inv_map = {v: k for k, v in self.label_info.items()}
+        output = np.vectorize(inv_map.__getitem__)(output)
+        return output
+
+    def populate_instances(self, max_instance_id, pc_center, instance_population):
+        coordinates_list = []
+        features_list = []
+        labels_list = []
+        for _ in range(instance_population):
+            instance_dict = choice(self.instance_data)
+            idx = np.random.randint(len(instance_dict["filepaths"]))
+            instance_list = []
+            for time in range(self.sweep):
+                if idx < len(instance_dict["filepaths"]):
+                    filepath = instance_dict["filepaths"][idx]
+                    instance = np.load(filepath)
+                    time_array = np.ones((instance.shape[0], 1)) * time
+                    instance = np.hstack((instance[:, :3], time_array , instance[:, 3:4]))
+                    instance_list.append(instance)
+                    idx = idx + 1
+            instances = np.vstack(instance_list)
+            coordinates = instances[:, :3] - instances[:, :3].mean(0)
+            coordinates += pc_center + np.array([uniform(-10, 10), uniform(-10, 10), uniform(-1, 1)])
+            features = instances[:, 3:]
+            semantic_label = instance_dict["semantic_label"]
+            labels = np.zeros_like(features, dtype=np.int64)
+            labels[:, 0] = semantic_label
+            max_instance_id = max_instance_id + 1
+            labels[:, 1] = max_instance_id
+            aug = self.volume_augmentations(points=coordinates)
+            coordinates = aug["points"]
+            coordinates_list.append(coordinates)
+            features_list.append(features)
+            labels_list.append(labels)
+        return np.vstack(coordinates_list), np.vstack(features_list), np.vstack(labels_list)
diff --git a/datasets/preprocessing/semantic_kitti_preprocessing.py b/datasets/preprocessing/semantic_kitti_preprocessing.py
new file mode 100644
index 0000000..bd4258a
--- /dev/null
+++ b/datasets/preprocessing/semantic_kitti_preprocessing.py
@@ -0,0 +1,253 @@
+import re
+import numpy as np
+import yaml
+from pathlib import Path
+from natsort import natsorted
+from loguru import logger
+from tqdm import tqdm
+from fire import Fire
+
+
+class SemanticKittiPreprocessing:
+    def __init__(
+        self,
+        data_dir: str = "/globalwork/data/SemanticKITTI/dataset",
+        save_dir: str = "/globalwork/yilmaz/data/processed/semantic_kitti",
+        modes: tuple = ("train", "validation", "test"),
+        git_repo: str = "./third_party/semantic-kitti-api",
+    ):
+        self.data_dir = Path(data_dir)
+        self.save_dir = Path(save_dir)
+        self.modes = modes
+        git_repo = Path(git_repo)
+        
+        if not self.data_dir.exists():
+            logger.error("Data folder doesn't exist")
+            raise FileNotFoundError
+        if self.save_dir.exists() is False:
+            self.save_dir.mkdir(parents=True, exist_ok=True)
+        
+        self.files = {}
+        for data_type in self.modes:
+            self.files.update({data_type: []})
+        
+        self.config = self._load_yaml(git_repo / "config" / "semantic-kitti.yaml")
+        self.create_label_database(git_repo / "config" / "semantic-kitti.yaml")
+        self.pose = dict()
+
+        for mode in self.modes:
+            scene_mode = "valid" if mode == "validation" else mode
+            self.pose[mode] = dict()
+            for scene in sorted(self.config["split"][scene_mode]):
+                filepaths = list(self.data_dir.glob(f"*/{scene:02}/velodyne/*bin"))
+                filepaths = [str(file) for file in filepaths]
+                self.files[mode].extend(natsorted(filepaths))
+                calibration = parse_calibration(
+                    Path(filepaths[0]).parent.parent / "calib.txt"
+                )
+                self.pose[mode].update(
+                    {
+                        scene: parse_poses(
+                            Path(filepaths[0]).parent.parent / "poses.txt", calibration,
+                        ),
+                    }
+                )
+    
+    def preprocess(self):
+        for mode in self.modes:
+            database = []
+            for filepath in tqdm(self.files[mode], unit="file"):
+                filebase = self.process_file(filepath, mode)
+                database.append(filebase)
+            self.save_database(database, mode)
+        self.joint_database()
+
+    def make_instance_database(self):
+        train_database = self._load_yaml(self.save_dir / "train_database.yaml")
+        instance_database = {}
+        for sample in tqdm(train_database):
+            instances = self.extract_instance_from_file(sample)
+            for instance in instances:
+                scene = instance["scene"]
+                panoptic_label = instance["panoptic_label"]
+                unique_identifier = f"{scene}_{panoptic_label}"
+                if unique_identifier in instance_database:
+                    instance_database[unique_identifier]["filepaths"].append(instance["instance_filepath"])
+                else:
+                    instance_database[unique_identifier] = {
+                        "semantic_label": instance["semantic_label"],
+                        "filepaths": [instance["instance_filepath"]]
+                    }
+        self.save_database(list(instance_database.values()), "train_instances")
+        
+        validation_database = self._load_yaml(self.save_dir / "validation_database.yaml")
+        for sample in tqdm(validation_database):
+            instances = self.extract_instance_from_file(sample)
+            for instance in instances:
+                scene = instance["scene"]
+                panoptic_label = instance["panoptic_label"]
+                unique_identifier = f"{scene}_{panoptic_label}"
+                if unique_identifier in instance_database:
+                    instance_database[unique_identifier]["filepaths"].append(instance["instance_filepath"])
+                else:
+                    instance_database[unique_identifier] = {
+                        "semantic_label": instance["semantic_label"],
+                        "filepaths": [instance["instance_filepath"]]
+                    }
+        self.save_database(list(instance_database.values()), "trainval_instances")
+    
+    def extract_instance_from_file(self, sample):
+        points = np.fromfile(sample["filepath"], dtype=np.float32).reshape(-1, 4)
+        pose = np.array(sample["pose"]).T
+        points[:, :3] = points[:, :3] @ pose[:3, :3] + pose[3, :3]
+        label = np.fromfile(sample["label_filepath"], dtype=np.uint32)
+        scene, sub_scene = re.search(r"(\d{2}).*(\d{6})", sample["filepath"]).group(1, 2)
+        file_instances = []
+        for panoptic_label in np.unique(label):
+            semantic_label = panoptic_label & 0xFFFF
+            semantic_label = np.vectorize(self.config["learning_map"].__getitem__)(semantic_label)
+            if np.isin(semantic_label, range(1,9)):
+                instance_mask = label == panoptic_label
+                instance_points = points[instance_mask, :]
+                filename = f"{scene}_{panoptic_label:010d}_{sub_scene}.npy"
+                instance_filepath = self.save_dir / "instances" / filename
+                instance = {
+                    "scene": scene,
+                    "sub_scene": sub_scene,
+                    "panoptic_label": f"{panoptic_label:010d}",
+                    "instance_filepath": str(instance_filepath),
+                    "semantic_label": semantic_label.item(),
+                }
+                if not instance_filepath.parent.exists():
+                    instance_filepath.parent.mkdir(parents=True, exist_ok=True)
+                np.save(instance_filepath, instance_points.astype(np.float32))
+                file_instances.append(instance)
+        return file_instances
+        
+    def save_database(self, database, mode):
+        for element in database:
+            self._dict_to_yaml(element)
+        self._save_yaml(self.save_dir / (mode + "_database.yaml"), database)
+
+    def joint_database(self, train_modes=["train", "validation"]):
+        joint_db = []
+        for mode in train_modes:
+            joint_db.extend(self._load_yaml(self.save_dir / (mode + "_database.yaml")))
+        self._save_yaml(self.save_dir / "trainval_database.yaml", joint_db)
+
+    @classmethod
+    def _save_yaml(cls, path, file):
+        with open(path, "w") as f:
+            yaml.safe_dump(file, f, default_style=None, default_flow_style=False)
+
+    @classmethod
+    def _dict_to_yaml(cls, dictionary):
+        if not isinstance(dictionary, dict):
+            return
+        for k, v in dictionary.items():
+            if isinstance(v, dict):
+                cls._dict_to_yaml(v)
+            if isinstance(v, np.ndarray):
+                dictionary[k] = v.tolist()
+            if isinstance(v, Path):
+                dictionary[k] = str(v)
+
+    @classmethod
+    def _load_yaml(cls, filepath):
+        with open(filepath) as f:
+            file = yaml.safe_load(f)
+        return file
+
+    def create_label_database(self, config_file):
+        if (self.save_dir / "label_database.yaml").exists():
+            return self._load_yaml(self.save_dir / "label_database.yaml")
+        config = self._load_yaml(config_file)
+        label_database = {}
+        for key, old_key in config["learning_map_inv"].items():
+            label_database.update(
+                {
+                    key: {
+                        "name": config["labels"][old_key],
+                        "color": config["color_map"][old_key][::-1],
+                        "validation": not config["learning_ignore"][key],
+                    }
+                }
+            )
+
+        self._save_yaml(self.save_dir / "label_database.yaml", label_database)
+        return label_database
+
+    def process_file(self, filepath, mode):
+        scene, sub_scene = re.search(r"(\d{2}).*(\d{6})", filepath).group(1, 2)
+        sample = {
+            "filepath": filepath,
+            "scene": int(scene),
+            "pose": self.pose[mode][int(scene)][int(sub_scene)].tolist(),
+        }
+        
+        if mode in ["train", "validation"]:
+            # getting label info
+            label_filepath = filepath.replace("velodyne", "labels").replace(
+                "bin", "label"
+            )
+            sample["label_filepath"] = label_filepath
+        return sample
+
+def parse_calibration(filename):
+    """ read calibration file with given filename
+        Returns
+        -------
+        dict
+            Calibration matrices as 4x4 numpy arrays.
+    """
+    calib = {}
+
+    with open(filename) as calib_file:
+        for line in calib_file:
+            key, content = line.strip().split(":")
+            values = [float(v) for v in content.strip().split()]
+
+            pose = np.zeros((4, 4))
+            pose[0, 0:4] = values[0:4]
+            pose[1, 0:4] = values[4:8]
+            pose[2, 0:4] = values[8:12]
+            pose[3, 3] = 1.0
+
+            calib[key] = pose
+    return calib
+
+def parse_poses(filename, calibration):
+    """ read poses file with per-scan poses from given filename
+        Returns
+        -------
+        list
+            list of poses as 4x4 numpy arrays.
+    """
+
+    poses = []
+
+    Tr = calibration["Tr"]
+    Tr_inv = np.linalg.inv(Tr)
+
+    with open(filename) as file:
+        for line in file:
+            values = [float(v) for v in line.strip().split()]
+
+            pose = np.zeros((4, 4))
+            pose[0, 0:4] = values[0:4]
+            pose[1, 0:4] = values[4:8]
+            pose[2, 0:4] = values[8:12]
+            pose[3, 3] = 1.0
+
+            poses.append(np.matmul(Tr_inv, np.matmul(pose, Tr)))
+
+    return poses
+
+if __name__ == "__main__":
+    Fire(SemanticKittiPreprocessing)
+
+# if __name__ == "__main__":
+    # preprocess_cls = SemanticKittiPreprocessing()
+    # preprocess_cls.preprocess_sequential()
+    # preprocess_cls.make_instance_database_sequential()
+    
\ No newline at end of file
diff --git a/datasets/utils.py b/datasets/utils.py
new file mode 100644
index 0000000..4bc67a1
--- /dev/null
+++ b/datasets/utils.py
@@ -0,0 +1,110 @@
+import MinkowskiEngine as ME
+import numpy as np
+import torch
+
+
+class VoxelizeCollate:
+    def __init__(
+            self,
+            ignore_label=255,
+            voxel_size=1,
+    ):
+        self.voxel_size = voxel_size
+        self.ignore_label = ignore_label
+                
+    def __call__(self, batch):
+        (coordinates, features, labels, original_labels, inverse_maps, num_points, sequences) = (
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            []
+        )
+        
+        for sample in batch:
+            original_labels.append(sample["labels"])
+            num_points.append(sample["num_points"])
+            sequences.append(sample["sequence"])
+            sample_c, sample_f, sample_l, inverse_map = voxelize(sample["coordinates"],
+                                                                 sample["features"],
+                                                                 sample["labels"],
+                                                                 self.voxel_size)
+            inverse_maps.append(inverse_map)
+            coordinates.append(sample_c)
+            features.append(sample_f)
+            labels.append(sample_l)
+            
+        # Concatenate all lists
+        target = generate_target(features, labels, self.ignore_label)
+        coordinates, features = ME.utils.sparse_collate(coordinates, features)
+        raw_coordinates = features[:, :4]
+        features = features[:, 4:]
+        
+        return (NoGpu(coordinates, features, raw_coordinates, original_labels, inverse_maps, num_points, sequences), target)
+
+def voxelize(coordinates, features, labels, voxel_size):
+    if coordinates.shape[1] == 4:
+        voxel_size = np.array([voxel_size, voxel_size, voxel_size, 1])
+    sample_c, sample_f, unique_map, inverse_map = ME.utils.sparse_quantize(coordinates=coordinates,
+                                                                           features=features,
+                                                                           return_index=True,
+                                                                           return_inverse=True,
+                                                                           quantization_size=voxel_size)
+    sample_c = sample_c
+    sample_f = torch.from_numpy(sample_f).float()
+    sample_l = torch.from_numpy(labels[unique_map])
+    return sample_c, sample_f, sample_l, inverse_map
+
+def generate_target(features, labels, ignore_label):
+    target = []
+    
+    for feat, lb in zip(features, labels):
+        raw_coords = feat[:, :3]
+        raw_coords = (raw_coords - raw_coords.min(0)[0]) / (raw_coords.max(0)[0]-raw_coords.min(0)[0])
+        mask_labels = []
+        binary_masks = []
+        bboxs = []
+        
+        panoptic_labels = lb[:, 1].unique()
+        for panoptic_label in panoptic_labels:
+            mask = lb[:, 1] == panoptic_label
+            
+            if panoptic_label == 0:
+                continue
+            
+            sem_labels = lb[mask, 0]
+            if sem_labels[0] != ignore_label:
+                mask_labels.append(sem_labels[0])
+                binary_masks.append(mask)
+                mask_coords = raw_coords[mask, :]
+                bboxs.append(torch.hstack((
+                    mask_coords.mean(0),
+                    mask_coords.max(0)[0]-mask_coords.min(0)[0],
+                    )))
+        
+        if len(mask_labels) != 0:
+            mask_labels = torch.stack(mask_labels)
+            binary_masks = torch.stack(binary_masks)
+            bboxs = torch.stack(bboxs)
+            target.append({
+                    'labels': mask_labels,
+                    'masks': binary_masks,
+                    'bboxs': bboxs
+            })
+    
+    return target
+
+
+class NoGpu:
+    def __init__(self, coordinates, features, raw_coordinates, original_labels=None, inverse_maps=None, num_points=None,
+                 sequences=None):
+        """ helper class to prevent gpu loading on lightning """
+        self.coordinates = coordinates
+        self.features = features
+        self.raw_coordinates = raw_coordinates
+        self.original_labels = original_labels
+        self.inverse_maps = inverse_maps
+        self.num_points = num_points
+        self.sequences = sequences
\ No newline at end of file
diff --git a/main_panoptic.py b/main_panoptic.py
new file mode 100644
index 0000000..fd63c53
--- /dev/null
+++ b/main_panoptic.py
@@ -0,0 +1,101 @@
+import logging
+import os
+import hydra
+import torch
+from dotenv import load_dotenv
+from omegaconf import DictConfig, OmegaConf
+from trainer.trainer import PanopticSegmentation
+from utils.utils import flatten_dict, RegularCheckpointing
+from pytorch_lightning import Trainer, seed_everything
+
+
+def get_parameters(cfg: DictConfig):
+    logger = logging.getLogger(__name__)
+    load_dotenv(".env")
+
+    # parsing input parameters
+    seed_everything(cfg.general.seed)
+
+    # getting basic configuration
+    if cfg.general.get("gpus", None) is None:
+        cfg.general.gpus = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+    loggers = []
+    
+    if not os.path.exists(cfg.general.save_dir):
+        os.makedirs(cfg.general.save_dir)
+    else:
+        print("EXPERIMENT ALREADY EXIST")
+        cfg.general.ckpt_path = f"{cfg.general.save_dir}/last-epoch.ckpt"
+
+    for log in cfg.logging:
+        print(log)
+        loggers.append(hydra.utils.instantiate(log))
+        loggers[-1].log_hyperparams(
+            flatten_dict(OmegaConf.to_container(cfg, resolve=True))
+        )
+
+    model = PanopticSegmentation(cfg)
+    
+    logger.info(flatten_dict(OmegaConf.to_container(cfg, resolve=True)))
+    return cfg, model, loggers
+
+
+@hydra.main(config_path="conf", config_name="config_panoptic_4d.yaml")
+def train(cfg: DictConfig):
+    os.chdir(hydra.utils.get_original_cwd())
+    cfg, model, loggers = get_parameters(cfg)
+    callbacks = []
+    for cb in cfg.callbacks:
+        callbacks.append(hydra.utils.instantiate(cb))
+
+    callbacks.append(RegularCheckpointing())
+    # torch.use_deterministic_algorithms(True)
+    runner = Trainer(
+        logger=loggers,
+        accelerator='gpu',
+        devices=1,
+        callbacks=callbacks,
+        default_root_dir=str(cfg.general.save_dir),
+        **cfg.trainer,
+    )
+    runner.fit(model, ckpt_path=cfg.general.ckpt_path)
+
+
+@hydra.main(config_path="conf", config_name="config_panoptic_4d.yaml")
+def validate(cfg: DictConfig):
+    # because hydra wants to change dir for some reason
+    os.chdir(hydra.utils.get_original_cwd())
+    cfg, model, loggers = get_parameters(cfg)
+    runner = Trainer(
+        logger=loggers,
+        accelerator='gpu',
+        devices=1,
+        default_root_dir=str(cfg.general.save_dir),
+    )
+    runner.validate(model=model, ckpt_path=cfg.general.ckpt_path)
+
+
+@hydra.main(config_path="conf", config_name="config_panoptic_4d.yaml")
+def test(cfg: DictConfig):
+    # because hydra wants to change dir for some reason
+    os.chdir(hydra.utils.get_original_cwd())
+    cfg, model, loggers = get_parameters(cfg)
+    runner = Trainer(
+        logger=loggers,
+        accelerator='gpu',
+        devices=1,
+        default_root_dir=str(cfg.general.save_dir),
+    )
+    runner.test(model=model, ckpt_path=cfg.general.ckpt_path)
+
+@hydra.main(config_path="conf", config_name="config_panoptic_4d.yaml")
+def main(cfg: DictConfig):
+    if cfg['general']['mode'] == "train":
+        train(cfg)
+    elif cfg['general']['mode'] == "validate":
+        validate(cfg)
+    else:
+        test(cfg)
+
+if __name__ == "__main__":
+    main()
diff --git a/models/__init__.py b/models/__init__.py
new file mode 100644
index 0000000..369c63f
--- /dev/null
+++ b/models/__init__.py
@@ -0,0 +1,35 @@
+import models.resunet as resunet
+import models.res16unet as res16unet
+from models.res16unet import Res16UNet34C, STRes16UNet34C
+from models.mask4d import Mask4D
+
+MODELS = []
+
+
+def add_models(module):
+    MODELS.extend([getattr(module, a) for a in dir(module) if "Net" in a])
+
+
+add_models(resunet)
+add_models(res16unet)
+
+
+def get_models():
+    """Returns a tuple of sample models."""
+    return MODELS
+
+
+def load_model(name):
+    """Creates and returns an instance of the model given its class name."""
+    # Find the model class from its name
+    all_models = get_models()
+    mdict = {model.__name__: model for model in all_models}
+    if name not in mdict:
+        print("Invalid model index. Options are:")
+        # Display a list of valid model names
+        for model in all_models:
+            print(f"\t* {model.__name__}")
+        return None
+    NetClass = mdict[name]
+
+    return NetClass
diff --git a/models/criterion.py b/models/criterion.py
new file mode 100644
index 0000000..26c9625
--- /dev/null
+++ b/models/criterion.py
@@ -0,0 +1,158 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+def dice_loss(inputs: torch.Tensor, targets: torch.Tensor, num_masks: float):
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * (inputs * targets).sum(-1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss.sum() / num_masks
+
+
+dice_loss_jit = torch.jit.script(
+    dice_loss
+)  # type: torch.jit.ScriptModule
+
+
+def sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor, num_masks: float):
+    loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    return loss.mean(1).sum() / num_masks
+
+
+sigmoid_ce_loss_jit = torch.jit.script(
+    sigmoid_ce_loss
+)  # type: torch.jit.ScriptModule
+
+
+def box_loss(inputs: torch.Tensor, targets: torch.Tensor, num_bboxs: float):
+    loss = F.l1_loss(inputs, targets, reduction="none")
+    return loss.mean(1).sum() / num_bboxs
+
+
+box_loss_jit = torch.jit.script(
+    box_loss
+)  # type: torch.jit.ScriptModule
+
+
+class SetCriterion(nn.Module):
+    """This class computes the loss for DETR.
+    The process happens in two steps:
+        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
+        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
+    """
+
+    def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses):
+        """Create the criterion.
+        Parameters:
+            num_classes: number of object categories, omitting the special no-object category
+            matcher: module able to compute a matching between targets and proposals
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            eos_coef: relative classification weight applied to the no-object category
+            losses: list of all the losses to be applied. See get_loss for list of available losses.
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.eos_coef = eos_coef
+        self.losses = losses
+        empty_weight = torch.ones(num_classes + 1)
+        empty_weight[-1] = self.eos_coef
+        
+        self.register_buffer("empty_weight", empty_weight)
+
+    def loss_labels(self, outputs, targets, indices):
+        src_logits = outputs["pred_logits"].float()
+
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(
+            src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device
+        )
+        target_classes[idx] = target_classes_o
+
+        loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight, ignore_index=255)
+        losses = {"loss_ce": loss_ce}
+        return losses
+
+    def loss_masks(self, outputs, targets, indices):
+        loss_masks = []
+        loss_dices = []
+        
+        for batch_id, (map_id, target_id) in enumerate(indices):
+            map = outputs["pred_masks"][batch_id][:, map_id].T
+            target_mask = targets[batch_id]["masks"][target_id].float()
+            num_masks = target_mask.shape[0]
+            
+            loss_masks.append(sigmoid_ce_loss_jit(map, target_mask, num_masks))
+            loss_dices.append(dice_loss_jit(map, target_mask, num_masks))
+        return {
+            "loss_mask": torch.sum(torch.stack(loss_masks)),
+            "loss_dice": torch.sum(torch.stack(loss_dices))
+        }
+
+    def loss_bboxs(self, outputs, targets, indices):
+        loss_box = torch.zeros(1, device=outputs["pred_bboxs"].device)
+        for batch_id, (map_id, target_id) in enumerate(indices):
+            pred_bboxs = outputs["pred_bboxs"][batch_id, map_id, :]
+            target_bboxs = targets[batch_id]["bboxs"][target_id]
+            target_classes = targets[batch_id]["labels"][target_id]
+            keep_things = target_classes < 8
+            if torch.any(keep_things):
+                target_bboxs = target_bboxs[keep_things]
+                pred_bboxs = pred_bboxs[keep_things]
+                num_bboxs = target_bboxs.shape[0]
+                loss_box += box_loss_jit(pred_bboxs, target_bboxs, num_bboxs)
+        return {
+            "loss_box": loss_box,
+        }
+
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    def get_loss(self, loss, outputs, targets, indices):
+        loss_map = {
+            'labels': self.loss_labels,
+            'masks': self.loss_masks,
+            'bboxs': self.loss_bboxs
+        }
+        return loss_map[loss](outputs, targets, indices)
+
+    def forward(self, outputs, targets):
+        """This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"}
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
+        
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices))
+        
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "aux_outputs" in outputs:
+            for i, aux_outputs in enumerate(outputs["aux_outputs"]):
+                indices = self.matcher(aux_outputs, targets)
+                for loss in self.losses:
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices)
+                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        return losses
diff --git a/models/mask4d.py b/models/mask4d.py
new file mode 100644
index 0000000..32debbc
--- /dev/null
+++ b/models/mask4d.py
@@ -0,0 +1,273 @@
+import torch
+import hydra
+import torch.nn as nn
+import MinkowskiEngine.MinkowskiOps as me
+from MinkowskiEngine.MinkowskiPooling import MinkowskiAvgPooling
+from models.modules.common import conv
+from models.position_embedding import PositionEmbeddingCoordsSine
+from third_party.pointnet2.pointnet2_utils import furthest_point_sample
+from models.modules.helpers_3detr import GenericMLP
+from torch.cuda.amp import autocast
+from models.modules.attention import CrossAttentionLayer, SelfAttentionLayer, FFNLayer
+
+
+class Mask4D(nn.Module):
+    def __init__(self, backbone, num_queries, num_heads, num_decoders, num_levels, sample_sizes,
+                 mask_dim, dim_feedforward, num_labels):
+        super().__init__()
+        self.backbone = hydra.utils.instantiate(backbone)
+        self.num_queries = num_queries
+        self.num_heads = num_heads
+        self.num_decoders = num_decoders
+        self.num_levels = num_levels
+        self.sample_sizes = sample_sizes
+        sizes = self.backbone.PLANES[-5:]
+
+        self.point_features_head = conv(
+            self.backbone.PLANES[7], mask_dim, kernel_size=1, stride=1, bias=True, D=3
+        )
+        
+        self.query_projection = GenericMLP(
+            input_dim=mask_dim,
+            hidden_dims=[mask_dim],
+            output_dim=mask_dim,
+            use_conv=True,
+            output_use_activation=True,
+            hidden_use_bias=True,
+        )
+        
+        self.mask_embed_head = nn.Sequential(
+            nn.Linear(mask_dim, mask_dim),
+            nn.ReLU(),
+            nn.Linear(mask_dim, mask_dim)
+        )
+        
+        self.bbox_embed_head = nn.Sequential(
+            nn.Linear(mask_dim, mask_dim),
+            nn.ReLU(),
+            nn.Linear(mask_dim, mask_dim),
+            nn.ReLU(),
+            nn.Linear(mask_dim, 6),
+            nn.Sigmoid()
+        )
+        
+        self.class_embed_head = nn.Linear(mask_dim, num_labels + 1)
+        self.pos_enc = PositionEmbeddingCoordsSine(d_pos=mask_dim)
+        self.temporal_pos_enc = PositionEmbeddingCoordsSine(d_in=1, d_pos=mask_dim)
+        self.pooling = MinkowskiAvgPooling(kernel_size=2, stride=2, dimension=3)
+        
+        self.cross_attention = nn.ModuleList()
+        self.self_attention = nn.ModuleList()
+        self.ffn_attention = nn.ModuleList()
+        self.lin_squeeze = nn.ModuleList()
+        
+        for hlevel in range(self.num_levels):
+            self.cross_attention.append(
+                CrossAttentionLayer(
+                    d_model=mask_dim,
+                    nhead=self.num_heads,
+                )
+            )
+            self.lin_squeeze.append(nn.Linear(sizes[hlevel], mask_dim))
+            self.self_attention.append(
+                SelfAttentionLayer(
+                    d_model=mask_dim,
+                    nhead=self.num_heads,
+                )
+            )
+            self.ffn_attention.append(
+                FFNLayer(
+                    d_model=mask_dim,
+                    dim_feedforward=dim_feedforward,
+                )
+            )
+
+        self.decoder_norm = nn.LayerNorm(mask_dim)
+
+    def forward(self, x, raw_coordinates=None, is_eval=False):
+        device = x.device
+        all_features = self.backbone(x)
+        point_features = self.point_features_head(all_features[-1])
+        
+        with torch.no_grad():
+            coordinates = me.SparseTensor(features=raw_coordinates, coordinates=x.C, device=device)
+        pos_encodings_pcd = self.get_pos_encs(coordinates)
+        
+        sampled_coords = []
+        mins = []
+        maxs = []
+        for coords, feats in zip(x.decomposed_coordinates, coordinates.decomposed_features):
+            fps_idx = furthest_point_sample(coords[None, ...].float(), self.num_queries).squeeze(0).long()
+            sampled_coords.append(feats[fps_idx, :3])
+            mins.append(feats[:, :3].min(dim=0)[0])
+            maxs.append(feats[:, :3].max(dim=0)[0])
+            
+        sampled_coords = torch.stack(sampled_coords)
+        mins = torch.stack(mins)
+        maxs = torch.stack(maxs)
+        
+        query_pos = self.pos_enc(sampled_coords.float(),input_range=[mins, maxs])  # Batch, Dim, queries
+        query_pos = self.query_projection(query_pos)
+
+        queries = torch.zeros_like(query_pos).permute((0, 2, 1))
+        query_pos = query_pos.permute((2, 0, 1))
+
+        predictions_class = []
+        predictions_bbox = []
+        predictions_mask = []
+
+        for _ in range(self.num_decoders):
+            for hlevel in range(self.num_levels):
+                output_class, outputs_bbox, outputs_mask, attn_mask = self.mask_module(queries,
+                                                                                       point_features,
+                                                                                       self.num_levels - hlevel)
+
+                decomposed_feat = all_features[hlevel].decomposed_features
+                decomposed_attn = attn_mask.decomposed_features
+                
+                pcd_sizes = [pcd.shape[0] for pcd in decomposed_feat]
+                curr_sample_size = max(pcd_sizes)
+
+                if not is_eval:
+                    curr_sample_size = min(curr_sample_size, self.sample_sizes[hlevel])
+
+                rand_idx, mask_idx = self.get_random_samples(pcd_sizes, curr_sample_size, device)
+
+                batched_feat = torch.stack([
+                    feat[idx, :] for feat, idx in zip(decomposed_feat, rand_idx)
+                ])
+
+                batched_attn = torch.stack([
+                    attn[idx, :] for attn, idx in zip(decomposed_attn, rand_idx)
+                ])
+
+                batched_pos_enc = torch.stack([
+                    pos_enc[idx, :] for pos_enc, idx in zip(pos_encodings_pcd[hlevel], rand_idx)
+                ])
+
+                batched_attn.permute((0, 2, 1))[batched_attn.sum(1) == curr_sample_size] = False
+
+                m = torch.stack(mask_idx)
+                batched_attn = torch.logical_or(batched_attn, m[..., None])
+
+                src_pcd = self.lin_squeeze[hlevel](batched_feat.permute((1, 0, 2)))
+                
+                output = self.cross_attention[hlevel](
+                    queries.permute((1, 0, 2)),
+                    src_pcd,
+                    memory_mask=batched_attn.repeat_interleave(self.num_heads, dim=0).permute((0, 2, 1)),
+                    memory_key_padding_mask=None,  # here we do not apply masking on padded region
+                    pos=batched_pos_enc.permute((1, 0, 2)),
+                    query_pos=query_pos
+                )
+
+                output = self.self_attention[hlevel](
+                    output, tgt_mask=None,
+                    tgt_key_padding_mask=None,
+                    query_pos=query_pos
+                )
+
+                # FFN
+                queries = self.ffn_attention[hlevel](output).permute((1, 0, 2))
+
+                predictions_class.append(output_class)
+                predictions_bbox.append(outputs_bbox)
+                predictions_mask.append(outputs_mask)
+        
+        output_class, outputs_bbox, outputs_mask = self.mask_module(queries, point_features)
+        predictions_class.append(output_class)
+        predictions_bbox.append(outputs_bbox)
+        predictions_mask.append(outputs_mask)
+
+        return {
+            'pred_logits': predictions_class[-1],
+            'pred_bboxs': predictions_bbox[-1],
+            'pred_masks': predictions_mask[-1],
+            'aux_outputs': self._set_aux_loss(
+                predictions_class, predictions_bbox, predictions_mask
+            )
+        }
+
+    def mask_module(self, query_feat, point_features, num_pooling_steps=0):
+        query_feat = self.decoder_norm(query_feat)
+        mask_embed = self.mask_embed_head(query_feat)
+        outputs_class = self.class_embed_head(query_feat)
+        outputs_bbox = self.bbox_embed_head(query_feat)
+
+        output_masks = []
+
+        for feat, embed in zip(point_features.decomposed_features, mask_embed):
+            output_masks.append(feat @ embed.T)
+
+        output_masks = torch.cat(output_masks)
+        outputs_mask = me.SparseTensor(features=output_masks,
+                                       coordinate_manager=point_features.coordinate_manager,
+                                       coordinate_map_key=point_features.coordinate_map_key)
+
+        if num_pooling_steps != 0:
+            attn_mask = outputs_mask
+            for _ in range(num_pooling_steps):
+                attn_mask = self.pooling(attn_mask.float())
+
+            attn_mask = me.SparseTensor(features=(attn_mask.F.detach().sigmoid() < 0.5),
+                                        coordinate_manager=attn_mask.coordinate_manager,
+                                        coordinate_map_key=attn_mask.coordinate_map_key)
+
+            return outputs_class, outputs_bbox, outputs_mask.decomposed_features, attn_mask
+
+        return outputs_class, outputs_bbox, outputs_mask.decomposed_features
+
+    def get_pos_encs(self, coordinates):
+        pos_encodings_pcd = []
+        
+        for _ in range(self.num_levels + 1):
+            pos_encodings_pcd.append([])
+        
+            for coords_batch in coordinates.decomposed_features:
+                scene_min = coords_batch.min(dim=0)[0][None, ...]
+                scene_max = coords_batch.max(dim=0)[0][None, ...]
+
+                with autocast(enabled=False):
+                    tmp = self.pos_enc(coords_batch[None, :, :3].float(),
+                                      input_range=[scene_min[:, :3], scene_max[:, :3]])
+                    tmp += self.temporal_pos_enc(coords_batch[None, :, 3].float(),
+                                                 input_range=[scene_min[:, 3:4], scene_max[:, 3:4]])
+
+                pos_encodings_pcd[-1].append(tmp.squeeze(0).permute((1, 0)))
+            
+            coordinates = self.pooling(coordinates)
+            
+        pos_encodings_pcd.reverse()
+        
+        return pos_encodings_pcd
+
+    def get_random_samples(self, pcd_sizes, curr_sample_size, device):
+        rand_idx = []
+        mask_idx = []
+        for pcd_size in pcd_sizes:
+            if pcd_size <= curr_sample_size:
+                # we do not need to sample
+                # take all points and pad the rest with zeroes and mask it
+                idx = torch.zeros(curr_sample_size, dtype=torch.long, device=device)
+                midx = torch.ones(curr_sample_size, dtype=torch.bool, device=device)
+                idx[:pcd_size] = torch.arange(pcd_size, device=device)
+                midx[:pcd_size] = False  # attend to first points
+            else:
+                # we have more points in pcd as we like to sample
+                # take a subset (no padding or masking needed)
+                idx = torch.randperm(pcd_size, device=device)[:curr_sample_size]
+                midx = torch.zeros(curr_sample_size, dtype=torch.bool, device=device)
+
+            rand_idx.append(idx)
+            mask_idx.append(midx)
+        return rand_idx, mask_idx
+
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_bbox, outputs_seg_masks):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [
+            {"pred_logits": a, "pred_bboxs": b, "pred_masks": c}
+            for a, b, c in zip(outputs_class[:-1], outputs_bbox[:-1], outputs_seg_masks[:-1])
+        ]
diff --git a/models/matcher.py b/models/matcher.py
new file mode 100644
index 0000000..dc90b33
--- /dev/null
+++ b/models/matcher.py
@@ -0,0 +1,144 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/matcher.py
+"""
+Modules to compute the matching cost and solve the corresponding LSAP.
+"""
+import torch
+import torch.nn.functional as F
+from scipy.optimize import linear_sum_assignment
+from torch import nn
+from torch.cuda.amp import autocast
+
+
+def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor):
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets)
+    denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :]
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss
+
+
+batch_dice_loss_jit = torch.jit.script(
+    batch_dice_loss
+)  # type: torch.jit.ScriptModule
+
+
+def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor):
+    """
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    Returns:
+        Loss tensor
+    """
+    hw = inputs.shape[1]
+
+    pos = F.binary_cross_entropy_with_logits(
+        inputs, torch.ones_like(inputs), reduction="none"
+    )
+    neg = F.binary_cross_entropy_with_logits(
+        inputs, torch.zeros_like(inputs), reduction="none"
+    )
+
+    loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum(
+        "nc,mc->nm", neg, (1 - targets)
+    )
+
+    return loss / hw
+
+
+batch_sigmoid_ce_loss_jit = torch.jit.script(
+    batch_sigmoid_ce_loss
+)  # type: torch.jit.ScriptModule
+
+
+class HungarianMatcher(nn.Module):
+    """This class computes an assignment between the targets and the predictions of the network
+
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
+    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
+    while the others are un-matched (and thus treated as non-objects).
+    """
+
+    def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float = 1,
+                 cost_box: float = 1):
+        """Creates the matcher
+
+        Params:
+            cost_class: This is the relative weight of the classification error in the matching cost
+            cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost
+            cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost
+        """
+        super().__init__()
+        self.cost_class = cost_class
+        self.cost_mask = cost_mask
+        self.cost_dice = cost_dice
+        self.cost_box = cost_box
+        
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """Performs the matching
+
+        Params:
+            outputs: This is a dict that contains at least these entries:
+                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                 "pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks
+
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
+                           objects in the target) containing the class labels
+                 "masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks
+
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        bs, num_queries = outputs["pred_logits"].shape[:2]
+
+        indices = []
+
+        # Iterate through batch size
+        for b in range(bs):
+
+            out_prob = outputs["pred_logits"][b].softmax(-1)  # [num_queries, num_classes]
+            tgt_ids = targets[b]["labels"]
+
+            # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+            # but approximate it in 1 - proba[target class].
+            # The 1 is a constant that doesn't change the matching, it can be ommitted.
+            cost_class = -out_prob[:, tgt_ids]
+
+            out_mask = outputs['pred_masks'][b].T  # [num_queries, H_pred, W_pred]
+            # gt masks are already padded when preparing target
+            tgt_mask = targets[b]["masks"].to(out_mask)
+            
+            with autocast(enabled=False):
+                out_mask = out_mask.float()
+                tgt_mask = tgt_mask.float()
+                # Compute the focal loss between masks
+                cost_mask = batch_sigmoid_ce_loss_jit(out_mask, tgt_mask)
+
+                # Compute the dice loss betwen masks
+                cost_dice = batch_dice_loss_jit(out_mask, tgt_mask)
+            
+            # Final cost matrix
+            C = (
+                self.cost_mask * cost_mask
+                + self.cost_class * cost_class
+                + self.cost_dice * cost_dice
+            )
+            C = C.reshape(num_queries, -1).cpu()
+
+            indices.append(linear_sum_assignment(C))
+
+        return [
+            (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64))
+            for i, j in indices
+        ]
diff --git a/models/metrics/__init__.py b/models/metrics/__init__.py
new file mode 100644
index 0000000..94f2156
--- /dev/null
+++ b/models/metrics/__init__.py
@@ -0,0 +1 @@
+from .panoptic_quality import Panoptic4DEval
\ No newline at end of file
diff --git a/models/metrics/panoptic_quality.py b/models/metrics/panoptic_quality.py
new file mode 100644
index 0000000..6e09ca6
--- /dev/null
+++ b/models/metrics/panoptic_quality.py
@@ -0,0 +1,146 @@
+import numpy as np
+import math
+
+
+class Panoptic4DEval:
+    def __init__(self, n_classes, min_stuff_cls_id, ignore=0, offset=2**32, min_points=50):
+        self.n_classes = n_classes + 1
+        self.ignore = ignore
+        self.include = np.array([n for n in range(self.n_classes) if n != self.ignore], dtype=np.int64)
+        self.min_stuff_cls_id = min_stuff_cls_id
+        self.reset()
+        self.offset = offset  # largest number of instances in a given scan
+        self.min_points = min_points  # smallest number of points to consider instances in gt
+        self.eps = 1e-15
+
+    def reset(self):
+        # iou stuff
+        self.px_iou_conf_matrix = np.zeros((self.n_classes, self.n_classes), dtype=np.int64)
+        self.sequences = []
+        self.preds = {}
+        self.gts = {}
+        self.intersects = {}
+
+    def addBatchSemIoU(self, x_sem, y_sem):
+        # idxs are labels and predictions
+        idxs = np.stack([x_sem, y_sem], axis=0)
+        
+        # make confusion matrix (cols = gt, rows = pred)
+        np.add.at(self.px_iou_conf_matrix, tuple(idxs), 1)
+
+    def getSemIoUStats(self):
+        conf = self.px_iou_conf_matrix.copy().astype(np.double)
+        conf[:, self.ignore] = 0
+        
+        # get the clean stats
+        tp = conf.diagonal()
+        fp = conf.sum(axis=1) - tp
+        fn = conf.sum(axis=0) - tp
+        return tp, fp, fn
+
+    def getSemIoU(self):
+        tp, fp, fn = self.getSemIoUStats()
+        intersection = tp
+        union = tp + fp + fn
+        union = np.maximum(union, self.eps)
+        iou = intersection[self.include].astype(np.double) / union[self.include].astype(np.double)
+        iou_mean = iou.mean()
+        
+        return iou_mean, iou
+
+    def update_dict_stat(self, stat_dict, unique_ids, unique_cnts):
+        for uniqueid, counts in zip(unique_ids, unique_cnts):
+            if uniqueid == 1: continue # 1 -- no instance
+            if uniqueid in stat_dict:
+                stat_dict[uniqueid] += counts
+            else:
+                stat_dict[uniqueid] = counts
+
+    def addBatchPanoptic4D(self, seq, x_sem_row, x_inst_row, y_sem_row, y_inst_row):
+        if seq not in self.sequences:
+            self.sequences.append(seq)
+            self.preds[seq] = {}
+            self.gts[seq] = [{} for i in range(self.n_classes)]
+            self.intersects[seq] = [{} for i in range(self.n_classes)]
+        
+        # make sure instances are not zeros (it messes with my approach)
+        x_inst_row = x_inst_row + 1
+        y_inst_row = y_inst_row + 1
+        
+        preds = self.preds[seq]
+        # generate the areas for each unique instance prediction (i.e., set1)
+        unique_pred, counts_pred = np.unique(x_inst_row, return_counts=True)
+        self.update_dict_stat(preds, unique_pred, counts_pred)
+
+        for cl in self.include:
+            # Per-class accumulated stats
+            cl_gts = self.gts[seq][cl]
+            cl_intersects = self.intersects[seq][cl]
+
+            # get a binary class mask (filter acc. to semantic class!)
+            y_inst_in_cl_mask = y_sem_row == cl
+
+            # get instance points in class (mask-out everything but _this_ class)
+            y_inst_in_cl = y_inst_row * y_inst_in_cl_mask.astype(np.int64)
+
+            # generate the areas for each unique instance gt_np (i.e., set2)
+            unique_gt, counts_gt = np.unique(y_inst_in_cl[y_inst_in_cl > 0], return_counts=True)
+            self.update_dict_stat(cl_gts, unique_gt[counts_gt>self.min_points], counts_gt[counts_gt>self.min_points])
+            y_inst_in_cl[np.isin(y_inst_in_cl, unique_gt[counts_gt<=self.min_points])] = 0
+
+            # generate intersection using offset
+            offset_combo = x_inst_row[y_inst_in_cl > 0] + self.offset * y_inst_in_cl[y_inst_in_cl > 0]
+            unique_combo, counts_combo = np.unique(offset_combo, return_counts=True)
+
+            self.update_dict_stat(cl_intersects, unique_combo, counts_combo)
+
+    def getPQ4D(self):
+        pan_aq = np.zeros(self.n_classes, dtype=np.double)
+        pan_aq_ovr = 0.0
+        num_tubes = [0] * self.n_classes
+
+        for seq in self.sequences:
+            preds = self.preds[seq]
+            for cl in range(self.n_classes):
+                cl_gts = self.gts[seq][cl]
+                cl_intersects = self.intersects[seq][cl]
+                outer_sum_iou = 0.0
+                for gt_id, gt_size in cl_gts.items():
+                    num_tubes[cl] += 1
+                    inner_sum_iou = 0.0
+                    for pr_id, pr_size in preds.items():
+                        TPA_key =  pr_id + self.offset * gt_id
+                        if TPA_key in cl_intersects:
+                            TPA_ovr = cl_intersects[TPA_key]
+                            inner_sum_iou += TPA_ovr * (TPA_ovr / (gt_size + pr_size - TPA_ovr))
+                    outer_sum_iou += float(inner_sum_iou) / float(gt_size)
+                pan_aq[cl] += outer_sum_iou
+                pan_aq_ovr += outer_sum_iou
+                
+        AQ_overall = np.sum(pan_aq_ovr)/ np.sum(num_tubes[1:self.min_stuff_cls_id])
+        AQ = pan_aq / np.maximum(num_tubes, self.eps)
+
+        iou_mean, iou = self.getSemIoU()
+        
+        PQ4D =  math.sqrt(AQ_overall*iou_mean)
+        return PQ4D, AQ_overall, AQ[self.include], iou_mean, iou
+
+    def addBatch(self, x_sem, x_inst, y_sem, y_inst, indices, seq):  # x=preds, y=targets
+        x_sem = x_sem[indices]
+        x_inst = x_inst[indices]
+        y_sem = y_sem[indices]
+        y_inst = y_inst[indices]
+
+        # only interested in points that are outside the void area (not in excluded classes)
+        gt_not_in_excl_mask = y_sem != self.ignore
+        # remove all other points
+        x_sem = x_sem[gt_not_in_excl_mask]
+        y_sem = y_sem[gt_not_in_excl_mask]
+        x_inst = x_inst[gt_not_in_excl_mask]
+        y_inst = y_inst[gt_not_in_excl_mask]
+
+        # add to IoU calculation (for checking purposes)
+        self.addBatchSemIoU(x_sem, y_sem)
+
+        # now do the panoptic stuff
+        self.addBatchPanoptic4D(seq, x_sem, x_inst, y_sem, y_inst)
\ No newline at end of file
diff --git a/models/model.py b/models/model.py
new file mode 100644
index 0000000..9d2dfa2
--- /dev/null
+++ b/models/model.py
@@ -0,0 +1,27 @@
+from MinkowskiEngine import MinkowskiNetwork
+
+
+class Model(MinkowskiNetwork):
+    """
+  Base network for all sparse convnet
+
+  By default, all networks are segmentation networks.
+  """
+
+    OUT_PIXEL_DIST = -1
+
+    def __init__(self, in_channels, out_channels, config, D, **kwargs):
+        super().__init__(D)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.config = config
+
+
+class HighDimensionalModel(Model):
+    """
+  Base network for all spatio (temporal) chromatic sparse convnet
+  """
+
+    def __init__(self, in_channels, out_channels, config, D, **kwargs):
+        assert D > 4, "Num dimension smaller than 5"
+        super().__init__(in_channels, out_channels, config, D, **kwargs)
diff --git a/models/modules/__init__.py b/models/modules/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/modules/attention.py b/models/modules/attention.py
new file mode 100644
index 0000000..b3b7382
--- /dev/null
+++ b/models/modules/attention.py
@@ -0,0 +1,115 @@
+import torch.nn as nn
+from torch.nn import functional as F
+
+
+class SelfAttentionLayer(nn.Module):
+  
+    def __init__(self, d_model, nhead, dropout=0.0, activation="relu"):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+
+        self.norm = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward(self, tgt,
+                tgt_mask = None,
+                tgt_key_padding_mask= None,
+                query_pos = None):
+        q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout(tgt2)
+        tgt = self.norm(tgt)
+
+        return tgt
+
+
+class CrossAttentionLayer(nn.Module):
+
+    def __init__(self, d_model, nhead, dropout=0.0, activation="relu"):
+        super().__init__()
+        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+
+        self.norm = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward(self, tgt, memory,
+                memory_mask = None,
+                memory_key_padding_mask = None,
+                pos = None,
+                query_pos = None):
+        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout(tgt2)
+        tgt = self.norm(tgt)
+
+        return tgt
+
+
+class FFNLayer(nn.Module):
+
+    def __init__(self, d_model, dim_feedforward=2048, dropout=0.0, activation="relu"):
+        super().__init__()
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm = nn.LayerNorm(d_model)
+
+        self.activation = _get_activation_fn(activation)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward(self, tgt):
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout(tgt2)
+        tgt = self.norm(tgt)
+        return tgt
+
+
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+
+
diff --git a/models/modules/common.py b/models/modules/common.py
new file mode 100644
index 0000000..1678209
--- /dev/null
+++ b/models/modules/common.py
@@ -0,0 +1,258 @@
+import sys
+
+if sys.version_info[:2] >= (3, 8):
+    from collections.abc import Sequence
+else:
+    from collections import Sequence
+
+from enum import Enum
+
+import torch.nn as nn
+import MinkowskiEngine as ME
+
+
+class NormType(Enum):
+    BATCH_NORM = 0
+    INSTANCE_NORM = 1
+    INSTANCE_BATCH_NORM = 2
+
+
+def get_norm(norm_type, n_channels, D, bn_momentum=0.1):
+    if norm_type == NormType.BATCH_NORM:
+        return ME.MinkowskiBatchNorm(n_channels, momentum=bn_momentum)
+    elif norm_type == NormType.INSTANCE_NORM:
+        return ME.MinkowskiInstanceNorm(n_channels)
+    elif norm_type == NormType.INSTANCE_BATCH_NORM:
+        return nn.Sequential(
+            ME.MinkowskiInstanceNorm(n_channels),
+            ME.MinkowskiBatchNorm(n_channels, momentum=bn_momentum),
+        )
+    else:
+        raise ValueError(f"Norm type: {norm_type} not supported")
+
+
+class ConvType(Enum):
+    """
+    Define the kernel region type
+    """
+
+    HYPERCUBE = 0, "HYPERCUBE"
+    SPATIAL_HYPERCUBE = 1, "SPATIAL_HYPERCUBE"
+    SPATIO_TEMPORAL_HYPERCUBE = 2, "SPATIO_TEMPORAL_HYPERCUBE"
+    HYPERCROSS = 3, "HYPERCROSS"
+    SPATIAL_HYPERCROSS = 4, "SPATIAL_HYPERCROSS"
+    SPATIO_TEMPORAL_HYPERCROSS = 5, "SPATIO_TEMPORAL_HYPERCROSS"
+    SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS = 6, "SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS "
+
+    def __new__(cls, value, name):
+        member = object.__new__(cls)
+        member._value_ = value
+        member.fullname = name
+        return member
+
+    def __int__(self):
+        return self.value
+
+
+# Covert the ConvType var to a RegionType var
+conv_to_region_type = {
+    # kernel_size = [k, k, k, 1]
+    ConvType.HYPERCUBE: ME.RegionType.HYPER_CUBE,
+    ConvType.SPATIAL_HYPERCUBE: ME.RegionType.HYPER_CUBE,
+    ConvType.SPATIO_TEMPORAL_HYPERCUBE: ME.RegionType.HYPER_CUBE,
+    ConvType.HYPERCROSS: ME.RegionType.HYPER_CROSS,
+    ConvType.SPATIAL_HYPERCROSS: ME.RegionType.HYPER_CROSS,
+    ConvType.SPATIO_TEMPORAL_HYPERCROSS: ME.RegionType.HYPER_CROSS,
+    ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS: ME.RegionType.HYPER_CUBE,  # JONAS CHANGE from HYBRID
+}
+
+# int_to_region_type = {m.value: m for m in ME.RegionType}
+int_to_region_type = {m: ME.RegionType(m) for m in range(3)}
+
+
+def convert_region_type(region_type):
+    """
+    Convert the integer region_type to the corresponding RegionType enum object.
+    """
+    return int_to_region_type[region_type]
+
+
+def convert_conv_type(conv_type, kernel_size, D):
+    assert isinstance(conv_type, ConvType), "conv_type must be of ConvType"
+    region_type = conv_to_region_type[conv_type]
+    axis_types = None
+    if conv_type == ConvType.SPATIAL_HYPERCUBE:
+        # No temporal convolution
+        if isinstance(kernel_size, Sequence):
+            kernel_size = kernel_size[:3]
+        else:
+            kernel_size = [
+                kernel_size,
+            ] * 3
+        if D == 4:
+            kernel_size.append(1)
+    elif conv_type == ConvType.SPATIO_TEMPORAL_HYPERCUBE:
+        # conv_type conversion already handled
+        assert D == 4
+    elif conv_type == ConvType.HYPERCUBE:
+        # conv_type conversion already handled
+        pass
+    elif conv_type == ConvType.SPATIAL_HYPERCROSS:
+        if isinstance(kernel_size, Sequence):
+            kernel_size = kernel_size[:3]
+        else:
+            kernel_size = [
+                kernel_size,
+            ] * 3
+        if D == 4:
+            kernel_size.append(1)
+    elif conv_type == ConvType.HYPERCROSS:
+        # conv_type conversion already handled
+        pass
+    elif conv_type == ConvType.SPATIO_TEMPORAL_HYPERCROSS:
+        # conv_type conversion already handled
+        assert D == 4
+    elif conv_type == ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS:
+        # Define the CUBIC conv kernel for spatial dims and CROSS conv for temp dim
+        axis_types = [
+            ME.RegionType.HYPER_CUBE,
+        ] * 3
+        if D == 4:
+            axis_types.append(ME.RegionType.HYPER_CROSS)
+    return region_type, axis_types, kernel_size
+
+
+def conv(
+    in_planes,
+    out_planes,
+    kernel_size,
+    stride=1,
+    dilation=1,
+    bias=False,
+    conv_type=ConvType.HYPERCUBE,
+    D=-1,
+):
+    assert D > 0, "Dimension must be a positive integer"
+    region_type, axis_types, kernel_size = convert_conv_type(conv_type, kernel_size, D)
+    kernel_generator = ME.KernelGenerator(
+        kernel_size,
+        stride,
+        dilation,
+        region_type=region_type,
+        axis_types=None, # axis_types JONAS
+        dimension=D,
+    )
+
+    return ME.MinkowskiConvolution(
+        in_channels=in_planes,
+        out_channels=out_planes,
+        kernel_size=kernel_size,
+        stride=stride,
+        dilation=dilation,
+        bias=bias,
+        kernel_generator=kernel_generator,
+        dimension=D,
+    )
+
+
+def conv_tr(
+    in_planes,
+    out_planes,
+    kernel_size,
+    upsample_stride=1,
+    dilation=1,
+    bias=False,
+    conv_type=ConvType.HYPERCUBE,
+    D=-1,
+):
+    assert D > 0, "Dimension must be a positive integer"
+    region_type, axis_types, kernel_size = convert_conv_type(conv_type, kernel_size, D)
+    kernel_generator = ME.KernelGenerator(
+        kernel_size,
+        upsample_stride,
+        dilation,
+        region_type=region_type,
+        axis_types=axis_types,
+        dimension=D,
+    )
+
+    return ME.MinkowskiConvolutionTranspose(
+        in_channels=in_planes,
+        out_channels=out_planes,
+        kernel_size=kernel_size,
+        stride=upsample_stride,
+        dilation=dilation,
+        bias=bias,
+        kernel_generator=kernel_generator,
+        dimension=D,
+    )
+
+
+def avg_pool(
+    kernel_size,
+    stride=1,
+    dilation=1,
+    conv_type=ConvType.HYPERCUBE,
+    in_coords_key=None,
+    D=-1,
+):
+    assert D > 0, "Dimension must be a positive integer"
+    region_type, axis_types, kernel_size = convert_conv_type(conv_type, kernel_size, D)
+    kernel_generator = ME.KernelGenerator(
+        kernel_size,
+        stride,
+        dilation,
+        region_type=region_type,
+        axis_types=axis_types,
+        dimension=D,
+    )
+
+    return ME.MinkowskiAvgPooling(
+        kernel_size=kernel_size,
+        stride=stride,
+        dilation=dilation,
+        kernel_generator=kernel_generator,
+        dimension=D,
+    )
+
+
+def avg_unpool(kernel_size, stride=1, dilation=1, conv_type=ConvType.HYPERCUBE, D=-1):
+    assert D > 0, "Dimension must be a positive integer"
+    region_type, axis_types, kernel_size = convert_conv_type(conv_type, kernel_size, D)
+    kernel_generator = ME.KernelGenerator(
+        kernel_size,
+        stride,
+        dilation,
+        region_type=region_type,
+        axis_types=axis_types,
+        dimension=D,
+    )
+
+    return ME.MinkowskiAvgUnpooling(
+        kernel_size=kernel_size,
+        stride=stride,
+        dilation=dilation,
+        kernel_generator=kernel_generator,
+        dimension=D,
+    )
+
+
+def sum_pool(kernel_size, stride=1, dilation=1, conv_type=ConvType.HYPERCUBE, D=-1):
+    assert D > 0, "Dimension must be a positive integer"
+    region_type, axis_types, kernel_size = convert_conv_type(conv_type, kernel_size, D)
+    kernel_generator = ME.KernelGenerator(
+        kernel_size,
+        stride,
+        dilation,
+        region_type=region_type,
+        axis_types=axis_types,
+        dimension=D,
+    )
+
+    return ME.MinkowskiSumPooling(
+        kernel_size=kernel_size,
+        stride=stride,
+        dilation=dilation,
+        kernel_generator=kernel_generator,
+        dimension=D,
+    )
diff --git a/models/modules/helpers_3detr.py b/models/modules/helpers_3detr.py
new file mode 100644
index 0000000..fe73bf7
--- /dev/null
+++ b/models/modules/helpers_3detr.py
@@ -0,0 +1,116 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import torch.nn as nn
+from functools import partial
+import copy
+
+
+class BatchNormDim1Swap(nn.BatchNorm1d):
+    """
+    Used for nn.Transformer that uses a HW x N x C rep
+    """
+
+    def forward(self, x):
+        """
+        x: HW x N x C
+        permute to N x C x HW
+        Apply BN on C
+        permute back
+        """
+        hw, n, c = x.shape
+        x = x.permute(1, 2, 0)
+        x = super(BatchNormDim1Swap, self).forward(x)
+        # x: n x c x hw -> hw x n x c
+        x = x.permute(2, 0, 1)
+        return x
+
+
+NORM_DICT = {
+    "bn": BatchNormDim1Swap,
+    "bn1d": nn.BatchNorm1d,
+    "id": nn.Identity,
+    "ln": nn.LayerNorm,
+}
+
+ACTIVATION_DICT = {
+    "relu": nn.ReLU,
+    "gelu": nn.GELU,
+    "leakyrelu": partial(nn.LeakyReLU, negative_slope=0.1),
+}
+
+WEIGHT_INIT_DICT = {
+    "xavier_uniform": nn.init.xavier_uniform_,
+}
+
+
+class GenericMLP(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        hidden_dims,
+        output_dim,
+        norm_fn_name=None,
+        activation="relu",
+        use_conv=False,
+        dropout=None,
+        hidden_use_bias=False,
+        output_use_bias=True,
+        output_use_activation=False,
+        output_use_norm=False,
+        weight_init_name=None,
+    ):
+        super().__init__()
+        activation = ACTIVATION_DICT[activation]
+        norm = None
+        if norm_fn_name is not None:
+            norm = NORM_DICT[norm_fn_name]
+        if norm_fn_name == "ln" and use_conv:
+            norm = lambda x: nn.GroupNorm(1, x)  # easier way to use LayerNorm
+
+        if dropout is not None:
+            if not isinstance(dropout, list):
+                dropout = [dropout for _ in range(len(hidden_dims))]
+
+        layers = []
+        prev_dim = input_dim
+        for idx, x in enumerate(hidden_dims):
+            if use_conv:
+                layer = nn.Conv1d(prev_dim, x, 1, bias=hidden_use_bias)
+            else:
+                layer = nn.Linear(prev_dim, x, bias=hidden_use_bias)
+            layers.append(layer)
+            if norm:
+                layers.append(norm(x))
+            layers.append(activation())
+            if dropout is not None:
+                layers.append(nn.Dropout(p=dropout[idx]))
+            prev_dim = x
+        if use_conv:
+            layer = nn.Conv1d(prev_dim, output_dim, 1, bias=output_use_bias)
+        else:
+            layer = nn.Linear(prev_dim, output_dim, bias=output_use_bias)
+        layers.append(layer)
+
+        if output_use_norm:
+            layers.append(norm(output_dim))
+
+        if output_use_activation:
+            layers.append(activation())
+
+        self.layers = nn.Sequential(*layers)
+
+        if weight_init_name is not None:
+            self.do_weight_init(weight_init_name)
+
+    def do_weight_init(self, weight_init_name):
+        func = WEIGHT_INIT_DICT[weight_init_name]
+        for (_, param) in self.named_parameters():
+            if param.dim() > 1:  # skips batchnorm/layernorm
+                func(param)
+
+    def forward(self, x):
+        output = self.layers(x)
+        return output
+
+
+def get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
\ No newline at end of file
diff --git a/models/modules/resnet_block.py b/models/modules/resnet_block.py
new file mode 100644
index 0000000..0ffafaa
--- /dev/null
+++ b/models/modules/resnet_block.py
@@ -0,0 +1,149 @@
+import torch.nn as nn
+from MinkowskiEngine import MinkowskiReLU
+
+from models.modules.common import ConvType, NormType, conv, get_norm
+
+
+class BasicBlockBase(nn.Module):
+    expansion = 1
+    NORM_TYPE = NormType.BATCH_NORM
+
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        stride=1,
+        dilation=1,
+        downsample=None,
+        conv_type=ConvType.HYPERCUBE,
+        bn_momentum=0.1,
+        D=3,
+    ):
+        super().__init__()
+
+        self.conv1 = conv(
+            inplanes,
+            planes,
+            kernel_size=3,
+            stride=stride,
+            dilation=dilation,
+            conv_type=conv_type,
+            D=D,
+        )
+        self.norm1 = get_norm(self.NORM_TYPE, planes, D, bn_momentum=bn_momentum)
+        self.conv2 = conv(
+            planes,
+            planes,
+            kernel_size=3,
+            stride=1,
+            dilation=dilation,
+            bias=False,
+            conv_type=conv_type,
+            D=D,
+        )
+        self.norm2 = get_norm(self.NORM_TYPE, planes, D, bn_momentum=bn_momentum)
+        self.relu = MinkowskiReLU(inplace=True)
+        self.downsample = downsample
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.norm1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.norm2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class BasicBlock(BasicBlockBase):
+    NORM_TYPE = NormType.BATCH_NORM
+
+
+class BasicBlockIN(BasicBlockBase):
+    NORM_TYPE = NormType.INSTANCE_NORM
+
+
+class BasicBlockINBN(BasicBlockBase):
+    NORM_TYPE = NormType.INSTANCE_BATCH_NORM
+
+
+class BottleneckBase(nn.Module):
+    expansion = 4
+    NORM_TYPE = NormType.BATCH_NORM
+
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        stride=1,
+        dilation=1,
+        downsample=None,
+        conv_type=ConvType.HYPERCUBE,
+        bn_momentum=0.1,
+        D=3,
+    ):
+        super().__init__()
+        self.conv1 = conv(inplanes, planes, kernel_size=1, D=D)
+        self.norm1 = get_norm(self.NORM_TYPE, planes, D, bn_momentum=bn_momentum)
+
+        self.conv2 = conv(
+            planes,
+            planes,
+            kernel_size=3,
+            stride=stride,
+            dilation=dilation,
+            conv_type=conv_type,
+            D=D,
+        )
+        self.norm2 = get_norm(self.NORM_TYPE, planes, D, bn_momentum=bn_momentum)
+
+        self.conv3 = conv(planes, planes * self.expansion, kernel_size=1, D=D)
+        self.norm3 = get_norm(
+            self.NORM_TYPE, planes * self.expansion, D, bn_momentum=bn_momentum
+        )
+
+        self.relu = MinkowskiReLU(inplace=True)
+        self.downsample = downsample
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.norm1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.norm2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.norm3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(BottleneckBase):
+    NORM_TYPE = NormType.BATCH_NORM
+
+
+class BottleneckIN(BottleneckBase):
+    NORM_TYPE = NormType.INSTANCE_NORM
+
+
+class BottleneckINBN(BottleneckBase):
+    NORM_TYPE = NormType.INSTANCE_BATCH_NORM
diff --git a/models/position_embedding.py b/models/position_embedding.py
new file mode 100644
index 0000000..8ad0417
--- /dev/null
+++ b/models/position_embedding.py
@@ -0,0 +1,65 @@
+import torch
+from torch import nn
+import numpy as np
+
+
+def shift_scale_points(pred_xyz, input_range):
+    """
+    pred_xyz: B x N x 3
+    input_range: [[B x 3], [B x 3]] - min and max XYZ coords
+    dst_range: [[B x 3], [B x 3]] - min and max XYZ coords
+    """
+    dst_range = [
+        torch.zeros_like(input_range[0], device=input_range[0].device),
+        torch.ones_like(input_range[0], device=input_range[0].device),
+    ]
+    
+    src_diff = input_range[1][:, None, :] - input_range[0][:, None, :]
+    dst_diff = dst_range[1][:, None, :] - dst_range[0][:, None, :]
+    prop_xyz = (
+        ((pred_xyz - input_range[0][:, None, :]) * dst_diff) / src_diff
+    ) + dst_range[0][:, None, :]
+    return prop_xyz
+
+
+class PositionEmbeddingCoordsSine(nn.Module):
+    def __init__(
+        self,
+        d_in=3,
+        d_pos=None,
+        normalize=True,
+    ):
+        super().__init__()
+        self.d_in = d_in
+        self.d_pos = d_pos
+        self.normalize = normalize
+
+        # define a gaussian matrix input_ch -> output_ch
+        B = torch.empty((d_in, d_pos // 2)).normal_()
+        self.register_buffer("gauss_B", B)
+        
+    @torch.no_grad()
+    def forward(self, xyz, num_channels=None, input_range=None):
+        # xyz is batch x npoints x 3
+        if num_channels is None:
+            num_channels = self.gauss_B.shape[1] * 2
+
+        bsize, npoints = xyz.shape[0], xyz.shape[1]
+        d_out = num_channels // 2
+
+        # clone coords so that shift/scale operations do not affect original tensor
+        orig_xyz = xyz
+        xyz = orig_xyz.clone()
+
+        if self.normalize:
+            xyz = shift_scale_points(xyz, input_range=input_range)
+
+        xyz *= 2 * np.pi
+        xyz_proj = torch.mm(xyz.view(-1, self.d_in), self.gauss_B[:, :d_out]).view(
+            bsize, npoints, d_out
+        )
+        final_embeds = [xyz_proj.sin(), xyz_proj.cos()]
+
+        # return batch x d_pos x npoints embedding
+        final_embeds = torch.cat(final_embeds, dim=2).permute(0, 2, 1)
+        return final_embeds
diff --git a/models/res16unet.py b/models/res16unet.py
new file mode 100644
index 0000000..8651702
--- /dev/null
+++ b/models/res16unet.py
@@ -0,0 +1,425 @@
+import MinkowskiEngine.MinkowskiOps as me
+from MinkowskiEngine import MinkowskiReLU
+
+from models.resnet import ResNetBase, get_norm
+from models.modules.common import ConvType, NormType, conv, conv_tr
+from models.modules.resnet_block import BasicBlock, Bottleneck
+
+
+class Res16UNetBase(ResNetBase):
+    BLOCK = None
+    PLANES = (32, 64, 128, 256, 256, 256, 256, 256)
+    DILATIONS = (1, 1, 1, 1, 1, 1, 1, 1)
+    LAYERS = (2, 2, 2, 2, 2, 2, 2, 2)
+    INIT_DIM = 32
+    OUT_PIXEL_DIST = 1
+    NORM_TYPE = NormType.BATCH_NORM
+    NON_BLOCK_CONV_TYPE = ConvType.SPATIAL_HYPERCUBE
+    CONV_TYPE = ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS
+
+    # To use the model, must call initialize_coords before forward pass.
+    # Once data is processed, call clear to reset the model before calling initialize_coords
+    def __init__(self, in_channels, out_channels, config, D=3, **kwargs):
+        super().__init__(in_channels, out_channels, config, D)
+        
+    def network_initialization(self, in_channels, out_channels, config, D):
+        # Setup net_metadata
+        dilations = self.DILATIONS
+        bn_momentum = config.bn_momentum
+
+        def space_n_time_m(n, m):
+            return n if D == 3 else [n, n, n, m]
+
+        if D == 4:
+            self.OUT_PIXEL_DIST = space_n_time_m(self.OUT_PIXEL_DIST, 1)
+
+        # Output of the first conv concated to conv6
+        self.inplanes = self.INIT_DIM
+        self.conv0p1s1 = conv(
+            in_channels,
+            self.inplanes,
+            kernel_size=space_n_time_m(config.conv1_kernel_size, 1),
+            stride=1,
+            dilation=1,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D,
+        )
+
+        self.bn0 = get_norm(self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum)
+
+        self.conv1p1s2 = conv(
+            self.inplanes,
+            self.inplanes,
+            kernel_size=space_n_time_m(2, 1),
+            stride=space_n_time_m(2, 1),
+            dilation=1,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D,
+        )
+        self.bn1 = get_norm(self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum)
+        self.block1 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[0],
+            self.LAYERS[0],
+            dilation=dilations[0],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum,
+        )
+
+        self.conv2p2s2 = conv(
+            self.inplanes,
+            self.inplanes,
+            kernel_size=space_n_time_m(2, 1),
+            stride=space_n_time_m(2, 1),
+            dilation=1,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D,
+        )
+        self.bn2 = get_norm(self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum)
+        self.block2 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[1],
+            self.LAYERS[1],
+            dilation=dilations[1],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum,
+        )
+
+        self.conv3p4s2 = conv(
+            self.inplanes,
+            self.inplanes,
+            kernel_size=space_n_time_m(2, 1),
+            stride=space_n_time_m(2, 1),
+            dilation=1,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D,
+        )
+        self.bn3 = get_norm(self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum)
+        self.block3 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[2],
+            self.LAYERS[2],
+            dilation=dilations[2],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum,
+        )
+
+        self.conv4p8s2 = conv(
+            self.inplanes,
+            self.inplanes,
+            kernel_size=space_n_time_m(2, 1),
+            stride=space_n_time_m(2, 1),
+            dilation=1,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D,
+        )
+        self.bn4 = get_norm(self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum)
+        self.block4 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[3],
+            self.LAYERS[3],
+            dilation=dilations[3],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum,
+        )
+        self.convtr4p16s2 = conv_tr(
+            self.inplanes,
+            self.PLANES[4],
+            kernel_size=space_n_time_m(2, 1),
+            upsample_stride=space_n_time_m(2, 1),
+            dilation=1,
+            bias=False,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D,
+        )
+        self.bntr4 = get_norm(
+            self.NORM_TYPE, self.PLANES[4], D, bn_momentum=bn_momentum
+        )
+
+        self.inplanes = self.PLANES[4] + self.PLANES[2] * self.BLOCK.expansion
+        self.block5 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[4],
+            self.LAYERS[4],
+            dilation=dilations[4],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum,
+        )
+        self.convtr5p8s2 = conv_tr(
+            self.inplanes,
+            self.PLANES[5],
+            kernel_size=space_n_time_m(2, 1),
+            upsample_stride=space_n_time_m(2, 1),
+            dilation=1,
+            bias=False,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D,
+        )
+        self.bntr5 = get_norm(
+            self.NORM_TYPE, self.PLANES[5], D, bn_momentum=bn_momentum
+        )
+
+        self.inplanes = self.PLANES[5] + self.PLANES[1] * self.BLOCK.expansion
+        self.block6 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[5],
+            self.LAYERS[5],
+            dilation=dilations[5],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum,
+        )
+        self.convtr6p4s2 = conv_tr(
+            self.inplanes,
+            self.PLANES[6],
+            kernel_size=space_n_time_m(2, 1),
+            upsample_stride=space_n_time_m(2, 1),
+            dilation=1,
+            bias=False,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D,
+        )
+        self.bntr6 = get_norm(
+            self.NORM_TYPE, self.PLANES[6], D, bn_momentum=bn_momentum
+        )
+
+        self.inplanes = self.PLANES[6] + self.PLANES[0] * self.BLOCK.expansion
+        self.block7 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[6],
+            self.LAYERS[6],
+            dilation=dilations[6],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum,
+        )
+        self.convtr7p2s2 = conv_tr(
+            self.inplanes,
+            self.PLANES[7],
+            kernel_size=space_n_time_m(2, 1),
+            upsample_stride=space_n_time_m(2, 1),
+            dilation=1,
+            bias=False,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D,
+        )
+        self.bntr7 = get_norm(
+            self.NORM_TYPE, self.PLANES[7], D, bn_momentum=bn_momentum
+        )
+
+        self.inplanes = self.PLANES[7] + self.INIT_DIM
+        self.block8 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[7],
+            self.LAYERS[7],
+            dilation=dilations[7],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum,
+        )
+
+        # self.final = conv(
+        #     self.PLANES[7], out_channels, kernel_size=1, stride=1, bias=True, D=D
+        # )
+        self.relu = MinkowskiReLU(inplace=True)
+
+    def forward(self, x):
+        feature_maps = []
+
+        out = self.conv0p1s1(x)
+        out = self.bn0(out)
+        out_p1 = self.relu(out)
+
+        out = self.conv1p1s2(out_p1)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out_b1p2 = self.block1(out)
+
+        out = self.conv2p2s2(out_b1p2)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out_b2p4 = self.block2(out)
+
+        out = self.conv3p4s2(out_b2p4)
+        out = self.bn3(out)
+        out = self.relu(out)
+        out_b3p8 = self.block3(out)
+
+        # pixel_dist=16
+        out = self.conv4p8s2(out_b3p8)
+        out = self.bn4(out)
+        out = self.relu(out)
+        out = self.block4(out)
+
+        feature_maps.append(out)
+
+        # pixel_dist=8
+        out = self.convtr4p16s2(out)
+        out = self.bntr4(out)
+        out = self.relu(out)
+
+        out = me.cat(out, out_b3p8)
+        out = self.block5(out)
+
+        feature_maps.append(out)
+
+        # pixel_dist=4
+        out = self.convtr5p8s2(out)
+        out = self.bntr5(out)
+        out = self.relu(out)
+
+        out = me.cat(out, out_b2p4)
+        out = self.block6(out)
+
+        feature_maps.append(out)
+
+        # pixel_dist=2
+        out = self.convtr6p4s2(out)
+        out = self.bntr6(out)
+        out = self.relu(out)
+
+        out = me.cat(out, out_b1p2)
+        out = self.block7(out)
+
+        feature_maps.append(out)
+
+        # pixel_dist=1
+        out = self.convtr7p2s2(out)
+        out = self.bntr7(out)
+        out = self.relu(out)
+
+        out = me.cat(out, out_p1)
+        out = self.block8(out)
+
+        feature_maps.append(out)
+
+        return feature_maps
+
+
+class Res16UNet14(Res16UNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (1, 1, 1, 1, 1, 1, 1, 1)
+
+
+class Res16UNet18(Res16UNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (2, 2, 2, 2, 2, 2, 2, 2)
+
+
+class Res16UNet34(Res16UNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (2, 3, 4, 6, 2, 2, 2, 2)
+
+
+class Res16UNet50(Res16UNetBase):
+    BLOCK = Bottleneck
+    LAYERS = (2, 3, 4, 6, 2, 2, 2, 2)
+
+
+class Res16UNet101(Res16UNetBase):
+    BLOCK = Bottleneck
+    LAYERS = (2, 3, 4, 23, 2, 2, 2, 2)
+
+
+class Res16UNet14A(Res16UNet14):
+    PLANES = (32, 64, 128, 256, 128, 128, 96, 96)
+
+
+class Res16UNet14A2(Res16UNet14A):
+    LAYERS = (1, 1, 1, 1, 2, 2, 2, 2)
+
+
+class Res16UNet14B(Res16UNet14):
+    PLANES = (32, 64, 128, 256, 128, 128, 128, 128)
+
+
+class Res16UNet14B2(Res16UNet14B):
+    LAYERS = (1, 1, 1, 1, 2, 2, 2, 2)
+
+
+class Res16UNet14B3(Res16UNet14B):
+    LAYERS = (2, 2, 2, 2, 1, 1, 1, 1)
+
+
+class Res16UNet14C(Res16UNet14):
+    PLANES = (32, 64, 128, 256, 192, 192, 128, 128)
+
+
+class Res16UNet14D(Res16UNet14):
+    PLANES = (32, 64, 128, 256, 384, 384, 384, 384)
+
+
+class Res16UNet18A(Res16UNet18):
+    PLANES = (32, 64, 128, 256, 128, 128, 96, 96)
+
+
+class Res16UNet18B(Res16UNet18):
+    PLANES = (32, 64, 128, 256, 128, 128, 128, 128)
+
+
+class Res16UNet18D(Res16UNet18):
+    PLANES = (32, 64, 128, 256, 384, 384, 384, 384)
+
+
+class Res16UNet34A(Res16UNet34):
+    PLANES = (32, 64, 128, 256, 256, 128, 64, 64)
+
+
+class Res16UNet34B(Res16UNet34):
+    PLANES = (32, 64, 128, 256, 256, 128, 64, 32)
+
+
+class Res16UNet34C(Res16UNet34):
+    PLANES = (32, 64, 128, 256, 256, 128, 96, 96)
+
+class Custom30M(Res16UNet34):
+    PLANES = (32, 64, 128, 256, 128, 64, 64, 32)
+
+class Res16UNet34D(Res16UNet34):
+    PLANES = (32, 64, 128, 256, 256, 128, 96, 128)
+
+
+class STRes16UNetBase(Res16UNetBase):
+
+    CONV_TYPE = ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS
+
+    def __init__(self, in_channels, out_channels, config, D=4, **kwargs):
+        super().__init__(in_channels, out_channels, config, D, **kwargs)
+
+
+class STRes16UNet14(STRes16UNetBase, Res16UNet14):
+    pass
+
+
+class STRes16UNet14A(STRes16UNetBase, Res16UNet14A):
+    pass
+
+
+class STRes16UNet18(STRes16UNetBase, Res16UNet18):
+    pass
+
+
+class STRes16UNet34(STRes16UNetBase, Res16UNet34):
+    pass
+
+
+class STRes16UNet34C(STRes16UNetBase, Res16UNet34C):
+    pass
+
+
+class STRes16UNet50(STRes16UNetBase, Res16UNet50):
+    pass
+
+
+class STRes16UNet101(STRes16UNetBase, Res16UNet101):
+    pass
+
+
+class STRes16UNet18A(STRes16UNet18):
+    PLANES = (32, 64, 128, 256, 128, 128, 96, 96)
+
+
+class STResTesseract16UNetBase(STRes16UNetBase):
+    pass
+    #CONV_TYPE = ConvType.HYPERCUBE
+
+
+class STResTesseract16UNet18A(STRes16UNet18A, STResTesseract16UNetBase):
+    pass
diff --git a/models/resnet.py b/models/resnet.py
new file mode 100644
index 0000000..5208c1f
--- /dev/null
+++ b/models/resnet.py
@@ -0,0 +1,240 @@
+import torch.nn as nn
+import MinkowskiEngine as ME
+
+from models.model import Model
+from models.modules.common import ConvType, NormType, conv, get_norm, sum_pool
+from models.modules.resnet_block import BasicBlock, Bottleneck
+
+
+class ResNetBase(Model):
+    BLOCK = None
+    LAYERS = ()
+    INIT_DIM = 64
+    PLANES = (64, 128, 256, 512)
+    OUT_PIXEL_DIST = 32
+    HAS_LAST_BLOCK = False
+    CONV_TYPE = ConvType.HYPERCUBE
+
+    def __init__(self, in_channels, out_channels, config, D=3, **kwargs):
+        assert self.BLOCK is not None
+        assert self.OUT_PIXEL_DIST > 0
+
+        super().__init__(in_channels, out_channels, config, D, **kwargs)
+
+        self.network_initialization(in_channels, out_channels, config, D)
+        self.weight_initialization()
+
+    def network_initialization(self, in_channels, out_channels, config, D):
+        def space_n_time_m(n, m):
+            return n if D == 3 else [n, n, n, m]
+
+        if D == 4:
+            self.OUT_PIXEL_DIST = space_n_time_m(self.OUT_PIXEL_DIST, 1)
+
+        dilations = config.dilations
+        bn_momentum = config.bn_momentum
+        self.inplanes = self.INIT_DIM
+        self.conv1 = conv(
+            in_channels,
+            self.inplanes,
+            kernel_size=space_n_time_m(config.conv1_kernel_size, 1),
+            stride=1,
+            D=D,
+        )
+
+        self.bn1 = get_norm(
+            NormType.BATCH_NORM, self.inplanes, D=self.D, bn_momentum=bn_momentum
+        )
+        self.relu = ME.MinkowskiReLU(inplace=True)
+        self.pool = sum_pool(
+            kernel_size=space_n_time_m(2, 1), stride=space_n_time_m(2, 1), D=D
+        )
+
+        self.layer1 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[0],
+            self.LAYERS[0],
+            stride=space_n_time_m(2, 1),
+            dilation=space_n_time_m(dilations[0], 1),
+        )
+        self.layer2 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[1],
+            self.LAYERS[1],
+            stride=space_n_time_m(2, 1),
+            dilation=space_n_time_m(dilations[1], 1),
+        )
+        self.layer3 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[2],
+            self.LAYERS[2],
+            stride=space_n_time_m(2, 1),
+            dilation=space_n_time_m(dilations[2], 1),
+        )
+        self.layer4 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[3],
+            self.LAYERS[3],
+            stride=space_n_time_m(2, 1),
+            dilation=space_n_time_m(dilations[3], 1),
+        )
+
+        self.final = conv(
+            self.PLANES[3] * self.BLOCK.expansion,
+            out_channels,
+            kernel_size=1,
+            bias=True,
+            D=D,
+        )
+
+    def weight_initialization(self):
+        for m in self.modules():
+            if isinstance(m, ME.MinkowskiBatchNorm):
+                nn.init.constant_(m.bn.weight, 1)
+                nn.init.constant_(m.bn.bias, 0)
+
+    def _make_layer(
+        self,
+        block,
+        planes,
+        blocks,
+        stride=1,
+        dilation=1,
+        norm_type=NormType.BATCH_NORM,
+        bn_momentum=0.1,
+    ):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv(
+                    self.inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False,
+                    D=self.D,
+                ),
+                get_norm(
+                    norm_type,
+                    planes * block.expansion,
+                    D=self.D,
+                    bn_momentum=bn_momentum,
+                ),
+            )
+        layers = []
+        layers.append(
+            block(
+                self.inplanes,
+                planes,
+                stride=stride,
+                dilation=dilation,
+                downsample=downsample,
+                conv_type=self.CONV_TYPE,
+                D=self.D,
+            )
+        )
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    stride=1,
+                    dilation=dilation,
+                    conv_type=self.CONV_TYPE,
+                    D=self.D,
+                )
+            )
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.pool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.final(x)
+        return x
+
+
+class ResNet14(ResNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (1, 1, 1, 1)
+
+
+class ResNet18(ResNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (2, 2, 2, 2)
+
+
+class ResNet34(ResNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (3, 4, 6, 3)
+
+
+class ResNet50(ResNetBase):
+    BLOCK = Bottleneck
+    LAYERS = (3, 4, 6, 3)
+
+
+class ResNet101(ResNetBase):
+    BLOCK = Bottleneck
+    LAYERS = (3, 4, 23, 3)
+
+
+class STResNetBase(ResNetBase):
+
+    CONV_TYPE = ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS
+
+    def __init__(self, in_channels, out_channels, config, D=4, **kwargs):
+        super().__init__(in_channels, out_channels, config, D, **kwargs)
+
+
+class STResNet14(STResNetBase, ResNet14):
+    pass
+
+
+class STResNet18(STResNetBase, ResNet18):
+    pass
+
+
+class STResNet34(STResNetBase, ResNet34):
+    pass
+
+
+class STResNet50(STResNetBase, ResNet50):
+    pass
+
+
+class STResNet101(STResNetBase, ResNet101):
+    pass
+
+
+class STResTesseractNetBase(STResNetBase):
+    CONV_TYPE = ConvType.HYPERCUBE
+
+
+class STResTesseractNet14(STResTesseractNetBase, STResNet14):
+    pass
+
+
+class STResTesseractNet18(STResTesseractNetBase, STResNet18):
+    pass
+
+
+class STResTesseractNet34(STResTesseractNetBase, STResNet34):
+    pass
+
+
+class STResTesseractNet50(STResTesseractNetBase, STResNet50):
+    pass
+
+
+class STResTesseractNet101(STResTesseractNetBase, STResNet101):
+    pass
diff --git a/models/resunet.py b/models/resunet.py
new file mode 100644
index 0000000..a4b21d9
--- /dev/null
+++ b/models/resunet.py
@@ -0,0 +1,593 @@
+import torch.nn as nn
+import MinkowskiEngine as ME
+import MinkowskiEngine.MinkowskiOps as me
+from MinkowskiEngine import MinkowskiReLU
+
+from models.resnet import ResNetBase, get_norm
+from models.modules.common import ConvType, NormType, conv, conv_tr
+from models.modules.resnet_block import BasicBlock, Bottleneck, BasicBlockINBN
+
+
+class MinkUNetBase(ResNetBase):
+    BLOCK = None
+    PLANES = (64, 128, 256, 512, 256, 128, 128)
+    DILATIONS = (1, 1, 1, 1, 1, 1)
+    LAYERS = (2, 2, 2, 2, 2, 2)
+    INIT_DIM = 64
+    OUT_PIXEL_DIST = 1
+    NORM_TYPE = NormType.BATCH_NORM
+    NON_BLOCK_CONV_TYPE = ConvType.SPATIAL_HYPERCUBE
+    CONV_TYPE = ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS
+
+    # To use the model, must call initialize_coords before forward pass.
+    # Once data is processed, call clear to reset the model before calling initialize_coords
+    def __init__(self, in_channels, out_channels, config, D=3, **kwargs):
+        super().__init__(in_channels, out_channels, config, D)
+
+    def network_initialization(self, in_channels, out_channels, config, D):
+        # Setup net_metadata
+        dilations = self.DILATIONS
+        bn_momentum = config.bn_momentum
+
+        def space_n_time_m(n, m):
+            return n if D == 3 else [n, n, n, m]
+
+        if D == 4:
+            self.OUT_PIXEL_DIST = space_n_time_m(self.OUT_PIXEL_DIST, 1)
+
+        # Output of the first conv concated to conv6
+        self.inplanes = self.INIT_DIM
+        self.conv1p1s1 = conv(
+            in_channels,
+            self.inplanes,
+            kernel_size=space_n_time_m(config.conv1_kernel_size, 1),
+            stride=1,
+            dilation=1,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D,
+        )
+
+        self.bn1 = get_norm(self.NORM_TYPE, self.PLANES[0], D, bn_momentum=bn_momentum)
+        self.block1 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[0],
+            self.LAYERS[0],
+            dilation=dilations[0],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum,
+        )
+
+        self.conv2p1s2 = conv(
+            self.inplanes,
+            self.inplanes,
+            kernel_size=space_n_time_m(2, 1),
+            stride=space_n_time_m(2, 1),
+            dilation=1,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D,
+        )
+        self.bn2 = get_norm(self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum)
+        self.block2 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[1],
+            self.LAYERS[1],
+            dilation=dilations[1],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum,
+        )
+
+        self.conv3p2s2 = conv(
+            self.inplanes,
+            self.inplanes,
+            kernel_size=space_n_time_m(2, 1),
+            stride=space_n_time_m(2, 1),
+            dilation=1,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D,
+        )
+        self.bn3 = get_norm(self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum)
+        self.block3 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[2],
+            self.LAYERS[2],
+            dilation=dilations[2],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum,
+        )
+
+        self.conv4p4s2 = conv(
+            self.inplanes,
+            self.inplanes,
+            kernel_size=space_n_time_m(2, 1),
+            stride=space_n_time_m(2, 1),
+            dilation=1,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D,
+        )
+        self.bn4 = get_norm(self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum)
+        self.block4 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[3],
+            self.LAYERS[3],
+            dilation=dilations[3],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum,
+        )
+        self.convtr4p8s2 = conv_tr(
+            self.inplanes,
+            self.PLANES[4],
+            kernel_size=space_n_time_m(2, 1),
+            upsample_stride=space_n_time_m(2, 1),
+            dilation=1,
+            bias=False,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D,
+        )
+        self.bntr4 = get_norm(
+            self.NORM_TYPE, self.PLANES[4], D, bn_momentum=bn_momentum
+        )
+
+        self.inplanes = self.PLANES[4] + self.PLANES[2] * self.BLOCK.expansion
+        self.block5 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[4],
+            self.LAYERS[4],
+            dilation=dilations[4],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum,
+        )
+        self.convtr5p4s2 = conv_tr(
+            self.inplanes,
+            self.PLANES[5],
+            kernel_size=space_n_time_m(2, 1),
+            upsample_stride=space_n_time_m(2, 1),
+            dilation=1,
+            bias=False,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D,
+        )
+        self.bntr5 = get_norm(
+            self.NORM_TYPE, self.PLANES[5], D, bn_momentum=bn_momentum
+        )
+
+        self.inplanes = self.PLANES[5] + self.PLANES[1] * self.BLOCK.expansion
+        self.block6 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[5],
+            self.LAYERS[5],
+            dilation=dilations[5],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum,
+        )
+        self.convtr6p2s2 = conv_tr(
+            self.inplanes,
+            self.PLANES[6],
+            kernel_size=space_n_time_m(2, 1),
+            upsample_stride=space_n_time_m(2, 1),
+            dilation=1,
+            bias=False,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D,
+        )
+        self.bntr6 = get_norm(
+            self.NORM_TYPE, self.PLANES[6], D, bn_momentum=bn_momentum
+        )
+        self.relu = MinkowskiReLU(inplace=True)
+
+        self.final = nn.Sequential(
+            conv(
+                self.PLANES[6] + self.PLANES[0] * self.BLOCK.expansion,
+                512,
+                kernel_size=1,
+                stride=1,
+                dilation=1,
+                bias=False,
+                D=D,
+            ),
+            ME.MinkowskiBatchNorm(512),
+            ME.MinkowskiReLU(),
+            conv(
+                512, out_channels, kernel_size=1, stride=1, dilation=1, bias=True, D=D
+            ),
+        )
+
+    def forward(self, x):
+        out = self.conv1p1s1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out_b1p1 = self.block1(out)
+
+        out = self.conv2p1s2(out_b1p1)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out_b2p2 = self.block2(out)
+
+        out = self.conv3p2s2(out_b2p2)
+        out = self.bn3(out)
+        out = self.relu(out)
+
+        out_b3p4 = self.block3(out)
+
+        out = self.conv4p4s2(out_b3p4)
+        out = self.bn4(out)
+        out = self.relu(out)
+
+        # pixel_dist=8
+        out = self.block4(out)
+
+        out = self.convtr4p8s2(out)
+        out = self.bntr4(out)
+        out = self.relu(out)
+
+        out = me.cat(out, out_b3p4)
+        out = self.block5(out)
+
+        out = self.convtr5p4s2(out)
+        out = self.bntr5(out)
+        out = self.relu(out)
+
+        out = me.cat(out, out_b2p2)
+        out = self.block6(out)
+
+        out = self.convtr6p2s2(out)
+        out = self.bntr6(out)
+        out = self.relu(out)
+
+        out = me.cat(out, out_b1p1)
+        return self.final(out)
+
+
+class ResUNet14(MinkUNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (1, 1, 1, 1, 1, 1)
+
+
+class ResUNet18(MinkUNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (2, 2, 2, 2, 2, 2)
+
+
+class ResUNet18INBN(ResUNet18):
+    NORM_TYPE = NormType.INSTANCE_BATCH_NORM
+    BLOCK = BasicBlockINBN
+
+
+class ResUNet34(MinkUNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (3, 4, 6, 3, 2, 2)
+
+
+class ResUNet50(MinkUNetBase):
+    BLOCK = Bottleneck
+    LAYERS = (3, 4, 6, 3, 2, 2)
+
+
+class ResUNet101(MinkUNetBase):
+    BLOCK = Bottleneck
+    LAYERS = (3, 4, 23, 3, 2, 2)
+
+
+class ResUNet14D(ResUNet14):
+    PLANES = (64, 128, 256, 512, 512, 512, 512)
+
+
+class ResUNet18D(ResUNet18):
+    PLANES = (64, 128, 256, 512, 512, 512, 512)
+
+
+class ResUNet34D(ResUNet34):
+    PLANES = (64, 128, 256, 512, 512, 512, 512)
+
+
+class ResUNet34E(ResUNet34):
+    INIT_DIM = 32
+    PLANES = (32, 64, 128, 256, 128, 64, 64)
+
+
+class ResUNet34F(ResUNet34):
+    INIT_DIM = 32
+    PLANES = (32, 64, 128, 256, 128, 64, 32)
+
+
+class MinkUNetHyper(MinkUNetBase):
+    BLOCK = None
+    PLANES = (64, 128, 256, 512, 256, 128, 128)
+    DILATIONS = (1, 1, 1, 1, 1, 1)
+    LAYERS = (2, 2, 2, 2, 2, 2)
+    INIT_DIM = 64
+    OUT_PIXEL_DIST = 1
+    NORM_TYPE = NormType.BATCH_NORM
+    NON_BLOCK_CONV_TYPE = ConvType.SPATIAL_HYPERCUBE
+    CONV_TYPE = ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS
+
+    # To use the model, must call initialize_coords before forward pass.
+    # Once data is processed, call clear to reset the model before calling initialize_coords
+    def __init__(self, in_channels, out_channels, config, D=3, **kwargs):
+        super(MinkUNetBase, self).__init__(in_channels, out_channels, config, D)
+
+    def network_initialization(self, in_channels, out_channels, config, D):
+        # Setup net_metadata
+        dilations = self.DILATIONS
+        bn_momentum = config.bn_momentum
+
+        def space_n_time_m(n, m):
+            return n if D == 3 else [n, n, n, m]
+
+        if D == 4:
+            self.OUT_PIXEL_DIST = space_n_time_m(self.OUT_PIXEL_DIST, 1)
+
+        # Output of the first conv concated to conv6
+        self.inplanes = self.INIT_DIM
+        self.conv1p1s1 = conv(
+            in_channels,
+            self.inplanes,
+            kernel_size=space_n_time_m(config.conv1_kernel_size, 1),
+            stride=1,
+            dilation=1,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D,
+        )
+
+        self.bn1 = get_norm(self.NORM_TYPE, self.PLANES[0], D, bn_momentum=bn_momentum)
+        self.block1 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[0],
+            self.LAYERS[0],
+            dilation=dilations[0],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum,
+        )
+
+        self.conv2p1s2 = conv(
+            self.inplanes,
+            self.inplanes,
+            kernel_size=space_n_time_m(2, 1),
+            stride=space_n_time_m(2, 1),
+            dilation=1,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D,
+        )
+        self.bn2 = get_norm(self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum)
+        self.block2 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[1],
+            self.LAYERS[1],
+            dilation=dilations[1],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum,
+        )
+
+        self.conv3p2s2 = conv(
+            self.inplanes,
+            self.inplanes,
+            kernel_size=space_n_time_m(2, 1),
+            stride=space_n_time_m(2, 1),
+            dilation=1,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D,
+        )
+        self.bn3 = get_norm(self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum)
+        self.block3 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[2],
+            self.LAYERS[2],
+            dilation=dilations[2],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum,
+        )
+
+        self.conv4p4s2 = conv(
+            self.inplanes,
+            self.inplanes,
+            kernel_size=space_n_time_m(2, 1),
+            stride=space_n_time_m(2, 1),
+            dilation=1,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D,
+        )
+        self.bn4 = get_norm(self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum)
+        self.block4 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[3],
+            self.LAYERS[3],
+            dilation=dilations[3],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum,
+        )
+        self.pool_tr4 = ME.MinkowskiPoolingTranspose(
+            kernel_size=8, stride=8, dimension=D
+        )
+        _ = self.inplanes
+        self.convtr4p8s2 = conv_tr(
+            self.inplanes,
+            self.PLANES[4],
+            kernel_size=space_n_time_m(2, 1),
+            upsample_stride=space_n_time_m(2, 1),
+            dilation=1,
+            bias=False,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D,
+        )
+        self.bntr4 = get_norm(
+            self.NORM_TYPE, self.PLANES[4], D, bn_momentum=bn_momentum
+        )
+
+        self.inplanes = self.PLANES[4] + self.PLANES[2] * self.BLOCK.expansion
+        self.block5 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[4],
+            self.LAYERS[4],
+            dilation=dilations[4],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum,
+        )
+        self.pool_tr5 = ME.MinkowskiPoolingTranspose(
+            kernel_size=4, stride=4, dimension=D
+        )
+        out_pool5 = self.inplanes
+        self.convtr5p4s2 = conv_tr(
+            self.inplanes,
+            self.PLANES[5],
+            kernel_size=space_n_time_m(2, 1),
+            upsample_stride=space_n_time_m(2, 1),
+            dilation=1,
+            bias=False,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D,
+        )
+        self.bntr5 = get_norm(
+            self.NORM_TYPE, self.PLANES[5], D, bn_momentum=bn_momentum
+        )
+
+        self.inplanes = self.PLANES[5] + self.PLANES[1] * self.BLOCK.expansion
+        self.block6 = self._make_layer(
+            self.BLOCK,
+            self.PLANES[5],
+            self.LAYERS[5],
+            dilation=dilations[5],
+            norm_type=self.NORM_TYPE,
+            bn_momentum=bn_momentum,
+        )
+        self.pool_tr6 = ME.MinkowskiPoolingTranspose(
+            kernel_size=2, stride=2, dimension=D
+        )
+        out_pool6 = self.inplanes
+        self.convtr6p2s2 = conv_tr(
+            self.inplanes,
+            self.PLANES[6],
+            kernel_size=space_n_time_m(2, 1),
+            upsample_stride=space_n_time_m(2, 1),
+            dilation=1,
+            bias=False,
+            conv_type=self.NON_BLOCK_CONV_TYPE,
+            D=D,
+        )
+        self.bntr6 = get_norm(
+            self.NORM_TYPE, self.PLANES[6], D, bn_momentum=bn_momentum
+        )
+
+        self.relu = MinkowskiReLU(inplace=True)
+
+        self.final = nn.Sequential(
+            conv(
+                out_pool5
+                + out_pool6
+                + self.PLANES[6]
+                + self.PLANES[0] * self.BLOCK.expansion,
+                512,
+                kernel_size=1,
+                bias=False,
+                D=D,
+            ),
+            ME.MinkowskiBatchNorm(512),
+            ME.MinkowskiReLU(),
+            conv(512, out_channels, kernel_size=1, bias=True, D=D),
+        )
+
+    def forward(self, x):
+        out = self.conv1p1s1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out_b1p1 = self.block1(out)
+
+        out = self.conv2p1s2(out_b1p1)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out_b2p2 = self.block2(out)
+
+        out = self.conv3p2s2(out_b2p2)
+        out = self.bn3(out)
+        out = self.relu(out)
+
+        out_b3p4 = self.block3(out)
+
+        out = self.conv4p4s2(out_b3p4)
+        out = self.bn4(out)
+        out = self.relu(out)
+
+        # pixel_dist=8
+        out = self.block4(out)
+
+        out = self.convtr4p8s2(out)
+        out = self.bntr4(out)
+        out = self.relu(out)
+
+        out = me.cat(out, out_b3p4)
+        out = self.block5(out)
+        out_5 = self.pool_tr5(out)
+
+        out = self.convtr5p4s2(out)
+        out = self.bntr5(out)
+        out = self.relu(out)
+
+        out = me.cat(out, out_b2p2)
+        out = self.block6(out)
+        out_6 = self.pool_tr6(out)
+
+        out = self.convtr6p2s2(out)
+        out = self.bntr6(out)
+        out = self.relu(out)
+
+        out = me.cat(out, out_b1p1, out_6, out_5)
+        return self.final(out)
+
+
+class MinkUNetHyper14INBN(MinkUNetHyper):
+    NORM_TYPE = NormType.INSTANCE_BATCH_NORM
+    BLOCK = BasicBlockINBN
+
+
+class STMinkUNetBase(MinkUNetBase):
+
+    CONV_TYPE = ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS
+
+    def __init__(self, in_channels, out_channels, config, D=4, **kwargs):
+        super().__init__(in_channels, out_channels, config, D, **kwargs)
+
+
+class STResUNet14(STMinkUNetBase, ResUNet14):
+    pass
+
+
+class STResUNet18(STMinkUNetBase, ResUNet18):
+    pass
+
+
+class STResUNet34(STMinkUNetBase, ResUNet34):
+    pass
+
+
+class STResUNet50(STMinkUNetBase, ResUNet50):
+    pass
+
+
+class STResUNet101(STMinkUNetBase, ResUNet101):
+    pass
+
+
+class STResTesseractUNetBase(STMinkUNetBase):
+    CONV_TYPE = ConvType.HYPERCUBE
+
+
+class STResTesseractUNet14(STResTesseractUNetBase, ResUNet14):
+    pass
+
+
+class STResTesseractUNet18(STResTesseractUNetBase, ResUNet18):
+    pass
+
+
+class STResTesseractUNet34(STResTesseractUNetBase, ResUNet34):
+    pass
+
+
+class STResTesseractUNet50(STResTesseractUNetBase, ResUNet50):
+    pass
+
+
+class STResTesseractUNet101(STResTesseractUNetBase, ResUNet101):
+    pass
diff --git a/scripts/preprocess_kitti.sh b/scripts/preprocess_kitti.sh
new file mode 100755
index 0000000..97927b9
--- /dev/null
+++ b/scripts/preprocess_kitti.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+srun python -m datasets.preprocessing.semantic_kitti_preprocessing preprocess \
+--data_dir="/globalwork/data/SemanticKITTI/dataset" \
+--save_dir="/globalwork/yilmaz/data/processed/semantic_kitti"
+
+srun python -m datasets.preprocessing.semantic_kitti_preprocessing make_instance_database \
+--data_dir="/globalwork/data/SemanticKITTI/dataset" \
+--save_dir="/globalwork/yilmaz/data/processed/semantic_kitti"
\ No newline at end of file
diff --git a/scripts/test.sh b/scripts/test.sh
new file mode 100755
index 0000000..394f745
--- /dev/null
+++ b/scripts/test.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+export OMP_NUM_THREADS=12  # speeds up MinkowskiEngine
+export CUDA_LAUNCH_BLOCKING=1
+export HYDRA_FULL_ERROR=1
+
+python main_panoptic.py \
+general.project_name="kitti_semantic" \
+general.mode="test" \
+general.instance_population=0 \
+general.ckpt_path='/home/yilmaz/fix/Mask4D/saved/2023-09-30_153832/last-epoch.ckpt' \
\ No newline at end of file
diff --git a/scripts/train.sh b/scripts/train.sh
new file mode 100755
index 0000000..5d18fac
--- /dev/null
+++ b/scripts/train.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+export OMP_NUM_THREADS=12  # speeds up MinkowskiEngine
+export CUDA_LAUNCH_BLOCKING=1
+export HYDRA_FULL_ERROR=1
+
+# TRAIN
+python main_panoptic.py \
+general.project_name="kitti_semantic" \
\ No newline at end of file
diff --git a/scripts/val.sh b/scripts/val.sh
new file mode 100755
index 0000000..7bbdd0d
--- /dev/null
+++ b/scripts/val.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+export OMP_NUM_THREADS=12  # speeds up MinkowskiEngine
+export CUDA_LAUNCH_BLOCKING=1
+export HYDRA_FULL_ERROR=1
+
+python main_panoptic.py \
+general.project_name="kitti_semantic" \
+general.mode="validate" \
+general.instance_population=0 \
+general.ckpt_path='/home/yilmaz/fix/Mask4D/saved/2023-09-30_153832/last-epoch.ckpt' \
\ No newline at end of file
diff --git a/third_party/pointnet2/_ext_src/include/ball_query.h b/third_party/pointnet2/_ext_src/include/ball_query.h
new file mode 100644
index 0000000..b4feff8
--- /dev/null
+++ b/third_party/pointnet2/_ext_src/include/ball_query.h
@@ -0,0 +1,7 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius,
+                      const int nsample);
diff --git a/third_party/pointnet2/_ext_src/include/cuda_utils.h b/third_party/pointnet2/_ext_src/include/cuda_utils.h
new file mode 100644
index 0000000..f746526
--- /dev/null
+++ b/third_party/pointnet2/_ext_src/include/cuda_utils.h
@@ -0,0 +1,43 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+#ifndef _CUDA_UTILS_H
+#define _CUDA_UTILS_H
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cmath>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <vector>
+
+#define TOTAL_THREADS 512
+
+inline int opt_n_threads(int work_size) {
+  const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
+
+  return max(min(1 << pow_2, TOTAL_THREADS), 1);
+}
+
+inline dim3 opt_block_config(int x, int y) {
+  const int x_threads = opt_n_threads(x);
+  const int y_threads =
+      max(min(opt_n_threads(y), TOTAL_THREADS / x_threads), 1);
+  dim3 block_config(x_threads, y_threads, 1);
+
+  return block_config;
+}
+
+#define CUDA_CHECK_ERRORS()                                           \
+  do {                                                                \
+    cudaError_t err = cudaGetLastError();                             \
+    if (cudaSuccess != err) {                                         \
+      fprintf(stderr, "CUDA kernel failed : %s\n%s at L:%d in %s\n",  \
+              cudaGetErrorString(err), __PRETTY_FUNCTION__, __LINE__, \
+              __FILE__);                                              \
+      exit(-1);                                                       \
+    }                                                                 \
+  } while (0)
+
+#endif
diff --git a/third_party/pointnet2/_ext_src/include/group_points.h b/third_party/pointnet2/_ext_src/include/group_points.h
new file mode 100644
index 0000000..97be802
--- /dev/null
+++ b/third_party/pointnet2/_ext_src/include/group_points.h
@@ -0,0 +1,8 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor group_points(at::Tensor points, at::Tensor idx);
+at::Tensor group_points_grad(at::Tensor grad_out, at::Tensor idx, const int n);
diff --git a/third_party/pointnet2/_ext_src/include/interpolate.h b/third_party/pointnet2/_ext_src/include/interpolate.h
new file mode 100644
index 0000000..e7fb792
--- /dev/null
+++ b/third_party/pointnet2/_ext_src/include/interpolate.h
@@ -0,0 +1,12 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+#pragma once
+
+#include <torch/extension.h>
+#include <vector>
+
+std::vector<at::Tensor> three_nn(at::Tensor unknowns, at::Tensor knows);
+at::Tensor three_interpolate(at::Tensor points, at::Tensor idx,
+                             at::Tensor weight);
+at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx,
+                                  at::Tensor weight, const int m);
diff --git a/third_party/pointnet2/_ext_src/include/sampling.h b/third_party/pointnet2/_ext_src/include/sampling.h
new file mode 100644
index 0000000..7de473e
--- /dev/null
+++ b/third_party/pointnet2/_ext_src/include/sampling.h
@@ -0,0 +1,9 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor gather_points(at::Tensor points, at::Tensor idx);
+at::Tensor gather_points_grad(at::Tensor grad_out, at::Tensor idx, const int n);
+at::Tensor furthest_point_sampling(at::Tensor points, const int nsamples);
diff --git a/third_party/pointnet2/_ext_src/include/utils.h b/third_party/pointnet2/_ext_src/include/utils.h
new file mode 100644
index 0000000..815dabb
--- /dev/null
+++ b/third_party/pointnet2/_ext_src/include/utils.h
@@ -0,0 +1,28 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+
+#pragma once
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+
+#define CHECK_CUDA(x)                                          \
+  do {                                                         \
+    AT_ASSERT(x.is_cuda(), #x " must be a CUDA tensor"); \
+  } while (0)
+
+#define CHECK_CONTIGUOUS(x)                                         \
+  do {                                                              \
+    AT_ASSERT(x.is_contiguous(), #x " must be a contiguous tensor"); \
+  } while (0)
+
+#define CHECK_IS_INT(x)                              \
+  do {                                               \
+    AT_ASSERT(x.scalar_type() == at::ScalarType::Int, \
+             #x " must be an int tensor");           \
+  } while (0)
+
+#define CHECK_IS_FLOAT(x)                              \
+  do {                                                 \
+    AT_ASSERT(x.scalar_type() == at::ScalarType::Float, \
+             #x " must be a float tensor");            \
+  } while (0)
diff --git a/third_party/pointnet2/_ext_src/src/ball_query.cpp b/third_party/pointnet2/_ext_src/src/ball_query.cpp
new file mode 100644
index 0000000..7dd77d5
--- /dev/null
+++ b/third_party/pointnet2/_ext_src/src/ball_query.cpp
@@ -0,0 +1,35 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+
+#include "ball_query.h"
+#include "utils.h"
+
+void query_ball_point_kernel_wrapper(int b, int n, int m, float radius,
+                                     int nsample, const float *new_xyz,
+                                     const float *xyz, int *idx);
+
+at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius,
+                      const int nsample) {
+  CHECK_CONTIGUOUS(new_xyz);
+  CHECK_CONTIGUOUS(xyz);
+  CHECK_IS_FLOAT(new_xyz);
+  CHECK_IS_FLOAT(xyz);
+
+  if (new_xyz.is_cuda()) {
+    CHECK_CUDA(xyz);
+  }
+
+  at::Tensor idx =
+      torch::zeros({new_xyz.size(0), new_xyz.size(1), nsample},
+                   at::device(new_xyz.device()).dtype(at::ScalarType::Int));
+
+  if (new_xyz.is_cuda()) {
+    query_ball_point_kernel_wrapper(xyz.size(0), xyz.size(1), new_xyz.size(1),
+                                    radius, nsample, new_xyz.data<float>(),
+                                    xyz.data<float>(), idx.data<int>());
+  } else {
+    AT_ASSERT(false, "CPU not supported");
+  }
+
+  return idx;
+}
diff --git a/third_party/pointnet2/_ext_src/src/ball_query_gpu.cu b/third_party/pointnet2/_ext_src/src/ball_query_gpu.cu
new file mode 100644
index 0000000..cee88cb
--- /dev/null
+++ b/third_party/pointnet2/_ext_src/src/ball_query_gpu.cu
@@ -0,0 +1,57 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cuda_utils.h"
+
+// input: new_xyz(b, m, 3) xyz(b, n, 3)
+// output: idx(b, m, nsample)
+__global__ void query_ball_point_kernel(int b, int n, int m, float radius,
+                                        int nsample,
+                                        const float *__restrict__ new_xyz,
+                                        const float *__restrict__ xyz,
+                                        int *__restrict__ idx) {
+  int batch_index = blockIdx.x;
+  xyz += batch_index * n * 3;
+  new_xyz += batch_index * m * 3;
+  idx += m * nsample * batch_index;
+
+  int index = threadIdx.x;
+  int stride = blockDim.x;
+
+  float radius2 = radius * radius;
+  for (int j = index; j < m; j += stride) {
+    float new_x = new_xyz[j * 3 + 0];
+    float new_y = new_xyz[j * 3 + 1];
+    float new_z = new_xyz[j * 3 + 2];
+    for (int k = 0, cnt = 0; k < n && cnt < nsample; ++k) {
+      float x = xyz[k * 3 + 0];
+      float y = xyz[k * 3 + 1];
+      float z = xyz[k * 3 + 2];
+      float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+                 (new_z - z) * (new_z - z);
+      if (d2 < radius2) {
+        if (cnt == 0) {
+          for (int l = 0; l < nsample; ++l) {
+            idx[j * nsample + l] = k;
+          }
+        }
+        idx[j * nsample + cnt] = k;
+        ++cnt;
+      }
+    }
+  }
+}
+
+void query_ball_point_kernel_wrapper(int b, int n, int m, float radius,
+                                     int nsample, const float *new_xyz,
+                                     const float *xyz, int *idx) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  query_ball_point_kernel<<<b, opt_n_threads(m), 0, stream>>>(
+      b, n, m, radius, nsample, new_xyz, xyz, idx);
+
+  CUDA_CHECK_ERRORS();
+}
diff --git a/third_party/pointnet2/_ext_src/src/bindings.cpp b/third_party/pointnet2/_ext_src/src/bindings.cpp
new file mode 100644
index 0000000..58d6c2d
--- /dev/null
+++ b/third_party/pointnet2/_ext_src/src/bindings.cpp
@@ -0,0 +1,22 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+
+#include "ball_query.h"
+#include "group_points.h"
+#include "interpolate.h"
+#include "sampling.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("gather_points", &gather_points);
+  m.def("gather_points_grad", &gather_points_grad);
+  m.def("furthest_point_sampling", &furthest_point_sampling);
+
+  m.def("three_nn", &three_nn);
+  m.def("three_interpolate", &three_interpolate);
+  m.def("three_interpolate_grad", &three_interpolate_grad);
+
+  m.def("ball_query", &ball_query);
+
+  m.def("group_points", &group_points);
+  m.def("group_points_grad", &group_points_grad);
+}
diff --git a/third_party/pointnet2/_ext_src/src/group_points.cpp b/third_party/pointnet2/_ext_src/src/group_points.cpp
new file mode 100644
index 0000000..22998dd
--- /dev/null
+++ b/third_party/pointnet2/_ext_src/src/group_points.cpp
@@ -0,0 +1,63 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+
+#include "group_points.h"
+#include "utils.h"
+
+void group_points_kernel_wrapper(int b, int c, int n, int npoints, int nsample,
+                                 const float *points, const int *idx,
+                                 float *out);
+
+void group_points_grad_kernel_wrapper(int b, int c, int n, int npoints,
+                                      int nsample, const float *grad_out,
+                                      const int *idx, float *grad_points);
+
+at::Tensor group_points(at::Tensor points, at::Tensor idx) {
+  CHECK_CONTIGUOUS(points);
+  CHECK_CONTIGUOUS(idx);
+  CHECK_IS_FLOAT(points);
+  CHECK_IS_INT(idx);
+
+  if (points.is_cuda()) {
+    CHECK_CUDA(idx);
+  }
+
+  at::Tensor output =
+      torch::zeros({points.size(0), points.size(1), idx.size(1), idx.size(2)},
+                   at::device(points.device()).dtype(at::ScalarType::Float));
+
+  if (points.is_cuda()) {
+    group_points_kernel_wrapper(points.size(0), points.size(1), points.size(2),
+                                idx.size(1), idx.size(2), points.data<float>(),
+                                idx.data<int>(), output.data<float>());
+  } else {
+    AT_ASSERT(false, "CPU not supported");
+  }
+
+  return output;
+}
+
+at::Tensor group_points_grad(at::Tensor grad_out, at::Tensor idx, const int n) {
+  CHECK_CONTIGUOUS(grad_out);
+  CHECK_CONTIGUOUS(idx);
+  CHECK_IS_FLOAT(grad_out);
+  CHECK_IS_INT(idx);
+
+  if (grad_out.is_cuda()) {
+    CHECK_CUDA(idx);
+  }
+
+  at::Tensor output =
+      torch::zeros({grad_out.size(0), grad_out.size(1), n},
+                   at::device(grad_out.device()).dtype(at::ScalarType::Float));
+
+  if (grad_out.is_cuda()) {
+    group_points_grad_kernel_wrapper(
+        grad_out.size(0), grad_out.size(1), n, idx.size(1), idx.size(2),
+        grad_out.data<float>(), idx.data<int>(), output.data<float>());
+  } else {
+    AT_ASSERT(false, "CPU not supported");
+  }
+
+  return output;
+}
diff --git a/third_party/pointnet2/_ext_src/src/group_points_gpu.cu b/third_party/pointnet2/_ext_src/src/group_points_gpu.cu
new file mode 100644
index 0000000..e36672e
--- /dev/null
+++ b/third_party/pointnet2/_ext_src/src/group_points_gpu.cu
@@ -0,0 +1,78 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cuda_utils.h"
+
+// input: points(b, c, n) idx(b, npoints, nsample)
+// output: out(b, c, npoints, nsample)
+__global__ void group_points_kernel(int b, int c, int n, int npoints,
+                                    int nsample,
+                                    const float *__restrict__ points,
+                                    const int *__restrict__ idx,
+                                    float *__restrict__ out) {
+  int batch_index = blockIdx.x;
+  points += batch_index * n * c;
+  idx += batch_index * npoints * nsample;
+  out += batch_index * npoints * nsample * c;
+
+  const int index = threadIdx.y * blockDim.x + threadIdx.x;
+  const int stride = blockDim.y * blockDim.x;
+  for (int i = index; i < c * npoints; i += stride) {
+    const int l = i / npoints;
+    const int j = i % npoints;
+    for (int k = 0; k < nsample; ++k) {
+      int ii = idx[j * nsample + k];
+      out[(l * npoints + j) * nsample + k] = points[l * n + ii];
+    }
+  }
+}
+
+void group_points_kernel_wrapper(int b, int c, int n, int npoints, int nsample,
+                                 const float *points, const int *idx,
+                                 float *out) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  group_points_kernel<<<b, opt_block_config(npoints, c), 0, stream>>>(
+      b, c, n, npoints, nsample, points, idx, out);
+
+  CUDA_CHECK_ERRORS();
+}
+
+// input: grad_out(b, c, npoints, nsample), idx(b, npoints, nsample)
+// output: grad_points(b, c, n)
+__global__ void group_points_grad_kernel(int b, int c, int n, int npoints,
+                                         int nsample,
+                                         const float *__restrict__ grad_out,
+                                         const int *__restrict__ idx,
+                                         float *__restrict__ grad_points) {
+  int batch_index = blockIdx.x;
+  grad_out += batch_index * npoints * nsample * c;
+  idx += batch_index * npoints * nsample;
+  grad_points += batch_index * n * c;
+
+  const int index = threadIdx.y * blockDim.x + threadIdx.x;
+  const int stride = blockDim.y * blockDim.x;
+  for (int i = index; i < c * npoints; i += stride) {
+    const int l = i / npoints;
+    const int j = i % npoints;
+    for (int k = 0; k < nsample; ++k) {
+      int ii = idx[j * nsample + k];
+      atomicAdd(grad_points + l * n + ii,
+                grad_out[(l * npoints + j) * nsample + k]);
+    }
+  }
+}
+
+void group_points_grad_kernel_wrapper(int b, int c, int n, int npoints,
+                                      int nsample, const float *grad_out,
+                                      const int *idx, float *grad_points) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  group_points_grad_kernel<<<b, opt_block_config(npoints, c), 0, stream>>>(
+      b, c, n, npoints, nsample, grad_out, idx, grad_points);
+
+  CUDA_CHECK_ERRORS();
+}
diff --git a/third_party/pointnet2/_ext_src/src/interpolate.cpp b/third_party/pointnet2/_ext_src/src/interpolate.cpp
new file mode 100644
index 0000000..4b680c5
--- /dev/null
+++ b/third_party/pointnet2/_ext_src/src/interpolate.cpp
@@ -0,0 +1,101 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+#include "interpolate.h"
+#include "utils.h"
+
+void three_nn_kernel_wrapper(int b, int n, int m, const float *unknown,
+                             const float *known, float *dist2, int *idx);
+void three_interpolate_kernel_wrapper(int b, int c, int m, int n,
+                                      const float *points, const int *idx,
+                                      const float *weight, float *out);
+void three_interpolate_grad_kernel_wrapper(int b, int c, int n, int m,
+                                           const float *grad_out,
+                                           const int *idx, const float *weight,
+                                           float *grad_points);
+
+std::vector<at::Tensor> three_nn(at::Tensor unknowns, at::Tensor knows) {
+  CHECK_CONTIGUOUS(unknowns);
+  CHECK_CONTIGUOUS(knows);
+  CHECK_IS_FLOAT(unknowns);
+  CHECK_IS_FLOAT(knows);
+
+  if (unknowns.is_cuda()) {
+    CHECK_CUDA(knows);
+  }
+
+  at::Tensor idx =
+      torch::zeros({unknowns.size(0), unknowns.size(1), 3},
+                   at::device(unknowns.device()).dtype(at::ScalarType::Int));
+  at::Tensor dist2 =
+      torch::zeros({unknowns.size(0), unknowns.size(1), 3},
+                   at::device(unknowns.device()).dtype(at::ScalarType::Float));
+
+  if (unknowns.is_cuda()) {
+    three_nn_kernel_wrapper(unknowns.size(0), unknowns.size(1), knows.size(1),
+                            unknowns.data<float>(), knows.data<float>(),
+                            dist2.data<float>(), idx.data<int>());
+  } else {
+    AT_ASSERT(false, "CPU not supported");
+  }
+
+  return {dist2, idx};
+}
+
+at::Tensor three_interpolate(at::Tensor points, at::Tensor idx,
+                             at::Tensor weight) {
+  CHECK_CONTIGUOUS(points);
+  CHECK_CONTIGUOUS(idx);
+  CHECK_CONTIGUOUS(weight);
+  CHECK_IS_FLOAT(points);
+  CHECK_IS_INT(idx);
+  CHECK_IS_FLOAT(weight);
+
+  if (points.is_cuda()) {
+    CHECK_CUDA(idx);
+    CHECK_CUDA(weight);
+  }
+
+  at::Tensor output =
+      torch::zeros({points.size(0), points.size(1), idx.size(1)},
+                   at::device(points.device()).dtype(at::ScalarType::Float));
+
+  if (points.is_cuda()) {
+    three_interpolate_kernel_wrapper(
+        points.size(0), points.size(1), points.size(2), idx.size(1),
+        points.data<float>(), idx.data<int>(), weight.data<float>(),
+        output.data<float>());
+  } else {
+    AT_ASSERT(false, "CPU not supported");
+  }
+
+  return output;
+}
+at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx,
+                                  at::Tensor weight, const int m) {
+  CHECK_CONTIGUOUS(grad_out);
+  CHECK_CONTIGUOUS(idx);
+  CHECK_CONTIGUOUS(weight);
+  CHECK_IS_FLOAT(grad_out);
+  CHECK_IS_INT(idx);
+  CHECK_IS_FLOAT(weight);
+
+  if (grad_out.is_cuda()) {
+    CHECK_CUDA(idx);
+    CHECK_CUDA(weight);
+  }
+
+  at::Tensor output =
+      torch::zeros({grad_out.size(0), grad_out.size(1), m},
+                   at::device(grad_out.device()).dtype(at::ScalarType::Float));
+
+  if (grad_out.is_cuda()) {
+    three_interpolate_grad_kernel_wrapper(
+        grad_out.size(0), grad_out.size(1), grad_out.size(2), m,
+        grad_out.data<float>(), idx.data<int>(), weight.data<float>(),
+        output.data<float>());
+  } else {
+    AT_ASSERT(false, "CPU not supported");
+  }
+
+  return output;
+}
diff --git a/third_party/pointnet2/_ext_src/src/interpolate_gpu.cu b/third_party/pointnet2/_ext_src/src/interpolate_gpu.cu
new file mode 100644
index 0000000..b4c5644
--- /dev/null
+++ b/third_party/pointnet2/_ext_src/src/interpolate_gpu.cu
@@ -0,0 +1,157 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cuda_utils.h"
+
+// input: unknown(b, n, 3) known(b, m, 3)
+// output: dist2(b, n, 3), idx(b, n, 3)
+__global__ void three_nn_kernel(int b, int n, int m,
+                                const float *__restrict__ unknown,
+                                const float *__restrict__ known,
+                                float *__restrict__ dist2,
+                                int *__restrict__ idx) {
+  int batch_index = blockIdx.x;
+  unknown += batch_index * n * 3;
+  known += batch_index * m * 3;
+  dist2 += batch_index * n * 3;
+  idx += batch_index * n * 3;
+
+  int index = threadIdx.x;
+  int stride = blockDim.x;
+  for (int j = index; j < n; j += stride) {
+    float ux = unknown[j * 3 + 0];
+    float uy = unknown[j * 3 + 1];
+    float uz = unknown[j * 3 + 2];
+
+    double best1 = 1e40, best2 = 1e40, best3 = 1e40;
+    int besti1 = 0, besti2 = 0, besti3 = 0;
+    for (int k = 0; k < m; ++k) {
+      float x = known[k * 3 + 0];
+      float y = known[k * 3 + 1];
+      float z = known[k * 3 + 2];
+      float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
+      if (d < best1) {
+        best3 = best2;
+        besti3 = besti2;
+        best2 = best1;
+        besti2 = besti1;
+        best1 = d;
+        besti1 = k;
+      } else if (d < best2) {
+        best3 = best2;
+        besti3 = besti2;
+        best2 = d;
+        besti2 = k;
+      } else if (d < best3) {
+        best3 = d;
+        besti3 = k;
+      }
+    }
+    dist2[j * 3 + 0] = best1;
+    dist2[j * 3 + 1] = best2;
+    dist2[j * 3 + 2] = best3;
+
+    idx[j * 3 + 0] = besti1;
+    idx[j * 3 + 1] = besti2;
+    idx[j * 3 + 2] = besti3;
+  }
+}
+
+void three_nn_kernel_wrapper(int b, int n, int m, const float *unknown,
+                             const float *known, float *dist2, int *idx) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  three_nn_kernel<<<b, opt_n_threads(n), 0, stream>>>(b, n, m, unknown, known,
+                                                      dist2, idx);
+
+  CUDA_CHECK_ERRORS();
+}
+
+// input: points(b, c, m), idx(b, n, 3), weight(b, n, 3)
+// output: out(b, c, n)
+__global__ void three_interpolate_kernel(int b, int c, int m, int n,
+                                         const float *__restrict__ points,
+                                         const int *__restrict__ idx,
+                                         const float *__restrict__ weight,
+                                         float *__restrict__ out) {
+  int batch_index = blockIdx.x;
+  points += batch_index * m * c;
+
+  idx += batch_index * n * 3;
+  weight += batch_index * n * 3;
+
+  out += batch_index * n * c;
+
+  const int index = threadIdx.y * blockDim.x + threadIdx.x;
+  const int stride = blockDim.y * blockDim.x;
+  for (int i = index; i < c * n; i += stride) {
+    const int l = i / n;
+    const int j = i % n;
+    float w1 = weight[j * 3 + 0];
+    float w2 = weight[j * 3 + 1];
+    float w3 = weight[j * 3 + 2];
+
+    int i1 = idx[j * 3 + 0];
+    int i2 = idx[j * 3 + 1];
+    int i3 = idx[j * 3 + 2];
+
+    out[i] = points[l * m + i1] * w1 + points[l * m + i2] * w2 +
+             points[l * m + i3] * w3;
+  }
+}
+
+void three_interpolate_kernel_wrapper(int b, int c, int m, int n,
+                                      const float *points, const int *idx,
+                                      const float *weight, float *out) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  three_interpolate_kernel<<<b, opt_block_config(n, c), 0, stream>>>(
+      b, c, m, n, points, idx, weight, out);
+
+  CUDA_CHECK_ERRORS();
+}
+
+// input: grad_out(b, c, n), idx(b, n, 3), weight(b, n, 3)
+// output: grad_points(b, c, m)
+
+__global__ void three_interpolate_grad_kernel(
+    int b, int c, int n, int m, const float *__restrict__ grad_out,
+    const int *__restrict__ idx, const float *__restrict__ weight,
+    float *__restrict__ grad_points) {
+  int batch_index = blockIdx.x;
+  grad_out += batch_index * n * c;
+  idx += batch_index * n * 3;
+  weight += batch_index * n * 3;
+  grad_points += batch_index * m * c;
+
+  const int index = threadIdx.y * blockDim.x + threadIdx.x;
+  const int stride = blockDim.y * blockDim.x;
+  for (int i = index; i < c * n; i += stride) {
+    const int l = i / n;
+    const int j = i % n;
+    float w1 = weight[j * 3 + 0];
+    float w2 = weight[j * 3 + 1];
+    float w3 = weight[j * 3 + 2];
+
+    int i1 = idx[j * 3 + 0];
+    int i2 = idx[j * 3 + 1];
+    int i3 = idx[j * 3 + 2];
+
+    atomicAdd(grad_points + l * m + i1, grad_out[i] * w1);
+    atomicAdd(grad_points + l * m + i2, grad_out[i] * w2);
+    atomicAdd(grad_points + l * m + i3, grad_out[i] * w3);
+  }
+}
+
+void three_interpolate_grad_kernel_wrapper(int b, int c, int n, int m,
+                                           const float *grad_out,
+                                           const int *idx, const float *weight,
+                                           float *grad_points) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  three_interpolate_grad_kernel<<<b, opt_block_config(n, c), 0, stream>>>(
+      b, c, n, m, grad_out, idx, weight, grad_points);
+
+  CUDA_CHECK_ERRORS();
+}
diff --git a/third_party/pointnet2/_ext_src/src/sampling.cpp b/third_party/pointnet2/_ext_src/src/sampling.cpp
new file mode 100644
index 0000000..de55822
--- /dev/null
+++ b/third_party/pointnet2/_ext_src/src/sampling.cpp
@@ -0,0 +1,88 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+#include "sampling.h"
+#include "utils.h"
+
+void gather_points_kernel_wrapper(int b, int c, int n, int npoints,
+                                  const float *points, const int *idx,
+                                  float *out);
+void gather_points_grad_kernel_wrapper(int b, int c, int n, int npoints,
+                                       const float *grad_out, const int *idx,
+                                       float *grad_points);
+
+void furthest_point_sampling_kernel_wrapper(int b, int n, int m,
+                                            const float *dataset, float *temp,
+                                            int *idxs);
+
+at::Tensor gather_points(at::Tensor points, at::Tensor idx) {
+  CHECK_CONTIGUOUS(points);
+  CHECK_CONTIGUOUS(idx);
+  CHECK_IS_FLOAT(points);
+  CHECK_IS_INT(idx);
+
+  if (points.is_cuda()) {
+    CHECK_CUDA(idx);
+  }
+
+  at::Tensor output =
+      torch::zeros({points.size(0), points.size(1), idx.size(1)},
+                   at::device(points.device()).dtype(at::ScalarType::Float));
+
+  if (points.is_cuda()) {
+    gather_points_kernel_wrapper(points.size(0), points.size(1), points.size(2),
+                                 idx.size(1), points.data<float>(),
+                                 idx.data<int>(), output.data<float>());
+  } else {
+    AT_ASSERT(false, "CPU not supported");
+  }
+
+  return output;
+}
+
+at::Tensor gather_points_grad(at::Tensor grad_out, at::Tensor idx,
+                              const int n) {
+  CHECK_CONTIGUOUS(grad_out);
+  CHECK_CONTIGUOUS(idx);
+  CHECK_IS_FLOAT(grad_out);
+  CHECK_IS_INT(idx);
+
+  if (grad_out.is_cuda()) {
+    CHECK_CUDA(idx);
+  }
+
+  at::Tensor output =
+      torch::zeros({grad_out.size(0), grad_out.size(1), n},
+                   at::device(grad_out.device()).dtype(at::ScalarType::Float));
+
+  if (grad_out.is_cuda()) {
+    gather_points_grad_kernel_wrapper(grad_out.size(0), grad_out.size(1), n,
+                                      idx.size(1), grad_out.data<float>(),
+                                      idx.data<int>(), output.data<float>());
+  } else {
+    AT_ASSERT(false, "CPU not supported");
+  }
+
+  return output;
+}
+at::Tensor furthest_point_sampling(at::Tensor points, const int nsamples) {
+  CHECK_CONTIGUOUS(points);
+  CHECK_IS_FLOAT(points);
+
+  at::Tensor output =
+      torch::zeros({points.size(0), nsamples},
+                   at::device(points.device()).dtype(at::ScalarType::Int));
+
+  at::Tensor tmp =
+      torch::full({points.size(0), points.size(1)}, 1e10,
+                  at::device(points.device()).dtype(at::ScalarType::Float));
+
+  if (points.is_cuda()) {
+    furthest_point_sampling_kernel_wrapper(
+        points.size(0), points.size(1), nsamples, points.data<float>(),
+        tmp.data<float>(), output.data<int>());
+  } else {
+    AT_ASSERT(false, "CPU not supported");
+  }
+
+  return output;
+}
diff --git a/third_party/pointnet2/_ext_src/src/sampling_gpu.cu b/third_party/pointnet2/_ext_src/src/sampling_gpu.cu
new file mode 100644
index 0000000..d2b3707
--- /dev/null
+++ b/third_party/pointnet2/_ext_src/src/sampling_gpu.cu
@@ -0,0 +1,232 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cuda_utils.h"
+
+// input: points(b, c, n) idx(b, m)
+// output: out(b, c, m)
+__global__ void gather_points_kernel(int b, int c, int n, int m,
+                                     const float *__restrict__ points,
+                                     const int *__restrict__ idx,
+                                     float *__restrict__ out) {
+  for (int i = blockIdx.x; i < b; i += gridDim.x) {
+    for (int l = blockIdx.y; l < c; l += gridDim.y) {
+      for (int j = threadIdx.x; j < m; j += blockDim.x) {
+        int a = idx[i * m + j];
+        out[(i * c + l) * m + j] = points[(i * c + l) * n + a];
+      }
+    }
+  }
+}
+
+void gather_points_kernel_wrapper(int b, int c, int n, int npoints,
+                                  const float *points, const int *idx,
+                                  float *out) {
+  gather_points_kernel<<<dim3(b, c, 1), opt_n_threads(npoints), 0,
+                         at::cuda::getCurrentCUDAStream()>>>(b, c, n, npoints,
+                                                             points, idx, out);
+
+  CUDA_CHECK_ERRORS();
+}
+
+// input: grad_out(b, c, m) idx(b, m)
+// output: grad_points(b, c, n)
+__global__ void gather_points_grad_kernel(int b, int c, int n, int m,
+                                          const float *__restrict__ grad_out,
+                                          const int *__restrict__ idx,
+                                          float *__restrict__ grad_points) {
+  for (int i = blockIdx.x; i < b; i += gridDim.x) {
+    for (int l = blockIdx.y; l < c; l += gridDim.y) {
+      for (int j = threadIdx.x; j < m; j += blockDim.x) {
+        int a = idx[i * m + j];
+        atomicAdd(grad_points + (i * c + l) * n + a,
+                  grad_out[(i * c + l) * m + j]);
+      }
+    }
+  }
+}
+
+void gather_points_grad_kernel_wrapper(int b, int c, int n, int npoints,
+                                       const float *grad_out, const int *idx,
+                                       float *grad_points) {
+  gather_points_grad_kernel<<<dim3(b, c, 1), opt_n_threads(npoints), 0,
+                              at::cuda::getCurrentCUDAStream()>>>(
+      b, c, n, npoints, grad_out, idx, grad_points);
+
+  CUDA_CHECK_ERRORS();
+}
+
+__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i,
+                         int idx1, int idx2) {
+  const float v1 = dists[idx1], v2 = dists[idx2];
+  const int i1 = dists_i[idx1], i2 = dists_i[idx2];
+  dists[idx1] = max(v1, v2);
+  dists_i[idx1] = v2 > v1 ? i2 : i1;
+}
+
+// Input dataset: (b, n, 3), tmp: (b, n)
+// Ouput idxs (b, m)
+template <unsigned int block_size>
+__global__ void furthest_point_sampling_kernel(
+    int b, int n, int m, const float *__restrict__ dataset,
+    float *__restrict__ temp, int *__restrict__ idxs) {
+  if (m <= 0) return;
+  __shared__ float dists[block_size];
+  __shared__ int dists_i[block_size];
+
+  int batch_index = blockIdx.x;
+  dataset += batch_index * n * 3;
+  temp += batch_index * n;
+  idxs += batch_index * m;
+
+  int tid = threadIdx.x;
+  const int stride = block_size;
+
+  int old = 0;
+  if (threadIdx.x == 0) idxs[0] = old;
+
+  __syncthreads();
+  for (int j = 1; j < m; j++) {
+    int besti = 0;
+    float best = -1;
+    float x1 = dataset[old * 3 + 0];
+    float y1 = dataset[old * 3 + 1];
+    float z1 = dataset[old * 3 + 2];
+    for (int k = tid; k < n; k += stride) {
+      float x2, y2, z2;
+      x2 = dataset[k * 3 + 0];
+      y2 = dataset[k * 3 + 1];
+      z2 = dataset[k * 3 + 2];
+      float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
+      if (mag <= 1e-3) continue;
+
+      float d =
+          (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
+
+      float d2 = min(d, temp[k]);
+      temp[k] = d2;
+      besti = d2 > best ? k : besti;
+      best = d2 > best ? d2 : best;
+    }
+    dists[tid] = best;
+    dists_i[tid] = besti;
+    __syncthreads();
+
+    if (block_size >= 512) {
+      if (tid < 256) {
+        __update(dists, dists_i, tid, tid + 256);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 256) {
+      if (tid < 128) {
+        __update(dists, dists_i, tid, tid + 128);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 128) {
+      if (tid < 64) {
+        __update(dists, dists_i, tid, tid + 64);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 64) {
+      if (tid < 32) {
+        __update(dists, dists_i, tid, tid + 32);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 32) {
+      if (tid < 16) {
+        __update(dists, dists_i, tid, tid + 16);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 16) {
+      if (tid < 8) {
+        __update(dists, dists_i, tid, tid + 8);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 8) {
+      if (tid < 4) {
+        __update(dists, dists_i, tid, tid + 4);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 4) {
+      if (tid < 2) {
+        __update(dists, dists_i, tid, tid + 2);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 2) {
+      if (tid < 1) {
+        __update(dists, dists_i, tid, tid + 1);
+      }
+      __syncthreads();
+    }
+
+    old = dists_i[0];
+    if (tid == 0) idxs[j] = old;
+  }
+}
+
+void furthest_point_sampling_kernel_wrapper(int b, int n, int m,
+                                            const float *dataset, float *temp,
+                                            int *idxs) {
+  unsigned int n_threads = opt_n_threads(n);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  switch (n_threads) {
+    case 512:
+      furthest_point_sampling_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 256:
+      furthest_point_sampling_kernel<256>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 128:
+      furthest_point_sampling_kernel<128>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 64:
+      furthest_point_sampling_kernel<64>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 32:
+      furthest_point_sampling_kernel<32>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 16:
+      furthest_point_sampling_kernel<16>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 8:
+      furthest_point_sampling_kernel<8>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 4:
+      furthest_point_sampling_kernel<4>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 2:
+      furthest_point_sampling_kernel<2>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 1:
+      furthest_point_sampling_kernel<1>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    default:
+      furthest_point_sampling_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+  }
+
+  CUDA_CHECK_ERRORS();
+}
diff --git a/third_party/pointnet2/pointnet2_modules.py b/third_party/pointnet2/pointnet2_modules.py
new file mode 100644
index 0000000..c525504
--- /dev/null
+++ b/third_party/pointnet2/pointnet2_modules.py
@@ -0,0 +1,514 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+''' Pointnet2 layers.
+Modified based on: https://github.com/erikwijmans/Pointnet2_PyTorch
+Extended with the following:
+1. Uniform sampling in each local region (sample_uniformly)
+2. Return sampled points indices to support votenet.
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import os
+import sys
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(BASE_DIR)
+
+import pointnet2_utils
+import pytorch_utils as pt_utils
+from typing import List
+
+
+class _PointnetSAModuleBase(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.npoint = None
+        self.groupers = None
+        self.mlps = None
+
+    def forward(self, xyz: torch.Tensor,
+                features: torch.Tensor = None) -> (torch.Tensor, torch.Tensor):
+        r"""
+        Parameters
+        ----------
+        xyz : torch.Tensor
+            (B, N, 3) tensor of the xyz coordinates of the features
+        features : torch.Tensor
+            (B, N, C) tensor of the descriptors of the the features
+
+        Returns
+        -------
+        new_xyz : torch.Tensor
+            (B, npoint, 3) tensor of the new features' xyz
+        new_features : torch.Tensor
+            (B, npoint, \sum_k(mlps[k][-1])) tensor of the new_features descriptors
+        """
+
+        new_features_list = []
+
+        xyz_flipped = xyz.transpose(1, 2).contiguous()
+        new_xyz = pointnet2_utils.gather_operation(
+            xyz_flipped,
+            pointnet2_utils.furthest_point_sample(xyz, self.npoint)
+        ).transpose(1, 2).contiguous() if self.npoint is not None else None
+
+        for i in range(len(self.groupers)):
+            new_features = self.groupers[i](
+                xyz, new_xyz, features
+            )  # (B, C, npoint, nsample)
+
+            new_features = self.mlps[i](
+                new_features
+            )  # (B, mlp[-1], npoint, nsample)
+            new_features = F.max_pool2d(
+                new_features, kernel_size=[1, new_features.size(3)]
+            )  # (B, mlp[-1], npoint, 1)
+            new_features = new_features.squeeze(-1)  # (B, mlp[-1], npoint)
+
+            new_features_list.append(new_features)
+
+        return new_xyz, torch.cat(new_features_list, dim=1)
+
+
+class PointnetSAModuleMSG(_PointnetSAModuleBase):
+    r"""Pointnet set abstrction layer with multiscale grouping
+
+    Parameters
+    ----------
+    npoint : int
+        Number of features
+    radii : list of float32
+        list of radii to group with
+    nsamples : list of int32
+        Number of samples in each ball query
+    mlps : list of list of int32
+        Spec of the pointnet before the global max_pool for each scale
+    bn : bool
+        Use batchnorm
+    """
+
+    def __init__(
+            self,
+            *,
+            npoint: int,
+            radii: List[float],
+            nsamples: List[int],
+            mlps: List[List[int]],
+            bn: bool = True,
+            use_xyz: bool = True, 
+            sample_uniformly: bool = False
+    ):
+        super().__init__()
+
+        assert len(radii) == len(nsamples) == len(mlps)
+
+        self.npoint = npoint
+        self.groupers = nn.ModuleList()
+        self.mlps = nn.ModuleList()
+        for i in range(len(radii)):
+            radius = radii[i]
+            nsample = nsamples[i]
+            self.groupers.append(
+                pointnet2_utils.QueryAndGroup(radius, nsample, use_xyz=use_xyz, sample_uniformly=sample_uniformly)
+                if npoint is not None else pointnet2_utils.GroupAll(use_xyz)
+            )
+            mlp_spec = mlps[i]
+            if use_xyz:
+                mlp_spec[0] += 3
+
+            self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn))
+
+
+class PointnetSAModule(PointnetSAModuleMSG):
+    r"""Pointnet set abstrction layer
+
+    Parameters
+    ----------
+    npoint : int
+        Number of features
+    radius : float
+        Radius of ball
+    nsample : int
+        Number of samples in the ball query
+    mlp : list
+        Spec of the pointnet before the global max_pool
+    bn : bool
+        Use batchnorm
+    """
+
+    def __init__(
+            self,
+            *,
+            mlp: List[int],
+            npoint: int = None,
+            radius: float = None,
+            nsample: int = None,
+            bn: bool = True,
+            use_xyz: bool = True
+    ):
+        super().__init__(
+            mlps=[mlp],
+            npoint=npoint,
+            radii=[radius],
+            nsamples=[nsample],
+            bn=bn,
+            use_xyz=use_xyz
+        )
+
+
+class PointnetSAModuleVotes(nn.Module):
+    ''' Modified based on _PointnetSAModuleBase and PointnetSAModuleMSG
+    with extra support for returning point indices for getting their GT votes '''
+
+    def __init__(
+            self,
+            *,
+            mlp: List[int],
+            npoint: int = None,
+            radius: float = None,
+            nsample: int = None,
+            bn: bool = True,
+            use_xyz: bool = True,
+            pooling: str = 'max',
+            sigma: float = None, # for RBF pooling
+            normalize_xyz: bool = False, # noramlize local XYZ with radius
+            sample_uniformly: bool = False,
+            ret_unique_cnt: bool = False
+    ):
+        super().__init__()
+        self.npoint = npoint
+        self.radius = radius
+        self.nsample = nsample
+        self.pooling = pooling
+        self.mlp_module = None
+        self.use_xyz = use_xyz
+        self.sigma = sigma
+        if self.sigma is None:
+            self.sigma = self.radius/2
+        self.normalize_xyz = normalize_xyz
+        self.ret_unique_cnt = ret_unique_cnt
+
+        if npoint is not None:
+            self.grouper = pointnet2_utils.QueryAndGroup(radius, nsample,
+                use_xyz=use_xyz, ret_grouped_xyz=True, normalize_xyz=normalize_xyz,
+                sample_uniformly=sample_uniformly, ret_unique_cnt=ret_unique_cnt)
+        else:
+            self.grouper = pointnet2_utils.GroupAll(use_xyz, ret_grouped_xyz=True)
+
+        mlp_spec = mlp
+        if use_xyz and len(mlp_spec)>0:
+            mlp_spec[0] += 3
+        self.mlp_module = pt_utils.SharedMLP(mlp_spec, bn=bn)
+
+
+    def forward(self, xyz: torch.Tensor,
+                features: torch.Tensor = None,
+                inds: torch.Tensor = None) -> (torch.Tensor, torch.Tensor):
+        r"""
+        Parameters
+        ----------
+        xyz : torch.Tensor
+            (B, N, 3) tensor of the xyz coordinates of the features
+        features : torch.Tensor
+            (B, C, N) tensor of the descriptors of the the features
+        inds : torch.Tensor
+            (B, npoint) tensor that stores index to the xyz points (values in 0-N-1)
+
+        Returns
+        -------
+        new_xyz : torch.Tensor
+            (B, npoint, 3) tensor of the new features' xyz
+        new_features : torch.Tensor
+            (B, \sum_k(mlps[k][-1]), npoint) tensor of the new_features descriptors
+        inds: torch.Tensor
+            (B, npoint) tensor of the inds
+        """
+
+        xyz_flipped = xyz.transpose(1, 2).contiguous()
+        if inds is None:
+            inds = pointnet2_utils.furthest_point_sample(xyz, self.npoint)
+        else:
+            assert(inds.shape[1] == self.npoint)
+        new_xyz = pointnet2_utils.gather_operation(
+            xyz_flipped, inds
+        ).transpose(1, 2).contiguous() if self.npoint is not None else None
+
+        if not self.ret_unique_cnt:
+            grouped_features, grouped_xyz = self.grouper(
+                xyz, new_xyz, features
+            )  # (B, C, npoint, nsample)
+        else:
+            grouped_features, grouped_xyz, unique_cnt = self.grouper(
+                xyz, new_xyz, features
+            )  # (B, C, npoint, nsample), (B,3,npoint,nsample), (B,npoint)
+
+        new_features = self.mlp_module(
+            grouped_features
+        )  # (B, mlp[-1], npoint, nsample)
+        if self.pooling == 'max':
+            new_features = F.max_pool2d(
+                new_features, kernel_size=[1, new_features.size(3)]
+            )  # (B, mlp[-1], npoint, 1)
+        elif self.pooling == 'avg':
+            new_features = F.avg_pool2d(
+                new_features, kernel_size=[1, new_features.size(3)]
+            )  # (B, mlp[-1], npoint, 1)
+        elif self.pooling == 'rbf': 
+            # Use radial basis function kernel for weighted sum of features (normalized by nsample and sigma)
+            # Ref: https://en.wikipedia.org/wiki/Radial_basis_function_kernel
+            rbf = torch.exp(-1 * grouped_xyz.pow(2).sum(1,keepdim=False) / (self.sigma**2) / 2) # (B, npoint, nsample)
+            new_features = torch.sum(new_features * rbf.unsqueeze(1), -1, keepdim=True) / float(self.nsample) # (B, mlp[-1], npoint, 1)
+        new_features = new_features.squeeze(-1)  # (B, mlp[-1], npoint)
+
+        if not self.ret_unique_cnt:
+            return new_xyz, new_features, inds
+        else:
+            return new_xyz, new_features, inds, unique_cnt
+
+class PointnetSAModuleMSGVotes(nn.Module):
+    ''' Modified based on _PointnetSAModuleBase and PointnetSAModuleMSG
+    with extra support for returning point indices for getting their GT votes '''
+
+    def __init__(
+            self,
+            *,
+            mlps: List[List[int]],
+            npoint: int,
+            radii: List[float],
+            nsamples: List[int],
+            bn: bool = True,
+            use_xyz: bool = True,
+            sample_uniformly: bool = False
+    ):
+        super().__init__()
+
+        assert(len(mlps) == len(nsamples) == len(radii))
+
+        self.npoint = npoint
+        self.groupers = nn.ModuleList()
+        self.mlps = nn.ModuleList()
+        for i in range(len(radii)):
+            radius = radii[i]
+            nsample = nsamples[i]
+            self.groupers.append(
+                pointnet2_utils.QueryAndGroup(radius, nsample, use_xyz=use_xyz, sample_uniformly=sample_uniformly)
+                if npoint is not None else pointnet2_utils.GroupAll(use_xyz)
+            )
+            mlp_spec = mlps[i]
+            if use_xyz:
+                mlp_spec[0] += 3
+
+            self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn))
+
+    def forward(self, xyz: torch.Tensor,
+                features: torch.Tensor = None, inds: torch.Tensor = None) -> (torch.Tensor, torch.Tensor):
+        r"""
+        Parameters
+        ----------
+        xyz : torch.Tensor
+            (B, N, 3) tensor of the xyz coordinates of the features
+        features : torch.Tensor
+            (B, C, C) tensor of the descriptors of the the features
+        inds : torch.Tensor
+            (B, npoint) tensor that stores index to the xyz points (values in 0-N-1)
+
+        Returns
+        -------
+        new_xyz : torch.Tensor
+            (B, npoint, 3) tensor of the new features' xyz
+        new_features : torch.Tensor
+            (B, \sum_k(mlps[k][-1]), npoint) tensor of the new_features descriptors
+        inds: torch.Tensor
+            (B, npoint) tensor of the inds
+        """
+        new_features_list = []
+
+        xyz_flipped = xyz.transpose(1, 2).contiguous()
+        if inds is None:
+            inds = pointnet2_utils.furthest_point_sample(xyz, self.npoint)
+        new_xyz = pointnet2_utils.gather_operation(
+            xyz_flipped, inds
+        ).transpose(1, 2).contiguous() if self.npoint is not None else None
+
+        for i in range(len(self.groupers)):
+            new_features = self.groupers[i](
+                xyz, new_xyz, features
+            )  # (B, C, npoint, nsample)
+            new_features = self.mlps[i](
+                new_features
+            )  # (B, mlp[-1], npoint, nsample)
+            new_features = F.max_pool2d(
+                new_features, kernel_size=[1, new_features.size(3)]
+            )  # (B, mlp[-1], npoint, 1)
+            new_features = new_features.squeeze(-1)  # (B, mlp[-1], npoint)
+
+            new_features_list.append(new_features)
+
+        return new_xyz, torch.cat(new_features_list, dim=1), inds
+
+
+class PointnetFPModule(nn.Module):
+    r"""Propigates the features of one set to another
+
+    Parameters
+    ----------
+    mlp : list
+        Pointnet module parameters
+    bn : bool
+        Use batchnorm
+    """
+
+    def __init__(self, *, mlp: List[int], bn: bool = True):
+        super().__init__()
+        self.mlp = pt_utils.SharedMLP(mlp, bn=bn)
+
+    def forward(
+            self, unknown: torch.Tensor, known: torch.Tensor,
+            unknow_feats: torch.Tensor, known_feats: torch.Tensor
+    ) -> torch.Tensor:
+        r"""
+        Parameters
+        ----------
+        unknown : torch.Tensor
+            (B, n, 3) tensor of the xyz positions of the unknown features
+        known : torch.Tensor
+            (B, m, 3) tensor of the xyz positions of the known features
+        unknow_feats : torch.Tensor
+            (B, C1, n) tensor of the features to be propigated to
+        known_feats : torch.Tensor
+            (B, C2, m) tensor of features to be propigated
+
+        Returns
+        -------
+        new_features : torch.Tensor
+            (B, mlp[-1], n) tensor of the features of the unknown features
+        """
+
+        if known is not None:
+            dist, idx = pointnet2_utils.three_nn(unknown, known)
+            dist_recip = 1.0 / (dist + 1e-8)
+            norm = torch.sum(dist_recip, dim=2, keepdim=True)
+            weight = dist_recip / norm
+
+            interpolated_feats = pointnet2_utils.three_interpolate(
+                known_feats, idx, weight
+            )
+        else:
+            interpolated_feats = known_feats.expand(
+                *known_feats.size()[0:2], unknown.size(1)
+            )
+
+        if unknow_feats is not None:
+            new_features = torch.cat([interpolated_feats, unknow_feats],
+                                   dim=1)  #(B, C2 + C1, n)
+        else:
+            new_features = interpolated_feats
+
+        new_features = new_features.unsqueeze(-1)
+        new_features = self.mlp(new_features)
+
+        return new_features.squeeze(-1)
+
+class PointnetLFPModuleMSG(nn.Module):
+    ''' Modified based on _PointnetSAModuleBase and PointnetSAModuleMSG
+    learnable feature propagation layer.'''
+
+    def __init__(
+            self,
+            *,
+            mlps: List[List[int]],
+            radii: List[float],
+            nsamples: List[int],
+            post_mlp: List[int],
+            bn: bool = True,
+            use_xyz: bool = True,
+            sample_uniformly: bool = False
+    ):
+        super().__init__()
+
+        assert(len(mlps) == len(nsamples) == len(radii))
+        
+        self.post_mlp = pt_utils.SharedMLP(post_mlp, bn=bn)
+
+        self.groupers = nn.ModuleList()
+        self.mlps = nn.ModuleList()
+        for i in range(len(radii)):
+            radius = radii[i]
+            nsample = nsamples[i]
+            self.groupers.append(
+                pointnet2_utils.QueryAndGroup(radius, nsample, use_xyz=use_xyz,
+                    sample_uniformly=sample_uniformly)
+            )
+            mlp_spec = mlps[i]
+            if use_xyz:
+                mlp_spec[0] += 3
+
+            self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn))
+
+    def forward(self, xyz2: torch.Tensor, xyz1: torch.Tensor,
+                features2: torch.Tensor, features1: torch.Tensor) -> torch.Tensor:
+        r""" Propagate features from xyz1 to xyz2.
+        Parameters
+        ----------
+        xyz2 : torch.Tensor
+            (B, N2, 3) tensor of the xyz coordinates of the features
+        xyz1 : torch.Tensor
+            (B, N1, 3) tensor of the xyz coordinates of the features
+        features2 : torch.Tensor
+            (B, C2, N2) tensor of the descriptors of the the features
+        features1 : torch.Tensor
+            (B, C1, N1) tensor of the descriptors of the the features
+
+        Returns
+        -------
+        new_features1 : torch.Tensor
+            (B, \sum_k(mlps[k][-1]), N1) tensor of the new_features descriptors
+        """
+        new_features_list = []
+
+        for i in range(len(self.groupers)):
+            new_features = self.groupers[i](
+                xyz1, xyz2, features1
+            )  # (B, C1, N2, nsample)
+            new_features = self.mlps[i](
+                new_features
+            )  # (B, mlp[-1], N2, nsample)
+            new_features = F.max_pool2d(
+                new_features, kernel_size=[1, new_features.size(3)]
+            )  # (B, mlp[-1], N2, 1)
+            new_features = new_features.squeeze(-1)  # (B, mlp[-1], N2)
+
+            if features2 is not None:
+                new_features = torch.cat([new_features, features2],
+                                           dim=1)  #(B, mlp[-1] + C2, N2)
+
+            new_features = new_features.unsqueeze(-1)
+            new_features = self.post_mlp(new_features)
+
+            new_features_list.append(new_features)
+
+        return torch.cat(new_features_list, dim=1).squeeze(-1)
+
+
+if __name__ == "__main__":
+    from torch.autograd import Variable
+    torch.manual_seed(1)
+    torch.cuda.manual_seed_all(1)
+    xyz = Variable(torch.randn(2, 9, 3).cuda(), requires_grad=True)
+    xyz_feats = Variable(torch.randn(2, 9, 6).cuda(), requires_grad=True)
+
+    test_module = PointnetSAModuleMSG(
+        npoint=2, radii=[5.0, 10.0], nsamples=[6, 3], mlps=[[9, 3], [9, 6]]
+    )
+    test_module.cuda()
+    print(test_module(xyz, xyz_feats))
+
+    for _ in range(1):
+        _, new_features = test_module(xyz, xyz_feats)
+        new_features.backward(
+            torch.cuda.FloatTensor(*new_features.size()).fill_(1)
+        )
+        print(new_features)
+        print(xyz.grad)
diff --git a/third_party/pointnet2/pointnet2_test.py b/third_party/pointnet2/pointnet2_test.py
new file mode 100644
index 0000000..0ab26ce
--- /dev/null
+++ b/third_party/pointnet2/pointnet2_test.py
@@ -0,0 +1,30 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+''' Testing customized ops. '''
+
+import torch
+from torch.autograd import gradcheck
+import numpy as np
+
+import os
+import sys
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(BASE_DIR)
+import pointnet2_utils
+
+def test_interpolation_grad():
+    batch_size = 1
+    feat_dim = 2
+    m = 4
+    feats = torch.randn(batch_size, feat_dim, m, requires_grad=True).float().cuda()
+    
+    def interpolate_func(inputs):
+        idx = torch.from_numpy(np.array([[[0,1,2],[1,2,3]]])).int().cuda()
+        weight = torch.from_numpy(np.array([[[1,1,1],[2,2,2]]])).float().cuda()
+        interpolated_feats = pointnet2_utils.three_interpolate(inputs, idx, weight)
+        return interpolated_feats
+    
+    assert (gradcheck(interpolate_func, feats, atol=1e-1, rtol=1e-1))
+
+if __name__=='__main__':
+    test_interpolation_grad()
diff --git a/third_party/pointnet2/pointnet2_utils.py b/third_party/pointnet2/pointnet2_utils.py
new file mode 100644
index 0000000..3227f21
--- /dev/null
+++ b/third_party/pointnet2/pointnet2_utils.py
@@ -0,0 +1,422 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+''' Modified based on: https://github.com/erikwijmans/Pointnet2_PyTorch '''
+from __future__ import (
+    division,
+    absolute_import,
+    with_statement,
+    print_function,
+    unicode_literals,
+)
+import torch
+from torch.autograd import Function
+import torch.nn as nn
+import third_party.pointnet2.pytorch_utils as pt_utils
+import sys
+
+try:
+    import builtins
+except:
+    import __builtin__ as builtins
+
+try:
+    import pointnet2._ext as _ext
+except ImportError:
+    if not getattr(builtins, "__POINTNET2_SETUP__", False):
+        raise ImportError(
+            "Could not import _ext module.\n"
+            "Please see the setup instructions in the README: "
+            "https://github.com/erikwijmans/Pointnet2_PyTorch/blob/master/README.rst"
+        )
+
+if False:
+    # Workaround for type hints without depending on the `typing` module
+    from typing import *
+
+
+class RandomDropout(nn.Module):
+    def __init__(self, p=0.5, inplace=False):
+        super(RandomDropout, self).__init__()
+        self.p = p
+        self.inplace = inplace
+
+    def forward(self, X):
+        theta = torch.Tensor(1).uniform_(0, self.p)[0]
+        return pt_utils.feature_dropout_no_scaling(X, theta, self.train, self.inplace)
+
+
+class FurthestPointSampling(Function):
+    @staticmethod
+    def forward(ctx, xyz, npoint):
+        # type: (Any, torch.Tensor, int) -> torch.Tensor
+        r"""
+        Uses iterative furthest point sampling to select a set of npoint features that have the largest
+        minimum distance
+
+        Parameters
+        ----------
+        xyz : torch.Tensor
+            (B, N, 3) tensor where N > npoint
+        npoint : int32
+            number of features in the sampled set
+
+        Returns
+        -------
+        torch.Tensor
+            (B, npoint) tensor containing the set
+        """
+        fps_inds = _ext.furthest_point_sampling(xyz, npoint)
+        ctx.mark_non_differentiable(fps_inds)
+        return fps_inds
+
+    @staticmethod
+    def backward(xyz, a=None):
+        return None, None
+
+
+furthest_point_sample = FurthestPointSampling.apply
+
+
+class GatherOperation(Function):
+    @staticmethod
+    def forward(ctx, features, idx):
+        # type: (Any, torch.Tensor, torch.Tensor) -> torch.Tensor
+        r"""
+
+        Parameters
+        ----------
+        features : torch.Tensor
+            (B, C, N) tensor
+
+        idx : torch.Tensor
+            (B, npoint) tensor of the features to gather
+
+        Returns
+        -------
+        torch.Tensor
+            (B, C, npoint) tensor
+        """
+
+        _, C, N = features.size()
+
+        ctx.for_backwards = (idx, C, N)
+
+        return _ext.gather_points(features, idx)
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        idx, C, N = ctx.for_backwards
+
+        grad_features = _ext.gather_points_grad(grad_out.contiguous(), idx, N)
+        return grad_features, None
+
+
+gather_operation = GatherOperation.apply
+
+
+class ThreeNN(Function):
+    @staticmethod
+    def forward(ctx, unknown, known):
+        # type: (Any, torch.Tensor, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]
+        r"""
+            Find the three nearest neighbors of unknown in known
+        Parameters
+        ----------
+        unknown : torch.Tensor
+            (B, n, 3) tensor of known features
+        known : torch.Tensor
+            (B, m, 3) tensor of unknown features
+
+        Returns
+        -------
+        dist : torch.Tensor
+            (B, n, 3) l2 distance to the three nearest neighbors
+        idx : torch.Tensor
+            (B, n, 3) index of 3 nearest neighbors
+        """
+        dist2, idx = _ext.three_nn(unknown, known)
+
+        return torch.sqrt(dist2), idx
+
+    @staticmethod
+    def backward(ctx, a=None, b=None):
+        return None, None
+
+
+three_nn = ThreeNN.apply
+
+
+class ThreeInterpolate(Function):
+    @staticmethod
+    def forward(ctx, features, idx, weight):
+        # type(Any, torch.Tensor, torch.Tensor, torch.Tensor) -> Torch.Tensor
+        r"""
+            Performs weight linear interpolation on 3 features
+        Parameters
+        ----------
+        features : torch.Tensor
+            (B, c, m) Features descriptors to be interpolated from
+        idx : torch.Tensor
+            (B, n, 3) three nearest neighbors of the target features in features
+        weight : torch.Tensor
+            (B, n, 3) weights
+
+        Returns
+        -------
+        torch.Tensor
+            (B, c, n) tensor of the interpolated features
+        """
+        B, c, m = features.size()
+        n = idx.size(1)
+
+        ctx.three_interpolate_for_backward = (idx, weight, m)
+
+        return _ext.three_interpolate(features, idx, weight)
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        # type: (Any, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+        r"""
+        Parameters
+        ----------
+        grad_out : torch.Tensor
+            (B, c, n) tensor with gradients of ouputs
+
+        Returns
+        -------
+        grad_features : torch.Tensor
+            (B, c, m) tensor with gradients of features
+
+        None
+
+        None
+        """
+        idx, weight, m = ctx.three_interpolate_for_backward
+
+        grad_features = _ext.three_interpolate_grad(
+            grad_out.contiguous(), idx, weight, m
+        )
+
+        return grad_features, None, None
+
+
+three_interpolate = ThreeInterpolate.apply
+
+
+class GroupingOperation(Function):
+    @staticmethod
+    def forward(ctx, features, idx):
+        # type: (Any, torch.Tensor, torch.Tensor) -> torch.Tensor
+        r"""
+
+        Parameters
+        ----------
+        features : torch.Tensor
+            (B, C, N) tensor of features to group
+        idx : torch.Tensor
+            (B, npoint, nsample) tensor containing the indicies of features to group with
+
+        Returns
+        -------
+        torch.Tensor
+            (B, C, npoint, nsample) tensor
+        """
+        B, nfeatures, nsample = idx.size()
+        _, C, N = features.size()
+
+        ctx.for_backwards = (idx, N)
+
+        return _ext.group_points(features, idx)
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        # type: (Any, torch.tensor) -> Tuple[torch.Tensor, torch.Tensor]
+        r"""
+
+        Parameters
+        ----------
+        grad_out : torch.Tensor
+            (B, C, npoint, nsample) tensor of the gradients of the output from forward
+
+        Returns
+        -------
+        torch.Tensor
+            (B, C, N) gradient of the features
+        None
+        """
+        idx, N = ctx.for_backwards
+
+        grad_features = _ext.group_points_grad(grad_out.contiguous(), idx, N)
+
+        return grad_features, None
+
+
+grouping_operation = GroupingOperation.apply
+
+
+class BallQuery(Function):
+    @staticmethod
+    def forward(ctx, radius, nsample, xyz, new_xyz):
+        # type: (Any, float, int, torch.Tensor, torch.Tensor) -> torch.Tensor
+        r"""
+
+        Parameters
+        ----------
+        radius : float
+            radius of the balls
+        nsample : int
+            maximum number of features in the balls
+        xyz : torch.Tensor
+            (B, N, 3) xyz coordinates of the features
+        new_xyz : torch.Tensor
+            (B, npoint, 3) centers of the ball query
+
+        Returns
+        -------
+        torch.Tensor
+            (B, npoint, nsample) tensor with the indicies of the features that form the query balls
+        """
+        inds = _ext.ball_query(new_xyz, xyz, radius, nsample)
+        ctx.mark_non_differentiable(inds)
+        return inds
+
+    @staticmethod
+    def backward(ctx, a=None):
+        return None, None, None, None
+
+
+ball_query = BallQuery.apply
+
+
+class QueryAndGroup(nn.Module):
+    r"""
+    Groups with a ball query of radius
+
+    Parameters
+    ---------
+    radius : float32
+        Radius of ball
+    nsample : int32
+        Maximum number of features to gather in the ball
+    """
+
+    def __init__(self, radius, nsample, use_xyz=True, ret_grouped_xyz=False, normalize_xyz=False, sample_uniformly=False, ret_unique_cnt=False):
+        # type: (QueryAndGroup, float, int, bool) -> None
+        super(QueryAndGroup, self).__init__()
+        self.radius, self.nsample, self.use_xyz = radius, nsample, use_xyz
+        self.ret_grouped_xyz = ret_grouped_xyz
+        self.normalize_xyz = normalize_xyz
+        self.sample_uniformly = sample_uniformly
+        self.ret_unique_cnt = ret_unique_cnt
+        if self.ret_unique_cnt:
+            assert(self.sample_uniformly)
+
+    def forward(self, xyz, new_xyz, features=None):
+        # type: (QueryAndGroup, torch.Tensor. torch.Tensor, torch.Tensor) -> Tuple[Torch.Tensor]
+        r"""
+        Parameters
+        ----------
+        xyz : torch.Tensor
+            xyz coordinates of the features (B, N, 3)
+        new_xyz : torch.Tensor
+            centriods (B, npoint, 3)
+        features : torch.Tensor
+            Descriptors of the features (B, C, N)
+
+        Returns
+        -------
+        new_features : torch.Tensor
+            (B, 3 + C, npoint, nsample) tensor
+        """
+        idx = ball_query(self.radius, self.nsample, xyz, new_xyz)
+
+        if self.sample_uniformly:
+            unique_cnt = torch.zeros((idx.shape[0], idx.shape[1]))
+            for i_batch in range(idx.shape[0]):
+                for i_region in range(idx.shape[1]):
+                    unique_ind = torch.unique(idx[i_batch, i_region, :])
+                    num_unique = unique_ind.shape[0]
+                    unique_cnt[i_batch, i_region] = num_unique
+                    sample_ind = torch.randint(0, num_unique, (self.nsample - num_unique,), dtype=torch.long)
+                    all_ind = torch.cat((unique_ind, unique_ind[sample_ind]))
+                    idx[i_batch, i_region, :] = all_ind
+
+
+        xyz_trans = xyz.transpose(1, 2).contiguous()
+        grouped_xyz = grouping_operation(xyz_trans, idx)  # (B, 3, npoint, nsample)
+        grouped_xyz -= new_xyz.transpose(1, 2).unsqueeze(-1)
+        if self.normalize_xyz:
+            grouped_xyz /= self.radius
+
+        if features is not None:
+            grouped_features = grouping_operation(features, idx)
+            if self.use_xyz:
+                new_features = torch.cat(
+                    [grouped_xyz, grouped_features], dim=1
+                )  # (B, C + 3, npoint, nsample)
+            else:
+                new_features = grouped_features
+        else:
+            assert (
+                self.use_xyz
+            ), "Cannot have not features and not use xyz as a feature!"
+            new_features = grouped_xyz
+
+        ret = [new_features]
+        if self.ret_grouped_xyz:
+            ret.append(grouped_xyz)
+        if self.ret_unique_cnt:
+            ret.append(unique_cnt)
+        if len(ret) == 1:
+            return ret[0]
+        else:
+            return tuple(ret)
+
+
+class GroupAll(nn.Module):
+    r"""
+    Groups all features
+
+    Parameters
+    ---------
+    """
+
+    def __init__(self, use_xyz=True, ret_grouped_xyz=False):
+        # type: (GroupAll, bool) -> None
+        super(GroupAll, self).__init__()
+        self.use_xyz = use_xyz
+
+    def forward(self, xyz, new_xyz, features=None):
+        # type: (GroupAll, torch.Tensor, torch.Tensor, torch.Tensor) -> Tuple[torch.Tensor]
+        r"""
+        Parameters
+        ----------
+        xyz : torch.Tensor
+            xyz coordinates of the features (B, N, 3)
+        new_xyz : torch.Tensor
+            Ignored
+        features : torch.Tensor
+            Descriptors of the features (B, C, N)
+
+        Returns
+        -------
+        new_features : torch.Tensor
+            (B, C + 3, 1, N) tensor
+        """
+
+        grouped_xyz = xyz.transpose(1, 2).unsqueeze(2)
+        if features is not None:
+            grouped_features = features.unsqueeze(2)
+            if self.use_xyz:
+                new_features = torch.cat(
+                    [grouped_xyz, grouped_features], dim=1
+                )  # (B, 3 + C, 1, N)
+            else:
+                new_features = grouped_features
+        else:
+            new_features = grouped_xyz
+
+        if self.ret_grouped_xyz:
+            return new_features, grouped_xyz
+        else:
+            return new_features
diff --git a/third_party/pointnet2/pytorch_utils.py b/third_party/pointnet2/pytorch_utils.py
new file mode 100644
index 0000000..4eb8572
--- /dev/null
+++ b/third_party/pointnet2/pytorch_utils.py
@@ -0,0 +1,295 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+''' Modified based on Ref: https://github.com/erikwijmans/Pointnet2_PyTorch '''
+import torch
+import torch.nn as nn
+from typing import List, Tuple
+
+class SharedMLP(nn.Sequential):
+
+    def __init__(
+            self,
+            args: List[int],
+            *,
+            bn: bool = False,
+            activation=nn.ReLU(inplace=True),
+            preact: bool = False,
+            first: bool = False,
+            name: str = ""
+    ):
+        super().__init__()
+
+        for i in range(len(args) - 1):
+            self.add_module(
+                name + 'layer{}'.format(i),
+                Conv2d(
+                    args[i],
+                    args[i + 1],
+                    bn=(not first or not preact or (i != 0)) and bn,
+                    activation=activation
+                    if (not first or not preact or (i != 0)) else None,
+                    preact=preact
+                )
+            )
+
+
+class _BNBase(nn.Sequential):
+
+    def __init__(self, in_size, batch_norm=None, name=""):
+        super().__init__()
+        self.add_module(name + "bn", batch_norm(in_size))
+
+        nn.init.constant_(self[0].weight, 1.0)
+        nn.init.constant_(self[0].bias, 0)
+
+
+class BatchNorm1d(_BNBase):
+
+    def __init__(self, in_size: int, *, name: str = ""):
+        super().__init__(in_size, batch_norm=nn.BatchNorm1d, name=name)
+
+
+class BatchNorm2d(_BNBase):
+
+    def __init__(self, in_size: int, name: str = ""):
+        super().__init__(in_size, batch_norm=nn.BatchNorm2d, name=name)
+
+
+class BatchNorm3d(_BNBase):
+
+    def __init__(self, in_size: int, name: str = ""):
+        super().__init__(in_size, batch_norm=nn.BatchNorm3d, name=name)
+
+
+class _ConvBase(nn.Sequential):
+
+    def __init__(
+            self,
+            in_size,
+            out_size,
+            kernel_size,
+            stride,
+            padding,
+            activation,
+            bn,
+            init,
+            conv=None,
+            batch_norm=None,
+            bias=True,
+            preact=False,
+            name=""
+    ):
+        super().__init__()
+
+        bias = bias and (not bn)
+        conv_unit = conv(
+            in_size,
+            out_size,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=bias
+        )
+        init(conv_unit.weight)
+        if bias:
+            nn.init.constant_(conv_unit.bias, 0)
+
+        if bn:
+            if not preact:
+                bn_unit = batch_norm(out_size)
+            else:
+                bn_unit = batch_norm(in_size)
+
+        if preact:
+            if bn:
+                self.add_module(name + 'bn', bn_unit)
+
+            if activation is not None:
+                self.add_module(name + 'activation', activation)
+
+        self.add_module(name + 'conv', conv_unit)
+
+        if not preact:
+            if bn:
+                self.add_module(name + 'bn', bn_unit)
+
+            if activation is not None:
+                self.add_module(name + 'activation', activation)
+
+
+class Conv1d(_ConvBase):
+
+    def __init__(
+            self,
+            in_size: int,
+            out_size: int,
+            *,
+            kernel_size: int = 1,
+            stride: int = 1,
+            padding: int = 0,
+            activation=nn.ReLU(inplace=True),
+            bn: bool = False,
+            init=nn.init.kaiming_normal_,
+            bias: bool = True,
+            preact: bool = False,
+            name: str = ""
+    ):
+        super().__init__(
+            in_size,
+            out_size,
+            kernel_size,
+            stride,
+            padding,
+            activation,
+            bn,
+            init,
+            conv=nn.Conv1d,
+            batch_norm=BatchNorm1d,
+            bias=bias,
+            preact=preact,
+            name=name
+        )
+
+
+class Conv2d(_ConvBase):
+
+    def __init__(
+            self,
+            in_size: int,
+            out_size: int,
+            *,
+            kernel_size: Tuple[int, int] = (1, 1),
+            stride: Tuple[int, int] = (1, 1),
+            padding: Tuple[int, int] = (0, 0),
+            activation=nn.ReLU(inplace=True),
+            bn: bool = False,
+            init=nn.init.kaiming_normal_,
+            bias: bool = True,
+            preact: bool = False,
+            name: str = ""
+    ):
+        super().__init__(
+            in_size,
+            out_size,
+            kernel_size,
+            stride,
+            padding,
+            activation,
+            bn,
+            init,
+            conv=nn.Conv2d,
+            batch_norm=BatchNorm2d,
+            bias=bias,
+            preact=preact,
+            name=name
+        )
+
+
+class Conv3d(_ConvBase):
+
+    def __init__(
+            self,
+            in_size: int,
+            out_size: int,
+            *,
+            kernel_size: Tuple[int, int, int] = (1, 1, 1),
+            stride: Tuple[int, int, int] = (1, 1, 1),
+            padding: Tuple[int, int, int] = (0, 0, 0),
+            activation=nn.ReLU(inplace=True),
+            bn: bool = False,
+            init=nn.init.kaiming_normal_,
+            bias: bool = True,
+            preact: bool = False,
+            name: str = ""
+    ):
+        super().__init__(
+            in_size,
+            out_size,
+            kernel_size,
+            stride,
+            padding,
+            activation,
+            bn,
+            init,
+            conv=nn.Conv3d,
+            batch_norm=BatchNorm3d,
+            bias=bias,
+            preact=preact,
+            name=name
+        )
+
+
+class FC(nn.Sequential):
+
+    def __init__(
+            self,
+            in_size: int,
+            out_size: int,
+            *,
+            activation=nn.ReLU(inplace=True),
+            bn: bool = False,
+            init=None,
+            preact: bool = False,
+            name: str = ""
+    ):
+        super().__init__()
+
+        fc = nn.Linear(in_size, out_size, bias=not bn)
+        if init is not None:
+            init(fc.weight)
+        if not bn:
+            nn.init.constant_(fc.bias, 0)
+
+        if preact:
+            if bn:
+                self.add_module(name + 'bn', BatchNorm1d(in_size))
+
+            if activation is not None:
+                self.add_module(name + 'activation', activation)
+
+        self.add_module(name + 'fc', fc)
+
+        if not preact:
+            if bn:
+                self.add_module(name + 'bn', BatchNorm1d(out_size))
+
+            if activation is not None:
+                self.add_module(name + 'activation', activation)
+
+def set_bn_momentum_default(bn_momentum):
+
+    def fn(m):
+        if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)):
+            m.momentum = bn_momentum
+
+    return fn
+
+
+class BNMomentumScheduler(object):
+
+    def __init__(
+            self, model, bn_lambda, last_epoch=-1,
+            setter=set_bn_momentum_default
+    ):
+        if not isinstance(model, nn.Module):
+            raise RuntimeError(
+                "Class '{}' is not a PyTorch nn Module".format(
+                    type(model).__name__
+                )
+            )
+
+        self.model = model
+        self.setter = setter
+        self.lmbd = bn_lambda
+
+        self.step(last_epoch + 1)
+        self.last_epoch = last_epoch
+
+    def step(self, epoch=None):
+        if epoch is None:
+            epoch = self.last_epoch + 1
+
+        self.last_epoch = epoch
+        self.model.apply(self.setter(self.lmbd(epoch)))
+
+
diff --git a/third_party/pointnet2/setup.py b/third_party/pointnet2/setup.py
new file mode 100644
index 0000000..ae3b5bf
--- /dev/null
+++ b/third_party/pointnet2/setup.py
@@ -0,0 +1,35 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# 
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+import glob
+import os.path as osp
+
+this_dir = osp.dirname(osp.abspath(__file__))
+
+_ext_src_root = "_ext_src"
+_ext_sources = glob.glob("{}/src/*.cpp".format(_ext_src_root)) + glob.glob(
+    "{}/src/*.cu".format(_ext_src_root)
+)
+_ext_headers = glob.glob("{}/include/*".format(_ext_src_root))
+
+setup(
+    name='pointnet2',
+    ext_modules=[
+        CUDAExtension(
+            name='pointnet2._ext',
+            sources=_ext_sources,
+            extra_compile_args={
+                "cxx": ["-O2", "-I{}".format("{}/include".format(_ext_src_root))],
+                "nvcc": ["-O2", "-I{}".format("{}/include".format(_ext_src_root))],
+            },
+            include_dirs=[osp.join(this_dir, _ext_src_root, "include")],
+        )
+    ],
+    cmdclass={
+        'build_ext': BuildExtension
+    }
+)
diff --git a/trainer/trainer.py b/trainer/trainer.py
new file mode 100644
index 0000000..87baabf
--- /dev/null
+++ b/trainer/trainer.py
@@ -0,0 +1,305 @@
+import statistics
+import hydra
+import MinkowskiEngine as ME
+import numpy as np
+import pytorch_lightning as pl
+import torch
+from sklearn.cluster import DBSCAN
+from contextlib import nullcontext
+from collections import defaultdict
+from utils.utils import associate_instances, save_predictions
+
+
+class PanopticSegmentation(pl.LightningModule):
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+        self.save_hyperparameters()
+        # model
+        self.model = hydra.utils.instantiate(config.model)
+        self.optional_freeze = nullcontext
+        
+        matcher = hydra.utils.instantiate(config.matcher)
+        weight_dict = {"loss_ce": matcher.cost_class,
+                       "loss_mask": matcher.cost_mask,
+                       "loss_dice": matcher.cost_dice,
+                       "loss_box": matcher.cost_box}
+
+        aux_weight_dict = {}
+        for i in range(self.model.num_levels * self.model.num_decoders):
+            aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+        weight_dict.update(aux_weight_dict)
+        
+        self.criterion = hydra.utils.instantiate(config.loss, matcher=matcher, weight_dict=weight_dict)
+        # metrics
+        self.class_evaluator = hydra.utils.instantiate(config.metric)
+        self.last_seq = None
+
+    def forward(self, x, raw_coordinates=None, is_eval=False):
+        with self.optional_freeze():
+            x = self.model(x, raw_coordinates=raw_coordinates, is_eval=is_eval)
+        return x
+
+    def training_step(self, batch, batch_idx):
+        data, target = batch
+        raw_coordinates = data.raw_coordinates
+        data = ME.SparseTensor(coordinates=data.coordinates, features=data.features, device=self.device)
+
+        output = self.forward(data, raw_coordinates=raw_coordinates)
+        losses = self.criterion(output, target)
+
+        for k in list(losses.keys()):
+            if k in self.criterion.weight_dict:
+                losses[k] *= self.criterion.weight_dict[k]
+            else:
+                # remove this loss if not specified in `weight_dict`
+                losses.pop(k)
+
+        logs = {f"train_{k}": v.detach().cpu().item() for k,v in losses.items()}
+
+        logs['train_mean_loss_ce'] = statistics.mean(
+            [item for item in [v for k, v in logs.items() if "loss_ce" in k]])
+
+        logs['train_mean_loss_mask'] = statistics.mean(
+            [item for item in [v for k, v in logs.items() if "loss_mask" in k]])
+
+        logs['train_mean_loss_dice'] = statistics.mean(
+            [item for item in [v for k, v in logs.items() if "loss_dice" in k]])
+
+        logs['train_mean_loss_box'] = statistics.mean(
+            [item for item in [v for k, v in logs.items() if "loss_box" in k]])
+
+        self.log_dict(logs)
+        return sum(losses.values())
+
+    def validation_step(self, batch, batch_idx):
+        data, target = batch
+        inverse_maps = data.inverse_maps
+        original_labels = data.original_labels
+        raw_coordinates = data.raw_coordinates
+        num_points = data.num_points
+        sequences = data.sequences
+        
+        data = ME.SparseTensor(coordinates=data.coordinates, features=data.features, device=self.device)
+        output = self.forward(data, raw_coordinates=raw_coordinates, is_eval=True)
+        losses = self.criterion(output, target)
+
+        for k in list(losses.keys()):
+            if k in self.criterion.weight_dict:
+                losses[k] *= self.criterion.weight_dict[k]
+            else:
+                # remove this loss if not specified in `weight_dict`
+                losses.pop(k)
+        
+        pred_logits = output['pred_logits']
+        pred_logits = torch.functional.F.softmax(pred_logits, dim=-1)[..., :-1]
+        pred_masks= output['pred_masks']
+        offset_coords_idx = 0
+        
+        for logit, mask, map, label, n_point, seq in zip(pred_logits, pred_masks, inverse_maps, original_labels, num_points, sequences):              
+            if seq != self.last_seq:
+                self.last_seq = seq
+                self.previous_instances = None
+                self.max_instance_id = self.config.model.num_queries
+                self.scene = 0
+                
+            class_confidence, classes = torch.max(logit.detach().cpu(), dim=1)
+            foreground_confidence = mask.detach().cpu().float().sigmoid()
+            confidence = class_confidence[None, ...] * foreground_confidence
+            confidence = confidence[map].numpy()
+            
+            ins_preds = np.argmax(confidence, axis=1)
+            sem_preds = classes[ins_preds].numpy() + 1
+            ins_preds += 1
+            ins_preds[np.isin(sem_preds, range(1, self.config.data.min_stuff_cls_id), invert=True)] = 0
+            sem_labels = self.validation_dataset._remap_model_output(label[:, 0])
+            ins_labels = label[:, 1] >> 16
+            
+            db_max_instance_id = self.config.model.num_queries
+            if self.config.general.dbscan_eps is not None:
+                curr_coords_idx = mask.shape[0]
+                curr_coords = raw_coordinates[offset_coords_idx:curr_coords_idx + offset_coords_idx, :3]
+                curr_coords = curr_coords[map].detach().cpu().numpy()
+                offset_coords_idx += curr_coords_idx
+                
+                ins_ids = np.unique(ins_preds)
+                for ins_id in ins_ids:
+                    if ins_id != 0:
+                        instance_mask = ins_preds == ins_id
+                        clusters = DBSCAN(eps=self.config.general.dbscan_eps, min_samples=1, n_jobs=-1).fit(curr_coords[instance_mask]).labels_
+                        new_mask = np.zeros(ins_preds.shape, dtype=np.int64)
+                        new_mask[instance_mask] = clusters + 1
+                        for cluster_id in np.unique(new_mask):
+                            if cluster_id != 0:
+                                db_max_instance_id += 1
+                                ins_preds[new_mask == cluster_id] = db_max_instance_id
+                                
+            self.max_instance_id = max(db_max_instance_id, self.max_instance_id)
+            for i in range(len(n_point) - 1):
+                indices = range(n_point[i], n_point[i+1])
+                if i == 0 and self.previous_instances is not None:
+                    current_instances = ins_preds[indices]
+                    associations = associate_instances(self.previous_instances, current_instances)
+                    for id in np.unique(ins_preds):
+                        if associations.get(id) is None:
+                            self.max_instance_id += 1
+                            associations[id] = self.max_instance_id
+                    ins_preds = np.vectorize(associations.__getitem__)(ins_preds)
+                else:
+                    self.class_evaluator.addBatch(sem_preds, ins_preds, sem_labels, ins_labels, indices, seq)
+            if i > 0:
+                self.previous_instances = ins_preds[indices]
+        
+        return {f"val_{k}": v.detach().cpu().item() for k, v in losses.items()}
+
+    def test_step(self, batch, batch_idx):
+        data, _ = batch
+        inverse_maps = data.inverse_maps
+        raw_coordinates = data.raw_coordinates
+        num_points = data.num_points
+        sequences = data.sequences
+        
+        data = ME.SparseTensor(coordinates=data.coordinates, features=data.features, device=self.device)
+        output = self.forward(data, raw_coordinates=raw_coordinates, is_eval=True)
+        
+        pred_logits = output['pred_logits']
+        pred_logits = torch.functional.F.softmax(pred_logits, dim=-1)[..., :-1]
+        pred_masks= output['pred_masks']
+        
+        offset_coords_idx = 0
+        
+        for logit, mask, map, n_point, seq in zip(pred_logits, pred_masks, inverse_maps, num_points, sequences):              
+            if seq != self.last_seq:
+                self.last_seq = seq
+                self.previous_instances = None
+                self.max_instance_id = self.config.model.num_queries
+                self.scene = 0
+            class_confidence, classes = torch.max(logit.detach().cpu(), dim=1)
+            foreground_confidence = mask.detach().cpu().float().sigmoid()
+            confidence = class_confidence[None, ...] * foreground_confidence
+            confidence = confidence[map].numpy()
+            
+            ins_preds = np.argmax(confidence, axis=1)
+            sem_preds = classes[ins_preds].numpy() + 1
+            ins_preds += 1
+            ins_preds[np.isin(sem_preds, range(1, self.config.data.min_stuff_cls_id), invert=True)] = 0
+            
+            db_max_instance_id = self.config.model.num_queries
+            if self.config.general.dbscan_eps is not None:
+                curr_coords_idx = mask.shape[0]
+                curr_coords = raw_coordinates[offset_coords_idx:curr_coords_idx + offset_coords_idx, :3]
+                curr_coords = curr_coords[map].detach().cpu().numpy()
+                offset_coords_idx += curr_coords_idx
+                
+                ins_ids = np.unique(ins_preds)
+                for ins_id in ins_ids:
+                    if ins_id != 0:
+                        instance_mask = ins_preds == ins_id
+                        clusters = DBSCAN(eps=self.config.general.dbscan_eps, min_samples=1, n_jobs=-1).fit(curr_coords[instance_mask]).labels_
+                        new_mask = np.zeros(ins_preds.shape, dtype=np.int64)
+                        new_mask[instance_mask] = clusters + 1
+                        for cluster_id in np.unique(new_mask):
+                            if cluster_id != 0:
+                                db_max_instance_id += 1
+                                ins_preds[new_mask == cluster_id] = db_max_instance_id
+                                
+            self.max_instance_id = max(db_max_instance_id, self.max_instance_id)
+            for i in range(len(n_point) - 1):
+                indices = range(n_point[i], n_point[i+1])
+                if i == 0 and self.previous_instances is not None:
+                    current_instances = ins_preds[indices]
+                    associations = associate_instances(self.previous_instances, current_instances)
+                    for id in np.unique(ins_preds):
+                        if associations.get(id) is None:
+                            self.max_instance_id += 1
+                            associations[id] = self.max_instance_id
+                    ins_preds = np.vectorize(associations.__getitem__)(ins_preds)
+                else:
+                    save_predictions(sem_preds[indices], ins_preds[indices], f"{seq:02}", f"{self.scene:06}")
+                    self.scene += 1
+            if i > 0:
+                self.previous_instances = ins_preds[indices]
+        
+        return {}
+
+    def training_epoch_end(self, outputs):
+        train_loss = sum([out["loss"].cpu().item() for out in outputs]) / len(outputs)
+        results = {"train_loss_mean": train_loss}
+        self.log_dict(results)
+
+    def validation_epoch_end(self, outputs):
+        self.last_seq = None
+        class_names = self.config.data.class_names
+        lstq, aq, all_aq, iou, all_iou = self.class_evaluator.getPQ4D()
+        self.class_evaluator.reset()
+        results = {}
+        results["val_mean_aq"] = aq
+        results["val_mean_iou"] = iou
+        results["val_mean_lstq"] = lstq
+        for i, (aq, iou) in enumerate(zip(all_aq, all_iou)):
+            results[f"val_{class_names[i]}_aq"] = aq.item()
+            results[f"val_{class_names[i]}_iou"] = iou.item()
+        self.log_dict(results)
+
+        dd = defaultdict(list)
+        for output in outputs:
+            for key, val in output.items():
+                dd[key].append(val)
+
+        dd = {k: statistics.mean(v) for k, v in dd.items()}
+
+        dd['val_mean_loss_ce'] = statistics.mean([item for item in [v for k,v in dd.items() if "loss_ce" in k]])
+        dd['val_mean_loss_mask'] = statistics.mean([item for item in [v for k,v in dd.items() if "loss_mask" in k]])
+        dd['val_mean_loss_dice'] = statistics.mean([item for item in [v for k,v in dd.items() if "loss_dice" in k]])
+
+        self.log_dict(dd)
+
+    def test_epoch_end(self, outputs):
+        return {}
+    
+    def configure_optimizers(self):
+        optimizer = hydra.utils.instantiate(
+            self.config.optimizer, params=self.parameters()
+        )
+        if "steps_per_epoch" in self.config.scheduler.scheduler.keys():
+            self.config.scheduler.scheduler.steps_per_epoch = len(
+                self.train_dataloader()
+            )
+        lr_scheduler = hydra.utils.instantiate(
+            self.config.scheduler.scheduler, optimizer=optimizer
+        )
+        scheduler_config = {"scheduler": lr_scheduler}
+        scheduler_config.update(self.config.scheduler.pytorch_lightning_params)
+        return [optimizer], [scheduler_config]
+
+    def prepare_data(self):
+        self.train_dataset = hydra.utils.instantiate(self.config.data.train_dataset)
+        self.validation_dataset = hydra.utils.instantiate(
+            self.config.data.validation_dataset
+        )
+        self.test_dataset = hydra.utils.instantiate(self.config.data.test_dataset)
+
+    def train_dataloader(self):
+        c_fn = hydra.utils.instantiate(self.config.data.train_collation)
+        return hydra.utils.instantiate(
+            self.config.data.train_dataloader,
+            self.train_dataset,
+            collate_fn=c_fn,
+        )
+
+    def val_dataloader(self):
+        c_fn = hydra.utils.instantiate(self.config.data.validation_collation)
+        return hydra.utils.instantiate(
+            self.config.data.validation_dataloader,
+            self.validation_dataset,
+            collate_fn=c_fn,
+        )
+
+    def test_dataloader(self):
+        c_fn = hydra.utils.instantiate(self.config.data.test_collation)
+        return hydra.utils.instantiate(
+            self.config.data.test_dataloader,
+            self.test_dataset,
+            collate_fn=c_fn,
+        )
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/utils/pointops2/__init__.py b/utils/pointops2/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/utils/pointops2/functions/__init__.py b/utils/pointops2/functions/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/utils/pointops2/functions/pointops.py b/utils/pointops2/functions/pointops.py
new file mode 100644
index 0000000..52aaa2c
--- /dev/null
+++ b/utils/pointops2/functions/pointops.py
@@ -0,0 +1,829 @@
+from typing import Tuple
+
+import torch
+from torch.autograd import Function
+import torch.nn as nn
+
+import pointops2_cuda as pointops_cuda
+import time
+
+class FurthestSampling(Function):
+    @staticmethod
+    def forward(ctx, xyz, offset, new_offset):
+        """
+        input: xyz: (n, 3), offset: (b), new_offset: (b)
+        output: idx: (m)
+        """
+        assert xyz.is_contiguous()
+        n, b, n_max = xyz.shape[0], offset.shape[0], offset[0]
+        for i in range(1, b):
+            n_max = max(offset[i] - offset[i-1], n_max)
+        idx = torch.cuda.IntTensor(new_offset[b-1].item()).zero_()
+        tmp = torch.cuda.FloatTensor(n).fill_(1e10)
+        pointops_cuda.furthestsampling_cuda(b, n_max, xyz, offset, new_offset, tmp, idx)
+        del tmp
+        return idx
+
+furthestsampling = FurthestSampling.apply
+
+
+class KNNQuery(Function):
+    @staticmethod
+    def forward(ctx, nsample, xyz, new_xyz, offset, new_offset):
+        """
+        input: xyz: (n, 3), new_xyz: (m, 3), offset: (b), new_offset: (b)
+        output: idx: (m, nsample), dist2: (m, nsample)
+        """
+        if new_xyz is None: new_xyz = xyz
+        assert xyz.is_contiguous() and new_xyz.is_contiguous()
+        m = new_xyz.shape[0]
+        idx = torch.cuda.IntTensor(m, nsample).zero_()
+        dist2 = torch.cuda.FloatTensor(m, nsample).zero_()
+        pointops_cuda.knnquery_cuda(m, nsample, xyz, new_xyz, offset, new_offset, idx, dist2)
+        return idx, torch.sqrt(dist2)
+
+knnquery = KNNQuery.apply
+
+
+class Grouping(Function):
+    @staticmethod
+    def forward(ctx, input, idx):
+        """
+        input: input: (n, c), idx : (m, nsample)
+        output: (m, nsample, c)
+        """
+        assert input.is_contiguous() and idx.is_contiguous()
+        m, nsample, n, c = idx.shape[0], idx.shape[1], input.shape[0], input.shape[1]
+        output = torch.cuda.FloatTensor(m, nsample, c)
+        pointops_cuda.grouping_forward_cuda(m, nsample, c, input, idx, output)
+        ctx.n = n
+        ctx.save_for_backward(idx)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: grad_out: (m, c, nsample)
+        output: (n, c), None
+        """
+        n = ctx.n
+        idx, = ctx.saved_tensors
+        m, nsample, c = grad_output.shape
+        grad_input = torch.cuda.FloatTensor(n, c).zero_()
+        pointops_cuda.grouping_backward_cuda(m, nsample, c, grad_output, idx, grad_input)
+        return grad_input, None
+
+grouping = Grouping.apply
+
+class AttentionStep1(Function):
+    @staticmethod
+    def forward(ctx, q, k, index0, index1):
+        """
+        input: q: (N, h, C//h), k: (N, h, C//h), index0: (M), index1: (M)
+        output: output: [N, h, C//h]
+        """
+        assert q.is_contiguous() and k.is_contiguous() and index0.is_contiguous() and index1.is_contiguous()
+
+        N_q, h, C_div_h = q.shape
+        N_k = k.shape[0]
+        M = index0.shape[0]
+        C = int(C_div_h * h)
+
+        output = torch.cuda.FloatTensor(M, h).zero_()
+        pointops_cuda.attention_step1_forward_cuda(N_k, M, h, C, q, k, index0, index1, output)
+        ctx.N_q = N_q
+        ctx.N_k = N_k
+        ctx.C = C
+        ctx.save_for_backward(q, k, index0, index1)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: grad_output: (N, h, C//h)
+        output: (M, h), (N, h, C//h), None, None
+        """
+        
+        N_q = ctx.N_q
+        N_k = ctx.N_k
+        C = ctx.C
+        q, k, index0, index1 = ctx.saved_tensors
+        M, h = grad_output.shape
+        
+        grad_output = grad_output.contiguous()
+        # print("grad_output.is_contiguous(): ", grad_output.is_contiguous())
+        assert q.is_contiguous() and k.is_contiguous() and index0.is_contiguous() and index1.is_contiguous() and grad_output.is_contiguous()
+
+        # print("back: attn[:5,:5]: ", attn[:5, :5])
+
+        # print("attn.shape: {} v.shape: {}, index0.shape: {}, index1.shape: {}".format(attn.shape, v.shape, index0.shape, index1.shape))
+
+        grad_q = torch.cuda.FloatTensor(N_q, h, C//h).zero_()
+        grad_k = torch.cuda.FloatTensor(N_k, h, C//h).zero_()
+
+        # torch.cuda.synchronize()
+        # start = time.time()
+        
+        pointops_cuda.attention_step1_backward_cuda(N_q, M, h, C, grad_output, index0, index1, q, k, grad_q, grad_k)
+        
+        # torch.cuda.synchronize()
+        # end = time.time()
+        # print("time v7: {}".format(end - start))
+        # # input()
+        
+        return grad_q, grad_k, None, None
+
+attention_step1 = AttentionStep1.apply
+
+class AttentionStep1_v2(Function):
+    @staticmethod
+    def forward(ctx, q, k, index1, index0_offsets, n_max):
+        """
+        input: q: (N, h, C//h), k: (N, h, C//h), index0: (M), index1: (M)
+        output: output: [N, h, C//h]
+        """
+        assert q.is_contiguous() and k.is_contiguous() and index0_offsets.is_contiguous() and index1.is_contiguous()
+        assert n_max <= 1024
+
+        N_q, h, C_div_h = q.shape
+        N_k = k.shape[0]
+        M = index1.shape[0]
+        C = int(C_div_h * h)
+
+        output = torch.cuda.FloatTensor(M, h).zero_()
+        pointops_cuda.attention_step1_forward_cuda_v2(N_k, M, h, C, n_max, q, k, index0_offsets, index1, output)
+        ctx.N_q = N_q
+        ctx.N_k = N_k
+        ctx.C = C
+        ctx.n_max = n_max
+        ctx.save_for_backward(q, k, index0_offsets, index1)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: grad_output: (N, h, C//h)
+        output: (M, h), (N, h, C//h), None, None
+        """
+        
+        N_q = ctx.N_q
+        N_k = ctx.N_k
+        C = ctx.C
+        n_max = ctx.n_max
+        q, k, index0_offsets, index1 = ctx.saved_tensors
+        M, h = grad_output.shape
+        
+        grad_output = grad_output.contiguous()
+        # print("grad_output.is_contiguous(): ", grad_output.is_contiguous())
+        assert q.is_contiguous() and k.is_contiguous() and index0_offsets.is_contiguous() and index1.is_contiguous() and grad_output.is_contiguous()
+
+        # print("back: attn[:5,:5]: ", attn[:5, :5])
+
+        # print("attn.shape: {} v.shape: {}, index0.shape: {}, index1.shape: {}".format(attn.shape, v.shape, index0.shape, index1.shape))
+
+        grad_q = torch.cuda.FloatTensor(N_q, h, C//h).zero_()
+        grad_k = torch.cuda.FloatTensor(N_k, h, C//h).zero_()
+
+        # torch.cuda.synchronize()
+        # start = time.time()
+        
+        pointops_cuda.attention_step1_backward_cuda_v2(N_q, M, h, C, n_max, grad_output, index0_offsets, index1, q, k, grad_q, grad_k)
+        
+        # torch.cuda.synchronize()
+        # end = time.time()
+        # print("time v7: {}".format(end - start))
+        # # input()
+        
+        return grad_q, grad_k, None, None, None
+
+attention_step1_v2 = AttentionStep1_v2.apply
+
+
+
+class AttentionStep2(Function):
+    @staticmethod
+    def forward(ctx, attn, v, index0, index1):
+        """
+        input: attn: (M, h), v: (N, h, C//h), index0: (M), index1: (M)
+        output: output: [N, h, C//h]
+        """
+        assert attn.is_contiguous() and v.is_contiguous() and index0.is_contiguous() and index1.is_contiguous()
+
+        M, h = attn.shape
+        N_q = index0.max().item() + 1
+        N_v, h, C_div_h = v.shape
+        C = int(C_div_h * h)
+
+        output = torch.cuda.FloatTensor(N_q, h, C//h).zero_()
+        pointops_cuda.attention_step2_forward_cuda(N_q, M, h, C, attn, v, index0, index1, output)
+        ctx.M = M
+
+        # print("attn[:5,:5]: ", attn[:5, :5])
+
+        ctx.save_for_backward(attn, v, index0, index1)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: grad_output: (N, h, C//h)
+        output: (M, h), (N, h, C//h), None, None
+        """
+        M = ctx.M
+        attn, v, index0, index1 = ctx.saved_tensors
+        N_v = v.shape[0]
+        N_q, h, C_div_h = grad_output.shape
+        C = h * C_div_h
+        
+        grad_output = grad_output.contiguous()
+        # print("grad_output.is_contiguous(): ", grad_output.is_contiguous())
+        assert attn.is_contiguous() and v.is_contiguous() and index0.is_contiguous() and index1.is_contiguous() and grad_output.is_contiguous()
+
+        # print("back: attn[:5,:5]: ", attn[:5, :5])
+
+        # print("attn.shape: {} v.shape: {}, index0.shape: {}, index1.shape: {}".format(attn.shape, v.shape, index0.shape, index1.shape))
+
+        grad_attn = torch.cuda.FloatTensor(M, h).zero_()
+        grad_v = torch.cuda.FloatTensor(N_v, h, C//h).zero_()
+
+        # torch.cuda.synchronize()
+        # start = time.time()
+        
+        pointops_cuda.attention_step2_backward_cuda(N_q, M, h, C, grad_output, index0, index1, attn, v, grad_attn, grad_v)
+        
+        # torch.cuda.synchronize()
+        # end = time.time()
+        # print("time v8: {}".format(end - start))
+        # # input()
+        
+        return grad_attn, grad_v, None, None
+
+attention_step2 = AttentionStep2.apply
+
+
+class AttentionStep2_v2(Function):
+    @staticmethod
+    def forward(ctx, attn, v, index0, index1):
+        """
+        input: attn: (M, h), v: (N, h, C//h), index0: (M), index1: (M)
+        output: output: [L, h, C//h]
+        """
+        assert attn.is_contiguous() and v.is_contiguous() and index0.is_contiguous() and index1.is_contiguous()
+        
+        L = int(index0.max().item()) + 1
+
+        M, h = attn.shape
+        N, h, C_div_h = v.shape
+        C = int(C_div_h * h)
+
+        output = torch.cuda.FloatTensor(L, h, C//h).zero_()
+        pointops_cuda.attention_step2_forward_cuda(N, M, h, C, attn, v, index0, index1, output)
+        ctx.M = M
+
+        # print("attn[:5,:5]: ", attn[:5, :5])
+
+        ctx.save_for_backward(attn, v, index0, index1)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: grad_output: (L, h, C//h)
+        output: (M, h), (N, h, C//h), None, None
+        """
+        M = ctx.M
+        attn, v, index0, index1 = ctx.saved_tensors
+        L, h, C_div_h = grad_output.shape
+        N = v.shape[0]
+        C = h * C_div_h
+        
+        grad_output = grad_output.contiguous()
+        # print("grad_output.is_contiguous(): ", grad_output.is_contiguous())
+        assert attn.is_contiguous() and v.is_contiguous() and index0.is_contiguous() and index1.is_contiguous() and grad_output.is_contiguous()
+
+        # print("back: attn[:5,:5]: ", attn[:5, :5])
+
+        # print("attn.shape: {} v.shape: {}, index0.shape: {}, index1.shape: {}".format(attn.shape, v.shape, index0.shape, index1.shape))
+
+        grad_attn = torch.cuda.FloatTensor(M, h).zero_()
+        grad_v = torch.cuda.FloatTensor(N, h, C//h).zero_()
+
+        pointops_cuda.attention_step2_backward_cuda(N, M, h, C, grad_output, index0, index1, attn, v, grad_attn, grad_v)
+        return grad_attn, grad_v, None, None
+
+attention_step2_v2 = AttentionStep2_v2.apply
+
+class DotProdWithIdx(Function):
+    @staticmethod
+    def forward(ctx, q, index, table, rel_idx):
+        """
+        input: q: (N, h, hdim), index: (M), table: (L, h, hdim, 3), rel_idx: (M, 3)
+        output: output: [M, h]
+        """
+        assert q.is_contiguous() and index.is_contiguous() and table.is_contiguous() and rel_idx.is_contiguous()
+
+        N, h, hdim = q.shape
+        M = index.shape[0]
+
+        output = torch.cuda.FloatTensor(M, h).zero_()
+        pointops_cuda.dot_prod_with_idx_forward_cuda(N, M, h, hdim, q, index, table, rel_idx, output)
+        ctx.save_for_backward(q, index, table, rel_idx)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: grad_output: [M, h]
+        output: (N, h, hdim), None, (L, h, hdim, 3), None
+        """
+        q, index, table, rel_idx = ctx.saved_tensors
+        M, h = grad_output.shape
+        N, _, hdim = q.shape
+        L = table.shape[0]
+        
+        grad_output = grad_output.contiguous()
+        assert q.is_contiguous() and index.is_contiguous() and table.is_contiguous() and rel_idx.is_contiguous() and grad_output.is_contiguous()
+
+        # print("back: attn[:5,:5]: ", attn[:5, :5])
+
+        # print("attn.shape: {} v.shape: {}, index0.shape: {}, index1.shape: {}".format(attn.shape, v.shape, index0.shape, index1.shape))
+
+        grad_q = torch.cuda.FloatTensor(N, h, hdim).zero_()
+        grad_table = torch.cuda.FloatTensor(L, h, hdim, 3).zero_()
+
+        # torch.cuda.synchronize()
+        # start = time.time()
+        
+        pointops_cuda.dot_prod_with_idx_backward_cuda(N, M, h, hdim, grad_output, q, index, table, rel_idx, grad_q, grad_table)
+        
+        # torch.cuda.synchronize()
+        # end = time.time()
+        # print("time v9: {}".format(end - start))
+        # # input()
+        
+        return grad_q, None, grad_table, None
+
+dot_prod_with_idx = DotProdWithIdx.apply
+
+class DotProdWithIdx_v2(Function):
+    @staticmethod
+    def forward(ctx, q, index_q, k, index_k, table_q, table_k, rel_idx):
+        """
+        input: q: (N, h, hdim), index_q: (M), k: (N, h, hdim), index_k: (M), table_q: (L, h, hdim, 3), table_k: (L, h, hdim, 3), rel_idx: (M, 3)
+        output: output: [M, h]
+        """
+        assert q.is_contiguous() and index_q.is_contiguous() and k.is_contiguous() and index_k.is_contiguous() and table_q.is_contiguous() and table_k.is_contiguous() and rel_idx.is_contiguous()
+
+        N, h, hdim = q.shape
+        M = index_q.shape[0]
+        L = table_q.shape[0]
+        assert table_k.shape[0] == L and index_k.shape[0] == M
+
+        # obtain the mapping from block_idx to m_idx
+        rel_idx_merge = rel_idx[:, 0] + rel_idx[:, 1] * L + rel_idx[:, 2] * (L ** 2) #[M, ]
+        sorted_values, sort_indices = torch.sort(rel_idx_merge)
+        _, counts = torch.unique_consecutive(sorted_values, return_counts=True)
+        rel_idx_offsets = torch.cumsum(counts, dim=-1) #[T,]
+        rel_idx_offsets = torch.cat([torch.zeros(1, dtype=torch.long).cuda(), rel_idx_offsets], 0) #[T+1,]
+        n_max = counts.max()
+        T = counts.shape[0]
+
+        # print("M: {}, L: {}, n_max: {}, T: {}".format(M, L, n_max, T))
+        # print("rel_idx_merge.shape: {}, sorted_values.shape: {}".format(rel_idx_merge.shape, sorted_values.shape))
+        # print("counts.shape: {}".format(counts.shape))
+
+        output = torch.cuda.FloatTensor(M, h).zero_()
+        # pointops_cuda.dot_prod_with_idx_forward_cuda(N, M, h, hdim, q, index, table, rel_idx, output)
+        pointops_cuda.dot_prod_with_idx_forward_cuda_v2(N, M, h, hdim, n_max, T, q, index_q, k, index_k, table_q, table_k, rel_idx, rel_idx_offsets.int(), sort_indices.int(), output)
+        
+        ctx.n_max = n_max
+        ctx.T = T
+        ctx.save_for_backward(q, index_q, k, index_k, table_q, table_k, rel_idx, rel_idx_offsets, sort_indices)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: grad_output: [M, h]
+        output: (N, h, hdim), None, (L, h, hdim, 3), None
+        """
+        q, index_q, k, index_k, table_q, table_k, rel_idx, rel_idx_offsets, sort_indices = ctx.saved_tensors
+        M, h = grad_output.shape
+        N, _, hdim = q.shape
+        L = table_q.shape[0]
+        T, n_max = ctx.T, ctx.n_max
+        
+        grad_output = grad_output.contiguous()
+        assert q.is_contiguous() and index_q.is_contiguous() and k.is_contiguous() and index_k.is_contiguous() and table_q.is_contiguous() and table_k.is_contiguous() and rel_idx.is_contiguous() and rel_idx_offsets.is_contiguous() and sort_indices.is_contiguous() and grad_output.is_contiguous()
+
+        # print("back: attn[:5,:5]: ", attn[:5, :5])
+
+        # print("attn.shape: {} v.shape: {}, index0.shape: {}, index1.shape: {}".format(attn.shape, v.shape, index0.shape, index1.shape))
+
+        grad_q = torch.cuda.FloatTensor(N, h, hdim).zero_()
+        grad_table_q = torch.cuda.FloatTensor(L, h, hdim, 3).zero_()
+        grad_k = torch.cuda.FloatTensor(N, h, hdim).zero_()
+        grad_table_k = torch.cuda.FloatTensor(L, h, hdim, 3).zero_()
+
+        # torch.cuda.synchronize()
+        # start = time.time()
+        
+        pointops_cuda.dot_prod_with_idx_backward_cuda_v2(N, M, h, hdim, n_max, T, grad_output, q, index_q, k, index_k, table_q, table_k, rel_idx, rel_idx_offsets.int(), sort_indices.int(), grad_q, grad_k, grad_table_q, grad_table_k)
+        
+        # torch.cuda.synchronize()
+        # end = time.time()
+        # print("time v9: {}".format(end - start))
+        # # input()
+        return grad_q, None, grad_k, None, grad_table_q, grad_table_k, None
+
+dot_prod_with_idx_v2 = DotProdWithIdx_v2.apply
+
+
+class DotProdWithIdx_v3(Function):
+    @staticmethod
+    def forward(ctx, q, index_q_offsets, n_max, k, index_k, table_q, table_k, rel_idx):
+        """
+        input: q: (N, h, hdim), index_q: (M), k: (N, h, hdim), index_k: (M), table_q: (L, h, hdim, 3), table_k: (L, h, hdim, 3), rel_idx: (M, 3)
+        output: output: [M, h]
+        """
+        assert q.is_contiguous() and index_q_offsets.is_contiguous() and k.is_contiguous() and index_k.is_contiguous() and table_q.is_contiguous() and table_k.is_contiguous() and rel_idx.is_contiguous()
+
+        N, h, hdim = q.shape
+        M = index_k.shape[0]
+        L = table_q.shape[0]
+        assert table_k.shape[0] == L
+
+        # # obtain the mapping from block_idx to m_idx
+        # rel_idx_merge = rel_idx[:, 0] + rel_idx[:, 1] * L + rel_idx[:, 2] * (L ** 2) #[M, ]
+        # sorted_values, sort_indices = torch.sort(rel_idx_merge)
+        # _, counts = torch.unique_consecutive(sorted_values, return_counts=True)
+        # rel_idx_offsets = torch.cumsum(counts, dim=-1) #[T,]
+        # rel_idx_offsets = torch.cat([torch.zeros(1, dtype=torch.long).cuda(), rel_idx_offsets], 0) #[T+1,]
+        # n_max = counts.max()
+        # T = counts.shape[0]
+
+        # print("M: {}, L: {}, n_max: {}, T: {}".format(M, L, n_max, T))
+        # print("rel_idx_merge.shape: {}, sorted_values.shape: {}".format(rel_idx_merge.shape, sorted_values.shape))
+        # print("counts.shape: {}".format(counts.shape))
+
+        # print("M: {}, L: {}, n_max: {}".format(M, L, n_max))
+
+        output = torch.cuda.FloatTensor(M, h).zero_()
+        # pointops_cuda.dot_prod_with_idx_forward_cuda(N, M, h, hdim, q, index, table, rel_idx, output)
+        pointops_cuda.dot_prod_with_idx_forward_cuda_v3(N, M, h, hdim, n_max, q, index_q_offsets, k, index_k, table_q, table_k, rel_idx, output)
+        
+        ctx.n_max = n_max
+        # ctx.T = T
+        ctx.save_for_backward(q, index_q_offsets, k, index_k, table_q, table_k, rel_idx)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: grad_output: [M, h]
+        output: (N, h, hdim), None, (L, h, hdim, 3), None
+        """
+        q, index_q_offsets, k, index_k, table_q, table_k, rel_idx = ctx.saved_tensors
+        M, h = grad_output.shape
+        N, _, hdim = q.shape
+        L = table_q.shape[0]
+        n_max = ctx.n_max
+        
+        grad_output = grad_output.contiguous()
+        assert q.is_contiguous() and index_q_offsets.is_contiguous() and k.is_contiguous() and index_k.is_contiguous() and table_q.is_contiguous() and table_k.is_contiguous() and rel_idx.is_contiguous() and grad_output.is_contiguous()
+
+        # print("back: attn[:5,:5]: ", attn[:5, :5])
+
+        # print("attn.shape: {} v.shape: {}, index0.shape: {}, index1.shape: {}".format(attn.shape, v.shape, index0.shape, index1.shape))
+
+        grad_q = torch.cuda.FloatTensor(N, h, hdim).zero_()
+        grad_table_q = torch.cuda.FloatTensor(L, h, hdim, 3).zero_()
+        grad_k = torch.cuda.FloatTensor(N, h, hdim).zero_()
+        grad_table_k = torch.cuda.FloatTensor(L, h, hdim, 3).zero_()
+
+        # torch.cuda.synchronize()
+        # start = time.time()
+        
+        pointops_cuda.dot_prod_with_idx_backward_cuda_v3(N, M, h, hdim, n_max, grad_output, q, index_q_offsets, k, index_k, table_q, table_k, rel_idx, grad_q, grad_k, grad_table_q, grad_table_k)
+        
+        # torch.cuda.synchronize()
+        # end = time.time()
+        # print("time v9: {}".format(end - start))
+        # # input()
+        return grad_q, None, None, grad_k, None, grad_table_q, grad_table_k, None
+
+dot_prod_with_idx_v3 = DotProdWithIdx_v3.apply
+
+class AttentionStep2WithRelPosValue(Function):
+    @staticmethod
+    def forward(ctx, attn, v, index0, index1, table, rel_idx):
+        """
+        input: attn: (M, h), v: (N, h, hdim), index0: (M), index1: (M), table: (L, h, hdim, 3), rel_idx: (M, 3)
+        output: output: [N, h, hdim]
+        """
+        assert attn.is_contiguous() and v.is_contiguous() and index0.is_contiguous() and index1.is_contiguous() and table.is_contiguous() and rel_idx.is_contiguous()
+
+        M, h = attn.shape
+        N_v, h, hdim = v.shape
+        N_q = index0.max().item() + 1
+
+        output = torch.cuda.FloatTensor(N_q, h, hdim).zero_()
+        pointops_cuda.attention_step2_with_rel_pos_value_forward_cuda(N_q, M, h, hdim, attn, v, index0, index1, table, rel_idx, output)
+
+        # print("attn[:5,:5]: ", attn[:5, :5])
+
+        ctx.save_for_backward(attn, v, index0, index1, table, rel_idx)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: grad_output: (N, h, C//h)
+        output: (M, h), (N, h, C//h), None, None, (L, h, hdim, 3), None
+        """
+        attn, v, index0, index1, table, rel_idx = ctx.saved_tensors
+        N_q, h, hdim = grad_output.shape
+        N_v = v.shape[0]
+        M = attn.shape[0]
+        L = table.shape[0]
+
+        grad_output = grad_output.contiguous()
+        # print("grad_output.is_contiguous(): ", grad_output.is_contiguous())
+        assert attn.is_contiguous() and v.is_contiguous() and index0.is_contiguous() and index1.is_contiguous() and grad_output.is_contiguous() and table.is_contiguous() and rel_idx.is_contiguous()
+
+        # print("back: attn[:5,:5]: ", attn[:5, :5])
+
+        # print("attn.shape: {} v.shape: {}, index0.shape: {}, index1.shape: {}".format(attn.shape, v.shape, index0.shape, index1.shape))
+
+        grad_attn = torch.cuda.FloatTensor(M, h).zero_()
+        grad_v = torch.cuda.FloatTensor(N_v, h, hdim).zero_()
+        grad_table = torch.cuda.FloatTensor(L, h, hdim, 3).zero_()
+
+        # print("attn.shape: {}, grad_attn.shape: {}".format(attn.shape, grad_attn.shape))
+        # print("v.shape: {}, grad_v.shape: {}".format(v.shape, grad_v.shape))
+        # print("table.shape: {}, grad_table.shape: {}".format(table.shape, grad_table.shape))
+
+        # torch.cuda.synchronize()
+        # start = time.time()
+        
+        pointops_cuda.attention_step2_with_rel_pos_value_backward_cuda(N_q, M, h, hdim, grad_output, index0, index1, attn, v, table, rel_idx, grad_attn, grad_v, grad_table)
+        
+        # torch.cuda.synchronize()
+        # end = time.time()
+        # print("time v10: {}".format(end - start))
+        # # input()
+        return grad_attn, grad_v, None, None, grad_table, None
+
+attention_step2_with_rel_pos_value = AttentionStep2WithRelPosValue.apply
+
+
+class AttentionStep2WithRelPosValue_v2(Function):
+    @staticmethod
+    def forward(ctx, attn, v, index0_offsets, n_max, index1, table, rel_idx):
+        """
+        input: attn: (M, h), v: (N, h, hdim), index0_offsets: (M), index1: (M), table: (L, h, hdim, 3), rel_idx: (M, 3)
+        output: output: [N, h, hdim]
+        """
+        assert attn.is_contiguous() and v.is_contiguous() and index0_offsets.is_contiguous() and index1.is_contiguous() and table.is_contiguous() and rel_idx.is_contiguous()
+
+        M, h = attn.shape
+        N, h, hdim = v.shape
+        # N_q = int(index0_offsets.max().item()) + 1
+
+        output = torch.cuda.FloatTensor(N, h, hdim).zero_()
+        pointops_cuda.attention_step2_with_rel_pos_value_forward_cuda_v2(N, M, h, hdim, n_max, attn, v, index0_offsets, index1, table, rel_idx, output)
+
+        # print("attn[:5,:5]: ", attn[:5, :5])
+
+        ctx.n_max = n_max
+        ctx.save_for_backward(attn, v, index0_offsets, index1, table, rel_idx)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: grad_output: (N, h, C//h)
+        output: (M, h), (N, h, C//h), None, None, (L, h, hdim, 3), None
+        """
+        n_max = ctx.n_max
+        attn, v, index0_offsets, index1, table, rel_idx = ctx.saved_tensors
+        N, h, hdim = grad_output.shape
+        N = v.shape[0]
+        M = attn.shape[0]
+        L = table.shape[0]
+
+        # grad_output = grad_output.contiguous()
+        # print("grad_output.is_contiguous(): ", grad_output.is_contiguous())
+        assert attn.is_contiguous() and v.is_contiguous() and index0_offsets.is_contiguous() and index1.is_contiguous() and grad_output.is_contiguous() and table.is_contiguous() and rel_idx.is_contiguous()
+
+        # print("back: attn[:5,:5]: ", attn[:5, :5])
+
+        # print("attn.shape: {} v.shape: {}, index0_offsets.shape: {}, index1.shape: {}".format(attn.shape, v.shape, index0_offsets.shape, index1.shape))
+
+        grad_attn = torch.cuda.FloatTensor(M, h).zero_()
+        grad_v = torch.cuda.FloatTensor(N, h, hdim).zero_()
+        grad_table = torch.cuda.FloatTensor(L, h, hdim, 3).zero_()
+
+        # print("attn.shape: {}, grad_attn.shape: {}".format(attn.shape, grad_attn.shape))
+        # print("v.shape: {}, grad_v.shape: {}".format(v.shape, grad_v.shape))
+        # print("table.shape: {}, grad_table.shape: {}".format(table.shape, grad_table.shape))
+
+        # torch.cuda.synchronize()
+        # start = time.time()
+        
+        pointops_cuda.attention_step2_with_rel_pos_value_backward_cuda_v2(N, M, h, hdim, n_max, grad_output, index0_offsets, index1, attn, v, table, rel_idx, grad_attn, grad_v, grad_table)
+        
+        # torch.cuda.synchronize()
+        # end = time.time()
+        # print("time v10: {}".format(end - start))
+        
+        return grad_attn, grad_v, None, None, None, grad_table, None
+
+attention_step2_with_rel_pos_value_v2 = AttentionStep2WithRelPosValue_v2.apply
+
+def queryandgroup(nsample, xyz, new_xyz, feat, idx, offset, new_offset, use_xyz=True, return_indx=False):
+    """
+    input: xyz: (n, 3), new_xyz: (m, 3), feat: (n, c), idx: (m, nsample), offset: (b), new_offset: (b)
+    output: new_feat: (m, c+3, nsample), grouped_idx: (m, nsample)
+    """
+    assert xyz.is_contiguous() and new_xyz.is_contiguous() and feat.is_contiguous()
+    if new_xyz is None:
+        new_xyz = xyz
+    if idx is None:
+        idx, _ = knnquery(nsample, xyz, new_xyz, offset, new_offset) # (m, nsample)
+
+    n, m, c = xyz.shape[0], new_xyz.shape[0], feat.shape[1]
+    grouped_xyz = xyz[idx.view(-1).long(), :].view(m, nsample, 3) # (m, nsample, 3)
+    #grouped_xyz = grouping(xyz, idx) # (m, nsample, 3)
+    # 相对位置
+    grouped_xyz -= new_xyz.unsqueeze(1) # (m, nsample, 3)
+    grouped_feat = feat[idx.view(-1).long(), :].view(m, nsample, c) # (m, nsample, c)
+    #grouped_feat = grouping(feat, idx) # (m, nsample, c)
+    if use_xyz:
+        if return_indx:
+            return torch.cat((grouped_xyz, grouped_feat), -1), idx # (m, nsample, 3+c)
+        else:
+            return torch.cat((grouped_xyz, grouped_feat), -1)
+    else:
+        if return_indx:
+            return grouped_feat, idx
+        else:
+            return grouped_feat
+
+
+def Divide2Patch(nsample, xyz, offset, return_offset=False, anchor_scale=None):
+    # nsample: 16  xyz: (n, 3)  offset: (b)
+    downsample_scale = anchor_scale or nsample
+    new_offset, count = [offset[0].item() // downsample_scale], offset[0].item() // downsample_scale
+    for i in range(1, offset.shape[0]):
+        count += (offset[i].item() - offset[i-1].item()) // downsample_scale
+        new_offset.append(count)
+    # print("donw sample scale:", downsample_scale,"offset:", offset, "newoffset:", new_offset)
+    new_offset = torch.cuda.IntTensor(new_offset)
+    idx = furthestsampling(xyz, offset, new_offset) # (m)
+    new_xyz = xyz[idx.long()]
+    p_idx, _ = knnquery(nsample, xyz, new_xyz, offset, new_offset) # (m, nsample)
+    if return_offset:
+        return p_idx, new_offset
+    else:
+        return p_idx
+    
+class Subtraction(Function):
+    @staticmethod
+    def forward(ctx, input1, input2, idx):
+        """
+        input: input1: (n, c), input2: (n, c), idx: (n, nsample)
+        output:  (n, nsample, c)
+        """
+        assert input1.is_contiguous() and input2.is_contiguous()
+        n, c = input1.shape; nsample = idx.shape[-1]
+        output = torch.cuda.FloatTensor(n, nsample, c).zero_()
+        pointops_cuda.subtraction_forward_cuda(n, nsample, c, input1, input2, idx, output)
+        ctx.save_for_backward(idx)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: grad_out: (n, nsample, c)
+        output: grad_input1: (n, c), grad_input2: (n, c)
+        """
+        idx, = ctx.saved_tensors
+        n, nsample, c = grad_output.shape
+        grad_input1 = torch.cuda.FloatTensor(n, c).zero_()
+        grad_input2 = torch.cuda.FloatTensor(n, c).zero_()
+        pointops_cuda.subtraction_backward_cuda(n, nsample, c, idx, grad_output, grad_input1, grad_input2)
+        return grad_input1, grad_input2, None
+
+subtraction = Subtraction.apply
+
+
+class Aggregation(Function):
+    @staticmethod
+    def forward(ctx, input, position, weight, idx):
+        """
+        input: input: (n, c), position: (n, nsample, c), weight : (n, nsample, c'), idx: (n, nsample)
+        output: (n, c)
+        """
+        assert input.is_contiguous() and position.is_contiguous() and weight.is_contiguous()
+        n, nsample, c = position.shape; w_c = weight.shape[-1]
+        output = torch.cuda.FloatTensor(n, c).zero_()
+        pointops_cuda.aggregation_forward_cuda(n, nsample, c, w_c, input, position, weight, idx, output)
+        ctx.save_for_backward(input, position, weight, idx)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: grad_out: (n, c)
+        output: grad_input: (n, c), grad_position: (n, nsample, c), grad_weight : (n, nsample, c')
+        """
+        input, position, weight, idx = ctx.saved_tensors
+        n, nsample, c = position.shape; w_c = weight.shape[-1]
+        grad_input = torch.cuda.FloatTensor(n, c).zero_()
+        grad_position = torch.cuda.FloatTensor(n, nsample, c).zero_()
+        grad_weight = torch.cuda.FloatTensor(n, nsample, w_c).zero_()
+        pointops_cuda.aggregation_backward_cuda(n, nsample, c, w_c, input, position, weight, idx, grad_output, grad_input, grad_position, grad_weight)
+        return grad_input, grad_position, grad_weight, None
+
+aggregation = Aggregation.apply
+
+
+def interpolation(xyz, new_xyz, feat, offset, new_offset, k=3):
+    """
+    input: xyz: (m, 3), new_xyz: (n, 3), feat: (m, c), offset: (b), new_offset: (b)
+    output: (n, c)
+    """
+    assert xyz.is_contiguous() and new_xyz.is_contiguous() and feat.is_contiguous()
+    idx, dist = knnquery(k, xyz, new_xyz, offset, new_offset) # (n, 3), (n, 3)
+    dist_recip = 1.0 / (dist + 1e-8) # (n, 3)
+    norm = torch.sum(dist_recip, dim=1, keepdim=True)
+    weight = dist_recip / norm # (n, 3)
+
+    new_feat = torch.cuda.FloatTensor(new_xyz.shape[0], feat.shape[1]).zero_()
+    for i in range(k):
+        new_feat += feat[idx[:, i].long(), :] * weight[:, i].unsqueeze(-1)
+    return new_feat
+
+
+def interpolation_v2(xyz, new_xyz, feat, offset, new_offset, k=3):
+    """
+    input: xyz: (m, 3), new_xyz: (n, 3), feat: (m, c), offset: (b), new_offset: (b)
+    output: (n, c)
+    """
+    assert xyz.is_contiguous() and new_xyz.is_contiguous() and feat.is_contiguous()
+
+    idx, _ = knnquery(k, xyz, new_xyz, offset, new_offset) # (n, 3), (n, 3)
+
+    # print("e3: idx.shape: {}, idx[:5]: {}".format(idx.shape, idx[:5]))
+
+    dist = torch.sqrt(((new_xyz.unsqueeze(1) - xyz[idx.long()]) ** 2).sum(-1) + 1e-8)
+
+    # print("e4: dist.shape: {}, dist[:5]: {}".format(dist.shape, dist[:5]))
+    # print("((_-dist)**2).max(): ", ((_-dist)**2).max())
+    # input()
+
+    dist_recip = 1.0 / (dist + 1e-8) # (n, 3)
+    norm = torch.sum(dist_recip, dim=1, keepdim=True)
+    weight = dist_recip / norm # (n, 3)
+
+    new_feat = torch.cuda.FloatTensor(new_xyz.shape[0], feat.shape[1]).zero_()
+    for i in range(k):
+        new_feat += feat[idx[:, i].long(), :] * weight[:, i].unsqueeze(-1)
+    return new_feat
+
+
+class Interpolation(Function):
+    @staticmethod
+    def forward(ctx, xyz, new_xyz, input, offset, new_offset, k=3):
+        """
+        input: xyz: (m, 3), new_xyz: (n, 3), input: (m, c), offset: (b), new_offset: (b)
+        output: (n, c)
+        """
+        assert xyz.is_contiguous() and new_xyz.is_contiguous() and input.is_contiguous()
+        idx, dist = knnquery(k, xyz, new_xyz, offset, new_offset) # (n, k), (n, k)
+        dist_recip = 1.0 / (dist + 1e-8) # (n, k)
+        norm = torch.sum(dist_recip, dim=1, keepdim=True)
+        weight = dist_recip / norm # (n, k)
+
+        n, c, m = new_xyz.shape[0], input.shape[1], input.shape[0]
+        output = torch.cuda.FloatTensor(n, c).zero_()
+        pointops_cuda.interpolation_forward_cuda(n, c, k, input, idx, weight, output)
+        ctx.m, ctx.k = m, k
+        ctx.save_for_backward(idx, weight)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: xyz: (m, 3), new_xyz: (n, 3), input: (m, c), offset: (b), new_offset: (b)
+        output: (n, c)
+        """
+        m, k = ctx.m, ctx.k
+        idx, weight = ctx.saved_tensors
+        n, c = grad_output.shape
+        grad_input = torch.cuda.FloatTensor(m, c).zero_()
+        pointops_cuda.interpolation_backward_cuda(n, c, k, grad_output, idx, weight, grad_input)
+        return None, None, grad_input, None, None, None
+
+interpolation2 = Interpolation.apply
diff --git a/utils/pointops2/functions/pointops2.py b/utils/pointops2/functions/pointops2.py
new file mode 100644
index 0000000..33af8e7
--- /dev/null
+++ b/utils/pointops2/functions/pointops2.py
@@ -0,0 +1,214 @@
+from typing import Tuple
+
+import torch
+from torch.autograd import Function
+import torch.nn as nn
+
+import pointops2_cuda as pointops_cuda
+
+
+class FurthestSampling(Function):
+    @staticmethod
+    def forward(ctx, xyz, offset, new_offset):
+        """
+        input: xyz: (n, 3), offset: (b), new_offset: (b)
+        output: idx: (m)
+        """
+        assert xyz.is_contiguous()
+        n, b, n_max = xyz.shape[0], offset.shape[0], offset[0]
+        for i in range(1, b):
+            n_max = max(offset[i] - offset[i-1], n_max)
+        idx = torch.cuda.IntTensor(new_offset[b-1].item()).zero_()
+        tmp = torch.cuda.FloatTensor(n).fill_(1e10)
+        pointops_cuda.furthestsampling_cuda(b, n_max, xyz, offset, new_offset, tmp, idx)
+        del tmp
+        return idx
+
+furthestsampling = FurthestSampling.apply
+
+
+class KNNQuery(Function):
+    @staticmethod
+    def forward(ctx, nsample, xyz, new_xyz, offset, new_offset):
+        """
+        input: xyz: (n, 3), new_xyz: (m, 3), offset: (b), new_offset: (b)
+        output: idx: (m, nsample), dist2: (m, nsample)
+        """
+        if new_xyz is None: new_xyz = xyz
+        assert xyz.is_contiguous() and new_xyz.is_contiguous()
+        m = new_xyz.shape[0]
+        idx = torch.cuda.IntTensor(m, nsample).zero_()
+        dist2 = torch.cuda.FloatTensor(m, nsample).zero_()
+        pointops_cuda.knnquery_cuda(m, nsample, xyz, new_xyz, offset, new_offset, idx, dist2)
+        return idx, torch.sqrt(dist2)
+
+knnquery = KNNQuery.apply
+
+
+class Grouping(Function):
+    @staticmethod
+    def forward(ctx, input, idx):
+        """
+        input: input: (n, c), idx : (m, nsample)
+        output: (m, nsample, c)
+        """
+        assert input.is_contiguous() and idx.is_contiguous()
+        m, nsample, n, c = idx.shape[0], idx.shape[1], input.shape[0], input.shape[1]
+        output = torch.cuda.FloatTensor(m, nsample, c)
+        pointops_cuda.grouping_forward_cuda(m, nsample, c, input, idx, output)
+        ctx.n = n
+        ctx.save_for_backward(idx)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: grad_out: (m, c, nsample)
+        output: (n, c), None
+        """
+        n = ctx.n
+        idx, = ctx.saved_tensors
+        m, nsample, c = grad_output.shape
+        grad_input = torch.cuda.FloatTensor(n, c).zero_()
+        pointops_cuda.grouping_backward_cuda(m, nsample, c, grad_output, idx, grad_input)
+        return grad_input, None
+
+grouping = Grouping.apply
+
+
+def queryandgroup(nsample, xyz, new_xyz, feat, idx, offset, new_offset, use_xyz=True):
+    """
+    input: xyz: (n, 3), new_xyz: (m, 3), feat: (n, c), idx: (m, nsample), offset: (b), new_offset: (b)
+    output: new_feat: (m, c+3, nsample), grouped_idx: (m, nsample)
+    """
+    assert xyz.is_contiguous() and new_xyz.is_contiguous() and feat.is_contiguous()
+    if new_xyz is None:
+        new_xyz = xyz
+    if idx is None:
+        idx, _ = knnquery(nsample, xyz, new_xyz, offset, new_offset) # (m, nsample)
+
+    n, m, c = xyz.shape[0], new_xyz.shape[0], feat.shape[1]
+    grouped_xyz = xyz[idx.view(-1).long(), :].view(m, nsample, 3) # (m, nsample, 3)
+    #grouped_xyz = grouping(xyz, idx) # (m, nsample, 3)
+    grouped_xyz -= new_xyz.unsqueeze(1) # (m, nsample, 3)
+    grouped_feat = feat[idx.view(-1).long(), :].view(m, nsample, c) # (m, nsample, c)
+    #grouped_feat = grouping(feat, idx) # (m, nsample, c)
+
+    if use_xyz:
+        return torch.cat((grouped_xyz, grouped_feat), -1) # (m, nsample, 3+c)
+    else:
+        return grouped_feat
+
+
+class Subtraction(Function):
+    @staticmethod
+    def forward(ctx, input1, input2, idx):
+        """
+        input: input1: (n, c), input2: (n, c), idx: (n, nsample)
+        output:  (n, nsample, c)
+        """
+        assert input1.is_contiguous() and input2.is_contiguous()
+        n, c = input1.shape; nsample = idx.shape[-1]
+        output = torch.cuda.FloatTensor(n, nsample, c).zero_()
+        pointops_cuda.subtraction_forward_cuda(n, nsample, c, input1, input2, idx, output)
+        ctx.save_for_backward(idx)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: grad_out: (n, nsample, c)
+        output: grad_input1: (n, c), grad_input2: (n, c)
+        """
+        idx, = ctx.saved_tensors
+        n, nsample, c = grad_output.shape
+        grad_input1 = torch.cuda.FloatTensor(n, c).zero_()
+        grad_input2 = torch.cuda.FloatTensor(n, c).zero_()
+        pointops_cuda.subtraction_backward_cuda(n, nsample, c, idx, grad_output, grad_input1, grad_input2)
+        return grad_input1, grad_input2, None
+
+subtraction = Subtraction.apply
+
+
+class Aggregation(Function):
+    @staticmethod
+    def forward(ctx, input, position, weight, idx):
+        """
+        input: input: (n, c), position: (n, nsample, c), weight : (n, nsample, c'), idx: (n, nsample)
+        output: (n, c)
+        """
+        assert input.is_contiguous() and position.is_contiguous() and weight.is_contiguous()
+        n, nsample, c = position.shape; w_c = weight.shape[-1]
+        output = torch.cuda.FloatTensor(n, c).zero_()
+        pointops_cuda.aggregation_forward_cuda(n, nsample, c, w_c, input, position, weight, idx, output)
+        ctx.save_for_backward(input, position, weight, idx)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: grad_out: (n, c)
+        output: grad_input: (n, c), grad_position: (n, nsample, c), grad_weight : (n, nsample, c')
+        """
+        input, position, weight, idx = ctx.saved_tensors
+        n, nsample, c = position.shape; w_c = weight.shape[-1]
+        grad_input = torch.cuda.FloatTensor(n, c).zero_()
+        grad_position = torch.cuda.FloatTensor(n, nsample, c).zero_()
+        grad_weight = torch.cuda.FloatTensor(n, nsample, w_c).zero_()
+        pointops_cuda.aggregation_backward_cuda(n, nsample, c, w_c, input, position, weight, idx, grad_output, grad_input, grad_position, grad_weight)
+        return grad_input, grad_position, grad_weight, None
+
+aggregation = Aggregation.apply
+
+
+def interpolation(xyz, new_xyz, feat, offset, new_offset, k=3):
+    """
+    input: xyz: (m, 3), new_xyz: (n, 3), feat: (m, c), offset: (b), new_offset: (b)
+    output: (n, c)
+    """
+    assert xyz.is_contiguous() and new_xyz.is_contiguous() and feat.is_contiguous()
+    idx, dist = knnquery(k, xyz, new_xyz, offset, new_offset) # (n, 3), (n, 3)
+    dist_recip = 1.0 / (dist + 1e-8) # (n, 3)
+    norm = torch.sum(dist_recip, dim=1, keepdim=True)
+    weight = dist_recip / norm # (n, 3)
+
+    new_feat = torch.cuda.FloatTensor(new_xyz.shape[0], feat.shape[1]).zero_()
+    for i in range(k):
+        new_feat += feat[idx[:, i].long(), :] * weight[:, i].unsqueeze(-1)
+    return new_feat
+
+
+class Interpolation(Function):
+    @staticmethod
+    def forward(ctx, xyz, new_xyz, input, offset, new_offset, k=3):
+        """
+        input: xyz: (m, 3), new_xyz: (n, 3), input: (m, c), offset: (b), new_offset: (b)
+        output: (n, c)
+        """
+        assert xyz.is_contiguous() and new_xyz.is_contiguous() and input.is_contiguous()
+        idx, dist = knnquery(k, xyz, new_xyz, offset, new_offset) # (n, k), (n, k)
+        dist_recip = 1.0 / (dist + 1e-8) # (n, k)
+        norm = torch.sum(dist_recip, dim=1, keepdim=True)
+        weight = dist_recip / norm # (n, k)
+
+        n, c, m = new_xyz.shape[0], input.shape[1], input.shape[0]
+        output = torch.cuda.FloatTensor(n, c).zero_()
+        pointops_cuda.interpolation_forward_cuda(n, c, k, input, idx, weight, output)
+        ctx.m, ctx.k = m, k
+        ctx.save_for_backward(idx, weight)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: xyz: (m, 3), new_xyz: (n, 3), input: (m, c), offset: (b), new_offset: (b)
+        output: (n, c)
+        """
+        m, k = ctx.m, ctx.k
+        idx, weight = ctx.saved_tensors
+        n, c = grad_output.shape
+        grad_input = torch.cuda.FloatTensor(m, c).zero_()
+        pointops_cuda.interpolation_backward_cuda(n, c, k, grad_output, idx, weight, grad_input)
+        return None, None, grad_input, None, None, None
+
+interpolation2 = Interpolation.apply
diff --git a/utils/pointops2/functions/pointops_ablation.py b/utils/pointops2/functions/pointops_ablation.py
new file mode 100644
index 0000000..ac01326
--- /dev/null
+++ b/utils/pointops2/functions/pointops_ablation.py
@@ -0,0 +1,215 @@
+from typing import Tuple
+
+import torch
+from torch.autograd import Function
+import torch.nn as nn
+
+import pointops2_cuda as pointops_cuda
+
+
+class FurthestSampling(Function):
+    @staticmethod
+    def forward(ctx, xyz, offset, new_offset):
+        """
+        input: xyz: (n, 3), offset: (b), new_offset: (b)
+        output: idx: (m)
+        """
+        assert xyz.is_contiguous()
+        n, b, n_max = xyz.shape[0], offset.shape[0], offset[0]
+        for i in range(1, b):
+            n_max = max(offset[i] - offset[i-1], n_max)
+        idx = torch.cuda.IntTensor(new_offset[b-1].item()).zero_()
+        tmp = torch.cuda.FloatTensor(n).fill_(1e10)
+        pointops_cuda.furthestsampling_cuda(b, n_max, xyz, offset, new_offset, tmp, idx)
+        del tmp
+        return idx
+
+furthestsampling = FurthestSampling.apply
+
+
+class KNNQuery(Function):
+    @staticmethod
+    def forward(ctx, nsample, xyz, new_xyz, offset, new_offset):
+        """
+        input: xyz: (n, 3), new_xyz: (m, 3), offset: (b), new_offset: (b)
+        output: idx: (m, nsample), dist2: (m, nsample)
+        """
+        if new_xyz is None: new_xyz = xyz
+        assert xyz.is_contiguous() and new_xyz.is_contiguous()
+        m = new_xyz.shape[0]
+        idx = torch.cuda.IntTensor(m, nsample).zero_()
+        dist2 = torch.cuda.FloatTensor(m, nsample).zero_()
+        pointops_cuda.knnquery_cuda(m, nsample, xyz, new_xyz, offset, new_offset, idx, dist2)
+        return idx, torch.sqrt(dist2)
+
+knnquery = KNNQuery.apply
+
+
+class Grouping(Function):
+    @staticmethod
+    def forward(ctx, input, idx):
+        """
+        input: input: (n, c), idx : (m, nsample)
+        output: (m, nsample, c)
+        """
+        assert input.is_contiguous() and idx.is_contiguous()
+        m, nsample, n, c = idx.shape[0], idx.shape[1], input.shape[0], input.shape[1]
+        output = torch.cuda.FloatTensor(m, nsample, c)
+        pointops_cuda.grouping_forward_cuda(m, nsample, c, input, idx, output)
+        ctx.n = n
+        ctx.save_for_backward(idx)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: grad_out: (m, c, nsample)
+        output: (n, c), None
+        """
+        n = ctx.n
+        idx, = ctx.saved_tensors
+        m, nsample, c = grad_output.shape
+        grad_input = torch.cuda.FloatTensor(n, c).zero_()
+        pointops_cuda.grouping_backward_cuda(m, nsample, c, grad_output, idx, grad_input)
+        return grad_input, None
+
+grouping = Grouping.apply
+
+
+def queryandgroup(nsample, xyz, new_xyz, feat, idx, offset, new_offset, use_xyz=True, relative=True):
+    """
+    input: xyz: (n, 3), new_xyz: (m, 3), feat: (n, c), idx: (m, nsample), offset: (b), new_offset: (b)
+    output: new_feat: (m, c+3, nsample), grouped_idx: (m, nsample)
+    """
+    assert xyz.is_contiguous() and new_xyz.is_contiguous() and feat.is_contiguous()
+    if new_xyz is None:
+        new_xyz = xyz
+    if idx is None:
+        idx, _ = knnquery(nsample, xyz, new_xyz, offset, new_offset) # (m, nsample)
+
+    n, m, c = xyz.shape[0], new_xyz.shape[0], feat.shape[1]
+    grouped_xyz = xyz[idx.view(-1).long(), :].view(m, nsample, 3) # (m, nsample, 3)
+    #grouped_xyz = grouping(xyz, idx) # (m, nsample, 3)
+    if relative:
+        grouped_xyz -= new_xyz.unsqueeze(1) # (m, nsample, 3)
+    grouped_feat = feat[idx.view(-1).long(), :].view(m, nsample, c) # (m, nsample, c)
+    #grouped_feat = grouping(feat, idx) # (m, nsample, c)
+
+    if use_xyz:
+        return torch.cat((grouped_xyz, grouped_feat), -1) # (m, nsample, 3+c)
+    else:
+        return grouped_feat
+
+
+class Subtraction(Function):
+    @staticmethod
+    def forward(ctx, input1, input2, idx):
+        """
+        input: input1: (n, c), input2: (n, c), idx: (n, nsample)
+        output:  (n, nsample, c)
+        """
+        assert input1.is_contiguous() and input2.is_contiguous()
+        n, c = input1.shape; nsample = idx.shape[-1]
+        output = torch.cuda.FloatTensor(n, nsample, c).zero_()
+        pointops_cuda.subtraction_forward_cuda(n, nsample, c, input1, input2, idx, output)
+        ctx.save_for_backward(idx)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: grad_out: (n, nsample, c)
+        output: grad_input1: (n, c), grad_input2: (n, c)
+        """
+        idx, = ctx.saved_tensors
+        n, nsample, c = grad_output.shape
+        grad_input1 = torch.cuda.FloatTensor(n, c).zero_()
+        grad_input2 = torch.cuda.FloatTensor(n, c).zero_()
+        pointops_cuda.subtraction_backward_cuda(n, nsample, c, idx, grad_output, grad_input1, grad_input2)
+        return grad_input1, grad_input2, None
+
+subtraction = Subtraction.apply
+
+
+class Aggregation(Function):
+    @staticmethod
+    def forward(ctx, input, position, weight, idx):
+        """
+        input: input: (n, c), position: (n, nsample, c), weight : (n, nsample, c'), idx: (n, nsample)
+        output: (n, c)
+        """
+        assert input.is_contiguous() and position.is_contiguous() and weight.is_contiguous()
+        n, nsample, c = position.shape; w_c = weight.shape[-1]
+        output = torch.cuda.FloatTensor(n, c).zero_()
+        pointops_cuda.aggregation_forward_cuda(n, nsample, c, w_c, input, position, weight, idx, output)
+        ctx.save_for_backward(input, position, weight, idx)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: grad_out: (n, c)
+        output: grad_input: (n, c), grad_position: (n, nsample, c), grad_weight : (n, nsample, c')
+        """
+        input, position, weight, idx = ctx.saved_tensors
+        n, nsample, c = position.shape; w_c = weight.shape[-1]
+        grad_input = torch.cuda.FloatTensor(n, c).zero_()
+        grad_position = torch.cuda.FloatTensor(n, nsample, c).zero_()
+        grad_weight = torch.cuda.FloatTensor(n, nsample, w_c).zero_()
+        pointops_cuda.aggregation_backward_cuda(n, nsample, c, w_c, input, position, weight, idx, grad_output, grad_input, grad_position, grad_weight)
+        return grad_input, grad_position, grad_weight, None
+
+aggregation = Aggregation.apply
+
+
+def interpolation(xyz, new_xyz, feat, offset, new_offset, k=3):
+    """
+    input: xyz: (m, 3), new_xyz: (n, 3), feat: (m, c), offset: (b), new_offset: (b)
+    output: (n, c)
+    """
+    assert xyz.is_contiguous() and new_xyz.is_contiguous() and feat.is_contiguous()
+    idx, dist = knnquery(k, xyz, new_xyz, offset, new_offset) # (n, 3), (n, 3)
+    dist_recip = 1.0 / (dist + 1e-8) # (n, 3)
+    norm = torch.sum(dist_recip, dim=1, keepdim=True)
+    weight = dist_recip / norm # (n, 3)
+
+    new_feat = torch.cuda.FloatTensor(new_xyz.shape[0], feat.shape[1]).zero_()
+    for i in range(k):
+        new_feat += feat[idx[:, i].long(), :] * weight[:, i].unsqueeze(-1)
+    return new_feat
+
+
+class Interpolation(Function):
+    @staticmethod
+    def forward(ctx, xyz, new_xyz, input, offset, new_offset, k=3):
+        """
+        input: xyz: (m, 3), new_xyz: (n, 3), input: (m, c), offset: (b), new_offset: (b)
+        output: (n, c)
+        """
+        assert xyz.is_contiguous() and new_xyz.is_contiguous() and input.is_contiguous()
+        idx, dist = knnquery(k, xyz, new_xyz, offset, new_offset) # (n, k), (n, k)
+        dist_recip = 1.0 / (dist + 1e-8) # (n, k)
+        norm = torch.sum(dist_recip, dim=1, keepdim=True)
+        weight = dist_recip / norm # (n, k)
+
+        n, c, m = new_xyz.shape[0], input.shape[1], input.shape[0]
+        output = torch.cuda.FloatTensor(n, c).zero_()
+        pointops_cuda.interpolation_forward_cuda(n, c, k, input, idx, weight, output)
+        ctx.m, ctx.k = m, k
+        ctx.save_for_backward(idx, weight)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        input: xyz: (m, 3), new_xyz: (n, 3), input: (m, c), offset: (b), new_offset: (b)
+        output: (n, c)
+        """
+        m, k = ctx.m, ctx.k
+        idx, weight = ctx.saved_tensors
+        n, c = grad_output.shape
+        grad_input = torch.cuda.FloatTensor(m, c).zero_()
+        pointops_cuda.interpolation_backward_cuda(n, c, k, grad_output, idx, weight, grad_input)
+        return None, None, grad_input, None, None, None
+
+interpolation2 = Interpolation.apply
diff --git a/utils/pointops2/functions/test_attention_op_step1.py b/utils/pointops2/functions/test_attention_op_step1.py
new file mode 100644
index 0000000..6515fdc
--- /dev/null
+++ b/utils/pointops2/functions/test_attention_op_step1.py
@@ -0,0 +1,78 @@
+import torch
+import pointops
+from torch_scatter import scatter_max, scatter_mean, scatter_add, scatter_min, scatter_sum
+
+torch.manual_seed(1)
+
+M = 800000
+N = 35000
+C = 96
+h = 6
+query = torch.rand(N, h, C//h).cuda()
+key = torch.rand(N, h, C//h).cuda()
+
+index_0 = torch.rand(M)
+index_0[index_0 < 0] = 0
+index_0 = (index_0*N).long().cuda()
+
+index_1 = torch.rand(M)
+index_1[index_1 < 0] = 0
+index_1 = (index_1*N).long().cuda()
+
+query.requires_grad = True
+key.requires_grad = True
+
+# rearrange index for acceleration
+index_0, indices = torch.sort(index_0) #[M,]
+index_1 = index_1[indices] #[M,]
+index_0_counts = index_0.bincount()
+
+print("index_0_counts.shape: ", index_0_counts.shape)
+
+n_max = index_0_counts.max()
+index_0_offsets = index_0_counts.cumsum(dim=-1) #[N]
+
+print("v1 index_0_offsets.shape: ", index_0_offsets.shape)
+
+index_0_offsets = torch.cat([torch.zeros(1, dtype=torch.long).cuda(), index_0_offsets], 0) #[N+1]
+        
+# print("index_0[:100]: ", index_0[:100])
+print("n_max: ", n_max)
+print("index_0_offsets.shape: ", index_0_offsets.shape)
+# input()
+
+print("index_0_offsets[:100]: ", index_0_offsets[:100])
+print("index_1[300:320]: ", index_1[300:320])
+
+            
+attn_flat = pointops.attention_step1(query.float(), key.float(), index_0.int(), index_1.int())
+# loss = attn_flat.sum()
+# loss.backward()
+print("attn_flat.shape: {}, attn_flat[300:320,:10]: {}".format(attn_flat.shape, attn_flat[300:320,:10]))
+# print("query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5])
+# print("key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5])
+# input()
+
+print("query.is_contiguous(): ", query.is_contiguous())
+print("key.is_contiguous(): ", key.is_contiguous())
+print("index_0.is_contiguous(): ", index_0.is_contiguous())
+print("index_1.is_contiguous(): ", index_1.is_contiguous())
+
+attn_flat_v2 = pointops.attention_step1_v2(query.float(), key.float(), index_1.int(), index_0_offsets.int(), n_max)
+# loss = attn_flat_v2.sum()
+# loss.backward()
+print("attn_flat_v2.shape: {}, attn_flat_v2[300:320,:10]: {}".format(attn_flat_v2.shape, attn_flat_v2[300:320,:10]))
+# print("query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5])
+# print("key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5])
+# input()
+
+mask = attn_flat_v2.sum(-1) != 0
+print("mask.sum(): ", mask.sum())
+print("attn_flat_v2[mask] - attn_flat[mask]: ", ((attn_flat_v2[mask] - attn_flat[mask])**2).max())
+
+
+print("((attn_flat-attn_flat_v2)**2 < 1e-8).all(): ", ((attn_flat-attn_flat_v2)**2 < 1e-8).all())
+
+selected = 10000
+print("torch.max((attn_flat[:selected]-attn_flat_v2[:selected])**2, 0): ", torch.max((attn_flat[:selected]-attn_flat_v2[:selected])**2, 0))
+
diff --git a/utils/pointops2/functions/test_attention_op_step1_v2.py b/utils/pointops2/functions/test_attention_op_step1_v2.py
new file mode 100644
index 0000000..b8bd582
--- /dev/null
+++ b/utils/pointops2/functions/test_attention_op_step1_v2.py
@@ -0,0 +1,97 @@
+import torch
+import pointops
+from torch_scatter import scatter_max, scatter_mean, scatter_add, scatter_min, scatter_sum
+
+torch.manual_seed(1)
+
+M = 800000
+N = 35000
+C = 96
+h = 6
+query = torch.rand(N, h, C//h).cuda()
+key = torch.rand(N, h, C//h).cuda()
+
+index_0 = torch.rand(M)
+index_0[index_0 < 0] = 0
+index_0 = (index_0*N).long().cuda()
+
+index_1 = torch.rand(M)
+index_1[index_1 < 0] = 0
+index_1 = (index_1*N).long().cuda()
+
+query.requires_grad = True
+key.requires_grad = True
+
+
+attn_flat = pointops.attention_step1(query.float(), key.float(), index_0.int(), index_1.int())
+loss = attn_flat.sum()
+loss.backward()
+print("attn_flat.shape: {}, attn_flat[:20,:10]: {}".format(attn_flat.shape, attn_flat[:20,:10]))
+print("query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5])
+print("key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5])
+input()
+
+
+
+# rearrange index for acceleration
+index_0, indices = torch.sort(index_0) #[M,]
+index_1 = index_1[indices] #[M,]
+index_0_counts = index_0.bincount()
+
+print("index_0_counts.shape: ", index_0_counts.shape)
+
+n_max = index_0_counts.max()
+index_0_offsets = index_0_counts.cumsum(dim=-1) #[N]
+
+print("v1 index_0_offsets.shape: ", index_0_offsets.shape)
+
+index_0_offsets = torch.cat([torch.zeros(1, dtype=torch.long).cuda(), index_0_offsets], 0) #[N+1]
+        
+# print("index_0[:100]: ", index_0[:100])
+print("n_max: ", n_max)
+print("index_0_offsets.shape: ", index_0_offsets.shape)
+# input()
+
+print("index_0_offsets[:100]: ", index_0_offsets[:100])
+print("index_1[:20]: ", index_1[:20])
+
+            
+attn_flat = pointops.attention_step1(query.float(), key.float(), index_0.int(), index_1.int())
+# loss = attn_flat.sum()
+# loss.backward()
+# # attn_flat = pointops.attention_step1(query.float(), key.float(), index_0.int(), index_1.int())
+# # loss = attn_flat.sum()
+# # loss.backward()
+# print("attn_flat.shape: {}, attn_flat[:20,:10]: {}".format(attn_flat.shape, attn_flat[:20,:10]))
+# print("query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5])
+# print("key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5])
+# input()
+
+print("query.is_contiguous(): ", query.is_contiguous())
+print("key.is_contiguous(): ", key.is_contiguous())
+print("index_0.is_contiguous(): ", index_0.is_contiguous())
+print("index_1.is_contiguous(): ", index_1.is_contiguous())
+
+attn_flat_v2 = pointops.attention_step1_v2(query.float(), key.float(), index_1.int(), index_0_offsets.int(), n_max)
+loss = attn_flat_v2.sum()
+loss.backward()
+
+# attn_flat_v2 = pointops.attention_step1_v2(query.float(), key.float(), index_1.int(), index_0_offsets.int(), n_max)
+# loss = attn_flat_v2.sum()
+# loss.backward()
+
+print("attn_flat_v2.shape: {}, attn_flat_v2[:20,:10]: {}".format(attn_flat_v2.shape, attn_flat_v2[:20,:10]))
+print("query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5])
+print("key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5])
+# input()
+
+# mask = attn_flat_v2.sum(-1) != 0
+# print("mask.sum(): ", mask.sum())
+# print("attn_flat_v2[mask] - attn_flat[mask]: ", ((attn_flat_v2[mask] - attn_flat[mask])**2).max())
+
+
+print("((attn_flat-attn_flat_v2)**2 < 1e-8).all(): ", ((attn_flat-attn_flat_v2)**2 < 1e-8).all())
+
+selected = 10000
+print("torch.max((attn_flat[:selected]-attn_flat_v2[:selected])**2, 0): ", torch.max((attn_flat[:selected]-attn_flat_v2[:selected])**2, 0))
+
diff --git a/utils/pointops2/functions/test_attention_op_step2.py b/utils/pointops2/functions/test_attention_op_step2.py
new file mode 100644
index 0000000..2146d5a
--- /dev/null
+++ b/utils/pointops2/functions/test_attention_op_step2.py
@@ -0,0 +1,55 @@
+import torch
+import pointops
+from torch_scatter import scatter_max, scatter_mean, scatter_add, scatter_min, scatter_sum
+
+torch.manual_seed(1)
+
+M = 800000
+N = 35000
+C = 96
+h = 6
+softmax_attn_flat = torch.rand(M, h).cuda()
+value = torch.rand(N, h, C//h).cuda()
+
+index_0 = torch.rand(M)
+index_0[index_0 < 0] = 0
+index_0 = (index_0*N).long().cuda()
+
+index_1 = torch.rand(M)
+index_1[index_1 < 0] = 0
+index_1 = (index_1*N).long().cuda()
+
+softmax_attn_flat.requires_grad = True
+value.requires_grad = True
+
+# value_flat = value[index_1] #[M, num_heads, C // num_heads]
+# x = (softmax_attn_flat.unsqueeze(-1) * value_flat).reshape(M, C)
+# x = scatter_sum(src=x, index=index_0, dim=0, dim_size=N) #[N, C]
+# loss = x.sum()
+# loss.backward()
+
+# print("x.shape: {}, x[:5,:10]: {}".format(x.shape, x[:5,:10]))
+# print("softmax_attn_flat.grad[:5, :10]: ", softmax_attn_flat.grad[:5, :10])
+# print("value.grad[:5, :3, :5]: ", value.grad[:5, :3, :5])
+# input()
+
+print("softmax_attn_flat.is_contiguous(): ", softmax_attn_flat.is_contiguous())
+print("value.is_contiguous(): ", value.is_contiguous())
+print("index_0.is_contiguous(): ", index_0.is_contiguous())
+print("index_1.is_contiguous(): ", index_1.is_contiguous())
+
+x_v2 = pointops.attention_step2(softmax_attn_flat.float(), value.float(), index_0.int(), index_1.int())
+x_v2 = x_v2.view(N, C)
+loss = x_v2.sum()
+loss.backward()
+
+print("x_v2.shape: {}, x_v2[:5,:10]: {}".format(x_v2.shape, x_v2[:5,:10]))
+
+print("softmax_attn_flat.grad[:5, :10]: ", softmax_attn_flat.grad[:5, :10])
+print("value.grad[:5, :3, :5]: ", value.grad[:5, :3, :5])
+input()
+
+print("((x-x_v2)**2 < 1e-8).all(): ", ((x-x_v2)**2 < 1e-8).all())
+
+print("torch.max((x-x_v2)**2): ", torch.max((x-x_v2)**2))
+
diff --git a/utils/pointops2/functions/test_relative_pos_encoding_op_step1.py b/utils/pointops2/functions/test_relative_pos_encoding_op_step1.py
new file mode 100644
index 0000000..2ff8b34
--- /dev/null
+++ b/utils/pointops2/functions/test_relative_pos_encoding_op_step1.py
@@ -0,0 +1,56 @@
+import torch
+import pointops
+from torch_scatter import scatter_max, scatter_mean, scatter_add, scatter_min, scatter_sum
+
+torch.manual_seed(1)
+
+M = 80000
+N = 3500
+hdim = 16
+h = 6
+L = 31
+query = torch.rand(N, h, hdim).cuda()
+table = torch.rand(L, h, hdim, 3).cuda()
+
+index = torch.rand(M)
+index[index < 0] = 0
+index = (index*N).long().cuda()
+
+rel_index = torch.rand(M, 3)
+rel_index[rel_index < 0] = 0
+rel_index = (rel_index*L).long().cuda()
+
+query.requires_grad = True
+table.requires_grad = True
+
+# query_flat = query[index] #[M, h, hdim]
+# table_x, table_y, table_z = table[:,:,:,0], table[:,:,:,1], table[:,:,:,2] #[L, h, hdim]
+# rel_index_x, rel_index_y, rel_index_z = rel_index[:,0], rel_index[:,1], rel_index[:,2] #[M]
+# rel_pos_encoding = table_x[rel_index_x] + table_y[rel_index_y] + table_z[rel_index_z] #[M, h, hdim]
+# output = (query_flat * rel_pos_encoding).sum(-1) #[M, h]
+# loss = output.mean()
+# loss.backward()
+
+# print("output.shape: {}, output[:5,:10]: {}".format(output.shape, output[:5,:10]))
+# print("query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5])
+# print("table.grad[:5, :3, :5, :2]: ", table.grad[:5, :3, :5, :2])
+# input()
+
+# print("query.is_contiguous(): ", query.is_contiguous())
+# print("key.is_contiguous(): ", key.is_contiguous())
+# print("index_0.is_contiguous(): ", index_0.is_contiguous())
+# print("index_1.is_contiguous(): ", index_1.is_contiguous())
+
+output_v2 = pointops.dot_prod_with_idx(query, index.int(), table, rel_index.int())
+loss = output_v2.mean()
+loss.backward()
+
+print("output_v2.shape: {}, output_v2[:5,:10]: {}".format(output_v2.shape, output_v2[:5,:10]))
+print("v2: query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5])
+print("v2: table.grad[:5, :3, :5, :2]: ", table.grad[:5, :3, :5, :2])
+input()
+
+# print("((output-output_v2)**2).max(): ", ((output-output_v2)**2).max())
+
+# print("torch.max((attn_flat-attn_flat_v2)**2): ", torch.max((attn_flat-attn_flat_v2)**2))
+
diff --git a/utils/pointops2/functions/test_relative_pos_encoding_op_step1_v2.py b/utils/pointops2/functions/test_relative_pos_encoding_op_step1_v2.py
new file mode 100644
index 0000000..3117274
--- /dev/null
+++ b/utils/pointops2/functions/test_relative_pos_encoding_op_step1_v2.py
@@ -0,0 +1,64 @@
+import torch
+import pointops
+from torch_scatter import scatter_max, scatter_mean, scatter_add, scatter_min, scatter_sum
+
+torch.manual_seed(1)
+
+M = 80000
+N = 3500
+hdim = 16
+h = 6
+L = 31
+query = torch.rand(N, h, hdim).cuda()
+table_q = torch.rand(L, h, hdim, 3).cuda()
+key = torch.rand(N, h, hdim).cuda()
+table_k = torch.rand(L, h, hdim, 3).cuda()
+
+index_q = torch.rand(M)
+index_q[index_q < 0] = 0
+index_q = (index_q*N).long().cuda()
+
+index_k = torch.rand(M)
+index_k[index_k < 0] = 0
+index_k = (index_k*N).long().cuda()
+
+rel_index = torch.rand(M, 3)
+rel_index[rel_index < 0] = 0
+rel_index = (rel_index*L).long().cuda()
+
+query.requires_grad = True
+table_q.requires_grad = True
+key.requires_grad = True
+table_k.requires_grad = True
+
+output1 = pointops.dot_prod_with_idx(query, index_q.int(), table_q, rel_index.int())
+output2 = pointops.dot_prod_with_idx(key, index_k.int(), table_k, rel_index.int())
+output = output1 + output2
+# loss = output.mean()
+# loss.backward()
+
+# print("output.shape: {}, output[:5,:10]: {}".format(output.shape, output[:5,:10]))
+# print("query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5])
+# print("table_q.grad[:5, :3, :5, :2]: ", table_q.grad[:5, :3, :5, :2])
+# print("key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5])
+# print("table_k.grad[:5, :3, :5, :2]: ", table_k.grad[:5, :3, :5, :2])
+# input()
+
+# print("query.is_contiguous(): ", query.is_contiguous())
+# print("key.is_contiguous(): ", key.is_contiguous())
+# print("index_0.is_contiguous(): ", index_0.is_contiguous())
+# print("index_1.is_contiguous(): ", index_1.is_contiguous())
+
+output_v2 = pointops.dot_prod_with_idx_v2(query, index_q.int(), key, index_k.int(), table_q, table_k, rel_index.int())
+loss = output_v2.mean()
+loss.backward()
+
+print("output_v2.shape: {}, output_v2[:5,:10]: {}".format(output_v2.shape, output_v2[:5,:10]))
+print("v2 query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5])
+print("v2 table_q.grad[:5, :3, :5, :2]: ", table_q.grad[:5, :3, :5, :2])
+print("v2 key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5])
+print("v2 table_k.grad[:5, :3, :5, :2]: ", table_k.grad[:5, :3, :5, :2])
+# input()
+
+print("((output-output_v2)**2).max(): ", ((output-output_v2)**2).max())
+
diff --git a/utils/pointops2/functions/test_relative_pos_encoding_op_step1_v3.py b/utils/pointops2/functions/test_relative_pos_encoding_op_step1_v3.py
new file mode 100644
index 0000000..ea68061
--- /dev/null
+++ b/utils/pointops2/functions/test_relative_pos_encoding_op_step1_v3.py
@@ -0,0 +1,90 @@
+import torch
+import pointops
+from torch_scatter import scatter_max, scatter_mean, scatter_add, scatter_min, scatter_sum
+
+torch.manual_seed(1)
+
+M = 80000
+N = 3500
+# M = 80
+# N = 5
+hdim = 16
+h = 6
+L = 31
+query = torch.rand(N, h, hdim).cuda()
+table_q = torch.rand(L, h, hdim, 3).cuda()
+key = torch.rand(N, h, hdim).cuda()
+table_k = torch.rand(L, h, hdim, 3).cuda()
+
+index_q = torch.rand(M)
+index_q[index_q < 0] = 0
+index_q = (index_q*N).long().cuda()
+
+index_k = torch.rand(M)
+index_k[index_k < 0] = 0
+index_k = (index_k*N).long().cuda()
+
+rel_index = torch.rand(M, 3)
+rel_index[rel_index < 0] = 0
+rel_index = (rel_index*L).long().cuda()
+
+
+# rearrange index for acceleration
+index_q, indices = torch.sort(index_q) #[M,]
+index_k = index_k[indices] #[M,]
+rel_index = rel_index[indices]
+index_q_counts = index_q.bincount()
+
+print("index_q_counts.shape: ", index_q_counts.shape)
+
+n_max = index_q_counts.max()
+index_q_offsets = index_q_counts.cumsum(dim=-1) #[N]
+
+print("v1 index_q_offsets.shape: ", index_q_offsets.shape)
+
+index_q_offsets = torch.cat([torch.zeros(1, dtype=torch.long).cuda(), index_q_offsets], 0) #[N+1]
+        
+# print("index_q[:100]: ", index_q[:100])
+print("n_max: ", n_max)
+print("index_q_offsets.shape: ", index_q_offsets.shape)
+# input()
+
+print("index_q_offsets[:100]: ", index_q_offsets[:100])
+print("index_k[:20]: ", index_k[:20])
+
+query.requires_grad = True
+table_q.requires_grad = True
+key.requires_grad = True
+table_k.requires_grad = True
+
+output1 = pointops.dot_prod_with_idx(query, index_q.int(), table_q, rel_index.int())
+output2 = pointops.dot_prod_with_idx(key, index_k.int(), table_k, rel_index.int())
+output = output1 + output2
+loss = output.mean()
+loss.backward()
+
+# print("output.shape: {}, output[:5,:10]: {}".format(output.shape, output[:5,:10]))
+# print("query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5])
+# print("table_q.grad[:5, :3, :5, :2]: ", table_q.grad[:5, :3, :5, :2])
+# print("key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5])
+# print("table_k.grad[:5, :3, :5, :2]: ", table_k.grad[:5, :3, :5, :2])
+# input()
+
+# print("query.is_contiguous(): ", query.is_contiguous())
+# print("key.is_contiguous(): ", key.is_contiguous())
+# print("index_q.is_contiguous(): ", index_q.is_contiguous())
+# print("index_k.is_contiguous(): ", index_k.is_contiguous())
+
+output_v2 = pointops.dot_prod_with_idx_v3(query, index_q_offsets.int(), n_max, key, index_k.int(), table_q, table_k, rel_index.int())
+# loss = output_v2.mean()
+# loss.backward()
+
+# print("output_v2.shape: {}, output_v2[:5,:10]: {}".format(output_v2.shape, output_v2[:5,:10]))
+# print("v2 query.grad[:5, :3, :5]: ", query.grad[:5, :3, :5])
+# print("v2 table_q.grad[:5, :3, :5, :2]: ", table_q.grad[:5, :3, :5, :2])
+# print("v2 key.grad[:5, :3, :5]: ", key.grad[:5, :3, :5])
+# print("v2 table_k.grad[:5, :3, :5, :2]: ", table_k.grad[:5, :3, :5, :2])
+# input()
+
+print("((output-output_v2)**2).max(): ", ((output-output_v2)**2).max())
+
diff --git a/utils/pointops2/functions/test_relative_pos_encoding_op_step2.py b/utils/pointops2/functions/test_relative_pos_encoding_op_step2.py
new file mode 100644
index 0000000..eae7d9f
--- /dev/null
+++ b/utils/pointops2/functions/test_relative_pos_encoding_op_step2.py
@@ -0,0 +1,66 @@
+import torch
+import pointops
+from torch_scatter import scatter_max, scatter_mean, scatter_add, scatter_min, scatter_sum
+
+torch.manual_seed(1)
+
+M = 80000
+N = 3500
+hdim = 16
+h = 6
+L = 31
+attn = torch.rand(M, h).cuda()
+v = torch.rand(N, h, hdim).cuda()
+table = torch.rand(L, h, hdim, 3).cuda()
+
+index_0 = torch.rand(M)
+index_0[index_0 < 0] = 0
+index_0 = (index_0*N).long().cuda()
+
+index_1 = torch.rand(M)
+index_1[index_1 < 0] = 0
+index_1 = (index_1*N).long().cuda()
+
+rel_index = torch.rand(M, 3)
+rel_index[rel_index < 0] = 0
+rel_index = (rel_index*L).long().cuda()
+
+attn.requires_grad = True
+v.requires_grad = True
+table.requires_grad = True
+
+v_flat = v[index_1] #[M, h, hdim]
+table_x, table_y, table_z = table[:,:,:,0], table[:,:,:,1], table[:,:,:,2] #[L, h, hdim]
+rel_index_x, rel_index_y, rel_index_z = rel_index[:,0], rel_index[:,1], rel_index[:,2] #[M]
+rel_pos_encoding = table_x[rel_index_x] + table_y[rel_index_y] + table_z[rel_index_z] #[M, h, hdim]
+v_flat_new = v_flat + rel_pos_encoding #[M, h, hdim]
+output = attn.unsqueeze(-1) * v_flat_new #[M, h, hdim] 
+output = scatter_sum(src=output, index=index_0, dim=0, dim_size=N) #[N, h, hdim]
+loss = output.mean()
+loss.backward()
+
+print("output.shape: {}, output[:5,:10,:5]: {}".format(output.shape, output[:5,:10, :5]))
+print("attn.grad[:5, :3]: ", attn.grad[:5, :3])
+print("v.grad[:5, :3, :5]: ", v.grad[:5, :3, :5])
+print("table.grad[:5, :3, :5, :2]: ", table.grad[:5, :3, :5, :2])
+input()
+
+# print("query.is_contiguous(): ", query.is_contiguous())
+# print("key.is_contiguous(): ", key.is_contiguous())
+# print("index_0.is_contiguous(): ", index_0.is_contiguous())
+# print("index_1.is_contiguous(): ", index_1.is_contiguous())
+
+# output_v2 = pointops.attention_step2_with_rel_pos_value(attn, v, index_0.int(), index_1.int(), table, rel_index.int())
+# loss = output_v2.mean()
+# loss.backward()
+
+# print("output_v2.shape: {}, output_v2[:5,:10,:5]: {}".format(output_v2.shape, output_v2[:5,:10,:5]))
+# print("v2 attn.grad[:5, :3]: ", attn.grad[:5, :3])
+# print("v2 v.grad[:5, :3, :5]: ", v.grad[:5, :3, :5])
+# print("v2 table.grad[:5, :3, :5, :2]: ", table.grad[:5, :3, :5, :2])
+# input()
+
+# print("((output-output_v2)**2).max(): ", ((output-output_v2)**2).max())
+
+# print("torch.max((attn_flat-attn_flat_v2)**2): ", torch.max((attn_flat-attn_flat_v2)**2))
+
diff --git a/utils/pointops2/functions/test_relative_pos_encoding_op_step2_v2.py b/utils/pointops2/functions/test_relative_pos_encoding_op_step2_v2.py
new file mode 100644
index 0000000..9c2ee11
--- /dev/null
+++ b/utils/pointops2/functions/test_relative_pos_encoding_op_step2_v2.py
@@ -0,0 +1,92 @@
+import torch
+import pointops
+from torch_scatter import scatter_max, scatter_mean, scatter_add, scatter_min, scatter_sum
+
+torch.manual_seed(1)
+
+M = 80000
+N = 3500
+hdim = 16
+h = 6
+L = 31
+attn = torch.rand(M, h).cuda()
+v = torch.rand(N, h, hdim).cuda()
+table = torch.rand(L, h, hdim, 3).cuda()
+
+index_0 = torch.rand(M)
+index_0[index_0 < 0] = 0
+index_0 = (index_0*N).long().cuda()
+
+index_1 = torch.rand(M)
+index_1[index_1 < 0] = 0
+index_1 = (index_1*N).long().cuda()
+
+rel_index = torch.rand(M, 3)
+rel_index[rel_index < 0] = 0
+rel_index = (rel_index*L).long().cuda()
+
+
+# rearrange index for acceleration
+index_0, indices = torch.sort(index_0) #[M,]
+index_1 = index_1[indices] #[M,]
+rel_index = rel_index[indices]
+index_0_counts = index_0.bincount()
+
+print("index_0_counts.shape: ", index_0_counts.shape)
+
+n_max = index_0_counts.max()
+index_0_offsets = index_0_counts.cumsum(dim=-1) #[N]
+
+print("v1 index_0_offsets.shape: ", index_0_offsets.shape)
+
+index_0_offsets = torch.cat([torch.zeros(1, dtype=torch.long).cuda(), index_0_offsets], 0) #[N+1]
+
+
+attn.requires_grad = True
+v.requires_grad = True
+table.requires_grad = True
+
+
+output = pointops.attention_step2_with_rel_pos_value(attn, v, index_0.int(), index_1.int(), table, rel_index.int())
+loss = output.mean()
+loss.backward()
+
+print("output.shape: {}, output[:5,:10,:5]: {}".format(output.shape, output[:5,:10, :5]))
+print("attn.grad[:5, :3]: ", attn.grad[:5, :3])
+print("v.grad[:5, :3, :5]: ", v.grad[:5, :3, :5])
+print("table.grad[:5, :3, :5, :2]: ", table.grad[:5, :3, :5, :2])
+# input()
+
+attn_grad = attn.grad.clone()
+v_grad = v.grad.clone()
+table_grad = table.grad.clone()
+
+attn.grad.zero_()
+v.grad.zero_()
+table.grad.zero_()
+
+# print("query.is_contiguous(): ", query.is_contiguous())
+# print("key.is_contiguous(): ", key.is_contiguous())
+# print("index_0.is_contiguous(): ", index_0.is_contiguous())
+# print("index_1.is_contiguous(): ", index_1.is_contiguous())
+
+output_v2 = pointops.attention_step2_with_rel_pos_value_v2(attn, v, index_0_offsets.int(), n_max, index_1.int(), table, rel_index.int())
+loss = output_v2.mean()
+loss.backward()
+
+print("output_v2.shape: {}, output_v2[:5,:10,:5]: {}".format(output_v2.shape, output_v2[:5,:10,:5]))
+print("v2 attn.grad[:5, :3]: ", attn.grad[:5, :3])
+print("v2 v.grad[:5, :3, :5]: ", v.grad[:5, :3, :5])
+print("v2 table.grad[:5, :3, :5, :2]: ", table.grad[:5, :3, :5, :2])
+# input()
+
+print("((output-output_v2)**2).max(): ", ((output-output_v2)**2).max())
+
+print("((attn_grad-attn.grad)**2).max(): ", ((attn_grad-attn.grad)**2).max())
+
+print("((v_grad-v.grad)**2).max(): ", ((v_grad-v.grad)**2).max())
+
+print("((table_grad-table.grad)**2).max(): ", ((table_grad-table.grad)**2).max())
+
+# print("torch.max((attn_flat-attn_flat_v2)**2): ", torch.max((attn_flat-attn_flat_v2)**2))
+
diff --git a/utils/pointops2/setup.py b/utils/pointops2/setup.py
new file mode 100644
index 0000000..03f5d6e
--- /dev/null
+++ b/utils/pointops2/setup.py
@@ -0,0 +1,42 @@
+#python3 setup.py install
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+import os
+from distutils.sysconfig import get_config_vars
+
+(opt,) = get_config_vars('OPT')
+os.environ['OPT'] = " ".join(
+    flag for flag in opt.split() if flag != '-Wstrict-prototypes'
+)
+
+setup(
+    name='pointops2',
+    ext_modules=[
+        CUDAExtension('pointops2_cuda', [
+            'src/pointops_api.cpp',
+            'src/knnquery/knnquery_cuda.cpp',
+            'src/knnquery/knnquery_cuda_kernel.cu',
+            'src/sampling/sampling_cuda.cpp',
+            'src/sampling/sampling_cuda_kernel.cu',
+            'src/grouping/grouping_cuda.cpp',
+            'src/grouping/grouping_cuda_kernel.cu',
+            'src/interpolation/interpolation_cuda.cpp',
+            'src/interpolation/interpolation_cuda_kernel.cu',
+            'src/subtraction/subtraction_cuda.cpp',
+            'src/subtraction/subtraction_cuda_kernel.cu',
+            'src/aggregation/aggregation_cuda.cpp',
+            'src/aggregation/aggregation_cuda_kernel.cu',
+            'src/attention/attention_cuda.cpp',
+            'src/attention/attention_cuda_kernel.cu',
+            'src/rpe/relative_pos_encoding_cuda.cpp',
+            'src/rpe/relative_pos_encoding_cuda_kernel.cu',
+            'src/attention_v2/attention_cuda_v2.cpp',
+            'src/attention_v2/attention_cuda_kernel_v2.cu',
+            'src/rpe_v2/relative_pos_encoding_cuda_v2.cpp',
+            'src/rpe_v2/relative_pos_encoding_cuda_kernel_v2.cu',
+            ],
+        extra_compile_args={'cxx': ['-g'], 'nvcc': ['-O2']}
+        )
+    ],
+    cmdclass={'build_ext': BuildExtension}
+)
diff --git a/utils/pointops2/src/__init__.py b/utils/pointops2/src/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/utils/pointops2/src/aggregation/aggregation_cuda.cpp b/utils/pointops2/src/aggregation/aggregation_cuda.cpp
new file mode 100644
index 0000000..d5ad9cd
--- /dev/null
+++ b/utils/pointops2/src/aggregation/aggregation_cuda.cpp
@@ -0,0 +1,29 @@
+#include <vector>
+#include <THC/THC.h>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "aggregation_cuda_kernel.h"
+
+
+void aggregation_forward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor output_tensor)
+{
+    const float *input = input_tensor.data_ptr<float>();
+    const float *position = position_tensor.data_ptr<float>();
+    const float *weight = weight_tensor.data_ptr<float>();
+    const int *idx = idx_tensor.data_ptr<int>();
+    float *output = output_tensor.data_ptr<float>();
+    aggregation_forward_cuda_launcher(n, nsample, c, w_c, input, position, weight, idx, output);
+}
+
+void aggregation_backward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input_tensor, at::Tensor grad_position_tensor, at::Tensor grad_weight_tensor)
+{
+	const float *input = input_tensor.data_ptr<float>();
+    const float *position = position_tensor.data_ptr<float>();
+    const float *weight = weight_tensor.data_ptr<float>();
+    const int *idx = idx_tensor.data_ptr<int>();
+    const float *grad_output = grad_output_tensor.data_ptr<float>();
+    float *grad_input = grad_input_tensor.data_ptr<float>();
+    float *grad_position = grad_position_tensor.data_ptr<float>();
+    float *grad_weight = grad_weight_tensor.data_ptr<float>();
+    aggregation_backward_cuda_launcher(n, nsample, c, w_c, input, position, weight, idx, grad_output, grad_input, grad_position, grad_weight);
+}
diff --git a/utils/pointops2/src/aggregation/aggregation_cuda_kernel.cu b/utils/pointops2/src/aggregation/aggregation_cuda_kernel.cu
new file mode 100644
index 0000000..8339bb7
--- /dev/null
+++ b/utils/pointops2/src/aggregation/aggregation_cuda_kernel.cu
@@ -0,0 +1,53 @@
+#include "../cuda_utils.h"
+#include "aggregation_cuda_kernel.h"
+
+
+__global__ void aggregation_forward_cuda_kernel(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, float *output) {
+    // input: input: (n, c), position: (n, nsample, c), weight: (n, nsample, w_c), idx: (n, nsample), output: (n, c)
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= n * c) return;
+    const int c_idx = index % c;
+    const int n_idx = index / c;
+    const int w_c_idx = c_idx % w_c;
+    for (int nsample_idx = 0; nsample_idx < nsample; nsample_idx++)
+    {   
+        int idx_idx = n_idx * nsample + nsample_idx;
+        int input_idx = idx[idx_idx] * c + c_idx;
+        int position_idx = n_idx * nsample * c + nsample_idx * c + c_idx;
+        int weight_idx = n_idx * nsample * w_c + nsample_idx * w_c + w_c_idx;
+        output[index] += (input[input_idx] + position[position_idx]) * weight[weight_idx];
+    }
+}
+
+__global__ void aggregation_backward_cuda_kernel(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, const float *grad_output, float *grad_input, float *grad_position, float *grad_weight) {
+    // input: grad_output: (n, c), output: grad_input: (n, c), grad_position: (n, nsample, c), grad_weight: (n, nsample, w_c)
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= n * c) return;
+    const int c_idx = index % c;
+    const int n_idx = index / c;
+    const int w_c_idx = c_idx % w_c;
+    for (int nsample_idx = 0; nsample_idx < nsample; nsample_idx++)
+    {   
+        int idx_idx = n_idx * nsample + nsample_idx;
+        int input_idx = idx[idx_idx] * c + c_idx;
+        int position_idx = n_idx * nsample * c + nsample_idx * c + c_idx;
+        int weight_idx = n_idx * nsample * w_c + nsample_idx * w_c + w_c_idx;
+        atomicAdd(grad_input + input_idx, grad_output[index] * weight[weight_idx]);
+        grad_position[position_idx] = grad_output[index] * weight[weight_idx];
+        atomicAdd(grad_weight + weight_idx, grad_output[index] * (input[input_idx] + position[position_idx]));
+    }
+}
+
+void aggregation_forward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, float *output) {
+    // input: input: (n, c), position: (n, nsample, c), weight: (n, nsample, w_c), idx: (n, nsample), output: (n, c)
+    dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK));
+    dim3 threads(THREADS_PER_BLOCK);
+    aggregation_forward_cuda_kernel<<<blocks, threads, 0>>>(n, nsample, c, w_c, input, position, weight, idx, output);
+}
+
+void aggregation_backward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, const float *grad_output, float *grad_input, float *grad_position, float *grad_weight) {  
+    // input: grad_output: (n, c), output: grad_input: (n, c), grad_position: (n, nsample, c), grad_weight: (n, nsample, w_c)
+    dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK));
+    dim3 threads(THREADS_PER_BLOCK);
+    aggregation_backward_cuda_kernel<<<blocks, threads, 0>>>(n, nsample, c, w_c, input, position, weight, idx, grad_output, grad_input, grad_position, grad_weight);
+}
diff --git a/utils/pointops2/src/aggregation/aggregation_cuda_kernel.h b/utils/pointops2/src/aggregation/aggregation_cuda_kernel.h
new file mode 100644
index 0000000..5211a96
--- /dev/null
+++ b/utils/pointops2/src/aggregation/aggregation_cuda_kernel.h
@@ -0,0 +1,20 @@
+#ifndef _AGGREGATION_CUDA_KERNEL
+#define _AGGREGATION_CUDA_KERNEL
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+
+void aggregation_forward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor output_tensor);
+void aggregation_backward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input_tensor, at::Tensor grad_position_tensor, at::Tensor grad_weight_tensor);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aggregation_forward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, float *output);
+void aggregation_backward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, const float *grad_output, float *grad_input, float *grad_position, float *grad_weight);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/utils/pointops2/src/attention/attention_cuda.cpp b/utils/pointops2/src/attention/attention_cuda.cpp
new file mode 100644
index 0000000..8d2c725
--- /dev/null
+++ b/utils/pointops2/src/attention/attention_cuda.cpp
@@ -0,0 +1,56 @@
+#include <vector>
+#include <THC/THC.h>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "attention_cuda_kernel.h"
+
+void attention_step1_forward_cuda(int N, int M, int h, int C, at::Tensor q_tensor, at::Tensor k_tensor, 
+    at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor)
+{
+    const float *q = q_tensor.data_ptr<float>();
+    const float *k = k_tensor.data_ptr<float>();
+    const int *index0 = index0_tensor.data_ptr<int>();
+    const int *index1 = index1_tensor.data_ptr<int>();
+    float *attn = attn_tensor.data_ptr<float>();
+    attention_step1_forward_cuda_launcher(N, M, h, C, q, k, index0, index1, attn);
+}
+
+void attention_step1_backward_cuda(int N, int M, int h, int C, at::Tensor grad_out_tensor, 
+    at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor q_tensor, at::Tensor k_tensor, 
+    at::Tensor grad_q_tensor, at::Tensor grad_k_tensor)
+{
+    const float *grad_out = grad_out_tensor.data_ptr<float>();
+    const int *index0 = index0_tensor.data_ptr<int>();
+    const int *index1 = index1_tensor.data_ptr<int>();
+    const float *q = q_tensor.data_ptr<float>();
+    const float *k = k_tensor.data_ptr<float>();
+    float *grad_q = grad_q_tensor.data_ptr<float>();
+    float *grad_k = grad_k_tensor.data_ptr<float>();
+    attention_step1_backward_cuda_launcher(N, M, h, C, grad_out, index0, index1, q, k, grad_q, grad_k);
+}
+
+void attention_step2_forward_cuda(int N, int M, int h, int C, at::Tensor attn_tensor, at::Tensor v_tensor, 
+    at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor output_tensor)
+{
+    const float *attn = attn_tensor.data_ptr<float>();
+    const float *v = v_tensor.data_ptr<float>();
+    const int *index0 = index0_tensor.data_ptr<int>();
+    const int *index1 = index1_tensor.data_ptr<int>();
+    float *output = output_tensor.data_ptr<float>();
+    attention_step2_forward_cuda_launcher(N, M, h, C, attn, v, index0, index1, output);
+}
+
+
+void attention_step2_backward_cuda(int N, int M, int h, int C, at::Tensor grad_out_tensor, 
+    at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, 
+    at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor)
+{
+    const float *grad_out = grad_out_tensor.data_ptr<float>();
+    const int *index0 = index0_tensor.data_ptr<int>();
+    const int *index1 = index1_tensor.data_ptr<int>();
+    const float *attn = attn_tensor.data_ptr<float>();
+    const float *v = v_tensor.data_ptr<float>();
+    float *grad_attn = grad_attn_tensor.data_ptr<float>();
+    float *grad_v = grad_v_tensor.data_ptr<float>();
+    attention_step2_backward_cuda_launcher(N, M, h, C, grad_out, index0, index1, attn, v, grad_attn, grad_v);
+}
diff --git a/utils/pointops2/src/attention/attention_cuda_kernel.cu b/utils/pointops2/src/attention/attention_cuda_kernel.cu
new file mode 100644
index 0000000..f71ad62
--- /dev/null
+++ b/utils/pointops2/src/attention/attention_cuda_kernel.cu
@@ -0,0 +1,103 @@
+#include "../cuda_utils.h"
+#include "attention_cuda_kernel.h"
+
+
+__global__ void attention_step1_forward_cuda_kernel( // M, h, C//h
+    int N, int M, int h, int C, const float *q, const float *k,
+    const int *index0, const int *index1, float *attn) {
+
+    int c_idx = blockIdx.z;
+    int h_idx = blockIdx.y;
+    int m_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (m_idx >= M || h_idx >= h || c_idx >= C / h) return;
+
+    int idx0 = index0[m_idx];
+    int idx1 = index1[m_idx];
+    float val = q[idx0*C+h_idx*C/h+c_idx] * k[idx1*C+h_idx*C/h+c_idx];
+    atomicAdd(attn+m_idx*h+h_idx, val);
+}
+
+__global__ void attention_step1_backward_cuda_kernel( // M, h, C//h
+    int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, const float *q, const float *k,
+    float *grad_q, float *grad_k) {
+    
+    int c_idx = blockIdx.z;
+    int h_idx = blockIdx.y;
+    int m_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (m_idx >= M || h_idx >= h || c_idx >= C / h) return;
+
+    int idx0 = index0[m_idx];
+    int idx1 = index1[m_idx];
+    int grad_out_idx = m_idx*h+h_idx;
+    int q_idx = idx0*C+h_idx*C/h+c_idx;
+    int k_idx = idx1*C+h_idx*C/h+c_idx;
+    atomicAdd(grad_q+q_idx, grad_out[grad_out_idx] * k[k_idx]);
+    atomicAdd(grad_k+k_idx, grad_out[grad_out_idx] * q[q_idx]);
+}
+
+void attention_step1_forward_cuda_launcher(int N, int M, int h, int C, const float *q, const float *k,
+    const int *index0, const int *index1, float *attn) {
+    // input: attn: (M, h), v: (N, h, C/h), index0: (M, ), index1: (M, )
+    //dim3 blocks(DIVUP(C/h, THREADS_PER_BLOCK), h, M);
+    dim3 blocks(DIVUP(M, THREADS_PER_BLOCK), h, C/h);
+    dim3 threads(THREADS_PER_BLOCK);
+    attention_step1_forward_cuda_kernel<<<blocks, threads, 0>>>(N, M, h, C, q, k, index0, index1, attn);
+}
+
+void attention_step1_backward_cuda_launcher(int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, 
+    const float *q, const float *k, float *grad_q, float *grad_k) {  
+    // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c)
+    //dim3 blocks(DIVUP(C/h, THREADS_PER_BLOCK), h, M);
+    dim3 blocks(DIVUP(M, THREADS_PER_BLOCK), h, C/h);
+    dim3 threads(THREADS_PER_BLOCK);
+    attention_step1_backward_cuda_kernel<<<blocks, threads, 0>>>(N, M, h, C, grad_out, index0, index1, q, k, grad_q, grad_k);
+}
+
+__global__ void attention_step2_forward_cuda_kernel( // M, h, C//h
+    int N, int M, int h, int C, const float *attn, const float *v,
+    const int *index0, const int *index1, float *output) {
+
+    int c_idx = blockIdx.z;
+    int h_idx = blockIdx.y;
+    int m_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (m_idx >= M || h_idx >= h || c_idx >= C / h) return;
+
+    int idx1 = index1[m_idx];
+    float val = attn[m_idx*h+h_idx] * v[idx1*C+h_idx*C/h+c_idx];
+    int idx0 = index0[m_idx];
+    atomicAdd(output+idx0*C+h_idx*C/h+c_idx, val);
+}
+
+__global__ void attention_step2_backward_cuda_kernel( // M, h, C//h
+    int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, const float *attn, const float *v,
+    float *grad_attn, float *grad_v) {
+    
+    int c_idx = blockIdx.z;
+    int h_idx = blockIdx.y;
+    int m_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (m_idx >= M || h_idx >= h || c_idx >= C / h) return;
+
+    int idx0 = index0[m_idx];
+    int idx1 = index1[m_idx];
+    int grad_out_idx = idx0*C+h_idx*C/h+c_idx;
+    atomicAdd(grad_attn+m_idx*h+h_idx, grad_out[grad_out_idx] * v[idx1*C+h_idx*C/h+c_idx]);
+    atomicAdd(grad_v+idx1*C+h_idx*C/h+c_idx, grad_out[grad_out_idx] * attn[m_idx*h+h_idx]);
+}
+
+void attention_step2_forward_cuda_launcher(int N, int M, int h, int C, const float *attn, const float *v,
+    const int *index0, const int *index1, float *output) {
+    // input: attn: (M, h), v: (N, h, C/h), index0: (M, ), index1: (M, )
+    //dim3 blocks(DIVUP(C/h, THREADS_PER_BLOCK), h, M);
+    dim3 blocks(DIVUP(M, THREADS_PER_BLOCK), h, C/h);
+    dim3 threads(THREADS_PER_BLOCK);
+    attention_step2_forward_cuda_kernel<<<blocks, threads, 0>>>(N, M, h, C, attn, v, index0, index1, output);
+}
+
+void attention_step2_backward_cuda_launcher(int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, 
+    const float *attn, const float *v, float *grad_attn, float *grad_v) {  
+    // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c)
+    //dim3 blocks(DIVUP(C/h, THREADS_PER_BLOCK), h, M);
+    dim3 blocks(DIVUP(M, THREADS_PER_BLOCK), h, C/h);
+    dim3 threads(THREADS_PER_BLOCK);
+    attention_step2_backward_cuda_kernel<<<blocks, threads, 0>>>(N, M, h, C, grad_out, index0, index1, attn, v, grad_attn, grad_v);
+}
diff --git a/utils/pointops2/src/attention/attention_cuda_kernel.h b/utils/pointops2/src/attention/attention_cuda_kernel.h
new file mode 100644
index 0000000..cbd99b9
--- /dev/null
+++ b/utils/pointops2/src/attention/attention_cuda_kernel.h
@@ -0,0 +1,26 @@
+#ifndef _ATTENTION_CUDA_KERNEL
+#define _ATTENTION_CUDA_KERNEL
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+
+void attention_step1_forward_cuda(int N, int M, int h, int C, at::Tensor q_tensor, at::Tensor k_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor);
+void attention_step1_backward_cuda(int N, int M, int h, int C, at::Tensor grad_out_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor q_tensor, at::Tensor k_tensor, at::Tensor grad_q_tensor, at::Tensor grad_k_tensor);
+
+void attention_step2_forward_cuda(int N, int M, int h, int C, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor output_tensor);
+void attention_step2_backward_cuda(int N, int M, int h, int C, at::Tensor grad_out_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void attention_step1_forward_cuda_launcher(int N, int M, int h, int C, const float *q, const float *k, const int *index0, const int *index1, float *attn);
+void attention_step1_backward_cuda_launcher(int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, const float *q, const float *k, float *grad_q, float *grad_k);
+
+void attention_step2_forward_cuda_launcher(int N, int M, int h, int C, const float *attn, const float *v, const int *index0, const int *index1, float *output);
+void attention_step2_backward_cuda_launcher(int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, const float *attn, const float *v, float *grad_attn, float *grad_v);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/utils/pointops2/src/attention_v2/attention_cuda_kernel_v2.cu b/utils/pointops2/src/attention_v2/attention_cuda_kernel_v2.cu
new file mode 100644
index 0000000..2e5343f
--- /dev/null
+++ b/utils/pointops2/src/attention_v2/attention_cuda_kernel_v2.cu
@@ -0,0 +1,193 @@
+#include "../cuda_utils.h"
+#include "attention_cuda_kernel_v2.h"
+
+
+template <unsigned int d>
+__global__ void attention_step1_forward_cuda_kernel_v2( // M, h, C//h
+    int N, int M, int h, const float *q, const float *k,
+    const int *index0_offsets, const int *index1, float *attn) {
+
+    int h_idx = blockIdx.y;
+    int q_idx = blockIdx.x;
+    int n_idx = threadIdx.x;
+    int C = h * d;
+    // if (m_idx >= M || h_idx >= h || c_idx >= C / h) return;
+
+    __shared__ float query_vec[d];
+    __shared__ int start, end;
+
+    // if(n_idx == 0){
+    //     printf("blockDim.x: %d\n", blockDim.x);
+    // }
+
+    if (n_idx == 0){
+        start = index0_offsets[q_idx];
+        end = index0_offsets[q_idx+1];
+        // printf("start: %d, end: %d, blockDim.x: %d\n", start, end, blockDim.x);
+    }
+    for(int i = n_idx; i < d; i += blockDim.x)
+        query_vec[i] = q[q_idx*C + h_idx*d + i];
+    
+    __syncthreads();
+
+    int m_idx = start + n_idx;
+    if(m_idx >= end)
+        return;
+
+    float sum = 0;
+    for(int i = 0; i < d; i++){
+        int k_idx = index1[m_idx];
+        float key = k[k_idx * C + h_idx * d + i];
+        sum += query_vec[i] * key;
+    }
+    attn[m_idx*h + h_idx] = sum;
+    // int idx0 = index0[m_idx];
+    // int idx1 = index1[m_idx];
+    // float val = q[idx0*C+h_idx*C/h+c_idx] * k[idx1*C+h_idx*C/h+c_idx];
+    // atomicAdd(attn+m_idx*h+h_idx, val);
+}
+
+template <unsigned int d>
+__global__ void attention_step1_backward_cuda_kernel_v2( // M, h, C//h
+    int N, int M, int h, const float *grad_out, const int *index0_offsets, const int *index1, const float *q, const float *k,
+    float *grad_q, float *grad_k) {
+    
+    int h_idx = blockIdx.y;
+    int q_idx = blockIdx.x;
+    int n_idx = threadIdx.x;
+    int C = d * h;
+    
+    __shared__ float query_vec[d];
+    __shared__ int start, end;
+
+    if (n_idx == 0){
+        start = index0_offsets[q_idx];
+        end = index0_offsets[q_idx+1];
+    }
+    for(int i = n_idx; i < d; i += blockDim.x)
+        query_vec[i] = q[q_idx*C + h_idx*d + i];
+    
+    __shared__ float gradient_new[d];
+    for(int i = n_idx; i < d; i += blockDim.x)
+        gradient_new[i] = 0;
+
+    __syncthreads();
+
+    int m_idx = start + n_idx;
+    if(m_idx < end){
+        float gradient = grad_out[m_idx*h + h_idx];
+        for(int i = 0; i < d; i++){
+            int k_idx = index1[m_idx];
+            atomicAdd(&gradient_new[i], gradient * k[k_idx*C + h_idx*d + i]);
+            atomicAdd(grad_k + k_idx*C + h_idx*d + i, gradient * query_vec[i]);
+        }
+    }
+    __syncthreads();
+
+    for(int i = n_idx; i < d; i += blockDim.x)
+        grad_q[q_idx*C + h_idx*d + i] = gradient_new[i];
+}
+
+void attention_step1_forward_cuda_launcher_v2(int N, int M, int h, int C, const unsigned int n_max, 
+    const float *q, const float *k, const int *index0_offsets, const int *index1, float *attn) {
+    // input: attn: (M, h), v: (N, h, C/h), index0: (M, ), index1: (M, )
+    //dim3 blocks(DIVUP(C/h, THREADS_PER_BLOCK), h, M);
+    dim3 blocks(N, h);
+	unsigned int n_threads = opt_n_threads(n_max);
+
+    n_threads = n_threads == n_max ? n_threads : n_threads * 2;
+    // n_threads = n_threads > 1024 ? 512 : n_threads;
+
+    // printf("n_max: %d, n_threads: %d\n", n_max, n_threads);
+
+    // dim3 threads(THREADS_PER_BLOCK);
+    // attention_step1_forward_cuda_kernel_v2<<<blocks, threads, 0>>>(N, M, h, C, q, k, index0, index1, attn);
+    
+	switch (C / h) {
+        case 16:
+            attention_step1_forward_cuda_kernel_v2<16><<<blocks, n_threads, 0>>>(N, M, h, q, k, index0_offsets, index1, attn);
+            break;
+        case 32:
+            attention_step1_forward_cuda_kernel_v2<32><<<blocks, n_threads, 0>>>(N, M, h, q, k, index0_offsets, index1, attn);
+            break;
+        default:
+            throw "d != 16 and d != 32";
+    }
+}
+
+void attention_step1_backward_cuda_launcher_v2(int N, int M, int h, int C, const unsigned int n_max, 
+    const float *grad_out, const int *index0_offsets, const int *index1, const float *q, const float *k, float *grad_q, float *grad_k) {  
+    // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c)
+    //dim3 blocks(DIVUP(C/h, THREADS_PER_BLOCK), h, M);
+    // dim3 blocks(DIVUP(M, THREADS_PER_BLOCK), h, C/h);
+    // dim3 threads(THREADS_PER_BLOCK);
+    dim3 blocks(N, h);
+	unsigned int n_threads = opt_n_threads(n_max);
+    // attention_step1_backward_cuda_kernel_v2<<<blocks, n_threads, 0>>>(N, M, h, C/h, grad_out, index0_offsets, index1, q, k, grad_q, grad_k);
+
+    n_threads = n_threads == n_max ? n_threads : n_threads * 2;
+    // n_threads = n_threads > 1024 ? 512 : n_threads;
+
+    // printf("n_max: %d, n_threads: %d\n", n_max, n_threads);
+
+	switch (C / h) {
+        case 16:
+            attention_step1_backward_cuda_kernel_v2<16><<<blocks, n_threads, 0>>>(N, M, h, grad_out, index0_offsets, index1, q, k, grad_q, grad_k);
+        break;
+        case 32:
+            attention_step1_backward_cuda_kernel_v2<32><<<blocks, n_threads, 0>>>(N, M, h, grad_out, index0_offsets, index1, q, k, grad_q, grad_k);
+            break;
+        default:
+            throw "d != 16 and d != 32";
+    }
+
+}
+
+__global__ void attention_step2_forward_cuda_kernel_v2( // M, h, C//h
+    int N, int M, int h, int C, const float *attn, const float *v,
+    const int *index0, const int *index1, float *output) {
+
+    int c_idx = blockIdx.z;
+    int h_idx = blockIdx.y;
+    int m_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (m_idx >= M || h_idx >= h || c_idx >= C / h) return;
+
+    int idx1 = index1[m_idx];
+    float val = attn[m_idx*h+h_idx] * v[idx1*C+h_idx*C/h+c_idx];
+    int idx0 = index0[m_idx];
+    atomicAdd(output+idx0*C+h_idx*C/h+c_idx, val);
+}
+
+__global__ void attention_step2_backward_cuda_kernel_v2( // M, h, C//h
+    int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, const float *attn, const float *v,
+    float *grad_attn, float *grad_v) {
+    
+    int c_idx = blockIdx.z;
+    int h_idx = blockIdx.y;
+    int m_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (m_idx >= M || h_idx >= h || c_idx >= C / h) return;
+
+    int idx0 = index0[m_idx];
+    int idx1 = index1[m_idx];
+    int grad_out_idx = idx0*C+h_idx*C/h+c_idx;
+    atomicAdd(grad_attn+m_idx*h+h_idx, grad_out[grad_out_idx] * v[idx1*C+h_idx*C/h+c_idx]);
+    atomicAdd(grad_v+idx1*C+h_idx*C/h+c_idx, grad_out[grad_out_idx] * attn[m_idx*h+h_idx]);
+}
+
+void attention_step2_forward_cuda_launcher_v2(int N, int M, int h, int C, const float *attn, const float *v,
+    const int *index0, const int *index1, float *output) {
+    // input: attn: (M, h), v: (N, h, C/h), index0: (M, ), index1: (M, )
+    //dim3 blocks(DIVUP(C/h, THREADS_PER_BLOCK), h, M);
+    dim3 blocks(DIVUP(M, THREADS_PER_BLOCK), h, C/h);
+    dim3 threads(THREADS_PER_BLOCK);
+    attention_step2_forward_cuda_kernel_v2<<<blocks, threads, 0>>>(N, M, h, C, attn, v, index0, index1, output);
+}
+
+void attention_step2_backward_cuda_launcher_v2(int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, 
+    const float *attn, const float *v, float *grad_attn, float *grad_v) {  
+    // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c)
+    //dim3 blocks(DIVUP(C/h, THREADS_PER_BLOCK), h, M);
+    dim3 blocks(DIVUP(M, THREADS_PER_BLOCK), h, C/h);
+    dim3 threads(THREADS_PER_BLOCK);
+    attention_step2_backward_cuda_kernel_v2<<<blocks, threads, 0>>>(N, M, h, C, grad_out, index0, index1, attn, v, grad_attn, grad_v);
+}
diff --git a/utils/pointops2/src/attention_v2/attention_cuda_kernel_v2.h b/utils/pointops2/src/attention_v2/attention_cuda_kernel_v2.h
new file mode 100644
index 0000000..d7e7f04
--- /dev/null
+++ b/utils/pointops2/src/attention_v2/attention_cuda_kernel_v2.h
@@ -0,0 +1,26 @@
+#ifndef _ATTENTION_V2_CUDA_KERNEL
+#define _ATTENTION_V2_CUDA_KERNEL
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+
+void attention_step1_forward_cuda_v2(int N, int M, int h, int C, const unsigned int n_max, at::Tensor q_tensor, at::Tensor k_tensor, at::Tensor index0_tensor_offsets, at::Tensor index1_tensor, at::Tensor attn_tensor);
+void attention_step1_backward_cuda_v2(int N, int M, int h, int C, const unsigned int n_max, at::Tensor grad_out_tensor, at::Tensor index0_tensor_offsets, at::Tensor index1_tensor, at::Tensor q_tensor, at::Tensor k_tensor, at::Tensor grad_q_tensor, at::Tensor grad_k_tensor);
+
+void attention_step2_forward_cuda_v2(int N, int M, int h, int C, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor output_tensor);
+void attention_step2_backward_cuda_v2(int N, int M, int h, int C, at::Tensor grad_out_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void attention_step1_forward_cuda_launcher_v2(int N, int M, int h, int C, const unsigned int n_max, const float *q, const float *k, const int *index0_offsets, const int *index1, float *attn);
+void attention_step1_backward_cuda_launcher_v2(int N, int M, int h, int C, const unsigned int n_max, const float *grad_out, const int *index0_offsets, const int *index1, const float *q, const float *k, float *grad_q, float *grad_k);
+
+void attention_step2_forward_cuda_launcher_v2(int N, int M, int h, int C, const float *attn, const float *v, const int *index0, const int *index1, float *output);
+void attention_step2_backward_cuda_launcher_v2(int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, const float *attn, const float *v, float *grad_attn, float *grad_v);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/utils/pointops2/src/attention_v2/attention_cuda_v2.cpp b/utils/pointops2/src/attention_v2/attention_cuda_v2.cpp
new file mode 100644
index 0000000..311adaf
--- /dev/null
+++ b/utils/pointops2/src/attention_v2/attention_cuda_v2.cpp
@@ -0,0 +1,56 @@
+#include <vector>
+#include <THC/THC.h>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "attention_cuda_kernel_v2.h"
+
+void attention_step1_forward_cuda_v2(int N, int M, int h, int C, const unsigned int n_max, at::Tensor q_tensor, at::Tensor k_tensor, 
+    at::Tensor index0_tensor_offsets, at::Tensor index1_tensor, at::Tensor attn_tensor)
+{
+    const float *q = q_tensor.data_ptr<float>();
+    const float *k = k_tensor.data_ptr<float>();
+    const int *index0_offsets = index0_tensor_offsets.data_ptr<int>();
+    const int *index1 = index1_tensor.data_ptr<int>();
+    float *attn = attn_tensor.data_ptr<float>();
+    attention_step1_forward_cuda_launcher_v2(N, M, h, C, n_max, q, k, index0_offsets, index1, attn);
+}
+
+void attention_step1_backward_cuda_v2(int N, int M, int h, int C, const unsigned int n_max, at::Tensor grad_out_tensor, 
+    at::Tensor index0_tensor_offsets, at::Tensor index1_tensor, at::Tensor q_tensor, at::Tensor k_tensor, 
+    at::Tensor grad_q_tensor, at::Tensor grad_k_tensor)
+{
+    const float *grad_out = grad_out_tensor.data_ptr<float>();
+    const int *index0_offsets = index0_tensor_offsets.data_ptr<int>();
+    const int *index1 = index1_tensor.data_ptr<int>();
+    const float *q = q_tensor.data_ptr<float>();
+    const float *k = k_tensor.data_ptr<float>();
+    float *grad_q = grad_q_tensor.data_ptr<float>();
+    float *grad_k = grad_k_tensor.data_ptr<float>();
+    attention_step1_backward_cuda_launcher_v2(N, M, h, C, n_max, grad_out, index0_offsets, index1, q, k, grad_q, grad_k);
+}
+
+void attention_step2_forward_cuda_v2(int N, int M, int h, int C, at::Tensor attn_tensor, at::Tensor v_tensor, 
+    at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor output_tensor)
+{
+    const float *attn = attn_tensor.data_ptr<float>();
+    const float *v = v_tensor.data_ptr<float>();
+    const int *index0 = index0_tensor.data_ptr<int>();
+    const int *index1 = index1_tensor.data_ptr<int>();
+    float *output = output_tensor.data_ptr<float>();
+    attention_step2_forward_cuda_launcher_v2(N, M, h, C, attn, v, index0, index1, output);
+}
+
+
+void attention_step2_backward_cuda_v2(int N, int M, int h, int C, at::Tensor grad_out_tensor, 
+    at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, 
+    at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor)
+{
+    const float *grad_out = grad_out_tensor.data_ptr<float>();
+    const int *index0 = index0_tensor.data_ptr<int>();
+    const int *index1 = index1_tensor.data_ptr<int>();
+    const float *attn = attn_tensor.data_ptr<float>();
+    const float *v = v_tensor.data_ptr<float>();
+    float *grad_attn = grad_attn_tensor.data_ptr<float>();
+    float *grad_v = grad_v_tensor.data_ptr<float>();
+    attention_step2_backward_cuda_launcher_v2(N, M, h, C, grad_out, index0, index1, attn, v, grad_attn, grad_v);
+}
diff --git a/utils/pointops2/src/cuda_utils.h b/utils/pointops2/src/cuda_utils.h
new file mode 100644
index 0000000..e67749c
--- /dev/null
+++ b/utils/pointops2/src/cuda_utils.h
@@ -0,0 +1,23 @@
+#ifndef _CUDA_UTILS_H
+#define _CUDA_UTILS_H
+
+#include <cmath>
+#include <algorithm>
+
+#define TOTAL_THREADS 1024
+#define THREADS_PER_BLOCK 256
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+inline int opt_n_threads(int work_size) {
+    const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
+    return std::max(std::min(1 << pow_2, TOTAL_THREADS), 1);
+}
+
+inline dim3 opt_block_config(int x, int y) {
+    const int x_threads = opt_n_threads(x);
+    const int y_threads = std::max(std::min(opt_n_threads(y), TOTAL_THREADS / x_threads), 1);
+    dim3 block_config(x_threads, y_threads, 1);
+    return block_config;
+}
+
+#endif
diff --git a/utils/pointops2/src/grouping/grouping_cuda.cpp b/utils/pointops2/src/grouping/grouping_cuda.cpp
new file mode 100644
index 0000000..a00d313
--- /dev/null
+++ b/utils/pointops2/src/grouping/grouping_cuda.cpp
@@ -0,0 +1,22 @@
+#include <vector>
+#include <THC/THC.h>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "grouping_cuda_kernel.h"
+
+
+void grouping_forward_cuda(int m, int nsample, int c, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor output_tensor)
+{
+    const float *input = input_tensor.data_ptr<float>();
+    const int *idx = idx_tensor.data_ptr<int>();
+    float *output = output_tensor.data_ptr<float>();
+    grouping_forward_cuda_launcher(m, nsample, c, input, idx, output);
+}
+
+void grouping_backward_cuda(int m, int nsample, int c, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor grad_input_tensor)
+{
+    const float *grad_output = grad_output_tensor.data_ptr<float>();
+    const int *idx = idx_tensor.data_ptr<int>();
+    float *grad_input = grad_input_tensor.data_ptr<float>();
+    grouping_backward_cuda_launcher(m, nsample, c, grad_output, idx, grad_input);
+}
diff --git a/utils/pointops2/src/grouping/grouping_cuda_kernel.cu b/utils/pointops2/src/grouping/grouping_cuda_kernel.cu
new file mode 100644
index 0000000..58ec0a2
--- /dev/null
+++ b/utils/pointops2/src/grouping/grouping_cuda_kernel.cu
@@ -0,0 +1,40 @@
+#include "../cuda_utils.h"
+#include "grouping_cuda_kernel.h"
+
+
+__global__ void grouping_forward_cuda_kernel(int m, int nsample, int c, const float *__restrict__ input, const int *__restrict__ idx, float *__restrict__ output) {
+    // input: input: (n, c), idx: (m, nsample), output: (m, nsample, c)
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= m * nsample * c) return;
+    const int c_idx = index % c;
+    const int nsample_idx = (index / c) % nsample;
+    const int m_idx = index / nsample / c;
+    const int input_idx = idx[m_idx * nsample + nsample_idx] * c + c_idx;
+    output[index] = input[input_idx];
+}
+
+__global__ void grouping_backward_cuda_kernel(int m, int nsample, int c, const float *__restrict__ grad_output, const int *__restrict__ idx, float *__restrict__ grad_input) {
+    // input: grad_output: (m, nsample, c), idx: (m, nsample), output: grad_input: (n, c)
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= m * nsample * c) return;
+    const int c_idx = index % c;
+    const int nsample_idx = (index / c) % nsample;
+    const int m_idx = index / nsample / c;
+    const int input_idx = idx[m_idx * nsample + nsample_idx] * c + c_idx;
+    atomicAdd(grad_input + input_idx, grad_output[index]);
+}
+
+void grouping_forward_cuda_launcher(int m, int nsample, int c, const float *input, const int *idx, float *output) {
+    // input: input: (n, c), idx: (m, nsample), output: (m, nsample, c)
+    dim3 blocks(DIVUP(m * nsample * c, THREADS_PER_BLOCK));
+    dim3 threads(THREADS_PER_BLOCK);
+    grouping_forward_cuda_kernel<<<blocks, threads, 0>>>(m, nsample, c, input, idx, output);
+}
+
+void grouping_backward_cuda_launcher(int m, int nsample, int c, const float *grad_output, const int *idx, float *grad_input)
+{  
+    // input: grad_output: (m, nsample, c), idx: (m, nsample), output: grad_input: (n, c)
+    dim3 blocks(DIVUP(m * nsample * c, THREADS_PER_BLOCK));
+    dim3 threads(THREADS_PER_BLOCK);
+    grouping_backward_cuda_kernel<<<blocks, threads, 0>>>(m, nsample, c, grad_output, idx, grad_input);
+}
diff --git a/utils/pointops2/src/grouping/grouping_cuda_kernel.h b/utils/pointops2/src/grouping/grouping_cuda_kernel.h
new file mode 100644
index 0000000..3db4aaa
--- /dev/null
+++ b/utils/pointops2/src/grouping/grouping_cuda_kernel.h
@@ -0,0 +1,20 @@
+#ifndef _GROUPING_CUDA_KERNEL
+#define _GROUPING_CUDA_KERNEL
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+
+void grouping_forward_cuda(int m, int nsample, int c, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor output_tensor);
+void grouping_backward_cuda(int m, int nsample, int c, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor grad_input_tensor);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void grouping_forward_cuda_launcher(int m, int nsample, int c, const float *input, const int *idx, float *output);
+void grouping_backward_cuda_launcher(int m, int nsample, int c, const float *grad_output, const int *idx, float *grad_input);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/utils/pointops2/src/interpolation/interpolation_cuda.cpp b/utils/pointops2/src/interpolation/interpolation_cuda.cpp
new file mode 100644
index 0000000..a73c02b
--- /dev/null
+++ b/utils/pointops2/src/interpolation/interpolation_cuda.cpp
@@ -0,0 +1,24 @@
+#include <vector>
+#include <THC/THC.h>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "interpolation_cuda_kernel.h"
+
+
+void interpolation_forward_cuda(int n, int c, int k, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor output_tensor)
+{
+    const float *input = input_tensor.data_ptr<float>();
+    const int *idx = idx_tensor.data_ptr<int>();
+    const float *weight = weight_tensor.data_ptr<float>();
+    float *output = output_tensor.data_ptr<float>();
+    interpolation_forward_cuda_launcher(n, c, k, input, idx, weight, output);
+}
+
+void interpolation_backward_cuda(int n, int c, int k, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_input_tensor)
+{
+    const float *grad_output = grad_output_tensor.data_ptr<float>();
+    const int *idx = idx_tensor.data_ptr<int>();
+    const float *weight = weight_tensor.data_ptr<float>();
+    float *grad_input = grad_input_tensor.data_ptr<float>();
+    interpolation_backward_cuda_launcher(n, c, k, grad_output, idx, weight, grad_input);
+}
diff --git a/utils/pointops2/src/interpolation/interpolation_cuda_kernel.cu b/utils/pointops2/src/interpolation/interpolation_cuda_kernel.cu
new file mode 100644
index 0000000..f560d8c
--- /dev/null
+++ b/utils/pointops2/src/interpolation/interpolation_cuda_kernel.cu
@@ -0,0 +1,47 @@
+#include "../cuda_utils.h"
+#include "interpolation_cuda_kernel.h"
+
+
+__global__ void interpolation_forward_cuda_kernel(int n, int c, int k, const float *input, const int *idx, const float *weight, float *output)
+{
+    // input: input: (m, c), idx: (n, k), weight: (n, k), output: output (n, c)
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= n * c) return;
+    int c_idx = index % c;
+    int n_idx = index / c;
+    for (int i = 0; i < k; i++)
+    {
+        int idx_idx = n_idx * k + i;
+        int input_idx = idx[idx_idx] * c + c_idx;
+        output[index] += input[input_idx] * weight[idx_idx];
+    }
+}
+
+__global__ void interpolation_backward_cuda_kernel(int n, int c, int k, const float *grad_output, const int *idx, const float *weight, float *grad_input)
+{
+    // input: grad_output: (n, c), idx: (n, k), weight: (n, k), output: grad_input (m, c)
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= n * c) return;
+    int c_idx = index % c;
+    int n_idx = index / c;
+    for (int i = 0; i < k; i++)
+    {
+        int idx_idx = n_idx * k + i;
+        int input_idx = idx[idx_idx] * c + c_idx;
+        atomicAdd(grad_input + input_idx, grad_output[index] * weight[idx_idx]);
+    }
+}
+
+void interpolation_forward_cuda_launcher(int n, int c, int k, const float *input, const int *idx, const float *weight, float *output) {
+    // input: input: (m, c), idx: (n, k), weight: (n, k), output: output (n, c)
+    dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK));
+    dim3 threads(THREADS_PER_BLOCK);
+    interpolation_forward_cuda_kernel<<<blocks, threads, 0>>>(n, c, k, input, idx, weight, output);
+}
+
+void interpolation_backward_cuda_launcher(int n, int c, int k, const float *grad_output, const int *idx, const float *weight, float *grad_input) {
+    // input: grad_output: (n, c), idx: (n, k), weight: (n, k), output: grad_input (m, c)
+    dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK));
+    dim3 threads(THREADS_PER_BLOCK);
+    interpolation_backward_cuda_kernel<<<blocks, threads, 0>>>(n, c, k, grad_output, idx, weight, grad_input);
+}
diff --git a/utils/pointops2/src/interpolation/interpolation_cuda_kernel.h b/utils/pointops2/src/interpolation/interpolation_cuda_kernel.h
new file mode 100644
index 0000000..309e5dd
--- /dev/null
+++ b/utils/pointops2/src/interpolation/interpolation_cuda_kernel.h
@@ -0,0 +1,20 @@
+#ifndef _INTERPOLATION_CUDA_KERNEL
+#define _INTERPOLATION_CUDA_KERNEL
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+
+void interpolation_forward_cuda(int n, int c, int k, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor output_tensor);
+void interpolation_backward_cuda(int n, int c, int k, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_input_tensor);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void interpolation_forward_cuda_launcher(int n, int c, int k, const float *input, const int *idx, const float *weight, float *output);
+void interpolation_backward_cuda_launcher(int n, int c, int k, const float *grad_output, const int *idx, const float *weight, float *grad_input);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/utils/pointops2/src/knnquery/knnquery_cuda.cpp b/utils/pointops2/src/knnquery/knnquery_cuda.cpp
new file mode 100644
index 0000000..568f136
--- /dev/null
+++ b/utils/pointops2/src/knnquery/knnquery_cuda.cpp
@@ -0,0 +1,17 @@
+#include <vector>
+#include <THC/THC.h>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "knnquery_cuda_kernel.h"
+
+
+void knnquery_cuda(int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor)
+{
+    const float *xyz = xyz_tensor.data_ptr<float>();
+    const float *new_xyz = new_xyz_tensor.data_ptr<float>();
+    const int *offset = offset_tensor.data_ptr<int>();
+    const int *new_offset = new_offset_tensor.data_ptr<int>();
+    int *idx = idx_tensor.data_ptr<int>();
+    float *dist2 = dist2_tensor.data_ptr<float>();
+    knnquery_cuda_launcher(m, nsample, xyz, new_xyz, offset, new_offset, idx, dist2);
+}
diff --git a/utils/pointops2/src/knnquery/knnquery_cuda_kernel.cu b/utils/pointops2/src/knnquery/knnquery_cuda_kernel.cu
new file mode 100644
index 0000000..83762bc
--- /dev/null
+++ b/utils/pointops2/src/knnquery/knnquery_cuda_kernel.cu
@@ -0,0 +1,116 @@
+#include "../cuda_utils.h"
+#include "knnquery_cuda_kernel.h"
+
+
+__device__ void swap_float(float *x, float *y)
+{
+    float tmp = *x;
+    *x = *y;
+    *y = tmp;
+}
+
+
+__device__ void swap_int(int *x, int *y)
+{
+    int tmp = *x;
+    *x = *y;
+    *y = tmp;
+}
+
+
+__device__ void reheap(float *dist, int *idx, int k)
+{
+    int root = 0;
+    int child = root * 2 + 1;
+    while (child < k)
+    {
+        if(child + 1 < k && dist[child+1] > dist[child])
+            child++;
+        if(dist[root] > dist[child])
+            return;
+        swap_float(&dist[root], &dist[child]);
+        swap_int(&idx[root], &idx[child]);
+        root = child;
+        child = root * 2 + 1;
+    }
+}
+
+
+__device__ void heap_sort(float *dist, int *idx, int k)
+{
+    int i;
+    for (i = k - 1; i > 0; i--)
+    {
+        swap_float(&dist[0], &dist[i]);
+        swap_int(&idx[0], &idx[i]);
+        reheap(dist, idx, i);
+    }
+}
+
+
+__device__ int get_bt_idx(int idx, const int *offset)
+{
+    int i = 0;
+    while (1)
+    {
+        if (idx < offset[i])
+            break;
+        else
+            i++;
+    }
+    return i;
+}
+
+
+__global__ void knnquery_cuda_kernel(int m, int nsample, const float *__restrict__ xyz, const float *__restrict__ new_xyz, const int *__restrict__ offset, const int *__restrict__ new_offset, int *__restrict__ idx, float *__restrict__ dist2) {
+    // input: xyz (n, 3) new_xyz (m, 3)
+    // output: idx (m, nsample) dist2 (m, nsample)
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (pt_idx >= m) return;
+
+    new_xyz += pt_idx * 3;
+    idx += pt_idx * nsample;
+    dist2 += pt_idx * nsample;
+    int bt_idx = get_bt_idx(pt_idx, new_offset);
+    int start;
+    if (bt_idx == 0)
+        start = 0;
+    else
+        start = offset[bt_idx - 1];
+    int end = offset[bt_idx];
+
+    float new_x = new_xyz[0];
+    float new_y = new_xyz[1];
+    float new_z = new_xyz[2];
+
+    float best_dist[100];
+    int best_idx[100];
+    for(int i = 0; i < nsample; i++){
+        best_dist[i] = 1e10;
+        best_idx[i] = start;
+    }
+    for(int i = start; i < end; i++){
+        float x = xyz[i * 3 + 0];
+        float y = xyz[i * 3 + 1];
+        float z = xyz[i * 3 + 2];
+        float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
+        if (d2 < best_dist[0]){
+            best_dist[0] = d2;
+            best_idx[0] = i;
+            reheap(best_dist, best_idx, nsample);
+        }
+    }
+    heap_sort(best_dist, best_idx, nsample);
+    for(int i = 0; i < nsample; i++){
+        idx[i] = best_idx[i];
+        dist2[i] = best_dist[i];
+    }
+}
+
+
+void knnquery_cuda_launcher(int m, int nsample, const float *xyz, const float *new_xyz, const int *offset, const int *new_offset, int *idx, float *dist2) {
+    // input: new_xyz: (m, 3), xyz: (n, 3), idx: (m, nsample)
+    dim3 blocks(DIVUP(m, THREADS_PER_BLOCK));
+    dim3 threads(THREADS_PER_BLOCK);
+    knnquery_cuda_kernel<<<blocks, threads, 0>>>(m, nsample, xyz, new_xyz, offset, new_offset, idx, dist2);
+}
diff --git a/utils/pointops2/src/knnquery/knnquery_cuda_kernel.h b/utils/pointops2/src/knnquery/knnquery_cuda_kernel.h
new file mode 100644
index 0000000..3c0aedf
--- /dev/null
+++ b/utils/pointops2/src/knnquery/knnquery_cuda_kernel.h
@@ -0,0 +1,18 @@
+#ifndef _KNNQUERY_CUDA_KERNEL
+#define _KNNQUERY_CUDA_KERNEL
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+
+void knnquery_cuda(int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void knnquery_cuda_launcher(int m, int nsample, const float *xyz, const float *new_xyz, const int *offset, const int *new_offset, int *idx, float *dist2);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/utils/pointops2/src/pointops_api.cpp b/utils/pointops2/src/pointops_api.cpp
new file mode 100644
index 0000000..812789f
--- /dev/null
+++ b/utils/pointops2/src/pointops_api.cpp
@@ -0,0 +1,45 @@
+#include <torch/serialize/tensor.h>
+#include <torch/extension.h>
+
+#include "knnquery/knnquery_cuda_kernel.h"
+#include "sampling/sampling_cuda_kernel.h"
+#include "grouping/grouping_cuda_kernel.h"
+#include "interpolation/interpolation_cuda_kernel.h"
+#include "aggregation/aggregation_cuda_kernel.h"
+#include "subtraction/subtraction_cuda_kernel.h"
+#include "attention/attention_cuda_kernel.h"
+#include "rpe/relative_pos_encoding_cuda_kernel.h"
+#include "attention_v2/attention_cuda_kernel_v2.h"
+#include "rpe_v2/relative_pos_encoding_cuda_kernel_v2.h"
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("knnquery_cuda", &knnquery_cuda, "knnquery_cuda");
+    m.def("furthestsampling_cuda", &furthestsampling_cuda, "furthestsampling_cuda");
+    m.def("grouping_forward_cuda", &grouping_forward_cuda, "grouping_forward_cuda");
+    m.def("grouping_backward_cuda", &grouping_backward_cuda, "grouping_backward_cuda");
+    m.def("interpolation_forward_cuda", &interpolation_forward_cuda, "interpolation_forward_cuda");
+    m.def("interpolation_backward_cuda", &interpolation_backward_cuda, "interpolation_backward_cuda");
+    m.def("subtraction_forward_cuda", &subtraction_forward_cuda, "subtraction_forward_cuda");
+    m.def("subtraction_backward_cuda", &subtraction_backward_cuda, "subtraction_backward_cuda");
+    m.def("aggregation_forward_cuda", &aggregation_forward_cuda, "aggregation_forward_cuda");
+    m.def("aggregation_backward_cuda", &aggregation_backward_cuda, "aggregation_backward_cuda");
+    m.def("attention_step1_forward_cuda", &attention_step1_forward_cuda, "attention_step1_forward_cuda");
+    m.def("attention_step1_backward_cuda", &attention_step1_backward_cuda, "attention_step1_backward_cuda");
+    m.def("attention_step2_forward_cuda", &attention_step2_forward_cuda, "attention_step2_forward_cuda");
+    m.def("attention_step2_backward_cuda", &attention_step2_backward_cuda, "attention_step2_backward_cuda");
+    m.def("dot_prod_with_idx_forward_cuda", &dot_prod_with_idx_forward_cuda, "dot_prod_with_idx_forward_cuda");
+    m.def("dot_prod_with_idx_backward_cuda", &dot_prod_with_idx_backward_cuda, "dot_prod_with_idx_backward_cuda");
+    m.def("attention_step2_with_rel_pos_value_forward_cuda", &attention_step2_with_rel_pos_value_forward_cuda, "attention_step2_with_rel_pos_value_forward_cuda");
+    m.def("attention_step2_with_rel_pos_value_backward_cuda", &attention_step2_with_rel_pos_value_backward_cuda, "attention_step2_with_rel_pos_value_backward_cuda");
+    m.def("attention_step1_forward_cuda_v2", &attention_step1_forward_cuda_v2, "attention_step1_forward_cuda_v2");
+    m.def("attention_step1_backward_cuda_v2", &attention_step1_backward_cuda_v2, "attention_step1_backward_cuda_v2");
+    m.def("attention_step2_forward_cuda_v2", &attention_step2_forward_cuda_v2, "attention_step2_forward_cuda_v2");
+    m.def("attention_step2_backward_cuda_v2", &attention_step2_backward_cuda_v2, "attention_step2_backward_cuda_v2");
+    m.def("dot_prod_with_idx_forward_cuda_v2", &dot_prod_with_idx_forward_cuda_v2, "dot_prod_with_idx_forward_cuda_v2");
+    m.def("dot_prod_with_idx_backward_cuda_v2", &dot_prod_with_idx_backward_cuda_v2, "dot_prod_with_idx_backward_cuda_v2");
+    m.def("attention_step2_with_rel_pos_value_forward_cuda_v2", &attention_step2_with_rel_pos_value_forward_cuda_v2, "attention_step2_with_rel_pos_value_forward_cuda_v2");
+    m.def("attention_step2_with_rel_pos_value_backward_cuda_v2", &attention_step2_with_rel_pos_value_backward_cuda_v2, "attention_step2_with_rel_pos_value_backward_cuda_v2");
+    m.def("dot_prod_with_idx_forward_cuda_v3", &dot_prod_with_idx_forward_cuda_v3, "dot_prod_with_idx_forward_cuda_v3");
+    m.def("dot_prod_with_idx_backward_cuda_v3", &dot_prod_with_idx_backward_cuda_v3, "dot_prod_with_idx_backward_cuda_v3");
+    }
diff --git a/utils/pointops2/src/rpe/relative_pos_encoding_cuda.cpp b/utils/pointops2/src/rpe/relative_pos_encoding_cuda.cpp
new file mode 100644
index 0000000..634ebb0
--- /dev/null
+++ b/utils/pointops2/src/rpe/relative_pos_encoding_cuda.cpp
@@ -0,0 +1,60 @@
+#include <vector>
+#include <THC/THC.h>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "relative_pos_encoding_cuda_kernel.h"
+
+void dot_prod_with_idx_forward_cuda(int N, int M, int h, int hdim, at::Tensor q_tensor, at::Tensor index_tensor, 
+    at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor)
+{
+    const float *q = q_tensor.data_ptr<float>();
+    const float *table = table_tensor.data_ptr<float>();
+    const int *index = index_tensor.data_ptr<int>();
+    const int *rel_idx = rel_idx_tensor.data_ptr<int>();
+    float *output = output_tensor.data_ptr<float>();
+    dot_prod_with_idx_forward_cuda_launcher(N, M, h, hdim, q, index, table, rel_idx, output);
+}
+
+void dot_prod_with_idx_backward_cuda(int N, int M, int h, int hdim, at::Tensor grad_out_tensor, 
+    at::Tensor q_tensor, at::Tensor index_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, 
+    at::Tensor grad_q_tensor, at::Tensor grad_table_tensor)
+{
+    const float *grad_out = grad_out_tensor.data_ptr<float>();
+    const float *q = q_tensor.data_ptr<float>();
+    const int *index = index_tensor.data_ptr<int>();
+    const float *table = table_tensor.data_ptr<float>();
+    const int *rel_idx = rel_idx_tensor.data_ptr<int>();
+    float *grad_q = grad_q_tensor.data_ptr<float>();
+    float *grad_table = grad_table_tensor.data_ptr<float>();
+    dot_prod_with_idx_backward_cuda_launcher(N, M, h, hdim, grad_out, q, index, table, rel_idx, grad_q, grad_table);
+}
+
+void attention_step2_with_rel_pos_value_forward_cuda(int N, int M, int h, int hdim, at::Tensor attn_tensor, at::Tensor v_tensor, 
+    at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor)
+{
+    const float *attn = attn_tensor.data_ptr<float>();
+    const float *v = v_tensor.data_ptr<float>();
+    const int *index0 = index0_tensor.data_ptr<int>();
+    const int *index1 = index1_tensor.data_ptr<int>();
+    const float *table = table_tensor.data_ptr<float>();
+    const int *rel_idx = rel_idx_tensor.data_ptr<int>();
+    float *output = output_tensor.data_ptr<float>();
+    attention_step2_with_rel_pos_value_forward_cuda_launcher(N, M, h, hdim, attn, v, index0, index1, table, rel_idx, output);
+}
+
+void attention_step2_with_rel_pos_value_backward_cuda(int N, int M, int h, int hdim, at::Tensor grad_out_tensor, 
+    at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor table_tensor,
+    at::Tensor rel_idx_tensor, at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor, at::Tensor grad_table_tensor)
+{
+    const float *grad_out = grad_out_tensor.data_ptr<float>();
+    const int *index0 = index0_tensor.data_ptr<int>();
+    const int *index1 = index1_tensor.data_ptr<int>();
+    const float *attn = attn_tensor.data_ptr<float>();
+    const float *v = v_tensor.data_ptr<float>();
+    const float *table = table_tensor.data_ptr<float>();
+    const int *rel_idx = rel_idx_tensor.data_ptr<int>();
+    float *grad_attn = grad_attn_tensor.data_ptr<float>();
+    float *grad_v = grad_v_tensor.data_ptr<float>();
+    float *grad_table = grad_table_tensor.data_ptr<float>();
+    attention_step2_with_rel_pos_value_backward_cuda_launcher(N, M, h, hdim, grad_out, index0, index1, attn, v, table, rel_idx, grad_attn, grad_v, grad_table);
+}
diff --git a/utils/pointops2/src/rpe/relative_pos_encoding_cuda_kernel.cu b/utils/pointops2/src/rpe/relative_pos_encoding_cuda_kernel.cu
new file mode 100644
index 0000000..b8fd8f4
--- /dev/null
+++ b/utils/pointops2/src/rpe/relative_pos_encoding_cuda_kernel.cu
@@ -0,0 +1,134 @@
+#include "../cuda_utils.h"
+#include "relative_pos_encoding_cuda_kernel.h"
+
+
+__global__ void dot_prod_with_idx_forward_cuda_kernel( // M, h, hdim
+    int N, int M, int h, int hdim, const float *q, const int *index,
+    const float *table, const int *rel_idx, float *output) {
+    // input: q: (N, h, hdim), index: (M), table: (L, h, hdim, 3), rel_idx: (M, 3), output: (M, h)
+
+    int c_idx = blockIdx.z;
+    int h_idx = blockIdx.y;
+    int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_idx >= M*3 || h_idx >= h || c_idx >= hdim) return;
+
+    int dim = thread_idx % 3;
+    int m_idx = thread_idx / 3;
+
+    int q_idx = index[m_idx];
+    int rel_idx_dim = rel_idx[thread_idx];
+    float rel_table_val = table[rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim];
+    float val = q[q_idx*h*hdim+h_idx*hdim+c_idx] * rel_table_val;
+    atomicAdd(output+m_idx*h+h_idx, val);
+}
+
+__global__ void dot_prod_with_idx_backward_cuda_kernel( // M, h, hdim
+    int N, int M, int h, int hdim, const float *grad_out, const float *q, const int *index, 
+    const float *table, const int *rel_idx, float *grad_q, float *grad_table) {
+    
+    int c_idx = blockIdx.z;
+    int h_idx = blockIdx.y;
+    int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_idx >= M*3 || h_idx >= h || c_idx >= hdim) return;
+
+    int dim = thread_idx % 3;
+    int m_idx = thread_idx / 3;
+
+    int q_idx = index[m_idx];
+    int rel_idx_dim = rel_idx[thread_idx];
+    int grad_out_idx = m_idx*h+h_idx;
+    float grad_out_value = grad_out[grad_out_idx];
+
+    float rel_table_val = table[rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim];
+    atomicAdd(grad_q+q_idx*h*hdim+h_idx*hdim+c_idx, grad_out_value * rel_table_val);
+
+    float q_value = q[q_idx*h*hdim+h_idx*hdim+c_idx];
+    atomicAdd(grad_table+rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim, grad_out_value * q_value);
+}
+
+void dot_prod_with_idx_forward_cuda_launcher(int N, int M, int h, int hdim, const float *q, const int *index,
+    const float *table, const int *rel_idx, float *output) {
+    // input: q: (N, h, hdim), index: (M), table: (L, h, hdim, 3), rel_idx: (M, 3)
+    //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M);
+    dim3 blocks(DIVUP(M*3, THREADS_PER_BLOCK), h, hdim);
+    dim3 threads(THREADS_PER_BLOCK);
+    dot_prod_with_idx_forward_cuda_kernel<<<blocks, threads, 0>>>(N, M, h, hdim, q, index, table, rel_idx, output);
+}
+
+void dot_prod_with_idx_backward_cuda_launcher(int N, int M, int h, int hdim, const float *grad_out, 
+    const float *q, const int *index, const float *table, const int *rel_idx, float *grad_q, float *grad_table) {  
+    // input: grad_out: (M, h), output: grad_q: (N, h, hdim), grad_table: (L, h, hdim, 3)
+    //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M);
+    dim3 blocks(DIVUP(M*3, THREADS_PER_BLOCK), h, hdim);
+    dim3 threads(THREADS_PER_BLOCK);
+    dot_prod_with_idx_backward_cuda_kernel<<<blocks, threads, 0>>>(N, M, h, hdim, grad_out, q, index, table, rel_idx, grad_q, grad_table);
+}
+
+__global__ void attention_step2_with_rel_pos_value_forward_cuda_kernel( // M, h, hdim
+    int N, int M, int h, int hdim, const float *attn, const float *v,
+    const int *index0, const int *index1, const float *table, const int *rel_idx, float *output) {
+    // input: attn: (M, h), v: (N, h, hdim), index0: (M, ), index1: (M, ), table: (L, h, hdim, 3), rel_idx: (M, 3)
+
+    int c_idx = blockIdx.z;
+    int h_idx = blockIdx.y;
+    int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_idx >= M*3 || h_idx >= h || c_idx >= hdim) return;
+
+    int dim = thread_idx % 3;
+    int m_idx = thread_idx / 3;
+
+    int idx1 = index1[m_idx];
+
+    int rel_idx_dim = rel_idx[thread_idx];
+    float table_val = table[rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim];
+
+    float val = attn[m_idx*h+h_idx] * (v[idx1*h*hdim+h_idx*hdim+c_idx] / 3.0 + table_val);
+
+    int idx0 = index0[m_idx];
+    atomicAdd(output+idx0*h*hdim+h_idx*hdim+c_idx, val);
+}
+
+
+__global__ void attention_step2_with_rel_pos_value_backward_cuda_kernel( // M, h, hdim
+    int N, int M, int h, int hdim, const float *grad_out, const int *index0, const int *index1, const float *attn, const float *v, const float *table,
+    const int *rel_idx, float *grad_attn, float *grad_v, float *grad_table) {
+    // input: attn: (M, h), v: (N, h, hdim), index0: (M, ), index1: (M, ), table: (L, h, hdim, 3), rel_idx: (M, 3)
+
+    int c_idx = blockIdx.z;
+    int h_idx = blockIdx.y;
+    int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_idx >= M*3 || h_idx >= h || c_idx >= hdim) return;
+
+    int dim = thread_idx % 3;
+    int m_idx = thread_idx / 3;
+
+    int idx0 = index0[m_idx];
+    int idx1 = index1[m_idx];
+    int grad_out_idx = idx0*h*hdim+h_idx*hdim+c_idx;
+
+    int rel_idx_dim = rel_idx[thread_idx];
+    float table_val = table[rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim];
+    float grad_out_value = grad_out[grad_out_idx];
+
+    atomicAdd(grad_attn+m_idx*h+h_idx, grad_out_value * (v[idx1*h*hdim+h_idx*hdim+c_idx]/3 + table_val));
+    atomicAdd(grad_v+idx1*h*hdim+h_idx*hdim+c_idx, grad_out_value * attn[m_idx*h+h_idx]/3);
+    atomicAdd(grad_table+rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim, grad_out_value * attn[m_idx*h+h_idx]);
+}
+
+void attention_step2_with_rel_pos_value_forward_cuda_launcher(int N, int M, int h, int hdim, const float *attn, const float *v, const int *index0,
+    const int *index1, const float *table, const int *rel_idx, float *output) {
+    // input: attn: (M, h), v: (N, h, hdim), index0: (M, ), index1: (M, ), table: (L, h, hdim, 3), rel_idx: (M, 3)
+    //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M);
+    dim3 blocks(DIVUP(M*3, THREADS_PER_BLOCK), h, hdim);
+    dim3 threads(THREADS_PER_BLOCK);
+    attention_step2_with_rel_pos_value_forward_cuda_kernel<<<blocks, threads, 0>>>(N, M, h, hdim, attn, v, index0, index1, table, rel_idx, output);
+}
+
+void attention_step2_with_rel_pos_value_backward_cuda_launcher(int N, int M, int h, int hdim, const float *grad_out, const int *index0, 
+    const int *index1, const float *attn, const float *v, const float *table, const int *rel_idx, float *grad_attn, float *grad_v, float *grad_table) {  
+    // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c)
+    //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M);
+    dim3 blocks(DIVUP(M*3, THREADS_PER_BLOCK), h, hdim);
+    dim3 threads(THREADS_PER_BLOCK);
+    attention_step2_with_rel_pos_value_backward_cuda_kernel<<<blocks, threads, 0>>>(N, M, h, hdim, grad_out, index0, index1, attn, v, table, rel_idx, grad_attn, grad_v, grad_table);
+}
diff --git a/utils/pointops2/src/rpe/relative_pos_encoding_cuda_kernel.h b/utils/pointops2/src/rpe/relative_pos_encoding_cuda_kernel.h
new file mode 100644
index 0000000..cafc7b6
--- /dev/null
+++ b/utils/pointops2/src/rpe/relative_pos_encoding_cuda_kernel.h
@@ -0,0 +1,26 @@
+#ifndef _RPE_CUDA_KERNEL
+#define _RPE_CUDA_KERNEL
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+
+void dot_prod_with_idx_forward_cuda(int N, int M, int h, int hdim, at::Tensor q_tensor, at::Tensor index_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor);
+void dot_prod_with_idx_backward_cuda(int N, int M, int h, int hdim, at::Tensor grad_out_tensor, at::Tensor q_tensor, at::Tensor index_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor grad_q_tensor, at::Tensor grad_table_tensor);
+
+void attention_step2_with_rel_pos_value_forward_cuda(int N, int M, int h, int hdim, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor);
+void attention_step2_with_rel_pos_value_backward_cuda(int N, int M, int h, int hdim, at::Tensor grad_out_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor, at::Tensor grad_table_tensor);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void dot_prod_with_idx_forward_cuda_launcher(int N, int M, int h, int hdim, const float *q, const int *index, const float *table, const int *rel_idx, float *output);
+void dot_prod_with_idx_backward_cuda_launcher(int N, int M, int h, int hdim, const float *grad_out, const float *q, const int *index, const float *table, const int *rel_idx, float *grad_q, float *grad_table);
+
+void attention_step2_with_rel_pos_value_forward_cuda_launcher(int N, int M, int h, int hdim, const float *attn, const float *v, const int *index0, const int *index1, const float *table, const int *rel_idx, float *output);
+void attention_step2_with_rel_pos_value_backward_cuda_launcher(int N, int M, int h, int hdim, const float *grad_out, const int *index0, const int *index1, const float *attn, const float *v, const float *table, const int *rel_idx, float *grad_attn, float *grad_v, float *grad_table);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/utils/pointops2/src/rpe_v2/relative_pos_encoding_cuda_kernel_v2.cu b/utils/pointops2/src/rpe_v2/relative_pos_encoding_cuda_kernel_v2.cu
new file mode 100644
index 0000000..628d8e3
--- /dev/null
+++ b/utils/pointops2/src/rpe_v2/relative_pos_encoding_cuda_kernel_v2.cu
@@ -0,0 +1,525 @@
+#include "../cuda_utils.h"
+#include "relative_pos_encoding_cuda_kernel_v2.h"
+
+
+// N, M, h, q, index_q, k, index_k, table_q, table_k, rel_idx, rel_idx_offsets, output
+
+template <unsigned int d>
+__global__ void dot_prod_with_idx_forward_cuda_kernel_v2( // M, h, hdim
+    int N, int M, int h, const float *q, const int *index_q, const float *k, const int *index_k,
+    const float *table_q, const float *table_k, const int *rel_idx, const int *rel_idx_offsets, 
+    const int *sort_indices, float *output) {
+    // input: q: (N, h, hdim), index: (M), table: (L, h, hdim, 3), rel_idx: (M, 3), output: (M, h)
+
+    int h_idx = blockIdx.y;
+    int t_idx = blockIdx.x;
+    int n_idx = threadIdx.x;
+    int C = h*d;
+
+    __shared__ int start, end;
+    if(n_idx == 0){
+        start = rel_idx_offsets[t_idx];
+        end = rel_idx_offsets[t_idx+1];
+        // printf("e2: start: %d, end: %d\n", start, end);
+    }
+
+    __syncthreads();
+    
+    int m_idx_prev = start + n_idx;
+    // if(m_idx_prev >= end)
+    //     return;
+
+    __shared__ int m_idx;
+    if(n_idx == 0)
+        m_idx = sort_indices[m_idx_prev];
+
+    __syncthreads();
+    
+    __shared__ int rel_idx_vec[3];
+    if(n_idx < 3)
+        rel_idx_vec[n_idx] = rel_idx[m_idx*3 + n_idx];
+    
+    __syncthreads();
+    
+    __shared__ float table_q_vec[d];
+    __shared__ float table_k_vec[d];
+
+    for(int i = n_idx; i < 2*d; i += blockDim.x){
+        if (i < d){
+            int ind0 = rel_idx_vec[0] * C * 3 + h_idx * d * 3 + i * 3 + 0;
+            int ind1 = rel_idx_vec[1] * C * 3 + h_idx * d * 3 + i * 3 + 1;
+            int ind2 = rel_idx_vec[2] * C * 3 + h_idx * d * 3 + i * 3 + 2;
+            table_q_vec[i] = table_q[ind0] + table_q[ind1] + table_q[ind2];
+        } else{
+            int ind0 = rel_idx_vec[0] * C * 3 + h_idx * d * 3 + (i-d) * 3 + 0;
+            int ind1 = rel_idx_vec[1] * C * 3 + h_idx * d * 3 + (i-d) * 3 + 1;
+            int ind2 = rel_idx_vec[2] * C * 3 + h_idx * d * 3 + (i-d) * 3 + 2;
+            table_k_vec[i-d] = table_k[ind0] + table_k[ind1] + table_k[ind2];
+        }
+    }
+
+    __syncthreads();
+
+    for(int i = m_idx_prev; i < end; i += blockDim.x){
+        float sum = 0;
+        int m_idx_i = sort_indices[i];
+        int q_idx = index_q[m_idx_i];
+        int k_idx = index_k[m_idx_i];
+        for(int j = 0; j < d; j++){
+            sum += q[q_idx*C + h_idx*d + j] * table_q_vec[j];
+            sum += k[k_idx*C + h_idx*d + j] * table_k_vec[j];
+        }
+        output[m_idx_i*h + h_idx] = sum;
+    }
+}
+
+// N, M, h, hdim, grad_out, q, index_q, k, index_k, table_q, table_k, rel_idx, rel_idx_offsets, sort_indices, grad_q, grad_k, grad_table_q, grad_table_k
+
+template <unsigned int d>
+__global__ void dot_prod_with_idx_backward_cuda_kernel_v2( // M, h, hdim
+    int N, int M, int h, const float *grad_out, const float *q, const int *index_q, 
+    const float *k, const int *index_k, const float *table_q, const float *table_k, 
+    const int *rel_idx, const int *rel_idx_offsets, const int *sort_indices, float *grad_q, 
+    float *grad_k, float *grad_table_q, float *grad_table_k) {
+    
+    int h_idx = blockIdx.y;
+    int t_idx = blockIdx.x;
+    int n_idx = threadIdx.x;
+    int C = h*d;
+
+    __shared__ int start, end;
+    if(n_idx == 0){
+        start = rel_idx_offsets[t_idx];
+        end = rel_idx_offsets[t_idx+1];
+    }
+
+    __syncthreads();
+    
+    int m_idx_prev = start + n_idx;
+    // if(m_idx_prev >= end)
+    //     return;
+
+    __shared__ int m_idx;
+    if(n_idx == 0)
+        m_idx = sort_indices[m_idx_prev];
+
+    __syncthreads();
+    
+    __shared__ int rel_idx_vec[3];
+    if(n_idx < 3)
+        rel_idx_vec[n_idx] = rel_idx[m_idx*3 + n_idx];
+    
+    __syncthreads();
+    
+    __shared__ float table_q_vec[d];
+    __shared__ float table_k_vec[d];
+
+    for(int i = n_idx; i < 2*d; i += blockDim.x){
+        if (i < d){
+            int ind0 = rel_idx_vec[0] * C * 3 + h_idx * d * 3 + i * 3 + 0;
+            int ind1 = rel_idx_vec[1] * C * 3 + h_idx * d * 3 + i * 3 + 1;
+            int ind2 = rel_idx_vec[2] * C * 3 + h_idx * d * 3 + i * 3 + 2;
+            table_q_vec[i] = table_q[ind0] + table_q[ind1] + table_q[ind2];
+        } else{
+            int ind0 = rel_idx_vec[0] * C * 3 + h_idx * d * 3 + (i-d) * 3 + 0;
+            int ind1 = rel_idx_vec[1] * C * 3 + h_idx * d * 3 + (i-d) * 3 + 1;
+            int ind2 = rel_idx_vec[2] * C * 3 + h_idx * d * 3 + (i-d) * 3 + 2;
+            table_k_vec[i-d] = table_k[ind0] + table_k[ind1] + table_k[ind2];
+        }
+    }
+
+    __shared__ float gradient_q[d];
+    __shared__ float gradient_k[d];
+    for(int i = n_idx; i < d; i += blockDim.x){
+        gradient_q[i] = 0;
+        gradient_k[i] = 0;
+    }
+
+    __syncthreads();
+
+    for(int i = m_idx_prev; i < end; i += blockDim.x){
+        int m_idx_i = sort_indices[i];
+        int q_idx = index_q[m_idx_i];
+        int k_idx = index_k[m_idx_i];
+        float grad_out_i = grad_out[m_idx_i*h+h_idx];
+        for(int j = 0; j < d; j++){
+            atomicAdd(&gradient_q[j], q[q_idx*C + h_idx*d + j] * grad_out_i);
+            atomicAdd(&gradient_k[j], k[k_idx*C + h_idx*d + j] * grad_out_i);
+            atomicAdd(grad_q + q_idx*C + h_idx*d + j, table_q_vec[j] * grad_out_i);
+            atomicAdd(grad_k + k_idx*C + h_idx*d + j, table_k_vec[j] * grad_out_i);
+        }
+    }
+
+    __syncthreads();
+
+    for(int i = n_idx; i < d*2; i += blockDim.x){
+        if(i < d){
+            atomicAdd(grad_table_q + rel_idx_vec[0] * C * 3 + h_idx * d * 3 + i * 3, gradient_q[i]);
+            atomicAdd(grad_table_q + rel_idx_vec[1] * C * 3 + h_idx * d * 3 + i * 3 + 1, gradient_q[i]);
+            atomicAdd(grad_table_q + rel_idx_vec[2] * C * 3 + h_idx * d * 3 + i * 3 + 2, gradient_q[i]);
+        }else{
+            atomicAdd(grad_table_k + rel_idx_vec[0] * C * 3 + h_idx * d * 3 + (i-d) * 3, gradient_k[i-d]);
+            atomicAdd(grad_table_k + rel_idx_vec[1] * C * 3 + h_idx * d * 3 + (i-d) * 3 + 1, gradient_k[i-d]);
+            atomicAdd(grad_table_k + rel_idx_vec[2] * C * 3 + h_idx * d * 3 + (i-d) * 3 + 2, gradient_k[i-d]);
+        }
+    }
+
+    // int c_idx = blockIdx.z;
+    // int h_idx = blockIdx.y;
+    // int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    // if (thread_idx >= M*3 || h_idx >= h || c_idx >= hdim) return;
+
+    // int dim = thread_idx % 3;
+    // int m_idx = thread_idx / 3;
+
+    // int q_idx = index[m_idx];
+    // int rel_idx_dim = rel_idx[thread_idx];
+    // int grad_out_idx = m_idx*h+h_idx;
+    // float grad_out_value = grad_out[grad_out_idx];
+
+    // float rel_table_val = table[rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim];
+    // atomicAdd(grad_q+q_idx*h*hdim+h_idx*hdim+c_idx, grad_out_value * rel_table_val);
+
+    // float q_value = q[q_idx*h*hdim+h_idx*hdim+c_idx];
+    // atomicAdd(grad_table+rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim, grad_out_value * q_value);
+}
+
+void dot_prod_with_idx_forward_cuda_launcher_v2(int N, int M, int h, int hdim, int n_max, int T, const float *q,
+    const int *index_q, const float *k, const int *index_k, const float *table_q, const float *table_k, 
+    const int *rel_idx, const int *rel_idx_offsets, const int *sort_indices, float *output)
+{
+    // input: q: (N, h, hdim), index: (M), table: (L, h, hdim, 3), rel_idx: (M, 3)
+    //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M);
+    dim3 blocks(T, h);
+    // dim3 threads(THREADS_PER_BLOCK);
+    
+	unsigned int n_threads = opt_n_threads(n_max);
+    n_threads = n_threads == n_max ? n_threads : n_threads * 2;
+    n_threads = n_threads > 1024 ? 512 : n_threads;
+
+    // printf("e1: T: %d, h: %d, n_threads: %d\n", T, h, n_threads);
+
+	switch (hdim) {
+        case 16:
+            dot_prod_with_idx_forward_cuda_kernel_v2<16><<<blocks, n_threads, 0>>>(N, M, h, q, index_q, k, index_k, table_q, table_k, rel_idx, rel_idx_offsets, sort_indices, output);
+            break;
+        case 32:
+            dot_prod_with_idx_forward_cuda_kernel_v2<32><<<blocks, n_threads, 0>>>(N, M, h, q, index_q, k, index_k, table_q, table_k, rel_idx, rel_idx_offsets, sort_indices, output);
+            break;
+        default:
+            throw "d != 16 and d != 32";
+    }
+}
+
+void dot_prod_with_idx_backward_cuda_launcher_v2(int N, int M, int h, int hdim, int n_max, int T, 
+    const float *grad_out, const float *q, const int *index_q, const float *k, const int *index_k, 
+    const float *table_q, const float *table_k, const int *rel_idx, const int *rel_idx_offsets, const int *sort_indices, 
+    float *grad_q, float *grad_k, float *grad_table_q, float *grad_table_k)
+{  
+    // input: grad_out: (M, h), output: grad_q: (N, h, hdim), grad_table: (L, h, hdim, 3)
+    //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M);
+    // dim3 blocks(DIVUP(M*3, THREADS_PER_BLOCK), h, hdim);
+    // dim3 threads(THREADS_PER_BLOCK);
+
+    dim3 blocks(T, h);
+    // dim3 threads(THREADS_PER_BLOCK);
+    
+	unsigned int n_threads = opt_n_threads(n_max);
+    n_threads = n_threads == n_max ? n_threads : n_threads * 2;
+    n_threads = n_threads > 1024 ? 512 : n_threads;
+
+	switch (hdim) {
+        case 16:
+            dot_prod_with_idx_backward_cuda_kernel_v2<16><<<blocks, n_threads, 0>>>(N, M, h, grad_out, q, index_q, k, index_k, table_q, table_k, rel_idx, rel_idx_offsets, sort_indices, grad_q, grad_k, grad_table_q, grad_table_k);
+            break;
+        case 32:
+            dot_prod_with_idx_backward_cuda_kernel_v2<32><<<blocks, n_threads, 0>>>(N, M, h, grad_out, q, index_q, k, index_k, table_q, table_k, rel_idx, rel_idx_offsets, sort_indices, grad_q, grad_k, grad_table_q, grad_table_k);
+            break;
+        default:
+            throw "d != 16 and d != 32";
+    }
+}
+
+
+
+template <unsigned int d>
+__global__ void dot_prod_with_idx_forward_cuda_kernel_v3( // M, h, hdim
+    int N, int M, int h, const float *q, const int *index_q_offsets, const float *k, const int *index_k,
+    const float *table_q, const float *table_k, const int *rel_idx, float *output) {
+    // input: q: (N, h, hdim), index: (M), table: (L, h, hdim, 3), rel_idx: (M, 3), output: (M, h)
+    int q_idx = blockIdx.x;
+    int h_idx = blockIdx.y;
+    int n_idx = threadIdx.x;
+    int C = h*d;
+
+    __shared__ float query_vec[d];
+    __shared__ int start, end;
+    if (n_idx == 0){
+        start = index_q_offsets[q_idx];
+        end = index_q_offsets[q_idx+1];
+    }
+    for(int i = n_idx; i < d; i += blockDim.x)
+        query_vec[i] = q[q_idx*C + h_idx*d + i];
+
+    __syncthreads();
+
+    int m_idx = start + n_idx;
+    if(m_idx >= end)
+        return;
+
+    int k_idx = index_k[m_idx];
+    int r_idx1 = rel_idx[m_idx*3], r_idx2 = rel_idx[m_idx*3+1], r_idx3 = rel_idx[m_idx*3+2];
+    float sum = 0;
+    for(int i = 0; i < d; i++){
+        float table_q_scalar_i = table_q[r_idx1*C*3+h_idx*d*3+i*3] + table_q[r_idx2*C*3+h_idx*d*3+i*3+1] + table_q[r_idx3*C*3+h_idx*d*3+i*3+2];
+        sum += query_vec[i] * table_q_scalar_i;
+        float table_k_scalar_i = table_k[r_idx1*C*3+h_idx*d*3+i*3] + table_k[r_idx2*C*3+h_idx*d*3+i*3+1] + table_k[r_idx3*C*3+h_idx*d*3+i*3+2];
+        sum += k[k_idx*C+h_idx*d+i] * table_k_scalar_i;
+    }
+    output[m_idx*h + h_idx] = sum;
+
+}
+
+// N, M, h, hdim, grad_out, q, index_q, k, index_k, table_q, table_k, rel_idx, rel_idx_offsets, sort_indices, grad_q, grad_k, grad_table_q, grad_table_k
+
+template <unsigned int d>
+__global__ void dot_prod_with_idx_backward_cuda_kernel_v3( // M, h, hdim
+    int N, int M, int h, const float *grad_out, const float *q, const int *index_q_offsets, 
+    const float *k, const int *index_k, const float *table_q, const float *table_k, 
+    const int *rel_idx, float *grad_q, float *grad_k, float *grad_table_q, float *grad_table_k) {
+
+    int q_idx = blockIdx.x;
+    int h_idx = blockIdx.y;
+    int n_idx = threadIdx.x;
+    int C = h*d;
+
+    __shared__ float query_vec[d];
+    __shared__ int start, end;
+    if (n_idx == 0){
+        start = index_q_offsets[q_idx];
+        end = index_q_offsets[q_idx+1];
+    }
+    for(int i = n_idx; i < d; i += blockDim.x)
+        query_vec[i] = q[q_idx*C + h_idx*d + i];
+
+    __shared__ float gradients_q[d];
+    for(int i = n_idx; i < d; i += blockDim.x){
+        gradients_q[i] = 0;
+    }
+
+    __syncthreads();
+
+    int m_idx = start + n_idx;
+
+    if(m_idx < end){
+        int k_idx = index_k[m_idx];
+        int r_idx1 = rel_idx[m_idx*3], r_idx2 = rel_idx[m_idx*3+1], r_idx3 = rel_idx[m_idx*3+2];
+        float gradient = grad_out[m_idx*h + h_idx];
+        for(int i = 0; i < d; i++){
+            float table_q_scalar_i = table_q[r_idx1*C*3+h_idx*d*3+i*3] + table_q[r_idx2*C*3+h_idx*d*3+i*3+1] + table_q[r_idx3*C*3+h_idx*d*3+i*3+2];
+            float table_k_scalar_i = table_k[r_idx1*C*3+h_idx*d*3+i*3] + table_k[r_idx2*C*3+h_idx*d*3+i*3+1] + table_k[r_idx3*C*3+h_idx*d*3+i*3+2];
+            float q_scalar_i = query_vec[i];
+            float k_scalar_i = k[k_idx*C+h_idx*d+i];
+            atomicAdd(&gradients_q[i], table_q_scalar_i * gradient);
+            atomicAdd(grad_k+k_idx*C+h_idx*d+i, table_k_scalar_i * gradient);
+            atomicAdd(grad_table_q+r_idx1*C*3+h_idx*d*3+i*3, q_scalar_i * gradient);
+            atomicAdd(grad_table_q+r_idx2*C*3+h_idx*d*3+i*3+1, q_scalar_i * gradient);
+            atomicAdd(grad_table_q+r_idx3*C*3+h_idx*d*3+i*3+2, q_scalar_i * gradient);
+            atomicAdd(grad_table_k+r_idx1*C*3+h_idx*d*3+i*3, k_scalar_i * gradient);
+            atomicAdd(grad_table_k+r_idx2*C*3+h_idx*d*3+i*3+1, k_scalar_i * gradient);
+            atomicAdd(grad_table_k+r_idx3*C*3+h_idx*d*3+i*3+2, k_scalar_i * gradient);
+        }
+    }
+    __syncthreads();
+
+    for(int i = n_idx; i < d; i += blockDim.x){
+        grad_q[q_idx*C+h_idx*d+i] = gradients_q[i];
+    }
+}
+
+void dot_prod_with_idx_forward_cuda_launcher_v3(int N, int M, int h, int hdim, int n_max, const float *q,
+    const int *index_q_offsets, const float *k, const int *index_k, const float *table_q, const float *table_k, 
+    const int *rel_idx, float *output)
+{
+    // input: q: (N, h, hdim), index: (M), table: (L, h, hdim, 3), rel_idx: (M, 3)
+    //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M);
+    dim3 blocks(N, h);
+    // dim3 threads(THREADS_PER_BLOCK);
+    
+	unsigned int n_threads = opt_n_threads(n_max);
+    n_threads = n_threads == n_max ? n_threads : n_threads * 2;
+
+    // printf("e1: h: %d, n_max: %d, n_threads: %d\n", h, n_max, n_threads);
+
+	switch (hdim) {
+        case 16:
+            dot_prod_with_idx_forward_cuda_kernel_v3<16><<<blocks, n_threads, 0>>>(N, M, h, q, index_q_offsets, k, index_k, table_q, table_k, rel_idx, output);
+            break;
+        case 32:
+            dot_prod_with_idx_forward_cuda_kernel_v3<32><<<blocks, n_threads, 0>>>(N, M, h, q, index_q_offsets, k, index_k, table_q, table_k, rel_idx, output);
+            break;
+        default:
+            throw "d != 16 and d != 32";
+    }
+}
+
+void dot_prod_with_idx_backward_cuda_launcher_v3(int N, int M, int h, int hdim, int n_max, 
+    const float *grad_out, const float *q, const int *index_q_offsets, const float *k, const int *index_k, 
+    const float *table_q, const float *table_k, const int *rel_idx, 
+    float *grad_q, float *grad_k, float *grad_table_q, float *grad_table_k)
+{  
+    // input: grad_out: (M, h), output: grad_q: (N, h, hdim), grad_table: (L, h, hdim, 3)
+    //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M);
+    // dim3 blocks(DIVUP(M*3, THREADS_PER_BLOCK), h, hdim);
+    // dim3 threads(THREADS_PER_BLOCK);
+
+    dim3 blocks(N, h);
+    // dim3 threads(THREADS_PER_BLOCK);
+    
+	unsigned int n_threads = opt_n_threads(n_max);
+    n_threads = n_threads == n_max ? n_threads : n_threads * 2;
+
+	switch (hdim) {
+        case 16:
+            dot_prod_with_idx_backward_cuda_kernel_v3<16><<<blocks, n_threads, 0>>>(N, M, h, grad_out, q, index_q_offsets, k, index_k, table_q, table_k, rel_idx, grad_q, grad_k, grad_table_q, grad_table_k);
+            break;
+        case 32:
+            dot_prod_with_idx_backward_cuda_kernel_v3<32><<<blocks, n_threads, 0>>>(N, M, h, grad_out, q, index_q_offsets, k, index_k, table_q, table_k, rel_idx, grad_q, grad_k, grad_table_q, grad_table_k);
+            break;
+        default:
+            throw "d != 16 and d != 32";
+    }
+}
+
+
+template <unsigned int d>
+__global__ void attention_step2_with_rel_pos_value_forward_cuda_kernel_v2( // M, h, hdim
+    int N, int M, int h, const float *attn, const float *v,
+    const int *index0_offsets, const int *index1, const float *table, const int *rel_idx, float *output) {
+    // input: attn: (M, h), v: (N, h, hdim), index0: (M, ), index1: (M, ), table: (L, h, hdim, 3), rel_idx: (M, 3)
+
+    int q_idx = blockIdx.x;
+    int h_idx = blockIdx.y;
+    int n_idx = threadIdx.x;
+
+    int C = h*d;
+
+    __shared__ int start, end;
+    __shared__ float result[d];
+
+    if (n_idx == 0){
+        start = index0_offsets[q_idx];
+        end = index0_offsets[q_idx+1];
+    }
+    for (int i = n_idx; i < d; i += blockDim.x){
+        result[i] = 0;
+    }
+
+    __syncthreads();
+
+    int m_idx = start + n_idx;
+    if (m_idx < end){
+        float attn_scalar = attn[m_idx*h + h_idx];
+        int r_idx1 = rel_idx[m_idx*3], r_idx2 = rel_idx[m_idx*3+1], r_idx3 = rel_idx[m_idx*3+2];
+        for(int i = 0; i < d; i ++){
+            int v_idx = index1[m_idx];
+            float table_scaler_i = table[r_idx1*C*3+h_idx*d*3+i*3] + table[r_idx2*C*3+h_idx*d*3+i*3+1] + table[r_idx3*C*3+h_idx*d*3+i*3+2];
+            float value_scaler_i = v[v_idx*C + h_idx*d + i];
+            atomicAdd(&result[i], (table_scaler_i + value_scaler_i) * attn_scalar);
+        }
+    }
+
+    __syncthreads();
+
+    for (int i = n_idx; i < d; i += blockDim.x)
+        output[q_idx*C + h_idx*d + i] = result[i];
+}
+
+
+template <unsigned int d>
+__global__ void attention_step2_with_rel_pos_value_backward_cuda_kernel_v2( // M, h, hdim
+    int N, int M, int h, const float *grad_out, const int *index0_offsets, const int *index1, const float *attn, const float *v, const float *table,
+    const int *rel_idx, float *grad_attn, float *grad_v, float *grad_table) {
+    // input: attn: (M, h), v: (N, h, hdim), index0: (M, ), index1: (M, ), table: (L, h, hdim, 3), rel_idx: (M, 3)
+
+    int q_idx = blockIdx.x;
+    int h_idx = blockIdx.y;
+    int n_idx = threadIdx.x;
+
+    int C = h*d;
+
+    __shared__ int start, end;
+    __shared__ float gradients[d];
+
+    if (n_idx == 0){
+        start = index0_offsets[q_idx];
+        end = index0_offsets[q_idx+1];
+    }
+    for (int i = n_idx; i < d; i += blockDim.x){
+        gradients[i] = grad_out[q_idx*C + h_idx*d + i];
+    }
+
+    __syncthreads();
+
+    int m_idx = start + n_idx;
+    if (m_idx < end){
+        int v_idx = index1[m_idx];
+        int r_idx1 = rel_idx[m_idx*3], r_idx2 = rel_idx[m_idx*3+1], r_idx3 = rel_idx[m_idx*3+2];
+        float attn_scalar = attn[m_idx*h + h_idx];
+        float grad_attn_sum = 0;
+        for (int i = 0; i < d; i++){
+            float grad_out_scaler_i = gradients[i];
+            float table_scaler_i = table[r_idx1*C*3+h_idx*d*3+i*3] + table[r_idx2*C*3+h_idx*d*3+i*3+1] + table[r_idx3*C*3+h_idx*d*3+i*3+2];
+            float value_scaler_i = v[v_idx*C + h_idx*d + i];
+            grad_attn_sum += (table_scaler_i + value_scaler_i) * grad_out_scaler_i;
+            atomicAdd(grad_v + v_idx*C + h_idx*d + i, attn_scalar * grad_out_scaler_i);
+            atomicAdd(grad_table + r_idx1*C*3 + h_idx*d*3 + i*3, attn_scalar * grad_out_scaler_i);
+            atomicAdd(grad_table + r_idx2*C*3 + h_idx*d*3 + i*3 + 1, attn_scalar * grad_out_scaler_i);
+            atomicAdd(grad_table + r_idx3*C*3 + h_idx*d*3 + i*3 + 2, attn_scalar * grad_out_scaler_i);
+        }
+        grad_attn[m_idx*h + h_idx] = grad_attn_sum;
+    }
+}
+
+void attention_step2_with_rel_pos_value_forward_cuda_launcher_v2(int N, int M, int h, int hdim, int n_max, const float *attn, const float *v, const int *index0_offsets,
+    const int *index1, const float *table, const int *rel_idx, float *output) {
+    // input: attn: (M, h), v: (N, h, hdim), index0: (M, ), index1: (M, ), table: (L, h, hdim, 3), rel_idx: (M, 3)
+    //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M);
+    // dim3 blocks(DIVUP(M*3, THREADS_PER_BLOCK), h, hdim);
+    // dim3 threads(THREADS_PER_BLOCK);
+    dim3 blocks(N, h);
+	unsigned int n_threads = opt_n_threads(n_max);
+    n_threads = n_threads == n_max ? n_threads : n_threads * 2;
+
+	switch (hdim) {
+        case 16:
+            attention_step2_with_rel_pos_value_forward_cuda_kernel_v2<16><<<blocks, n_threads, 0>>>(N, M, h, attn, v, index0_offsets, index1, table, rel_idx, output);
+            break;
+        case 32:
+            attention_step2_with_rel_pos_value_forward_cuda_kernel_v2<32><<<blocks, n_threads, 0>>>(N, M, h, attn, v, index0_offsets, index1, table, rel_idx, output);
+            break;
+        default:
+            throw "d != 16 and d != 32";
+    }
+}
+
+void attention_step2_with_rel_pos_value_backward_cuda_launcher_v2(int N, int M, int h, int hdim, int n_max, const float *grad_out, const int *index0_offsets, 
+    const int *index1, const float *attn, const float *v, const float *table, const int *rel_idx, float *grad_attn, float *grad_v, float *grad_table) {  
+    // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c)
+    //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M);
+    
+    dim3 blocks(N, h);
+	unsigned int n_threads = opt_n_threads(n_max);
+    n_threads = n_threads == n_max ? n_threads : n_threads * 2;
+
+	switch (hdim) {
+        case 16:
+            attention_step2_with_rel_pos_value_backward_cuda_kernel_v2<16><<<blocks, n_threads, 0>>>(N, M, h, grad_out, index0_offsets, index1, attn, v, table, rel_idx, grad_attn, grad_v, grad_table);
+            break;
+        case 32:
+            attention_step2_with_rel_pos_value_backward_cuda_kernel_v2<32><<<blocks, n_threads, 0>>>(N, M, h, grad_out, index0_offsets, index1, attn, v, table, rel_idx, grad_attn, grad_v, grad_table);
+            break;
+        default:
+            throw "d != 16 and d != 32";
+    }
+}
diff --git a/utils/pointops2/src/rpe_v2/relative_pos_encoding_cuda_kernel_v2.h b/utils/pointops2/src/rpe_v2/relative_pos_encoding_cuda_kernel_v2.h
new file mode 100644
index 0000000..648b152
--- /dev/null
+++ b/utils/pointops2/src/rpe_v2/relative_pos_encoding_cuda_kernel_v2.h
@@ -0,0 +1,32 @@
+#ifndef _RPE_V2_CUDA_KERNEL
+#define _RPE_V2_CUDA_KERNEL
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+
+void dot_prod_with_idx_forward_cuda_v2(int N, int M, int h, int hdim, int n_max, int T, at::Tensor q_tensor, at::Tensor index_q_tensor, at::Tensor k_tensor, at::Tensor index_k_tensor, at::Tensor table_q_tensor, at::Tensor table_k_tensor, at::Tensor rel_idx_tensor, at::Tensor rel_idx_offsets_tensor, at::Tensor sort_indices_tensor, at::Tensor output_tensor);
+void dot_prod_with_idx_backward_cuda_v2(int N, int M, int h, int hdim, int n_max, int T, at::Tensor grad_out_tensor, at::Tensor q_tensor, at::Tensor index_q_tensor, at::Tensor k_tensor, at::Tensor index_k_tensor, at::Tensor table_q_tensor, at::Tensor table_k_tensor, at::Tensor rel_idx_tensor, at::Tensor rel_idx_offsets_tensor, at::Tensor sort_indices_tensor, at::Tensor grad_q_tensor, at::Tensor grad_k_tensor, at::Tensor grad_table_q_tensor, at::Tensor grad_table_k_tensor);
+
+void dot_prod_with_idx_forward_cuda_v3(int N, int M, int h, int hdim, int n_max, at::Tensor q_tensor, at::Tensor index_q_offsets_tensor, at::Tensor k_tensor, at::Tensor index_k_tensor, at::Tensor table_q_tensor, at::Tensor table_k_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor);
+void dot_prod_with_idx_backward_cuda_v3(int N, int M, int h, int hdim, int n_max, at::Tensor grad_out_tensor, at::Tensor q_tensor, at::Tensor index_q_offsets_tensor, at::Tensor k_tensor, at::Tensor index_k_tensor, at::Tensor table_q_tensor, at::Tensor table_k_tensor, at::Tensor rel_idx_tensor, at::Tensor grad_q_tensor, at::Tensor grad_k_tensor, at::Tensor grad_table_q_tensor, at::Tensor grad_table_k_tensor);
+
+void attention_step2_with_rel_pos_value_forward_cuda_v2(int N, int M, int h, int hdim, int n_max, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor index0_offsets_tensor, at::Tensor index1_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor);
+void attention_step2_with_rel_pos_value_backward_cuda_v2(int N, int M, int h, int hdim, int n_max, at::Tensor grad_out_tensor, at::Tensor index0_offsets_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor, at::Tensor grad_table_tensor);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void dot_prod_with_idx_forward_cuda_launcher_v2(int N, int M, int h, int hdim, int n_max, int T, const float *q, const int *index_q, const float *k, const int *index_k, const float *table_q, const float *table_k, const int *rel_idx, const int *rel_idx_offsets, const int *sort_indices, float *output);
+void dot_prod_with_idx_backward_cuda_launcher_v2(int N, int M, int h, int hdim, int n_max, int T, const float *grad_out, const float *q, const int *index_q, const float *k, const int *index_k, const float *table_q, const float *table_k, const int *rel_idx, const int *rel_idx_offsets, const int *sort_indices, float *grad_q, float *grad_k, float *grad_table_q, float *grad_table_k);
+
+void dot_prod_with_idx_forward_cuda_launcher_v3(int N, int M, int h, int hdim, int n_max, const float *q, const int *index_q_offsets, const float *k, const int *index_k, const float *table_q, const float *table_k, const int *rel_idx, float *output);
+void dot_prod_with_idx_backward_cuda_launcher_v3(int N, int M, int h, int hdim, int n_max, const float *grad_out, const float *q, const int *index_q_offsets, const float *k, const int *index_k, const float *table_q, const float *table_k, const int *rel_idx, float *grad_q, float *grad_k, float *grad_table_q, float *grad_table_k);
+
+void attention_step2_with_rel_pos_value_forward_cuda_launcher_v2(int N, int M, int h, int hdim, int n_max, const float *attn, const float *v, const int *index0_offsets, const int *index1, const float *table, const int *rel_idx, float *output);
+void attention_step2_with_rel_pos_value_backward_cuda_launcher_v2(int N, int M, int h, int hdim, int n_max, const float *grad_out, const int *index0_offsets, const int *index1, const float *attn, const float *v, const float *table, const int *rel_idx, float *grad_attn, float *grad_v, float *grad_table);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/utils/pointops2/src/rpe_v2/relative_pos_encoding_cuda_v2.cpp b/utils/pointops2/src/rpe_v2/relative_pos_encoding_cuda_v2.cpp
new file mode 100644
index 0000000..0a4c96a
--- /dev/null
+++ b/utils/pointops2/src/rpe_v2/relative_pos_encoding_cuda_v2.cpp
@@ -0,0 +1,111 @@
+#include <vector>
+#include <THC/THC.h>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "relative_pos_encoding_cuda_kernel_v2.h"
+
+void dot_prod_with_idx_forward_cuda_v2(int N, int M, int h, int hdim, int n_max, int T, at::Tensor q_tensor, 
+    at::Tensor index_q_tensor, at::Tensor k_tensor, at::Tensor index_k_tensor, at::Tensor table_q_tensor, 
+    at::Tensor table_k_tensor, at::Tensor rel_idx_tensor, at::Tensor rel_idx_offsets_tensor, at::Tensor sort_indices_tensor, at::Tensor output_tensor)
+{
+    const float *q = q_tensor.data_ptr<float>();
+    const int *index_q = index_q_tensor.data_ptr<int>();
+    const float *k = k_tensor.data_ptr<float>();
+    const int *index_k = index_k_tensor.data_ptr<int>();
+    const float *table_q = table_q_tensor.data_ptr<float>();
+    const float *table_k = table_k_tensor.data_ptr<float>();
+    const int *rel_idx = rel_idx_tensor.data_ptr<int>();
+    const int *rel_idx_offsets = rel_idx_offsets_tensor.data_ptr<int>();
+    const int *sort_indices = sort_indices_tensor.data_ptr<int>();
+    float *output = output_tensor.data_ptr<float>();
+    dot_prod_with_idx_forward_cuda_launcher_v2(N, M, h, hdim, n_max, T, q, index_q, k, index_k, table_q, table_k, rel_idx, rel_idx_offsets, sort_indices, output);
+}
+
+void dot_prod_with_idx_backward_cuda_v2(int N, int M, int h, int hdim, int n_max, int T, at::Tensor grad_out_tensor, 
+    at::Tensor q_tensor, at::Tensor index_q_tensor, at::Tensor k_tensor, at::Tensor index_k_tensor, 
+    at::Tensor table_q_tensor, at::Tensor table_k_tensor, at::Tensor rel_idx_tensor, at::Tensor rel_idx_offsets_tensor, 
+    at::Tensor sort_indices_tensor, at::Tensor grad_q_tensor, at::Tensor grad_k_tensor, at::Tensor grad_table_q_tensor, at::Tensor grad_table_k_tensor)
+{
+    const float *grad_out = grad_out_tensor.data_ptr<float>();
+    const float *q = q_tensor.data_ptr<float>();
+    const int *index_q = index_q_tensor.data_ptr<int>();
+    const float *k = k_tensor.data_ptr<float>();
+    const int *index_k = index_k_tensor.data_ptr<int>();
+    const float *table_q = table_q_tensor.data_ptr<float>();
+    const float *table_k = table_k_tensor.data_ptr<float>();
+    const int *rel_idx = rel_idx_tensor.data_ptr<int>();
+    const int *rel_idx_offsets = rel_idx_offsets_tensor.data_ptr<int>();
+    const int *sort_indices = sort_indices_tensor.data_ptr<int>();
+    float *grad_q = grad_q_tensor.data_ptr<float>();
+    float *grad_k = grad_k_tensor.data_ptr<float>();
+    float *grad_table_q = grad_table_q_tensor.data_ptr<float>();
+    float *grad_table_k = grad_table_k_tensor.data_ptr<float>();
+    dot_prod_with_idx_backward_cuda_launcher_v2(N, M, h, hdim, n_max, T, grad_out, q, index_q, k, index_k, table_q, table_k, rel_idx, rel_idx_offsets, sort_indices, grad_q, grad_k, grad_table_q, grad_table_k);
+}
+
+
+void dot_prod_with_idx_forward_cuda_v3(int N, int M, int h, int hdim, int n_max, at::Tensor q_tensor, 
+    at::Tensor index_q_offsets_tensor, at::Tensor k_tensor, at::Tensor index_k_tensor, at::Tensor table_q_tensor, 
+    at::Tensor table_k_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor)
+{
+    const float *q = q_tensor.data_ptr<float>();
+    const int *index_q_offsets = index_q_offsets_tensor.data_ptr<int>();
+    const float *k = k_tensor.data_ptr<float>();
+    const int *index_k = index_k_tensor.data_ptr<int>();
+    const float *table_q = table_q_tensor.data_ptr<float>();
+    const float *table_k = table_k_tensor.data_ptr<float>();
+    const int *rel_idx = rel_idx_tensor.data_ptr<int>();
+    float *output = output_tensor.data_ptr<float>();
+    dot_prod_with_idx_forward_cuda_launcher_v3(N, M, h, hdim, n_max, q, index_q_offsets, k, index_k, table_q, table_k, rel_idx, output);
+}
+
+void dot_prod_with_idx_backward_cuda_v3(int N, int M, int h, int hdim, int n_max, at::Tensor grad_out_tensor, 
+    at::Tensor q_tensor, at::Tensor index_q_offsets_tensor, at::Tensor k_tensor, at::Tensor index_k_tensor, 
+    at::Tensor table_q_tensor, at::Tensor table_k_tensor, at::Tensor rel_idx_tensor, at::Tensor grad_q_tensor, 
+    at::Tensor grad_k_tensor, at::Tensor grad_table_q_tensor, at::Tensor grad_table_k_tensor)
+{
+    const float *grad_out = grad_out_tensor.data_ptr<float>();
+    const float *q = q_tensor.data_ptr<float>();
+    const int *index_q_offsets = index_q_offsets_tensor.data_ptr<int>();
+    const float *k = k_tensor.data_ptr<float>();
+    const int *index_k = index_k_tensor.data_ptr<int>();
+    const float *table_q = table_q_tensor.data_ptr<float>();
+    const float *table_k = table_k_tensor.data_ptr<float>();
+    const int *rel_idx = rel_idx_tensor.data_ptr<int>();
+    float *grad_q = grad_q_tensor.data_ptr<float>();
+    float *grad_k = grad_k_tensor.data_ptr<float>();
+    float *grad_table_q = grad_table_q_tensor.data_ptr<float>();
+    float *grad_table_k = grad_table_k_tensor.data_ptr<float>();
+    dot_prod_with_idx_backward_cuda_launcher_v3(N, M, h, hdim, n_max, grad_out, q, index_q_offsets, k, index_k, table_q, table_k, rel_idx, grad_q, grad_k, grad_table_q, grad_table_k);
+}
+
+
+void attention_step2_with_rel_pos_value_forward_cuda_v2(int N, int M, int h, int hdim, int n_max, at::Tensor attn_tensor, at::Tensor v_tensor, 
+    at::Tensor index0_offsets_tensor, at::Tensor index1_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor)
+{
+    const float *attn = attn_tensor.data_ptr<float>();
+    const float *v = v_tensor.data_ptr<float>();
+    const int *index0_offsets = index0_offsets_tensor.data_ptr<int>();
+    const int *index1 = index1_tensor.data_ptr<int>();
+    const float *table = table_tensor.data_ptr<float>();
+    const int *rel_idx = rel_idx_tensor.data_ptr<int>();
+    float *output = output_tensor.data_ptr<float>();
+    attention_step2_with_rel_pos_value_forward_cuda_launcher_v2(N, M, h, hdim, n_max, attn, v, index0_offsets, index1, table, rel_idx, output);
+}
+
+void attention_step2_with_rel_pos_value_backward_cuda_v2(int N, int M, int h, int hdim, int n_max, at::Tensor grad_out_tensor, 
+    at::Tensor index0_offsets_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor table_tensor,
+    at::Tensor rel_idx_tensor, at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor, at::Tensor grad_table_tensor)
+{
+    const float *grad_out = grad_out_tensor.data_ptr<float>();
+    const int *index0_offsets = index0_offsets_tensor.data_ptr<int>();
+    const int *index1 = index1_tensor.data_ptr<int>();
+    const float *attn = attn_tensor.data_ptr<float>();
+    const float *v = v_tensor.data_ptr<float>();
+    const float *table = table_tensor.data_ptr<float>();
+    const int *rel_idx = rel_idx_tensor.data_ptr<int>();
+    float *grad_attn = grad_attn_tensor.data_ptr<float>();
+    float *grad_v = grad_v_tensor.data_ptr<float>();
+    float *grad_table = grad_table_tensor.data_ptr<float>();
+    attention_step2_with_rel_pos_value_backward_cuda_launcher_v2(N, M, h, hdim, n_max, grad_out, index0_offsets, index1, attn, v, table, rel_idx, grad_attn, grad_v, grad_table);
+}
diff --git a/utils/pointops2/src/sampling/sampling_cuda.cpp b/utils/pointops2/src/sampling/sampling_cuda.cpp
new file mode 100644
index 0000000..7b2622a
--- /dev/null
+++ b/utils/pointops2/src/sampling/sampling_cuda.cpp
@@ -0,0 +1,16 @@
+#include <vector>
+#include <THC/THC.h>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "sampling_cuda_kernel.h"
+
+
+void furthestsampling_cuda(int b, int n, at::Tensor xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor tmp_tensor, at::Tensor idx_tensor)
+{
+    const float *xyz = xyz_tensor.data_ptr<float>();
+    const int *offset = offset_tensor.data_ptr<int>();
+    const int *new_offset = new_offset_tensor.data_ptr<int>();
+    float *tmp = tmp_tensor.data_ptr<float>();
+    int *idx = idx_tensor.data_ptr<int>();
+    furthestsampling_cuda_launcher(b, n, xyz, offset, new_offset, tmp, idx);
+}
diff --git a/utils/pointops2/src/sampling/sampling_cuda_kernel.cu b/utils/pointops2/src/sampling/sampling_cuda_kernel.cu
new file mode 100644
index 0000000..d2c70b5
--- /dev/null
+++ b/utils/pointops2/src/sampling/sampling_cuda_kernel.cu
@@ -0,0 +1,171 @@
+#include "../cuda_utils.h"
+#include "sampling_cuda_kernel.h"
+
+
+__device__ void __update(float *dists, int *dists_i, int idx1, int idx2) {
+    const float v1 = dists[idx1], v2 = dists[idx2];
+    const int i1 = dists_i[idx1], i2 = dists_i[idx2];
+    dists[idx1] = max(v1, v2);
+    dists_i[idx1] = v2 > v1 ? i2 : i1;
+}
+
+// input xyz: (n, 3), tmp: (b, n_max)
+// ouput idx (m)
+template <unsigned int block_size>
+__global__ void furthestsampling_cuda_kernel(const float *xyz, const int *offset, const int *new_offset, float *tmp, int *idx)
+{
+    __shared__ float dists[block_size];
+    __shared__ int dists_i[block_size];
+
+    int bid = blockIdx.x;
+    int start_n, end_n, start_m, end_m, old;
+    if (bid == 0) {
+        start_n = 0;
+        end_n = offset[0];
+        start_m = 0;
+        end_m = new_offset[0];
+        old = 0;
+    }
+    else {
+        start_n = offset[bid - 1];
+        end_n = offset[bid];
+        start_m = new_offset[bid - 1];
+        end_m = new_offset[bid];
+        old = offset[bid - 1];
+    }
+
+    const int stride = block_size;
+    int tid = threadIdx.x;
+    if (tid == 0) idx[start_m] = start_n;
+
+    __syncthreads();
+    for (int j = start_m + 1; j < end_m; j++)
+    {
+        int besti = start_n;
+        float best = -1;
+        float x1 = xyz[old * 3 + 0];
+        float y1 = xyz[old * 3 + 1];
+        float z1 = xyz[old * 3 + 2];
+        for (int k = start_n + tid; k < end_n; k += stride)
+        {
+            float x2 = xyz[k * 3 + 0];
+            float y2 = xyz[k * 3 + 1];
+            float z2 = xyz[k * 3 + 2];
+            float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
+            float d2 = min(d, tmp[k]);
+            tmp[k] = d2;
+            besti = d2 > best ? k : besti;
+            best = d2 > best ? d2 : best;
+        }
+        dists[tid] = best;
+        dists_i[tid] = besti;
+        __syncthreads();
+
+        if (block_size >= 1024) {
+            if (tid < 512) {
+            __update(dists, dists_i, tid, tid + 512);
+            }
+            __syncthreads();
+        }
+        if (block_size >= 512) {
+            if (tid < 256) {
+            __update(dists, dists_i, tid, tid + 256);
+            }
+            __syncthreads();
+        }
+        if (block_size >= 256) {
+            if (tid < 128) {
+            __update(dists, dists_i, tid, tid + 128);
+            }
+            __syncthreads();
+        }
+        if (block_size >= 128) {
+            if (tid < 64) {
+            __update(dists, dists_i, tid, tid + 64);
+            }
+            __syncthreads();
+        }
+        if (block_size >= 64) {
+            if (tid < 32) {
+            __update(dists, dists_i, tid, tid + 32);
+            }
+            __syncthreads();
+        }
+        if (block_size >= 32) {
+            if (tid < 16) {
+            __update(dists, dists_i, tid, tid + 16);
+            }
+            __syncthreads();
+        }
+        if (block_size >= 16) {
+            if (tid < 8) {
+            __update(dists, dists_i, tid, tid + 8);
+            }
+            __syncthreads();
+        }
+        if (block_size >= 8) {
+            if (tid < 4) {
+            __update(dists, dists_i, tid, tid + 4);
+            }
+            __syncthreads();
+        }
+        if (block_size >= 4) {
+            if (tid < 2) {
+            __update(dists, dists_i, tid, tid + 2);
+            }
+            __syncthreads();
+        }
+        if (block_size >= 2) {
+            if (tid < 1) {
+            __update(dists, dists_i, tid, tid + 1);
+            }
+            __syncthreads();
+        }
+
+        old = dists_i[0];
+        if (tid == 0)
+            idx[j] = old;
+    }
+}
+
+void furthestsampling_cuda_launcher(int b, int n, const float *xyz, const int *offset, const int *new_offset, float *tmp, int *idx)
+{   
+	unsigned int n_threads = opt_n_threads(n);
+	switch (n_threads) {
+        case 1024:
+            furthestsampling_cuda_kernel<1024><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
+            break;
+        case 512:
+            furthestsampling_cuda_kernel<512><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
+            break;
+        case 256:
+            furthestsampling_cuda_kernel<256><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
+            break;
+        case 128:
+            furthestsampling_cuda_kernel<128><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
+            break;
+        case 64:
+            furthestsampling_cuda_kernel<64><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
+            break;
+        case 32:
+            furthestsampling_cuda_kernel<32><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
+            break;
+        case 16:
+            furthestsampling_cuda_kernel<16><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
+            break;
+        case 8:
+            furthestsampling_cuda_kernel<8><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
+            break;
+        case 4:
+            furthestsampling_cuda_kernel<4><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
+            break;
+        case 2:
+            furthestsampling_cuda_kernel<2><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
+            break;
+        case 1:
+            furthestsampling_cuda_kernel<1><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
+            break;
+        default:
+            furthestsampling_cuda_kernel<512><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
+    }
+}
diff --git a/utils/pointops2/src/sampling/sampling_cuda_kernel.h b/utils/pointops2/src/sampling/sampling_cuda_kernel.h
new file mode 100644
index 0000000..c903f63
--- /dev/null
+++ b/utils/pointops2/src/sampling/sampling_cuda_kernel.h
@@ -0,0 +1,18 @@
+#ifndef _SAMPLING_CUDA_KERNEL
+#define _SAMPLING_CUDA_KERNEL
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+
+void furthestsampling_cuda(int b, int n, at::Tensor xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor tmp_tensor, at::Tensor idx_tensor);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void furthestsampling_cuda_launcher(int b, int n, const float *xyz, const int *offset, const int *new_offset, float *tmp, int *idx);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/utils/pointops2/src/subtraction/subtraction_cuda.cpp b/utils/pointops2/src/subtraction/subtraction_cuda.cpp
new file mode 100644
index 0000000..fa38dc5
--- /dev/null
+++ b/utils/pointops2/src/subtraction/subtraction_cuda.cpp
@@ -0,0 +1,24 @@
+#include <vector>
+#include <THC/THC.h>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "subtraction_cuda_kernel.h"
+
+
+void subtraction_forward_cuda(int n, int nsample, int c, at::Tensor input1_tensor, at::Tensor input2_tensor, at::Tensor idx_tensor, at::Tensor output_tensor)
+{
+    const float *input1 = input1_tensor.data_ptr<float>();
+    const float *input2 = input2_tensor.data_ptr<float>();
+    const int *idx = idx_tensor.data_ptr<int>();
+    float *output = output_tensor.data_ptr<float>();
+    subtraction_forward_cuda_launcher(n, nsample, c, input1, input2, idx, output);
+}
+
+void subtraction_backward_cuda(int n, int nsample, int c, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input1_tensor, at::Tensor grad_input2_tensor)
+{
+    const int *idx = idx_tensor.data_ptr<int>();
+    const float *grad_output = grad_output_tensor.data_ptr<float>();
+    float *grad_input1 = grad_input1_tensor.data_ptr<float>();
+    float *grad_input2 = grad_input2_tensor.data_ptr<float>();
+    subtraction_backward_cuda_launcher(n, nsample, c, idx, grad_output, grad_input1, grad_input2);
+}
diff --git a/utils/pointops2/src/subtraction/subtraction_cuda_kernel.cu b/utils/pointops2/src/subtraction/subtraction_cuda_kernel.cu
new file mode 100644
index 0000000..9b8d4f7
--- /dev/null
+++ b/utils/pointops2/src/subtraction/subtraction_cuda_kernel.cu
@@ -0,0 +1,44 @@
+#include "../cuda_utils.h"
+#include "subtraction_cuda_kernel.h"
+
+
+__global__ void subtraction_forward_cuda_kernel(int n, int nsample, int c, const float *input1, const float *input2, const int *idx, float *output) {
+    // input: input1: (n, c), input2: (n, c), idx: (n, nsample), output: (n, nsample, c)
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= n * nsample * c) return;
+    const int c_idx = index % c;
+    const int nsample_idx = (index / c) % nsample;
+    const int n_idx = index / nsample / c;
+    const int idx_idx = n_idx * nsample + nsample_idx;
+    const int input1_idx = n_idx * c + c_idx;
+    const int input2_idx = idx[idx_idx] * c + c_idx;
+    output[index] = input1[input1_idx] - input2[input2_idx];
+}
+
+__global__ void subtraction_backward_cuda_kernel(int n, int nsample, int c, const int *idx, const float *grad_output, float *grad_input1, float *grad_input2) {
+    // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c)
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= n * nsample * c) return;
+    const int c_idx = index % c;
+    const int nsample_idx = (index / c) % nsample;
+    const int n_idx = index / nsample / c;
+    const int idx_idx = n_idx * nsample + nsample_idx;
+    const int input1_idx = n_idx * c + c_idx;
+    const int input2_idx = idx[idx_idx] * c + c_idx;
+    atomicAdd(grad_input1 + input1_idx, grad_output[index]);
+    atomicAdd(grad_input2 + input2_idx, -grad_output[index]);
+}
+
+void subtraction_forward_cuda_launcher(int n, int nsample, int c, const float *input1, const float *input2, const int *idx, float *output) {
+    // input: input1: (n, c), input2: (n, c), idx: (n, nsample), output: (n, nsample, c)
+    dim3 blocks(DIVUP(n * nsample * c, THREADS_PER_BLOCK));
+    dim3 threads(THREADS_PER_BLOCK);
+    subtraction_forward_cuda_kernel<<<blocks, threads, 0>>>(n, nsample, c, input1, input2, idx, output);
+}
+
+void subtraction_backward_cuda_launcher(int n, int nsample, int c, const int *idx, const float *grad_output, float *grad_input1, float *grad_input2) {  
+    // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c)
+    dim3 blocks(DIVUP(n * nsample * c, THREADS_PER_BLOCK));
+    dim3 threads(THREADS_PER_BLOCK);
+    subtraction_backward_cuda_kernel<<<blocks, threads, 0>>>(n, nsample, c, idx, grad_output, grad_input1, grad_input2);
+}
diff --git a/utils/pointops2/src/subtraction/subtraction_cuda_kernel.h b/utils/pointops2/src/subtraction/subtraction_cuda_kernel.h
new file mode 100644
index 0000000..856133d
--- /dev/null
+++ b/utils/pointops2/src/subtraction/subtraction_cuda_kernel.h
@@ -0,0 +1,20 @@
+#ifndef _SUBTRACTION_CUDA_KERNEL
+#define _SUBTRACTION_CUDA_KERNEL
+#include <vector>
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+
+void subtraction_forward_cuda(int n, int nsample, int c, at::Tensor input1_tensor, at::Tensor input2_tensor, at::Tensor idx_tensor, at::Tensor output_tensor);
+void subtraction_backward_cuda(int n, int nsample, int c, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input1_tensor, at::Tensor grad_input2_tensor);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void subtraction_forward_cuda_launcher(int n, int nsample, int c, const float *input1, const float *input2, const int *idx, float *output);
+void subtraction_backward_cuda_launcher(int n, int nsample, int c, const int *idx, const float *grad_output, float *grad_input1, float *grad_input2);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/utils/utils.py b/utils/utils.py
new file mode 100644
index 0000000..cd13b77
--- /dev/null
+++ b/utils/utils.py
@@ -0,0 +1,101 @@
+import numpy as np
+import torch
+from scipy.optimize import linear_sum_assignment
+import sys
+import pytorch_lightning as pl
+from pathlib import Path
+import os
+if sys.version_info[:2] >= (3, 8):
+    from collections.abc import MutableMapping
+else:
+    from collections import MutableMapping
+
+def flatten_dict(d, parent_key="", sep="_"):
+    """
+    https://stackoverflow.com/questions/6027558/flatten-nested-dictionaries-compressing-keys
+    """
+    items = []
+    for k, v in d.items():
+        new_key = parent_key + sep + k if parent_key else k
+        if isinstance(v, MutableMapping):
+            items.extend(flatten_dict(v, new_key, sep=sep).items())
+        else:
+            items.append((new_key, v))
+    return dict(items)
+
+
+class RegularCheckpointing(pl.Callback):
+    def on_train_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule"):
+        general = pl_module.config.general
+        trainer.save_checkpoint(f"{general.save_dir}/last-epoch.ckpt")
+        print("Checkpoint created")
+
+
+def associate_instances(previous_ins_label, current_ins_label):
+    previous_instance_ids, c_p = np.unique(previous_ins_label[previous_ins_label != 0], return_counts=True)
+    current_instance_ids, c_c = np.unique(current_ins_label[current_ins_label!=0], return_counts=True)
+    
+    associations = {
+      0: 0
+    }
+    
+    large_previous_instance_ids = []
+    large_current_instance_ids = []
+    for id, count in zip(previous_instance_ids, c_p):
+        if count > 25:
+            large_previous_instance_ids.append(id)
+    for id, count in zip(current_instance_ids, c_c):
+        if count > 50:
+            large_current_instance_ids.append(id)
+
+    p_n = len(large_previous_instance_ids)
+    c_n = len(large_current_instance_ids)
+
+    association_costs = torch.zeros(p_n, c_n)
+    for i, p_id in enumerate(large_previous_instance_ids):
+        for j, c_id in enumerate(large_current_instance_ids):
+            intersection = np.sum( (previous_ins_label==p_id) & (current_ins_label == c_id) )
+            union =  np.sum(previous_ins_label==p_id) + np.sum(current_ins_label == c_id) - intersection
+            iou = intersection/union
+            cost = 1 - iou
+            association_costs[i, j] = cost
+
+    idxes_1, idxes_2 = linear_sum_assignment(association_costs)
+    
+    for i1, i2 in zip(idxes_1, idxes_2):
+        if association_costs[i1][i2] < 1.0:
+            associations[large_current_instance_ids[i2]] = large_previous_instance_ids[i1]
+    return associations
+
+
+def save_predictions(sem_preds, ins_preds, seq_name, sweep_name):
+    filename = Path("/globalwork/yilmaz/submission/sequences") / seq_name / "predictions"
+    # assert not filename.exists(), "Path exists"
+    filename.mkdir(parents=True, exist_ok=True)
+    learning_map_inv = {
+        1: 10,     # "car"
+        2: 11,     # "bicycle"
+        3: 15,     # "motorcycle"
+        4: 18,     # "truck"
+        5: 20,     # "other-vehicle"
+        6: 30,     # "person"
+        7: 31,     # "bicyclist"
+        8: 32,     # "motorcyclist"
+        9: 40,     # "road"
+        10: 44,    # "parking"
+        11: 48,    # "sidewalk"
+        12: 49,    # "other-ground"
+        13: 50,    # "building"
+        14: 51,    # "fence"
+        15: 70,    # "vegetation"
+        16: 71,    # "trunk"
+        17: 72,    # "terrain"
+        18: 80,    # "pole"
+        19: 81,    # "traffic-sign"
+    }
+    sem_preds = np.vectorize(learning_map_inv.__getitem__)(sem_preds)
+    panoptic_preds = (ins_preds << 16) + sem_preds
+    file_path = str(filename / sweep_name) + ".label"
+    if not os.path.exists(file_path):
+        with open(file_path, "wb") as f:
+            f.write(panoptic_preds.astype(np.uint32).tobytes())