nerfstudio-project · liruilong940607 · Sep 26, 2024 · Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024
diff --git a/examples/benchmarks/fisheye/mcmc_zipnerf.sh b/examples/benchmarks/fisheye/mcmc_zipnerf.sh
@@ -0,0 +1,22 @@
+SCENE_DIR="data/zipnerf"
+SCENE_LIST="berlin london nyc alameda"
+DATA_FACTOR=2
+
+RESULT_DIR="results/benchmark_mcmc_2M_zipnerf"
+CAP_MAX=2000000
+
+# RESULT_DIR="results/benchmark_mcmc_4M_zipnerf"
+# CAP_MAX=4000000
+
+for SCENE in $SCENE_LIST;
+do
+    echo "Running $SCENE"
+
+    # train and eval
+    CUDA_VISIBLE_DEVICES=0 python simple_trainer.py mcmc --disable_viewer --data_factor $DATA_FACTOR \
+        --strategy.cap-max $CAP_MAX \
+        --opacity_reg 0.001 \
+        --camera_model fisheye \
+        --data_dir $SCENE_DIR/$SCENE/ \
+        --result_dir $RESULT_DIR/$SCENE/
+done
diff --git a/examples/benchmarks/fisheye/mcmc_zipnerf_undistort.sh b/examples/benchmarks/fisheye/mcmc_zipnerf_undistort.sh
@@ -0,0 +1,22 @@
+SCENE_DIR="data/zipnerf_undistort"
+SCENE_LIST="berlin london nyc alameda"
+DATA_FACTOR=2
+
+RESULT_DIR="results/benchmark_mcmc_2M_zipnerf_undistort"
+CAP_MAX=2000000
+
+# RESULT_DIR="results/benchmark_mcmc_4M_zipnerf_undistort"
+# CAP_MAX=4000000
+
+for SCENE in $SCENE_LIST;
+do
+    echo "Running $SCENE"
+
+    # train and eval
+    CUDA_VISIBLE_DEVICES=0 python simple_trainer.py mcmc --disable_viewer --data_factor $DATA_FACTOR \
+        --strategy.cap-max $CAP_MAX \
+        --opacity_reg 0.001 \
+        --camera_model pinhole \
+        --data_dir $SCENE_DIR/$SCENE/ \
+        --result_dir $RESULT_DIR/$SCENE/
+done
diff --git a/examples/benchmarks/mcmc.sh b/examples/benchmarks/mcmc.sh
@@ -19,7 +19,7 @@ do
     CUDA_VISIBLE_DEVICES=0 python simple_trainer.py mcmc --eval_steps -1 --disable_viewer --data_factor $DATA_FACTOR \
         --strategy.cap-max $CAP_MAX \
         --render_traj_path $RENDER_TRAJ_PATH \
-        --data_dir data/360_v2/$SCENE/ \
+        --data_dir $SCENE_DIR/$SCENE/ \
         --result_dir $RESULT_DIR/$SCENE/
 
     # run eval and render

diff --git a/examples/datasets/colmap.py b/examples/datasets/colmap.py
@@ -1,6 +1,7 @@
 import os
 import json
 from typing import Any, Dict, List, Optional
+from typing_extensions import assert_never
 
 import cv2
 import imageio.v2 as imageio
@@ -59,6 +60,7 @@ def __init__(
         Ks_dict = dict()
         params_dict = dict()
         imsize_dict = dict()  # width, height
+        mask_dict = dict()
         bottom = np.array([0, 0, 0, 1]).reshape(1, 4)
         for k in imdata:
             im = imdata[k]
@@ -99,14 +101,12 @@ def __init__(
                 params = np.array([cam.k1, cam.k2, cam.k3, cam.k4], dtype=np.float32)
                 camtype = "fisheye"
             assert (
-                camtype == "perspective"
-            ), f"Only support perspective camera model, got {type_}"
+                camtype == "perspective" or camtype == "fisheye"
+            ), f"Only perspective and fisheye cameras are supported, got {type_}"
 
             params_dict[camera_id] = params
-
-            # image size
             imsize_dict[camera_id] = (cam.width // factor, cam.height // factor)
-
+            mask_dict[camera_id] = None
         print(
             f"[Parser] {len(imdata)} images, taken by {len(set(camera_ids))} cameras."
         )
@@ -203,6 +203,7 @@ def __init__(
         self.Ks_dict = Ks_dict  # Dict of camera_id -> K
         self.params_dict = params_dict  # Dict of camera_id -> params
         self.imsize_dict = imsize_dict  # Dict of camera_id -> (width, height)
+        self.mask_dict = mask_dict  # Dict of camera_id -> mask
         self.points = points  # np.ndarray, (num_points, 3)
         self.points_err = points_err  # np.ndarray, (num_points,)
         self.points_rgb = points_rgb  # np.ndarray, (num_points, 3)
@@ -236,16 +237,62 @@ def __init__(
             ), f"Missing params for camera {camera_id}"
             K = self.Ks_dict[camera_id]
             width, height = self.imsize_dict[camera_id]
-            K_undist, roi_undist = cv2.getOptimalNewCameraMatrix(
-                K, params, (width, height), 0
-            )
-            mapx, mapy = cv2.initUndistortRectifyMap(
-                K, params, None, K_undist, (width, height), cv2.CV_32FC1
-            )
-            self.Ks_dict[camera_id] = K_undist
+
+            if camtype == "perspective":
+                K_undist, roi_undist = cv2.getOptimalNewCameraMatrix(
+                    K, params, (width, height), 0
+                )
+                mapx, mapy = cv2.initUndistortRectifyMap(
+                    K, params, None, K_undist, (width, height), cv2.CV_32FC1
+                )
+                mask = None
+            elif camtype == "fisheye":
+                fx = K[0, 0]
+                fy = K[1, 1]
+                cx = K[0, 2]
+                cy = K[1, 2]
+                mapx = np.zeros((height, width), dtype=np.float32)
+                mapy = np.zeros((height, width), dtype=np.float32)
+                for i in range(0, width):
+                    for j in range(0, height):
+                        x = float(i)
+                        y = float(j)
+                        x1 = (x - cx) / fx
+                        y1 = (y - cy) / fy
+                        theta = np.sqrt(x1**2 + y1**2)
+                        r = (
+                            1.0
+                            + params[0] * theta**2
+                            + params[1] * theta**4
+                            + params[2] * theta**6
+                            + params[3] * theta**8
+                        )
+                        x2 = fx * x1 * r + width // 2
+                        y2 = fy * y1 * r + height // 2
+                        mapx[j, i] = x2
+                        mapy[j, i] = y2
+
+                mask = np.logical_and(
+                    np.logical_and(mapx > 0, mapy > 0),
+                    np.logical_and(mapx < width - 1, mapy < height - 1),
+                )
+                y_indices, x_indices = np.nonzero(mask)
+                y_min, y_max = y_indices.min(), y_indices.max() + 1
+                x_min, x_max = x_indices.min(), x_indices.max() + 1
+                mask = mask[y_min:y_max, x_min:x_max]
+                K_undist = K.copy()
+                K_undist[0, 2] -= x_min
+                K_undist[1, 2] -= y_min
+                roi_undist = [x_min, y_min, x_max - x_min, y_max - y_min]
+            else:
+                assert_never(camtype)
+
             self.mapx_dict[camera_id] = mapx
             self.mapy_dict[camera_id] = mapy
+            self.Ks_dict[camera_id] = K_undist
             self.roi_undist_dict[camera_id] = roi_undist
+            self.imsize_dict[camera_id] = (roi_undist[2], roi_undist[3])
+            self.mask_dict[camera_id] = mask
 
         # size of the scene measured by cameras
         camera_locations = camtoworlds[:, :3, 3]
@@ -284,6 +331,7 @@ def __getitem__(self, item: int) -> Dict[str, Any]:
         K = self.parser.Ks_dict[camera_id].copy()  # undistorted K
         params = self.parser.params_dict[camera_id]
         camtoworlds = self.parser.camtoworlds[index]
+        mask = self.parser.mask_dict[camera_id]
 
         if len(params) > 0:
             # Images are distorted. Undistort them.
@@ -310,6 +358,8 @@ def __getitem__(self, item: int) -> Dict[str, Any]:
             "image": torch.from_numpy(image).float(),
             "image_id": item,  # the index of the image in the dataset
         }
+        if mask is not None:
+            data["mask"] = torch.from_numpy(mask).bool()
 
         if self.load_depths:
             # projected points to image plane to get depths

diff --git a/examples/simple_trainer.py b/examples/simple_trainer.py
@@ -67,6 +67,8 @@ class Config:
     global_scale: float = 1.0
     # Normalize the world space
     normalize_world_space: bool = True
+    # Camera model
+    camera_model: Literal["pinhole", "fisheye"] = "pinhole"
 
     # Port for the viewer server
     port: int = 8080
@@ -432,6 +434,7 @@ def rasterize_splats(
         Ks: Tensor,
         width: int,
         height: int,
+        masks: Optional[Tensor] = None,
         **kwargs,
     ) -> Tuple[Tensor, Tensor, Dict]:
         means = self.splats["means"]  # [N, 3]
@@ -474,8 +477,11 @@ def rasterize_splats(
             sparse_grad=self.cfg.sparse_grad,
             rasterize_mode=rasterize_mode,
             distributed=self.world_size > 1,
+            fisheye=self.cfg.camera_model == "fisheye",
             **kwargs,
         )
+        if masks is not None:
+            render_colors[~masks] = 0
         return render_colors, render_alphas, info
 
     def train(self):
@@ -555,6 +561,7 @@ def train(self):
                 pixels.shape[0] * pixels.shape[1] * pixels.shape[2]
             )
             image_ids = data["image_id"].to(device)
+            masks = data["mask"].to(device) if "mask" in data else None  # [1, H, W]
             if cfg.depth_loss:
                 points = data["points"].to(device)  # [1, M, 2]
                 depths_gt = data["depths"].to(device)  # [1, M]
@@ -581,6 +588,7 @@ def train(self):
                 far_plane=cfg.far_plane,
                 image_ids=image_ids,
                 render_mode="RGB+ED" if cfg.depth_loss else "RGB",
+                masks=masks,
             )
             if renders.shape[-1] == 4:
                 colors, depths = renders[..., 0:3], renders[..., 3:4]
@@ -806,6 +814,7 @@ def eval(self, step: int, stage: str = "val"):
             camtoworlds = data["camtoworld"].to(device)
             Ks = data["K"].to(device)
             pixels = data["image"].to(device) / 255.0
+            masks = data["mask"].to(device) if "mask" in data else None
             height, width = pixels.shape[1:3]
 
             torch.cuda.synchronize()
@@ -818,6 +827,7 @@ def eval(self, step: int, stage: str = "val"):
                 sh_degree=cfg.sh_degree,
                 near_plane=cfg.near_plane,
                 far_plane=cfg.far_plane,
+                masks=masks,
             )  # [1, H, W, 3]
             torch.cuda.synchronize()
             ellipse_time += time.time() - tic
@@ -909,7 +919,10 @@ def render_traj(self, step: int):
         K = torch.from_numpy(list(self.parser.Ks_dict.values())[0]).float().to(device)
         width, height = list(self.parser.imsize_dict.values())[0]
 
-        canvas_all = []
+        # save to video
+        video_dir = f"{cfg.result_dir}/videos"
+        os.makedirs(video_dir, exist_ok=True)
+        writer = imageio.get_writer(f"{video_dir}/traj_{step}.mp4", fps=30)
         for i in tqdm.trange(len(camtoworlds_all), desc="Rendering trajectory"):
             camtoworlds = camtoworlds_all[i : i + 1]
             Ks = K[None]
@@ -932,13 +945,6 @@ def render_traj(self, step: int):
             # write images
             canvas = torch.cat(canvas_list, dim=2).squeeze(0).cpu().numpy()
             canvas = (canvas * 255).astype(np.uint8)
-            canvas_all.append(canvas)
-
-        # save to video
-        video_dir = f"{cfg.result_dir}/videos"
-        os.makedirs(video_dir, exist_ok=True)
-        writer = imageio.get_writer(f"{video_dir}/traj_{step}.mp4", fps=30)
-        for canvas in canvas_all:
             writer.append_data(canvas)
         writer.close()
         print(f"Video saved to {video_dir}/traj_{step}.mp4")

diff --git a/gsplat/cuda/_torch_impl.py b/gsplat/cuda/_torch_impl.py
@@ -119,6 +119,71 @@ def _persp_proj(
     return means2d, cov2d  # [C, N, 2], [C, N, 2, 2]
 
 
+def _fisheye_proj(
+    means: Tensor,  # [C, N, 3]
+    covars: Tensor,  # [C, N, 3, 3]
+    Ks: Tensor,  # [C, 3, 3]
+    width: int,
+    height: int,
+) -> Tuple[Tensor, Tensor]:
+    """PyTorch implementation of fisheye projection for 3D Gaussians.
+
+    Args:
+        means: Gaussian means in camera coordinate system. [C, N, 3].
+        covars: Gaussian covariances in camera coordinate system. [C, N, 3, 3].
+        Ks: Camera intrinsics. [C, 3, 3].
+        width: Image width.
+        height: Image height.
+
+    Returns:
+        A tuple:
+
+        - **means2d**: Projected means. [C, N, 2].
+        - **cov2d**: Projected covariances. [C, N, 2, 2].
+    """
+    C, N, _ = means.shape
+
+    x, y, z = torch.unbind(means, dim=-1)  # [C, N]
+
+    fx = Ks[..., 0, 0, None]  # [C, 1]
+    fy = Ks[..., 1, 1, None]  # [C, 1]
+    cx = Ks[..., 0, 2, None]  # [C, 1]
+    cy = Ks[..., 1, 2, None]  # [C, 1]
+
+    eps = 0.0000001
+    xy_len = (x**2 + y**2) ** 0.5 + eps
+    theta = torch.atan2(xy_len, z + eps)
+    means2d = torch.stack(
+        [
+            x * fx * theta / xy_len + cx,
+            y * fy * theta / xy_len + cy,
+        ],
+        dim=-1,
+    )
+
+    x2 = x * x + eps
+    y2 = y * y
+    xy = x * y
+    x2y2 = x2 + y2
+    x2y2z2_inv = 1.0 / (x2y2 + z * z)
+    b = torch.atan2(xy_len, z) / xy_len / x2y2
+    a = z * x2y2z2_inv / (x2y2)
+    J = torch.stack(
+        [
+            fx * (x2 * a + y2 * b),
+            fx * xy * (a - b),
+            -fx * x * x2y2z2_inv,
+            fy * xy * (a - b),
+            fy * (y2 * a + x2 * b),
+            -fy * y * x2y2z2_inv,
+        ],
+        dim=-1,
+    ).reshape(C, N, 2, 3)
+
+    cov2d = torch.einsum("...ij,...jk,...kl->...il", J, covars, J.transpose(-1, -2))
+    return means2d, cov2d  # [C, N, 2], [C, N, 2, 2]
+
+
 def _ortho_proj(
     means: Tensor,  # [C, N, 3]
     covars: Tensor,  # [C, N, 3, 3]
@@ -193,6 +258,7 @@ def _fully_fused_projection(
     far_plane: float = 1e10,
     calc_compensations: bool = False,
     ortho: bool = False,
+    fisheye: bool = False,
 ) -> Tuple[Tensor, Tensor, Tensor, Tensor, Optional[Tensor]]:
     """PyTorch implementation of `gsplat.cuda._wrapper.fully_fused_projection()`
 
@@ -205,6 +271,8 @@ def _fully_fused_projection(
 
     if ortho:
         means2d, covars2d = _ortho_proj(means_c, covars_c, Ks, width, height)
+    elif fisheye:
+        means2d, covars2d = _fisheye_proj(means_c, covars_c, Ks, width, height)
     else:
         means2d, covars2d = _persp_proj(means_c, covars_c, Ks, width, height)