apple · chenxwh · Oct 12, 2024
diff --git a/README.md b/README.md
@@ -4,6 +4,8 @@ This software project accompanies the research paper:
 **[Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/abs/2410.02073)**, 
 *Aleksei Bochkovskii, Amaël Delaunoy, Hugo Germain, Marcel Santos, Yichao Zhou, Stephan R. Richter, and Vladlen Koltun*.
 
+[![Replicate](https://replicate.com/chenxwh/ml-depth-pro/badge)](https://replicate.com/chenxwh/ml-depth-pro) 
+
 ![](data/depth-pro-teaser.jpg)
 
 We present a foundation model for zero-shot metric monocular depth estimation. Our model, Depth Pro, synthesizes high-resolution depth maps with unparalleled sharpness and high-frequency details. The predictions are metric, with absolute scale, without relying on the availability of metadata such as camera intrinsics. And the model is fast, producing a 2.25-megapixel depth map in 0.3 seconds on a standard GPU. These characteristics are enabled by a number of technical contributions, including an efficient multi-scale vision transformer for dense prediction, a training protocol that combines real and synthetic datasets to achieve high metric accuracy alongside fine boundary tracing, dedicated evaluation metrics for boundary accuracy in estimated depth maps, and state-of-the-art focal length estimation from a single image.

diff --git a/cog.yaml b/cog.yaml
@@ -0,0 +1,31 @@
+# Configuration for Cog ⚙️
+# Reference: https://cog.run/yaml
+
+build:
+  # set to true if your model requires a GPU
+  gpu: true
+
+  # a list of ubuntu apt packages to install
+  system_packages:
+    - "libgl1-mesa-glx"
+    - "libglib2.0-0"
+
+  # python version in the form '3.11' or '3.11.4'
+  python_version: "3.9"
+
+  # a list of packages in the format <package-name>==<version>
+  python_packages:
+    - torch
+    - torchvision
+    - timm
+    - numpy<2
+    - pillow_heif
+    - matplotlib
+
+  # commands run after the environment is setup
+  run:
+    - pip install ipython
+    - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.8.2/pget_linux_x86_64" && chmod +x /usr/local/bin/pget
+
+# predict.py defines how predictions are run on your model
+predict: "predict.py:Predictor"
diff --git a/predict.py b/predict.py
@@ -0,0 +1,110 @@
+# Prediction interface for Cog ⚙️
+# https://cog.run/python
+
+import os
+import subprocess
+import time
+import numpy as np
+import PIL.Image
+import torch
+from matplotlib import pyplot as plt
+from cog import BasePredictor, Input, Path, BaseModel
+
+from src.depth_pro import create_model_and_transforms, load_rgb
+from src.depth_pro.depth_pro import DepthProConfig
+
+
+MODEL_CACHE = "checkpoints"
+MODEL_URL = (
+    f"https://weights.replicate.delivery/default/apple/ml-depth-pro/{MODEL_CACHE}.tar"
+)
+
+os.environ.update(
+    {
+        "HF_DATASETS_OFFLINE": "1",
+        "TRANSFORMERS_OFFLINE": "1",
+        "HF_HOME": MODEL_CACHE,
+        "TORCH_HOME": MODEL_CACHE,
+        "HF_DATASETS_CACHE": MODEL_CACHE,
+        "TRANSFORMERS_CACHE": MODEL_CACHE,
+        "HUGGINGFACE_HUB_CACHE": MODEL_CACHE,
+    }
+)
+
+
+class ModelOutput(BaseModel):
+    npz: Path
+    color_map: Path
+
+
+def download_weights(url, dest):
+    start = time.time()
+    print("downloading url: ", url)
+    print("downloading to: ", dest)
+    subprocess.check_call(["pget", "-x", url, dest], close_fds=False)
+    print("downloading took: ", time.time() - start)
+
+
+class Predictor(BasePredictor):
+    def setup(self) -> None:
+        """Load the model into memory to make running multiple predictions efficient"""
+
+        if not os.path.exists(MODEL_CACHE):
+            download_weights(MODEL_URL, MODEL_CACHE)
+
+        self.model, self.transform = create_model_and_transforms(
+            config=DepthProConfig(
+                patch_encoder_preset="dinov2l16_384",
+                image_encoder_preset="dinov2l16_384",
+                checkpoint_uri=f"./{MODEL_CACHE}/depth_pro.pt",
+                decoder_features=256,
+                use_fov_head=True,
+                fov_encoder_preset="dinov2l16_384",
+            ),
+            device=torch.device("cuda:0"),
+            precision=torch.half,
+        )
+        self.model.eval()
+
+    def predict(
+        self,
+        image_path: Path = Input(description="Input image"),
+    ) -> ModelOutput:
+        """Run a single prediction on the model"""
+
+        image, _, f_px = load_rgb(image_path)
+
+        # Run prediction. If `f_px` is provided, it is used to estimate the final metric depth,
+        # otherwise the model estimates `f_px` to compute the depth metricness.
+        prediction = self.model.infer(self.transform(image), f_px=f_px)
+
+        # Extract the depth and focal length.
+        depth = prediction["depth"].detach().cpu().numpy().squeeze()
+        if f_px is not None:
+            print(f"Focal length (from exif): {f_px:0.2f}")
+        elif prediction["focallength_px"] is not None:
+            focallength_px = prediction["focallength_px"].detach().cpu().item()
+            print(f"Estimated focal length: {focallength_px}")
+
+        inverse_depth = 1 / depth
+        # Visualize inverse depth instead of depth, clipped to [0.1m;250m] range for better visualization.
+        max_invdepth_vizu = min(inverse_depth.max(), 1 / 0.1)
+        min_invdepth_vizu = max(1 / 250, inverse_depth.min())
+        inverse_depth_normalized = (inverse_depth - min_invdepth_vizu) / (
+            max_invdepth_vizu - min_invdepth_vizu
+        )
+
+        # Save Depth as npz file.
+        out_npz = "/tmp/out.npz"
+        np.savez_compressed(out_npz, depth=depth)
+        np.savez_compressed("out.npz", depth=depth)
+
+        # Save as color-mapped "turbo" jpg image.
+        cmap = plt.get_cmap("turbo")
+        color_depth = (cmap(inverse_depth_normalized)[..., :3] * 255).astype(np.uint8)
+        out_color_map = "/tmp/out.jpg"
+        PIL.Image.fromarray(color_depth).save(out_color_map, format="JPEG", quality=90)
+
+        PIL.Image.fromarray(color_depth).save("out.jpg", format="JPEG", quality=90)
+
+        return ModelOutput(npz=Path(out_npz), color_map=Path(out_color_map))