diff --git a/README.md b/README.md index 6c4ea61..8d58c28 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ This software project accompanies the research paper: **[Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/abs/2410.02073)**, *Aleksei Bochkovskii, Amaël Delaunoy, Hugo Germain, Marcel Santos, Yichao Zhou, Stephan R. Richter, and Vladlen Koltun*. +[![Replicate](https://replicate.com/chenxwh/ml-depth-pro/badge)](https://replicate.com/chenxwh/ml-depth-pro) + ![](data/depth-pro-teaser.jpg) We present a foundation model for zero-shot metric monocular depth estimation. Our model, Depth Pro, synthesizes high-resolution depth maps with unparalleled sharpness and high-frequency details. The predictions are metric, with absolute scale, without relying on the availability of metadata such as camera intrinsics. And the model is fast, producing a 2.25-megapixel depth map in 0.3 seconds on a standard GPU. These characteristics are enabled by a number of technical contributions, including an efficient multi-scale vision transformer for dense prediction, a training protocol that combines real and synthetic datasets to achieve high metric accuracy alongside fine boundary tracing, dedicated evaluation metrics for boundary accuracy in estimated depth maps, and state-of-the-art focal length estimation from a single image. diff --git a/cog.yaml b/cog.yaml new file mode 100644 index 0000000..93be17a --- /dev/null +++ b/cog.yaml @@ -0,0 +1,31 @@ +# Configuration for Cog ⚙️ +# Reference: https://cog.run/yaml + +build: + # set to true if your model requires a GPU + gpu: true + + # a list of ubuntu apt packages to install + system_packages: + - "libgl1-mesa-glx" + - "libglib2.0-0" + + # python version in the form '3.11' or '3.11.4' + python_version: "3.9" + + # a list of packages in the format == + python_packages: + - torch + - torchvision + - timm + - numpy<2 + - pillow_heif + - matplotlib + + # commands run after the environment is setup + run: + - pip install ipython + - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.8.2/pget_linux_x86_64" && chmod +x /usr/local/bin/pget + +# predict.py defines how predictions are run on your model +predict: "predict.py:Predictor" diff --git a/predict.py b/predict.py new file mode 100644 index 0000000..dbadb8e --- /dev/null +++ b/predict.py @@ -0,0 +1,110 @@ +# Prediction interface for Cog ⚙️ +# https://cog.run/python + +import os +import subprocess +import time +import numpy as np +import PIL.Image +import torch +from matplotlib import pyplot as plt +from cog import BasePredictor, Input, Path, BaseModel + +from src.depth_pro import create_model_and_transforms, load_rgb +from src.depth_pro.depth_pro import DepthProConfig + + +MODEL_CACHE = "checkpoints" +MODEL_URL = ( + f"https://weights.replicate.delivery/default/apple/ml-depth-pro/{MODEL_CACHE}.tar" +) + +os.environ.update( + { + "HF_DATASETS_OFFLINE": "1", + "TRANSFORMERS_OFFLINE": "1", + "HF_HOME": MODEL_CACHE, + "TORCH_HOME": MODEL_CACHE, + "HF_DATASETS_CACHE": MODEL_CACHE, + "TRANSFORMERS_CACHE": MODEL_CACHE, + "HUGGINGFACE_HUB_CACHE": MODEL_CACHE, + } +) + + +class ModelOutput(BaseModel): + npz: Path + color_map: Path + + +def download_weights(url, dest): + start = time.time() + print("downloading url: ", url) + print("downloading to: ", dest) + subprocess.check_call(["pget", "-x", url, dest], close_fds=False) + print("downloading took: ", time.time() - start) + + +class Predictor(BasePredictor): + def setup(self) -> None: + """Load the model into memory to make running multiple predictions efficient""" + + if not os.path.exists(MODEL_CACHE): + download_weights(MODEL_URL, MODEL_CACHE) + + self.model, self.transform = create_model_and_transforms( + config=DepthProConfig( + patch_encoder_preset="dinov2l16_384", + image_encoder_preset="dinov2l16_384", + checkpoint_uri=f"./{MODEL_CACHE}/depth_pro.pt", + decoder_features=256, + use_fov_head=True, + fov_encoder_preset="dinov2l16_384", + ), + device=torch.device("cuda:0"), + precision=torch.half, + ) + self.model.eval() + + def predict( + self, + image_path: Path = Input(description="Input image"), + ) -> ModelOutput: + """Run a single prediction on the model""" + + image, _, f_px = load_rgb(image_path) + + # Run prediction. If `f_px` is provided, it is used to estimate the final metric depth, + # otherwise the model estimates `f_px` to compute the depth metricness. + prediction = self.model.infer(self.transform(image), f_px=f_px) + + # Extract the depth and focal length. + depth = prediction["depth"].detach().cpu().numpy().squeeze() + if f_px is not None: + print(f"Focal length (from exif): {f_px:0.2f}") + elif prediction["focallength_px"] is not None: + focallength_px = prediction["focallength_px"].detach().cpu().item() + print(f"Estimated focal length: {focallength_px}") + + inverse_depth = 1 / depth + # Visualize inverse depth instead of depth, clipped to [0.1m;250m] range for better visualization. + max_invdepth_vizu = min(inverse_depth.max(), 1 / 0.1) + min_invdepth_vizu = max(1 / 250, inverse_depth.min()) + inverse_depth_normalized = (inverse_depth - min_invdepth_vizu) / ( + max_invdepth_vizu - min_invdepth_vizu + ) + + # Save Depth as npz file. + out_npz = "/tmp/out.npz" + np.savez_compressed(out_npz, depth=depth) + np.savez_compressed("out.npz", depth=depth) + + # Save as color-mapped "turbo" jpg image. + cmap = plt.get_cmap("turbo") + color_depth = (cmap(inverse_depth_normalized)[..., :3] * 255).astype(np.uint8) + out_color_map = "/tmp/out.jpg" + PIL.Image.fromarray(color_depth).save(out_color_map, format="JPEG", quality=90) + + PIL.Image.fromarray(color_depth).save("out.jpg", format="JPEG", quality=90) + + return ModelOutput(npz=Path(out_npz), color_map=Path(out_color_map))