From 499c3e85a312f2c53444a4430b30a4bebe2f91cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Harri=20Sma=CC=8Att?= Date: Tue, 15 Oct 2024 18:24:59 +0300 Subject: [PATCH] Convert from Depth Pro default 1536x1536 implementation to 1024x1024 tensor CoreML packages --- .gitignore | 4 + README.md | 2 + convert_to_coreml.py | 191 +++++++++++++++++++++++++++++++ src/depth_pro/network/encoder.py | 12 +- 4 files changed, 203 insertions(+), 6 deletions(-) create mode 100644 .gitignore create mode 100644 convert_to_coreml.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9177316 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ + +checkpoints +out + diff --git a/README.md b/README.md index 6c4ea61..cc6d313 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +# This fork adds `convert_to_coreml.py` script to convert original Depth Pro model to CoreML programs + ## Depth Pro: Sharp Monocular Metric Depth in Less Than a Second This software project accompanies the research paper: diff --git a/convert_to_coreml.py b/convert_to_coreml.py new file mode 100644 index 0000000..9a681fe --- /dev/null +++ b/convert_to_coreml.py @@ -0,0 +1,191 @@ +import coremltools as ct +import logging +import math +import numpy as np + +import torch +from torch import nn +from torch.nn import functional as F + +from matplotlib import pyplot as plt +from typing import Tuple + +from src.depth_pro.depth_pro import ( + create_model_and_transforms, + create_backbone_model, + DEFAULT_MONODEPTH_CONFIG_DICT +) +from src.depth_pro.network.fov import FOVNetwork +from src.depth_pro.network.vit import resize_vit, resize_patch_embed +from src.depth_pro.utils import load_rgb + +from torchvision.transforms import ( + Compose, + ConvertImageDtype, + Lambda, + Normalize, + ToTensor +) + +""" +example.jpg fov_deg = +default 1536x1536: 48.4297 +scaled 1024x1024: 49.8382 +""" + +class DepthProRun(nn.Module): + def __init__(self, transform: nn.Module, encoder: nn.Module, decoder: nn.Module, depth: nn.Module): + super().__init__() + self.transform = transform + self.encoder = encoder + self.decoder = decoder + self.depth = depth + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if x.shape[0] == 3: + x = x.unsqueeze(0) + image = self.transform(x) + encodings = self.encoder(image) + features, features_0 = self.decoder(encodings) + depth = self.depth([image, features, features_0]) + return depth + +class Depth(nn.Module): + def __init__(self, head: nn.Module, fov: nn.Module): + super(Depth, self).__init__() + self.head = head + self.fov = fov + + def forward(self, inputs: torch.Tensor) -> torch.Tensor: + x = inputs[0] + features = inputs[1] + features_0 = inputs[2] + + _, _, H, W = features_0.shape + if H != 48 or W != 48: + features_0 = F.interpolate( + features_0, + size=(48, 48), + mode="bilinear", + align_corners=False, + ) + + # execute fov.forward locally with a different scale_factor + # fov_deg = self.fov.forward(x, features_0.detach()) + if hasattr(self.fov, "encoder"): + x = F.interpolate( + x, + size=None, + # result size needs to be 374 + scale_factor=0.375, + mode="bilinear", + align_corners=False, + ) + x = self.fov.encoder(x)[:, 1:].permute(0, 2, 1) + lowres_feature = self.fov.downsample(features_0.detach()) + x = x.reshape_as(lowres_feature) + lowres_feature + else: + x = features_0.detach() + fov_deg = self.fov.head(x) + f_px = 0.5 * torch.tan(math.pi * fov_deg.to(torch.float) / 360.0) + + canonical_inverse_depth = self.head(features) + inverse_depth = canonical_inverse_depth * f_px + depth = 1.0 / inverse_depth.clamp(min=1e-4, max=1e4) + return depth + +class Interpolate(nn.Module): + def __init__(self, size, mode): + super(Interpolate, self).__init__() + self.size = size + self.mode = mode + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = F.interpolate(x, size=self.size, mode=self.mode, align_corners=False) + return x + +def save_mlpackage(G, shapes, name): + G.eval() + G_inputs = [] + convert_inputs = [] + for shape in shapes: + G_inputs.append(torch.randn(shape)) + convert_inputs.append(ct.TensorType(shape=shape, dtype=np.float16)) + G_trace = torch.jit.trace(G, G_inputs if len(G_inputs) == 1 else [G_inputs]) + G_model = ct.convert( + G_trace, + inputs=convert_inputs if len(convert_inputs) <= 1 else [convert_inputs], + minimum_deployment_target=ct.target.macOS15, + compute_precision=ct.precision.FLOAT16, + compute_units=ct.ComputeUnit.CPU_AND_NE + ) + G_model.save("out/" + name + ".mlpackage") + +def create_scaled_model() -> Tuple[nn.Module, nn.Module, nn.Module]: + # from run.py + model, _ = create_model_and_transforms( + device=torch.device("cpu"), + precision=torch.float32, + ) + + # resize to 256x4 = 1024x1024 input image + new_img_size = (256, 256) + + model.encoder.patch_encoder = resize_patch_embed(model.encoder.patch_encoder) + model.encoder.patch_encoder = resize_vit(model.encoder.patch_encoder, img_size=new_img_size) + model.encoder.image_encoder = resize_patch_embed(model.encoder.image_encoder) + model.encoder.image_encoder = resize_vit(model.encoder.image_encoder, img_size=new_img_size) + model.encoder.out_size = int( + model.encoder.patch_encoder.patch_embed.img_size[0] // model.encoder.patch_encoder.patch_embed.patch_size[0] + ) + + # this is still under works to resize fov_encoder to 256x256 size too + # fov_encoder, _ = create_backbone_model(preset = DEFAULT_MONODEPTH_CONFIG_DICT.fov_encoder_preset) + # fov_encoder = resize_patch_embed(fov_encoder) + # fov_encoder = resize_vit(fov_encoder, img_size=new_img_size) + # model.fov = FOVNetwork(num_features=model.decoder.dim_decoder, fov_encoder=fov_encoder) + + # from depth_pro.py + transform = nn.Sequential( + #[ + #ToTensor(), + #Lambda(lambda x: x.to(device)), + Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]), + Interpolate( + size=(model.img_size, model.img_size), + mode="bilinear" + ), + ConvertImageDtype(torch.float32), + #] + ) + + depth = Depth(model.head, model.fov) + return transform, model, depth + +def load_and_show_example(transform: nn.Module, model: nn.Module, depth: nn.Module): + image, _, _ = load_rgb("data/example.jpg") + depth_pro_run = DepthProRun(transform, model.encoder, model.decoder, depth) + + depth_pro = Compose([ToTensor(), Lambda(lambda x: x.to(torch.device("cpu"))), depth_pro_run]) + depth_map = depth_pro(image).detach().cpu().numpy().squeeze() + + plt.ion() + fig = plt.figure() + ax_rgb = fig.add_subplot(121) + ax_disp = fig.add_subplot(122) + ax_rgb.imshow(image) + ax_disp.imshow(depth_map, cmap="turbo") + fig.canvas.draw() + fig.canvas.flush_events() + plt.show(block=True) + +def save_coreml_packages(transform: nn.Module, model: nn.Module, depth: nn.Module): + save_mlpackage(transform, [[1, 3, 1024, 1024]], "DepthPro_transform") + save_mlpackage(model.encoder, [[1, 3, 1024, 1024]], "DepthPro_encoder") + save_mlpackage(model.decoder, [[1, 256, 512, 512], [1, 256, 256, 256], [1, 512, 128, 128], [1, 1024, 64, 64], [1, 1024, 32, 32]], "DepthPro_decoder") + save_mlpackage(depth, [[1, 3, 1024, 1024], [1, 256, 512, 512], [1, 256, 32, 32]], "DepthPro_depth") + +if __name__ == "__main__": + transform, model, depth = create_scaled_model() + load_and_show_example(transform, model, depth) + save_coreml_packages(transform, model, depth) diff --git a/src/depth_pro/network/encoder.py b/src/depth_pro/network/encoder.py index a3a3da1..eea1bbd 100644 --- a/src/depth_pro/network/encoder.py +++ b/src/depth_pro/network/encoder.py @@ -169,7 +169,7 @@ def _create_pyramid( def split(self, x: torch.Tensor, overlap_ratio: float = 0.25) -> torch.Tensor: """Split the input into small patches with sliding window.""" - patch_size = 384 + patch_size = 256 patch_stride = int(patch_size * (1 - overlap_ratio)) image_size = x.shape[-1] @@ -276,7 +276,7 @@ def forward(self, x: torch.Tensor) -> list[torch.Tensor]: self.out_size, ) x_latent0_features = self.merge( - x_latent0_encodings[: batch_size * 5 * 5], batch_size=batch_size, padding=3 + x_latent0_encodings[: batch_size * 5 * 5], batch_size=batch_size, padding=2 ) x_latent1_encodings = self.reshape_feature( @@ -285,21 +285,21 @@ def forward(self, x: torch.Tensor) -> list[torch.Tensor]: self.out_size, ) x_latent1_features = self.merge( - x_latent1_encodings[: batch_size * 5 * 5], batch_size=batch_size, padding=3 + x_latent1_encodings[: batch_size * 5 * 5], batch_size=batch_size, padding=2 ) # Split the 35 batch size from pyramid encoding back into 5x5+3x3+1x1. x0_encodings, x1_encodings, x2_encodings = torch.split( x_pyramid_encodings, - [len(x0_patches), len(x1_patches), len(x2_patches)], + [x0_patches.shape[0], x1_patches.shape[0], x2_patches.shape[0]], dim=0, ) # 96x96 feature maps by merging 5x5 @ 24x24 patches with overlaps. - x0_features = self.merge(x0_encodings, batch_size=batch_size, padding=3) + x0_features = self.merge(x0_encodings, batch_size=batch_size, padding=2) # 48x84 feature maps by merging 3x3 @ 24x24 patches with overlaps. - x1_features = self.merge(x1_encodings, batch_size=batch_size, padding=6) + x1_features = self.merge(x1_encodings, batch_size=batch_size, padding=4) # 24x24 feature maps. x2_features = x2_encodings