forked from DepthAnything/Depth-Anything-V2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdepth_to_pointcloud_final.py
254 lines (221 loc) · 8.17 KB
/
depth_to_pointcloud_final.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
import argparse
import cv2
import glob
import numpy as np
import open3d as o3d
import os
from PIL import Image
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt
def colorize(value, vmin=None, vmax=None, cmap=None):
"""
Apply a colormap to a grayscale image (numpy array).
"""
value = value.copy() # Copy of input array
if vmin is None:
vmin = np.min(value) # Min value
if vmax is None:
vmax = np.max(value) # Max value
if cmap is None:
cmap = "magma" # Colour used to visualize dmap
# Normalize value to 0-1
value = (value - vmin) / (vmax - vmin + 1e-8)
value = np.clip(value, 0, 1)
# Apply colormap
cmapper = plt.get_cmap(cmap)
value = cmapper(value, bytes=True) # (H, W, 4)
img = value[:, :, :3] # Remove alpha channel to get RGB
return img # Return RGB img as Numpy array
def process_images(model):
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
image_paths = glob.glob(os.path.join(INPUT_DIR, "*.png")) + glob.glob(
os.path.join(INPUT_DIR, "*.jpg")
)
for image_path in tqdm(image_paths, desc="Processing Images"):
try:
# Read image using OpenCV
image = cv2.imread(image_path)
if image is None:
print(f"Error reading {image_path}")
continue
# Convert BGR to RGB
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
original_height, original_width = image.shape[:2]
FINAL_HEIGHT = original_height
FINAL_WIDTH = original_width
# Use the infer_image method
pred = model.infer_image(image)
predm = pred.squeeze()
print("Saving images ...")
# Resize color image and depth to final size
color_image = Image.fromarray(image)
resized_color_image = color_image.resize(
(FINAL_WIDTH, FINAL_HEIGHT), Image.LANCZOS
)
resized_pred = Image.fromarray(predm).resize(
(FINAL_WIDTH, FINAL_HEIGHT), Image.NEAREST
)
focal_length_x, focal_length_y = (
(FX, FY) if not NYU_DATA else (FL, FL)
) # If NYU_DATA is False use FX, FY otherwise FL
x, y = np.meshgrid(np.arange(FINAL_WIDTH), np.arange(FINAL_HEIGHT))
x = (x - FINAL_WIDTH / 2) / focal_length_x
y = (y - FINAL_HEIGHT / 2) / focal_length_y
z = np.array(resized_pred)
points = np.stack(
(np.multiply(x, z), np.multiply(y, z), z), axis=-1
).reshape(-1, 3)
# Image.fromarray(predm).convert("L").resize(
# (FINAL_WIDTH, FINAL_HEIGHT), Image.NEAREST
# ).save(
# os.path.join(
# OUTPUT_DIR,
# os.path.splitext(os.path.basename(image_path))[0] + "_pred01.png",
# )
# )
# p = colorize(predm, cmap="magma_r")
# Image.fromarray(p).save(
# os.path.join(
# OUTPUT_DIR,
# os.path.splitext(os.path.basename(image_path))[0] + "_pred02.png",
# )
# )
# pm = colorize(z, cmap="magma_r")
# Image.fromarray(pm).save(
# os.path.join(
# OUTPUT_DIR,
# os.path.splitext(os.path.basename(image_path))[0] + "_pred03.png",
# )
# )
z_norm = (z - z.min()) / (z.max() - z.min() + 1e-8)
imgdepth = o3d.geometry.Image((z_norm * 255).astype(np.uint8))
o3d.io.write_image(
os.path.join(
OUTPUT_DIR,
os.path.splitext(os.path.basename(image_path))[0] + "_pred04.png",
),
imgdepth,
)
print(f"Depth range: {z.min()} to {z.max()}")
# Uncomment the following lines if you want to save point clouds
# colors = np.array(resized_color_image).reshape(-1, 3) / 255.0
# pcd = o3d.geometry.PointCloud()
# pcd.points = o3d.utility.Vector3dVector(points)
# pcd.colors = o3d.utility.Vector3dVector(colors)
# o3d.io.write_point_cloud(
# os.path.join(
# OUTPUT_DIR,
# os.path.splitext(os.path.basename(image_path))[0] + ".ply",
# ),
# pcd,
# )
except Exception as e:
print(f"Error processing {image_path}: {e}")
def main():
parser = argparse.ArgumentParser(
description="Process images to generate depth maps and point clouds."
)
parser.add_argument(
"--input-dir",
type=str,
required=True,
help="Directory containing input images.",
)
parser.add_argument(
"--output-dir",
type=str,
help="Directory to save outputs.",
)
parser.add_argument(
"--model-path",
type=str,
required=True,
help="Path to the pre-trained model weights.",
)
parser.add_argument(
"--encoder",
default="vitl",
type=str,
choices=["vits", "vitb", "vitl", "vitg"],
help="Model encoder to use.",
)
parser.add_argument(
"--max-depth",
default=20,
type=float,
choices=[20, 80], # 20 for indoor model, 80 for outdoor model
help="Maximum depth value for the depth map.",
)
parser.add_argument(
"--fx", default=1109, type=float, help="Focal length along the x-axis."
)
parser.add_argument(
"--fy", default=1109, type=float, help="Focal length along the y-axis."
)
parser.add_argument(
"--dataset",
type=str,
default="hypersim",
choices=["hypersim", "vkitti"], # Depends on encoder used (indoor or outdoor)
help="Dataset type if needed (e.g., nyu).",
)
parser.add_argument(
"--nyu-data", action="store_true", help="Flag to indicate if using NYU dataset."
)
args = parser.parse_args()
# Set the default output directory based on dataset and encoder
if args.output_dir is None:
args.output_dir = os.path.join("./output", args.dataset, args.encoder)
# Set global variables used in process_images
global OUTPUT_DIR, INPUT_DIR, FX, FY, FL, NYU_DATA, DATASET
OUTPUT_DIR = args.output_dir
INPUT_DIR = args.input_dir
FX = args.fx
FY = args.fy
FL = 715.0873 # Focal length - NOT IN USE
NYU_DATA = args.nyu_data
DATASET = args.dataset
# Determine the device to use (CUDA, MPS, or CPU)
DEVICE = (
"cuda"
if torch.cuda.is_available()
else "mps" if torch.backends.mps.is_available() else "cpu"
)
# Model configuration based on the chosen encoder
model_configs = {
"vits": {"encoder": "vits", "features": 64, "out_channels": [48, 96, 192, 384]},
"vitb": {
"encoder": "vitb",
"features": 128,
"out_channels": [96, 192, 384, 768],
},
"vitl": {
"encoder": "vitl",
"features": 256,
"out_channels": [256, 512, 1024, 1024],
},
"vitg": {
"encoder": "vitg",
"features": 384,
"out_channels": [1536, 1536, 1536, 1536],
},
}
# Initialize the model
from depth_anything_v2.dpt import DepthAnythingV2
depth_anything = DepthAnythingV2(
**{**model_configs[args.encoder], "max_depth": args.max_depth}
)
depth_anything.load_state_dict(torch.load(args.model_path, map_location="cpu"))
depth_anything = depth_anything.to(DEVICE).eval()
# Call the process_images function
process_images(depth_anything)
if __name__ == "__main__":
main()
# Example usage:
# Will make default output dir if specific one is not provided
# Indoor
# python3 test4.py --input-dir ../data/test/test-input/ --model-path checkpoints/indoor/depth_anything_v2_metric_hypersim_vitl.pth
# Outdoor
# python3 test5.py --input-dir ../data/actual-data/ --model-path checkpoints/outdoor/depth_anything_v2_metric_vkitti_vitl.pth --encoder vitl --max-depth 80 --dataset vkitti