25
25
from marigold .marigold import MarigoldPipeline
26
26
# pix2pix/merge net imports
27
27
from pix2pix .options .test_options import TestOptions
28
+ # depthanyting v2
29
+ try :
30
+ from depth_anything_v2 import DepthAnythingV2
31
+ except :
32
+ print ('depth_anything_v2 import failed... somehow' )
28
33
29
34
# Our code
30
35
from src .misc import *
@@ -80,6 +85,8 @@ def load_models(self, model_type, device: torch.device, boost: bool, tiling_mode
80
85
model_dir = "./models/leres"
81
86
if model_type == 11 :
82
87
model_dir = "./models/depth_anything"
88
+ if model_type in [12 , 13 , 14 ]:
89
+ model_dir = "./models/depth_anything_v2"
83
90
84
91
# create paths to model if not present
85
92
os .makedirs (model_dir , exist_ok = True )
@@ -227,6 +234,19 @@ def load_models(self, model_type, device: torch.device, boost: bool, tiling_mode
227
234
"https://huggingface.co/spaces/LiheYoung/Depth-Anything/resolve/main/checkpoints/depth_anything_vitl14.pth" )
228
235
229
236
model .load_state_dict (torch .load (model_path ))
237
+ elif model_type in [12 , 13 , 14 ]: # depth_anything_v2 small, base, large
238
+ letter = {12 : 's' , 13 : 'b' , 14 : 'l' }[model_type ]
239
+ word = {12 : 'Small' , 13 : 'Base' , 14 : 'Large' }[model_type ]
240
+ model_path = f"{ model_dir } /depth_anything_v2_vit{ letter } .pth"
241
+ ensure_file_downloaded (model_path ,
242
+ f"https://huggingface.co/depth-anything/Depth-Anything-V2-{ word } /resolve/main/depth_anything_v2_vit{ letter } .pth" )
243
+ model_configs = {'vits' : {'encoder' : 'vits' , 'features' : 64 , 'out_channels' : [48 , 96 , 192 , 384 ]},
244
+ 'vitb' : {'encoder' : 'vitb' , 'features' : 128 , 'out_channels' : [96 , 192 , 384 , 768 ]},
245
+ 'vitl' : {'encoder' : 'vitl' , 'features' : 256 , 'out_channels' : [256 , 512 , 1024 , 1024 ]},
246
+ 'vitg' : {'encoder' : 'vitg' , 'features' : 384 , 'out_channels' : [1536 , 1536 , 1536 , 1536 ]}}
247
+ model = DepthAnythingV2 (** model_configs [f'vit{ letter } ' ])
248
+ model .load_state_dict (torch .load (model_path , map_location = 'cpu' ))
249
+ # 15 is reserved for Depth Anything V2 Giant
230
250
231
251
if tiling_mode :
232
252
def flatten (el ):
@@ -250,6 +270,9 @@ def flatten(el):
250
270
# TODO: Fix for zoedepth_n - it completely trips and generates black images
251
271
if model_type in [1 , 2 , 3 , 4 , 5 , 6 , 8 , 9 , 11 ] and not boost :
252
272
model = model .half ()
273
+ if model_type in [12 , 13 , 14 ]:
274
+ model .depth_head .half ()
275
+ model .pretrained .half ()
253
276
model .to (device ) # to correct device
254
277
255
278
self .depth_model = model
@@ -291,7 +314,10 @@ def get_default_net_size(model_type):
291
314
8 : [384 , 768 ],
292
315
9 : [384 , 512 ],
293
316
10 : [768 , 768 ],
294
- 11 : [518 , 518 ]
317
+ 11 : [518 , 518 ],
318
+ 12 : [518 , 518 ],
319
+ 13 : [518 , 518 ],
320
+ 14 : [518 , 518 ]
295
321
}
296
322
if model_type in sizes :
297
323
return sizes [model_type ]
@@ -350,6 +376,8 @@ def get_raw_prediction(self, input, net_width, net_height):
350
376
self .marigold_ensembles , self .marigold_steps )
351
377
elif self .depth_model_type == 11 :
352
378
raw_prediction = estimatedepthanything (img , self .depth_model , net_width , net_height )
379
+ elif self .depth_model_type in [12 , 13 , 14 ]:
380
+ raw_prediction = estimatedepthanything_v2 (img , self .depth_model , net_width , net_height )
353
381
else :
354
382
raw_prediction = estimateboost (img , self .depth_model , self .depth_model_type , self .pix2pix_model ,
355
383
self .boost_rmax )
@@ -499,6 +527,20 @@ def estimatedepthanything(image, model, w, h):
499
527
return depth .cpu ().numpy ()
500
528
501
529
530
+ def estimatedepthanything_v2 (image , model , w , h ):
531
+ # This is an awkward re-conversion, but I believe it should not impact quality
532
+ img = cv2 .cvtColor ((image * 255.1 ).astype ('uint8' ), cv2 .COLOR_BGR2RGB )
533
+ with torch .no_grad ():
534
+ # Compare to: model.infer_image(img, w)
535
+ image , (h , w ) = model .image2tensor (img , w )
536
+ # Casting to correct type, it is the same as type of some model tensor (the one here is arbitrary)
537
+ image_casted = image .type_as (model .pretrained .blocks [0 ].norm1 .weight .data )
538
+ depth = model .forward (image_casted ).type_as (image )
539
+ import torch .nn .functional as F
540
+ depth = F .interpolate (depth [:, None ], (h , w ), mode = "bilinear" , align_corners = True )[0 , 0 ]
541
+ return depth .cpu ().numpy ()
542
+
543
+
502
544
class ImageandPatchs :
503
545
def __init__ (self , root_dir , name , patchsinfo , rgb_image , scale = 1 ):
504
546
self .root_dir = root_dir
@@ -720,6 +762,8 @@ def estimateboost(img, model, model_type, pix2pixmodel, whole_size_threshold):
720
762
net_receptive_field_size = 512
721
763
elif model_type == 11 : # depth_anything
722
764
net_receptive_field_size = 518
765
+ elif model_type in [12 , 13 , 14 ]: # depth_anything_v2
766
+ net_receptive_field_size = 518
723
767
else : # other midas # TODO Marigold support
724
768
net_receptive_field_size = 384
725
769
patch_netsize = 2 * net_receptive_field_size
@@ -995,6 +1039,8 @@ def singleestimate(img, msize, model, net_type):
995
1039
return estimatemarigold (img , model , msize , msize )
996
1040
elif net_type == 11 :
997
1041
return estimatedepthanything (img , model , msize , msize )
1042
+ elif net_type in [12 , 13 , 14 ]:
1043
+ return estimatedepthanything_v2 (img , model , msize , msize )
998
1044
elif net_type >= 7 :
999
1045
# np to PIL
1000
1046
return estimatezoedepth (Image .fromarray (np .uint8 (img * 255 )).convert ('RGB' ), model , msize , msize )
0 commit comments