From 2ea304626d615f22b4871b35633984608227f74c Mon Sep 17 00:00:00 2001 From: dimtsiakm Date: Mon, 19 Jan 2026 15:28:30 +0100 Subject: [PATCH] Depth processing for videos stored as frame-by-frame images in a folder, with customizable options --- run_streaming_frames.py | 128 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 run_streaming_frames.py diff --git a/run_streaming_frames.py b/run_streaming_frames.py new file mode 100644 index 0000000..db7fb0a --- /dev/null +++ b/run_streaming_frames.py @@ -0,0 +1,128 @@ +# Copyright (2025) Bytedance Ltd. and/or its affiliates + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import numpy as np +import os +import torch +import time +import cv2 +import glob +import matplotlib.cm as cm + +from video_depth_anything.video_depth_stream import VideoDepthAnything +from utils.dc_utils import save_video + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Video Depth Anything') + parser.add_argument('--input_video', type=str, default='./assets/example_videos/davis_rollercoaster.mp4', help='path to folder containing input frames') + parser.add_argument('--output_dir', type=str, default='./outputs') + parser.add_argument('--gpu_id', type=int, default=0) + parser.add_argument('--input_size', type=int, default=518) + parser.add_argument('--max_res', type=int, default=1280) + parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl']) + parser.add_argument('--max_len', type=int, default=-1, help='maximum length of the input video, -1 means no limit') + parser.add_argument('--target_fps', type=int, default=-1, help='target fps of the input video, -1 means the original fps') + parser.add_argument('--metric', action='store_true', help='use metric model') + parser.add_argument('--fp32', action='store_true', help='model infer with torch.float32, default is torch.float16') + parser.add_argument('--grayscale', action='store_true', help='do not apply colorful palette') + + args = parser.parse_args() + + DEVICE = f'cuda:{args.gpu_id}' if torch.cuda.is_available() else 'cpu' + + model_configs = { + 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}, + 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]}, + 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}, + } + checkpoint_name = 'metric_video_depth_anything' if args.metric else 'video_depth_anything' + + video_depth_anything = VideoDepthAnything(**model_configs[args.encoder]) + video_depth_anything.load_state_dict(torch.load(f'./checkpoints/{checkpoint_name}_{args.encoder}.pth', map_location='cpu'), strict=True) + video_depth_anything = video_depth_anything.to(DEVICE).eval() + + # Get list of image files from folder + frame_files = sorted(glob.glob(os.path.join(args.input_video, '*.jpg'))) + total_frames = len(frame_files) + + if total_frames == 0: + raise ValueError(f"No .jpg files found in {args.input_video}") + + # Read first frame to get dimensions + first_frame = cv2.imread(frame_files[0]) + original_height, original_width = first_frame.shape[:2] + + if args.max_res > 0 and max(original_height, original_width) > args.max_res: + scale = args.max_res / max(original_height, original_width) + height = round(original_height * scale) + width = round(original_width * scale) + + # Use 30 FPS as default + original_fps = 30 + fps = original_fps if args.target_fps < 0 else args.target_fps + + stride = max(round(original_fps / fps), 1) + + depths = [] + start = time.time() + + # Create output directory if it doesn't exist + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + # Get colormap for depth visualization + colormap = np.array(cm.get_cmap("inferno").colors) + + for frame_count, frame_path in enumerate(frame_files): + if args.max_len > 0 and frame_count >= args.max_len: + break + + if frame_count % stride == 0: + frame = cv2.imread(frame_path) + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # Convert BGR to RGB + + if args.max_res > 0 and max(original_height, original_width) > args.max_res: + frame = cv2.resize(frame, (width, height)) # Resize frame + + # Inference depth + depth = video_depth_anything.infer_video_depth_one(frame, input_size=args.input_size, device=DEVICE, fp32=args.fp32) + depths.append(depth) + + # Save individual depth frame as JPG with colormap + frame_name = os.path.basename(frame_path) + frame_name_no_ext = os.path.splitext(frame_name)[0] + depth_frame_path = os.path.join(args.output_dir, frame_name_no_ext + '_depth.jpg') + + # Apply same colormap as video output + depth_normalized = ((depth - depth.min()) / (depth.max() - depth.min()) * 255).astype(np.uint8) + if args.grayscale: + depth_colored = depth_normalized + else: + depth_colored = (colormap[depth_normalized] * 255).astype(np.uint8) + + # Convert RGB to BGR for cv2.imwrite + if not args.grayscale: + depth_colored = cv2.cvtColor(depth_colored, cv2.COLOR_RGB2BGR) + cv2.imwrite(depth_frame_path, depth_colored) + + if (frame_count + 1) % 50 == 0: + print(f"frame: {frame_count + 1}/{total_frames}") + + end = time.time() + print(f"time: {end - start}s") + + folder_name = os.path.basename(args.input_video.rstrip('/')) + depth_vis_path = os.path.join(args.output_dir, folder_name + '_vis.mp4') + depths = np.stack(depths, axis=0) + save_video(depths, depth_vis_path, fps=fps, is_depths=True, grayscale=args.grayscale)