From 78460b34dd0e0bab6b9d5797443c80b1c5558b1f Mon Sep 17 00:00:00 2001 From: Zhao-Xu Luo Date: Thu, 2 Apr 2026 04:51:18 +0800 Subject: [PATCH] Qualcomm AI Engine Direct - Add depth_anything_v2_small to oss_scripts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary - Enable **depth_anything_v2_small** model (https://github.com/DepthAnything/Depth-Anything-V2) on HTP and integrates the script into `oss_scripts/`. - Provide `--dump_example_output` flag to dump the example image and export depth‑estimation images from both source model and QNN outputs. - Add unit tests to cover the added model. Test plan - Test with random images from ImageNet: `python examples/qualcomm/oss_scripts/depthanything_v2_small.py -a $ARTIFACT -d $IMAGENET_FOLDER_PATH -b build-android/ -H $HOST_NAME -s $DEVICE_ID -m $SOC_ID --seed 1126` - Test with the example image and export the post-processed source model output and QNN output into depth-estimation images: `python examples/qualcomm/oss_scripts/depthanything_v2_small.py -a $ARTIFACT -d $IMAGENET_FOLDER_PATH -b build-android/ -H $HOST_NAME -s $DEVICE_ID -m $SOC_ID --dump_example_output` --- backends/qualcomm/tests/test_qnn_delegate.py | 26 ++ .../oss_scripts/depthanything_v2_small.py | 237 ++++++++++++++++++ 2 files changed, 263 insertions(+) create mode 100644 examples/qualcomm/oss_scripts/depthanything_v2_small.py diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 4e23f43c2ea..72e91e48fc1 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -7147,6 +7147,32 @@ def test_deit(self): self.assertGreaterEqual(msg["top_1"], 76) self.assertGreaterEqual(msg["top_5"], 92) + def test_depthanything_v2_small(self): + if not self.required_envs([self.image_dataset]): + self.skipTest("missing required envs") + + cmds = [ + "python", + f"{self.executorch_root}/examples/qualcomm/oss_scripts/depthanything_v2_small.py", + "--dataset", + self.image_dataset, + "--artifact", + self.artifact_dir, + "--build_folder", + self.build_folder, + ] + self.add_default_cmds(cmds) + + p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) + with Listener((self.ip, self.port)) as listener: + conn = listener.accept() + p.communicate() + msg = json.loads(conn.recv()) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["sqnr"], 15) + def test_dino_v2(self): if not self.required_envs([self.image_dataset]): self.skipTest("missing required envs") diff --git a/examples/qualcomm/oss_scripts/depthanything_v2_small.py b/examples/qualcomm/oss_scripts/depthanything_v2_small.py new file mode 100644 index 00000000000..f38f4d818b7 --- /dev/null +++ b/examples/qualcomm/oss_scripts/depthanything_v2_small.py @@ -0,0 +1,237 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import getpass +import json +import logging +import os +from multiprocessing.connection import Client + +import numpy as np +import requests +import torch +from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype +from executorch.backends.qualcomm.serialization.qc_schema import ( + QnnExecuTorchBackendType, +) + +from executorch.examples.qualcomm.utils import ( + build_executorch_binary, + get_backend_type, + get_imagenet_dataset, + make_output_dir, + parse_skip_delegation_node, + setup_common_args_and_variables, + SimpleADB, +) +from PIL import Image +from torchao.quantization.utils import compute_error +from transformers import AutoImageProcessor, AutoModelForDepthEstimation +from transformers.modeling_outputs import DepthEstimatorOutput + +HUGGING_FACE_DEPTHANYTHING_V2 = "depth-anything/Depth-Anything-V2-Small-hf" + + +def postprocess_output_and_save(output, image_height, image_width, output_image_path): + image_processor = AutoImageProcessor.from_pretrained(HUGGING_FACE_DEPTHANYTHING_V2) + + post_processed_output = image_processor.post_process_depth_estimation( + # Resize the output back to the original image dimensions and set the channel dimension to 1 as + # depth‑estimation outputs are single‑channel. + DepthEstimatorOutput( + predicted_depth=output.reshape(1, image_height, image_width) + ), + target_sizes=[(image_height, image_width)], + ) + + predicted_depth = post_processed_output[0]["predicted_depth"] + depth = (predicted_depth - predicted_depth.min()) / ( + predicted_depth.max() - predicted_depth.min() + ) + depth = depth.detach().cpu().numpy() * 255 + depth = Image.fromarray(depth.astype("uint8")) + depth.save(output_image_path) + + +def main(args): + if args.compile_only and args.pre_gen_pte: + raise RuntimeError("Cannot set both compile_only and pre_gen_pte as true") + + skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args) + os.makedirs(args.artifact, exist_ok=True) + + model = AutoModelForDepthEstimation.from_pretrained( + HUGGING_FACE_DEPTHANYTHING_V2 + ).eval() + + data_num = 100 + if args.ci: + data_num = 1 + inputs = [(torch.rand(1, 3, 256, 256),)] + logging.warning( + "This option is for CI to verify the export flow. It uses random input and will result in poor accuracy." + ) + elif args.dump_example_output: + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw) + image.save(os.path.join(args.artifact, "source.png")) + image_processor = AutoImageProcessor.from_pretrained( + HUGGING_FACE_DEPTHANYTHING_V2 + ) + + pixel_values = image_processor(images=image, return_tensors="pt")[ + "pixel_values" + ] + inputs = [(pixel_values,)] + data_num = 1 + else: + inputs, _ = get_imagenet_dataset( + dataset_path=f"{args.dataset}", + data_size=data_num, + image_shape=(256, 256), + ) + + goldens = [] + with torch.no_grad(): + for per_input in inputs: + predicted_depth = model(*per_input).predicted_depth + goldens.append(predicted_depth.flatten()) + + pte_filename = "depthanything_v2_small_qnn" + # Skip lowering/compilation if using pre-generated PTE + if not args.pre_gen_pte: + # Lower to QNN + backend = get_backend_type(args.backend) + quant_dtype = { + QnnExecuTorchBackendType.kGpuBackend: None, + QnnExecuTorchBackendType.kHtpBackend: QuantDtype.use_8a8w, + }[backend] + build_executorch_binary( + model, + inputs[0], + args.model, + os.path.join(args.artifact, pte_filename), + inputs, + skip_node_id_set=skip_node_id_set, + skip_node_op_set=skip_node_op_set, + quant_dtype=quant_dtype, + backend=backend, + shared_buffer=args.shared_buffer, + online_prepare=args.online_prepare, + ) + + if args.compile_only: + return + + workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/{pte_filename}" + pte_path = ( + f"{args.pre_gen_pte}/{pte_filename}.pte" + if args.pre_gen_pte + else f"{args.artifact}/{pte_filename}.pte" + ) + + adb = SimpleADB( + qnn_sdk=os.getenv("QNN_SDK_ROOT"), + build_path=f"{args.build_folder}", + pte_path=pte_path, + workspace=workspace, + device_id=args.device, + host_id=args.host, + soc_model=args.model, + shared_buffer=args.shared_buffer, + target=args.target, + ) + adb.push(inputs=inputs, backends={backend}) + adb.execute() + + # collect output data + output_data_folder = f"{args.artifact}/outputs" + make_output_dir(output_data_folder) + + adb.pull(host_output_path=args.artifact) + + evaluations = { + "sqnr": [], + } + for i in range(data_num): + prediction = torch.from_numpy( + np.fromfile( + os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32 + ) + ) + evaluations["sqnr"].append(compute_error(goldens[i], prediction)) + + if args.dump_example_output: + example_input_shape = list(inputs[0][0].shape) + image_height, image_width = example_input_shape[-2], example_input_shape[-1] + + # Post-process source model output and export the depth estimation image + postprocess_output_and_save( + goldens[0], + image_height, + image_width, + os.path.join(args.artifact, "golden_depth.png"), + ) + prediction = np.fromfile( + os.path.join(output_data_folder, "output_0_0.raw"), dtype=np.float32 + ) + # Post-process QNN output and export the depth estimation image + postprocess_output_and_save( + torch.from_numpy(prediction), + image_height, + image_width, + os.path.join(args.artifact, "prediction_depth.png"), + ) + + evaluations["sqnr"] = sum(evaluations["sqnr"]) / data_num + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"sqnr": evaluations["sqnr"]})) + else: + print("SQNR(dB)={sqnr}".format(**evaluations)) + + +if __name__ == "__main__": + parser = setup_common_args_and_variables() + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts and output by this example. Default ./depthanything_v2_small", + default="./depthanything_v2_small", + type=str, + ) + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=False, + ) + parser.add_argument( + "--dump_example_output", + help=( + "If specified, export the example image and post-process both the source model output " + "and the QNN output into depth-estimation images." + ), + action="store_true", + default=False, + ) + + args = parser.parse_args() + args.validate(args) + + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e)