From 7e01f6052a37eecf3ed5c7ea6ff32ef129dbf1fc Mon Sep 17 00:00:00 2001
From: guotingxuan <117552024+guotingxuan@users.noreply.github.com>
Date: Mon, 2 Mar 2026 19:16:14 +0800
Subject: [PATCH 1/4] add video analysis operator suite

Integrated safety check, video cut, annotation, summary and classify for medical video data engineering.
---
 .../video_analysis_operator/__init__.py       |   8 +
 .../video_analysis_operator/metadata.yml      |  43 ++++++
 .../mapper/video_analysis_operator/process.py | 141 ++++++++++++++++++
 .../video_analysis_operator/requirements.txt  |  17 +++
 4 files changed, 209 insertions(+)
 create mode 100644 runtime/ops/mapper/video_analysis_operator/__init__.py
 create mode 100644 runtime/ops/mapper/video_analysis_operator/metadata.yml
 create mode 100644 runtime/ops/mapper/video_analysis_operator/process.py
 create mode 100644 runtime/ops/mapper/video_analysis_operator/requirements.txt

diff --git a/runtime/ops/mapper/video_analysis_operator/__init__.py b/runtime/ops/mapper/video_analysis_operator/__init__.py
new file mode 100644
index 00000000..6227d434
--- /dev/null
+++ b/runtime/ops/mapper/video_analysis_operator/__init__.py
@@ -0,0 +1,8 @@
+# -*- coding: utf-8 -*-
+from datamate.core.base_op import OPERATORS
+
+# 注册该算子，确保 path 指向当前文件夹名 video_analysis_operator
+OPERATORS.register_module(
+    module_name='VideoAnalysisOperator',
+    module_path="ops.user.video_analysis_operator.process"
+)
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_analysis_operator/metadata.yml b/runtime/ops/mapper/video_analysis_operator/metadata.yml
new file mode 100644
index 00000000..789e8bb0
--- /dev/null
+++ b/runtime/ops/mapper/video_analysis_operator/metadata.yml
@@ -0,0 +1,43 @@
+name: '视频处理套件'
+name_en: 'video_analysis_operator'
+description: '集成安全检查、视频切除、视频标注、摘要生成及视频分类的视频工程处理套件'
+description_en: 'An integrated suite for safety check, video cutting, annotation, summary, and classification for medical video engineering.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoAnalysisOperator'
+version: '1.0.0'
+modal: 'video'
+types:
+  - 'cleaning'
+  - 'annotation'
+inputs: 'video'
+outputs: 'video'
+
+runtime:
+  cpu: '4.0'
+  memory: '32212254720' 
+  npu: '1.0'          
+  storage: '30GB'
+
+settings:
+  - name: '任务选择'
+    raw_id: 'taskType'
+    type: 'select'
+    defaultVal: 'safety_check'
+    options:
+      - label: '安全检查'
+        value: 'safety_check'
+      - label: '视频切除'
+        value: 'video_cut'
+      - label: '视频标注'
+        value: 'video_annot'
+      - label: '摘要生成'
+        value: 'summary_gen'
+      - label: '视频分类'
+        value: 'video_classify'
+
+  - name: 'Qwen权重绝对路径'
+    description: '昇腾910B服务器上模型存放的物理路径'
+    raw_id: 'modelPath'
+    type: 'input'
+    defaultVal: '/mnt/nvme0n1/home/gtx/Video_Analysis_System/models/qwen/Qwen/Qwen2.5-VL-7B-Instruct'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_analysis_operator/process.py b/runtime/ops/mapper/video_analysis_operator/process.py
new file mode 100644
index 00000000..78adbcad
--- /dev/null
+++ b/runtime/ops/mapper/video_analysis_operator/process.py
@@ -0,0 +1,141 @@
+import os
+import sys
+import subprocess
+import re
+import json
+import cv2
+import numpy as np
+import mindspore
+from typing import Dict, Any
+from datamate.core.base_op import Mapper
+
+# 全局变量实现单例模式，防止显存 OOM
+GLOBAL_MODEL = None
+GLOBAL_PROCESSOR = None
+
+class VideoAnalysisOperator(Mapper):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # 1. 环境路径修复 (来自 Cell 1 & 2)
+        self._setup_env()
+        
+        # 2. 从 UI 配置获取参数
+        self.task_type = kwargs.get("taskType", "safety_check")
+        self.model_path = kwargs.get("modelPath", "/mnt/nvme0n1/home/gtx/Video_Analysis_System/models/qwen/Qwen/Qwen2.5-VL-7B-Instruct")
+        
+        # 3. 初始化加载模型
+        self._init_resources()
+
+        # 4. 任务 Prompt 库 (来自 Cell 3 & 4)
+        self.prompts = {
+            "audit": "分析视频中是否包含违规内容(色情/政治/国家领导人/战争/血腥)。仅返回区间数组如[[s,e]]，若无则返[]。禁言。",
+            "summary": "请详细描述这个视频里发生的主要内容，以‘这个视频’开头，30字以内",
+            "classify": "请分析视频内容，将其归类为以下类别中的【唯一一个】：\n日常生活, 影视剧集, 音乐舞蹈, 游戏电竞, 动漫, 新闻, 教育, 科技, 财经, 体育, 美食, 时尚, 汽车, 萌宠, 健康, 风光, 三农, 监控, 广告, 其他\n只输出类别名称。",
+            "extreme": "你现在是时序审计员。任务：将视频划分为5个以上的连续时间段，描述动作变化。格式必须为：[开始, 结束] 关键词。"
+        }
+
+    def _setup_env(self):
+        """路径穿透与修复"""
+        STD_PATH = "/mnt/nvme0n1/home/gtx/miniconda3/envs/video_ai/lib/python3.9/site-packages"
+        CANN_PATH = "/mnt/nvme0n1/home/gtx/my_env/cann/ascend-toolkit/8.3.RC2/python/site-packages"
+        for p in [STD_PATH, CANN_PATH]:
+            if os.path.exists(p) and p not in sys.path:
+                sys.path.insert(0, p)
+
+    def _init_resources(self):
+        """单例加载 Qwen2.5-VL"""
+        global GLOBAL_MODEL, GLOBAL_PROCESSOR
+        if GLOBAL_MODEL is None:
+            from mindone.transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+            from mindspore import context
+            context.set_context(mode=context.PYNATIVE_MODE, device_target="Ascend", device_id=0)
+            
+            GLOBAL_MODEL = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                self.model_path, mindspore_dtype=mindspore.bfloat16, 
+                trust_remote_code=False, local_files_only=True
+            )
+            GLOBAL_PROCESSOR = AutoProcessor.from_pretrained(self.model_path, trust_remote_code=False, local_files_only=True)
+        self.model = GLOBAL_MODEL
+        self.processor = GLOBAL_PROCESSOR
+
+    # --- 通用工具集 ---
+    def _get_duration(self, path):
+        cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", path]
+        return float(subprocess.run(cmd, stdout=subprocess.PIPE).stdout)
+
+    def _read_frames(self, path, num_frames=8, start=None, end=None, res_limit=None):
+        cap = cv2.VideoCapture(path)
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        total_v_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        s_idx = int(start * fps) if start is not None else 0
+        e_idx = int(end * fps) if end is not None else total_v_frames
+        indices = np.linspace(s_idx, e_idx - 1, num_frames, dtype=int)
+        frames = []
+        for idx in indices:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+            ret, frame = cap.read()
+            if ret:
+                if res_limit:
+                    w, h = cap.get(cv2.CAP_PROP_FRAME_WIDTH), cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
+                    scale = np.sqrt(res_limit / (w * h))
+                    frame = cv2.resize(frame, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_AREA)
+                frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+        cap.release()
+        return frames
+
+    def _infer(self, frames, prompt, max_tokens=100):
+        messages = [{"role": "user", "content": [{"type": "video", "video": frames}, {"type": "text", "text": prompt}]}]
+        text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = self.processor(text=[text], videos=[frames], padding=True, return_tensors="ms", max_pixels=448*448)
+        for k, v in inputs.items():
+            if isinstance(v, mindspore.Tensor) and v.dtype == mindspore.float32:
+                inputs[k] = v.astype(mindspore.bfloat16)
+        gen_ids = self.model.generate(**inputs, max_new_tokens=max_tokens)
+        return self.processor.batch_decode(gen_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0].strip()
+
+    # --- 核心业务逻辑分流 ---
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        video_path = sample.get('filePath')
+        if not video_path or not os.path.exists(video_path): return sample
+
+        if self.task_type == "safety_check":
+            sample['audit_result'] = self._run_safety_audit(video_path)
+        elif self.task_type == "video_cut":
+            violations = self._run_safety_audit(video_path, return_raw=True)
+            if violations:
+                sample['filePath'] = self._run_physical_cut(video_path, violations)
+        elif self.task_type == "video_annot":
+            sample['annotation'] = self._run_extreme_annot(video_path)
+        elif self.task_type == "summary_gen":
+            frames = self._read_frames(video_path, num_frames=16)
+            sample['summary'] = self._infer(frames, self.prompts["summary"])
+        elif self.task_type == "video_classify":
+            frames = self._read_frames(video_path, num_frames=8)
+            sample['category'] = self._infer(frames, self.prompts["classify"])
+
+        return sample
+
+    def _run_safety_audit(self, path, return_raw=False):
+        """高精审计 (Cell 3)"""
+        duration = self._get_duration(path)
+        violations, curr = [], 0.0
+        while curr < duration:
+            end = min(curr + 10, duration)
+            frames = self._read_frames(path, num_frames=12, start=curr, end=end)
+            res = self._infer(frames, self.prompts["audit"], max_tokens=40)
+            found = re.findall(r'\[\s*(\d+\.?\d*)\s*,\s*(\d+\.?\d*)\s*\]', res)
+            for s, e in found: violations.append([curr + float(s), curr + float(e)])
+            curr += 8
+        if return_raw: return violations
+        return f"发现{len(violations)}处违规" if violations else "安全"
+
+    def _run_physical_cut(self, path, violations):
+        """FFMPEG 物理切除 (Cell 3)"""
+        out_path = path.replace(".mp4", "_cleaned.mp4")
+        # 构建 FFMPEG 保留区间逻辑... (此处填入你 Cell 3 的拼接代码)
+        return out_path
+
+    def _run_extreme_annot(self, path):
+        """极速标注 (Cell 4)"""
+        frames = self._read_frames(path, num_frames=12, res_limit=128*128)
+        return self._infer(frames, self.prompts["extreme"], max_tokens=256)
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_analysis_operator/requirements.txt b/runtime/ops/mapper/video_analysis_operator/requirements.txt
new file mode 100644
index 00000000..5b985a81
--- /dev/null
+++ b/runtime/ops/mapper/video_analysis_operator/requirements.txt
@@ -0,0 +1,17 @@
+mindspore==2.4.1
+mindone==0.5.0
+transformers>=4.46.1
+tokenizers>=0.20.0
+jinja2
+safetensors
+sentencepiece
+opencv-python-headless>=4.9.0.80
+numpy<2.0,>=1.24.0
+Pillow
+decord
+attrs
+decorator
+psutil
+cloudpickle
+ml-dtypes
+absl-py
\ No newline at end of file

From 586ef20769461a2c5f74d48d8104fd2c0e203ddd Mon Sep 17 00:00:00 2001
From: guotingxuan <117552024+guotingxuan@users.noreply.github.com>
Date: Thu, 5 Mar 2026 22:07:54 +0800
Subject: [PATCH 2/4] Delete runtime/ops/mapper/video_analysis_operator
 directory

remove old code
---
 .../video_analysis_operator/__init__.py       |   8 -
 .../video_analysis_operator/metadata.yml      |  43 ------
 .../mapper/video_analysis_operator/process.py | 141 ------------------
 .../video_analysis_operator/requirements.txt  |  17 ---
 4 files changed, 209 deletions(-)
 delete mode 100644 runtime/ops/mapper/video_analysis_operator/__init__.py
 delete mode 100644 runtime/ops/mapper/video_analysis_operator/metadata.yml
 delete mode 100644 runtime/ops/mapper/video_analysis_operator/process.py
 delete mode 100644 runtime/ops/mapper/video_analysis_operator/requirements.txt

diff --git a/runtime/ops/mapper/video_analysis_operator/__init__.py b/runtime/ops/mapper/video_analysis_operator/__init__.py
deleted file mode 100644
index 6227d434..00000000
--- a/runtime/ops/mapper/video_analysis_operator/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# -*- coding: utf-8 -*-
-from datamate.core.base_op import OPERATORS
-
-# 注册该算子，确保 path 指向当前文件夹名 video_analysis_operator
-OPERATORS.register_module(
-    module_name='VideoAnalysisOperator',
-    module_path="ops.user.video_analysis_operator.process"
-)
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_analysis_operator/metadata.yml b/runtime/ops/mapper/video_analysis_operator/metadata.yml
deleted file mode 100644
index 789e8bb0..00000000
--- a/runtime/ops/mapper/video_analysis_operator/metadata.yml
+++ /dev/null
@@ -1,43 +0,0 @@
-name: '视频处理套件'
-name_en: 'video_analysis_operator'
-description: '集成安全检查、视频切除、视频标注、摘要生成及视频分类的视频工程处理套件'
-description_en: 'An integrated suite for safety check, video cutting, annotation, summary, and classification for medical video engineering.'
-language: 'python'
-vendor: 'huawei'
-raw_id: 'VideoAnalysisOperator'
-version: '1.0.0'
-modal: 'video'
-types:
-  - 'cleaning'
-  - 'annotation'
-inputs: 'video'
-outputs: 'video'
-
-runtime:
-  cpu: '4.0'
-  memory: '32212254720' 
-  npu: '1.0'          
-  storage: '30GB'
-
-settings:
-  - name: '任务选择'
-    raw_id: 'taskType'
-    type: 'select'
-    defaultVal: 'safety_check'
-    options:
-      - label: '安全检查'
-        value: 'safety_check'
-      - label: '视频切除'
-        value: 'video_cut'
-      - label: '视频标注'
-        value: 'video_annot'
-      - label: '摘要生成'
-        value: 'summary_gen'
-      - label: '视频分类'
-        value: 'video_classify'
-
-  - name: 'Qwen权重绝对路径'
-    description: '昇腾910B服务器上模型存放的物理路径'
-    raw_id: 'modelPath'
-    type: 'input'
-    defaultVal: '/mnt/nvme0n1/home/gtx/Video_Analysis_System/models/qwen/Qwen/Qwen2.5-VL-7B-Instruct'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_analysis_operator/process.py b/runtime/ops/mapper/video_analysis_operator/process.py
deleted file mode 100644
index 78adbcad..00000000
--- a/runtime/ops/mapper/video_analysis_operator/process.py
+++ /dev/null
@@ -1,141 +0,0 @@
-import os
-import sys
-import subprocess
-import re
-import json
-import cv2
-import numpy as np
-import mindspore
-from typing import Dict, Any
-from datamate.core.base_op import Mapper
-
-# 全局变量实现单例模式，防止显存 OOM
-GLOBAL_MODEL = None
-GLOBAL_PROCESSOR = None
-
-class VideoAnalysisOperator(Mapper):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        # 1. 环境路径修复 (来自 Cell 1 & 2)
-        self._setup_env()
-        
-        # 2. 从 UI 配置获取参数
-        self.task_type = kwargs.get("taskType", "safety_check")
-        self.model_path = kwargs.get("modelPath", "/mnt/nvme0n1/home/gtx/Video_Analysis_System/models/qwen/Qwen/Qwen2.5-VL-7B-Instruct")
-        
-        # 3. 初始化加载模型
-        self._init_resources()
-
-        # 4. 任务 Prompt 库 (来自 Cell 3 & 4)
-        self.prompts = {
-            "audit": "分析视频中是否包含违规内容(色情/政治/国家领导人/战争/血腥)。仅返回区间数组如[[s,e]]，若无则返[]。禁言。",
-            "summary": "请详细描述这个视频里发生的主要内容，以‘这个视频’开头，30字以内",
-            "classify": "请分析视频内容，将其归类为以下类别中的【唯一一个】：\n日常生活, 影视剧集, 音乐舞蹈, 游戏电竞, 动漫, 新闻, 教育, 科技, 财经, 体育, 美食, 时尚, 汽车, 萌宠, 健康, 风光, 三农, 监控, 广告, 其他\n只输出类别名称。",
-            "extreme": "你现在是时序审计员。任务：将视频划分为5个以上的连续时间段，描述动作变化。格式必须为：[开始, 结束] 关键词。"
-        }
-
-    def _setup_env(self):
-        """路径穿透与修复"""
-        STD_PATH = "/mnt/nvme0n1/home/gtx/miniconda3/envs/video_ai/lib/python3.9/site-packages"
-        CANN_PATH = "/mnt/nvme0n1/home/gtx/my_env/cann/ascend-toolkit/8.3.RC2/python/site-packages"
-        for p in [STD_PATH, CANN_PATH]:
-            if os.path.exists(p) and p not in sys.path:
-                sys.path.insert(0, p)
-
-    def _init_resources(self):
-        """单例加载 Qwen2.5-VL"""
-        global GLOBAL_MODEL, GLOBAL_PROCESSOR
-        if GLOBAL_MODEL is None:
-            from mindone.transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
-            from mindspore import context
-            context.set_context(mode=context.PYNATIVE_MODE, device_target="Ascend", device_id=0)
-            
-            GLOBAL_MODEL = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-                self.model_path, mindspore_dtype=mindspore.bfloat16, 
-                trust_remote_code=False, local_files_only=True
-            )
-            GLOBAL_PROCESSOR = AutoProcessor.from_pretrained(self.model_path, trust_remote_code=False, local_files_only=True)
-        self.model = GLOBAL_MODEL
-        self.processor = GLOBAL_PROCESSOR
-
-    # --- 通用工具集 ---
-    def _get_duration(self, path):
-        cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", path]
-        return float(subprocess.run(cmd, stdout=subprocess.PIPE).stdout)
-
-    def _read_frames(self, path, num_frames=8, start=None, end=None, res_limit=None):
-        cap = cv2.VideoCapture(path)
-        fps = cap.get(cv2.CAP_PROP_FPS)
-        total_v_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        s_idx = int(start * fps) if start is not None else 0
-        e_idx = int(end * fps) if end is not None else total_v_frames
-        indices = np.linspace(s_idx, e_idx - 1, num_frames, dtype=int)
-        frames = []
-        for idx in indices:
-            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
-            ret, frame = cap.read()
-            if ret:
-                if res_limit:
-                    w, h = cap.get(cv2.CAP_PROP_FRAME_WIDTH), cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
-                    scale = np.sqrt(res_limit / (w * h))
-                    frame = cv2.resize(frame, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_AREA)
-                frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
-        cap.release()
-        return frames
-
-    def _infer(self, frames, prompt, max_tokens=100):
-        messages = [{"role": "user", "content": [{"type": "video", "video": frames}, {"type": "text", "text": prompt}]}]
-        text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = self.processor(text=[text], videos=[frames], padding=True, return_tensors="ms", max_pixels=448*448)
-        for k, v in inputs.items():
-            if isinstance(v, mindspore.Tensor) and v.dtype == mindspore.float32:
-                inputs[k] = v.astype(mindspore.bfloat16)
-        gen_ids = self.model.generate(**inputs, max_new_tokens=max_tokens)
-        return self.processor.batch_decode(gen_ids[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0].strip()
-
-    # --- 核心业务逻辑分流 ---
-    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
-        video_path = sample.get('filePath')
-        if not video_path or not os.path.exists(video_path): return sample
-
-        if self.task_type == "safety_check":
-            sample['audit_result'] = self._run_safety_audit(video_path)
-        elif self.task_type == "video_cut":
-            violations = self._run_safety_audit(video_path, return_raw=True)
-            if violations:
-                sample['filePath'] = self._run_physical_cut(video_path, violations)
-        elif self.task_type == "video_annot":
-            sample['annotation'] = self._run_extreme_annot(video_path)
-        elif self.task_type == "summary_gen":
-            frames = self._read_frames(video_path, num_frames=16)
-            sample['summary'] = self._infer(frames, self.prompts["summary"])
-        elif self.task_type == "video_classify":
-            frames = self._read_frames(video_path, num_frames=8)
-            sample['category'] = self._infer(frames, self.prompts["classify"])
-
-        return sample
-
-    def _run_safety_audit(self, path, return_raw=False):
-        """高精审计 (Cell 3)"""
-        duration = self._get_duration(path)
-        violations, curr = [], 0.0
-        while curr < duration:
-            end = min(curr + 10, duration)
-            frames = self._read_frames(path, num_frames=12, start=curr, end=end)
-            res = self._infer(frames, self.prompts["audit"], max_tokens=40)
-            found = re.findall(r'\[\s*(\d+\.?\d*)\s*,\s*(\d+\.?\d*)\s*\]', res)
-            for s, e in found: violations.append([curr + float(s), curr + float(e)])
-            curr += 8
-        if return_raw: return violations
-        return f"发现{len(violations)}处违规" if violations else "安全"
-
-    def _run_physical_cut(self, path, violations):
-        """FFMPEG 物理切除 (Cell 3)"""
-        out_path = path.replace(".mp4", "_cleaned.mp4")
-        # 构建 FFMPEG 保留区间逻辑... (此处填入你 Cell 3 的拼接代码)
-        return out_path
-
-    def _run_extreme_annot(self, path):
-        """极速标注 (Cell 4)"""
-        frames = self._read_frames(path, num_frames=12, res_limit=128*128)
-        return self._infer(frames, self.prompts["extreme"], max_tokens=256)
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_analysis_operator/requirements.txt b/runtime/ops/mapper/video_analysis_operator/requirements.txt
deleted file mode 100644
index 5b985a81..00000000
--- a/runtime/ops/mapper/video_analysis_operator/requirements.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-mindspore==2.4.1
-mindone==0.5.0
-transformers>=4.46.1
-tokenizers>=0.20.0
-jinja2
-safetensors
-sentencepiece
-opencv-python-headless>=4.9.0.80
-numpy<2.0,>=1.24.0
-Pillow
-decord
-attrs
-decorator
-psutil
-cloudpickle
-ml-dtypes
-absl-py
\ No newline at end of file

From aab5b6925ef14e17bf0bc20406f27b7cd07e4701 Mon Sep 17 00:00:00 2001
From: guotingxuan5599 <1321352073@qq.com>
Date: Fri, 6 Mar 2026 00:26:14 +0800
Subject: [PATCH 3/4] feat: add video operators (mapper import + registration +
 metadata)

---
 runtime/ops/mapper/__init__.py                |  17 +-
 runtime/ops/mapper/_video_common/__init__.py  |   1 +
 runtime/ops/mapper/_video_common/ffmpeg.py    | 117 ++++++
 runtime/ops/mapper/_video_common/io_video.py  |  13 +
 runtime/ops/mapper/_video_common/log.py       |  23 ++
 runtime/ops/mapper/_video_common/paths.py     |  18 +
 runtime/ops/mapper/_video_common/schema.py    |   9 +
 .../mapper/video_audio_extract/__init__.py    |   6 +
 .../mapper/video_audio_extract/metadata.yml   |  16 +
 .../ops/mapper/video_audio_extract/process.py |  81 ++++
 .../mapper/video_classify_qwenvl/__init__.py  |   6 +
 .../mapper/video_classify_qwenvl/metadata.yml |  16 +
 .../mapper/video_classify_qwenvl/process.py   | 118 ++++++
 .../mapper/video_deborder_crop/__init__.py    |   6 +
 .../mapper/video_deborder_crop/metadata.yml   |  16 +
 .../ops/mapper/video_deborder_crop/process.py | 212 ++++++++++
 .../mapper/video_event_tag_qwenvl/__init__.py |   6 +
 .../video_event_tag_qwenvl/metadata.yml       |  16 +
 .../mapper/video_event_tag_qwenvl/process.py  | 121 ++++++
 .../mapper/video_format_convert/__init__.py   |   6 +
 .../mapper/video_format_convert/metadata.yml  |  16 +
 .../mapper/video_format_convert/process.py    |  97 +++++
 .../mapper/video_keyframe_extract/__init__.py |   6 +
 .../video_keyframe_extract/metadata.yml       |  16 +
 .../mapper/video_keyframe_extract/process.py  | 229 +++++++++++
 .../ops/mapper/video_mot_track/__init__.py    |   6 +
 .../video_mot_track/configs/bytetrack.yaml    |   7 +
 .../ops/mapper/video_mot_track/metadata.yml   |  16 +
 runtime/ops/mapper/video_mot_track/process.py | 119 ++++++
 .../mapper/video_sensitive_crop/__init__.py   |   6 +
 .../mapper/video_sensitive_crop/metadata.yml  |  16 +
 .../mapper/video_sensitive_crop/process.py    | 150 +++++++
 .../mapper/video_sensitive_detect/__init__.py |   6 +
 .../video_sensitive_detect/metadata.yml       |  16 +
 .../mapper/video_sensitive_detect/process.py  | 138 +++++++
 .../ops/mapper/video_speech_asr/__init__.py   |   6 +
 .../ops/mapper/video_speech_asr/metadata.yml  |  16 +
 .../ops/mapper/video_speech_asr/process.py    | 213 ++++++++++
 .../ops/mapper/video_subject_crop/__init__.py |   6 +
 .../mapper/video_subject_crop/metadata.yml    |  16 +
 .../ops/mapper/video_subject_crop/process.py  | 175 ++++++++
 .../ops/mapper/video_subtitle_ocr/__init__.py |   6 +
 .../mapper/video_subtitle_ocr/metadata.yml    |  16 +
 .../ops/mapper/video_subtitle_ocr/process.py  | 373 ++++++++++++++++++
 .../mapper/video_summary_qwenvl/__init__.py   |   6 +
 .../mapper/video_summary_qwenvl/metadata.yml  |  16 +
 .../mapper/video_summary_qwenvl/process.py    | 174 ++++++++
 runtime/ops/mapper/video_text_ocr/__init__.py |   6 +
 .../ops/mapper/video_text_ocr/metadata.yml    |  16 +
 runtime/ops/mapper/video_text_ocr/process.py  | 254 ++++++++++++
 50 files changed, 2966 insertions(+), 1 deletion(-)
 create mode 100644 runtime/ops/mapper/_video_common/__init__.py
 create mode 100644 runtime/ops/mapper/_video_common/ffmpeg.py
 create mode 100644 runtime/ops/mapper/_video_common/io_video.py
 create mode 100644 runtime/ops/mapper/_video_common/log.py
 create mode 100644 runtime/ops/mapper/_video_common/paths.py
 create mode 100644 runtime/ops/mapper/_video_common/schema.py
 create mode 100644 runtime/ops/mapper/video_audio_extract/__init__.py
 create mode 100644 runtime/ops/mapper/video_audio_extract/metadata.yml
 create mode 100644 runtime/ops/mapper/video_audio_extract/process.py
 create mode 100644 runtime/ops/mapper/video_classify_qwenvl/__init__.py
 create mode 100644 runtime/ops/mapper/video_classify_qwenvl/metadata.yml
 create mode 100644 runtime/ops/mapper/video_classify_qwenvl/process.py
 create mode 100644 runtime/ops/mapper/video_deborder_crop/__init__.py
 create mode 100644 runtime/ops/mapper/video_deborder_crop/metadata.yml
 create mode 100644 runtime/ops/mapper/video_deborder_crop/process.py
 create mode 100644 runtime/ops/mapper/video_event_tag_qwenvl/__init__.py
 create mode 100644 runtime/ops/mapper/video_event_tag_qwenvl/metadata.yml
 create mode 100644 runtime/ops/mapper/video_event_tag_qwenvl/process.py
 create mode 100644 runtime/ops/mapper/video_format_convert/__init__.py
 create mode 100644 runtime/ops/mapper/video_format_convert/metadata.yml
 create mode 100644 runtime/ops/mapper/video_format_convert/process.py
 create mode 100644 runtime/ops/mapper/video_keyframe_extract/__init__.py
 create mode 100644 runtime/ops/mapper/video_keyframe_extract/metadata.yml
 create mode 100644 runtime/ops/mapper/video_keyframe_extract/process.py
 create mode 100644 runtime/ops/mapper/video_mot_track/__init__.py
 create mode 100644 runtime/ops/mapper/video_mot_track/configs/bytetrack.yaml
 create mode 100644 runtime/ops/mapper/video_mot_track/metadata.yml
 create mode 100644 runtime/ops/mapper/video_mot_track/process.py
 create mode 100644 runtime/ops/mapper/video_sensitive_crop/__init__.py
 create mode 100644 runtime/ops/mapper/video_sensitive_crop/metadata.yml
 create mode 100644 runtime/ops/mapper/video_sensitive_crop/process.py
 create mode 100644 runtime/ops/mapper/video_sensitive_detect/__init__.py
 create mode 100644 runtime/ops/mapper/video_sensitive_detect/metadata.yml
 create mode 100644 runtime/ops/mapper/video_sensitive_detect/process.py
 create mode 100644 runtime/ops/mapper/video_speech_asr/__init__.py
 create mode 100644 runtime/ops/mapper/video_speech_asr/metadata.yml
 create mode 100644 runtime/ops/mapper/video_speech_asr/process.py
 create mode 100644 runtime/ops/mapper/video_subject_crop/__init__.py
 create mode 100644 runtime/ops/mapper/video_subject_crop/metadata.yml
 create mode 100644 runtime/ops/mapper/video_subject_crop/process.py
 create mode 100644 runtime/ops/mapper/video_subtitle_ocr/__init__.py
 create mode 100644 runtime/ops/mapper/video_subtitle_ocr/metadata.yml
 create mode 100644 runtime/ops/mapper/video_subtitle_ocr/process.py
 create mode 100644 runtime/ops/mapper/video_summary_qwenvl/__init__.py
 create mode 100644 runtime/ops/mapper/video_summary_qwenvl/metadata.yml
 create mode 100644 runtime/ops/mapper/video_summary_qwenvl/process.py
 create mode 100644 runtime/ops/mapper/video_text_ocr/__init__.py
 create mode 100644 runtime/ops/mapper/video_text_ocr/metadata.yml
 create mode 100644 runtime/ops/mapper/video_text_ocr/process.py

diff --git a/runtime/ops/mapper/__init__.py b/runtime/ops/mapper/__init__.py
index 4b970199..ed0a0fcb 100644
--- a/runtime/ops/mapper/__init__.py
+++ b/runtime/ops/mapper/__init__.py
@@ -47,6 +47,21 @@ def _import_operators():
     from . import remove_duplicate_sentences
     from . import knowledge_relation_slice
     from . import pii_ner_detection
-
+        # ===== Video operators (PR1-PR5) =====
+    from . import _video_common
+    from . import video_format_convert
+    from . import video_sensitive_detect
+    from . import video_sensitive_crop
+    from . import video_mot_track
+    from . import video_subject_crop
+    from . import video_classify_qwenvl
+    from . import video_summary_qwenvl
+    from . import video_event_tag_qwenvl
+    from . import video_keyframe_extract
+    from . import video_deborder_crop
+    from . import video_audio_extract
+    from . import video_speech_asr
+    from . import video_subtitle_ocr
+    from . import video_text_ocr
 
 _import_operators()
diff --git a/runtime/ops/mapper/_video_common/__init__.py b/runtime/ops/mapper/_video_common/__init__.py
new file mode 100644
index 00000000..7c68785e
--- /dev/null
+++ b/runtime/ops/mapper/_video_common/__init__.py
@@ -0,0 +1 @@
+# -*- coding: utf-8 -*-
\ No newline at end of file
diff --git a/runtime/ops/mapper/_video_common/ffmpeg.py b/runtime/ops/mapper/_video_common/ffmpeg.py
new file mode 100644
index 00000000..c0340c04
--- /dev/null
+++ b/runtime/ops/mapper/_video_common/ffmpeg.py
@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+import os
+import subprocess
+
+def run_cmd(cmd, logger=None):
+    if logger:
+        logger.info("FFmpeg cmd: " + " ".join(cmd))
+    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    if p.returncode != 0:
+        msg = f"FFmpeg failed (code={p.returncode}).\nSTDOUT:\n{p.stdout}\nSTDERR:\n{p.stderr}"
+        raise RuntimeError(msg)
+    return p.stdout, p.stderr
+
+def convert_to_mp4_h264(
+    in_path: str,
+    out_path: str,
+    crf: int = 23,
+    preset: str = "veryfast",
+    audio: bool = True,
+    fps: int = None,
+    scale: str = None,  # e.g. "1280:720" or None
+    logger=None,
+):
+    """
+    最通用的“交付格式”：mp4(H.264) + yuv420p
+    - crf 越小质量越高，体积越大（18~28常用）
+    - preset 越慢压缩越好但越耗时（veryfast/fast/medium）
+    """
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+
+    cmd = ["ffmpeg", "-y", "-i", in_path]
+
+    # 视频参数
+    cmd += ["-c:v", "libx264", "-pix_fmt", "yuv420p", "-preset", preset, "-crf", str(crf)]
+
+    # 可选 fps / scale
+    if fps is not None:
+        cmd += ["-r", str(int(fps))]
+    if scale is not None:
+        cmd += ["-vf", f"scale={scale}"]
+
+    # 音频
+    if audio:
+        cmd += ["-c:a", "aac", "-b:a", "128k"]
+    else:
+        cmd += ["-an"]
+
+    cmd += [out_path]
+    return run_cmd(cmd, logger=logger)
+
+def transcode_any(
+    in_path: str,
+    out_path: str,
+    vcodec: str = "libx264",
+    acodec: str = "aac",
+    pix_fmt: str = "yuv420p",
+    crf: int = 23,
+    preset: str = "veryfast",
+    vbitrate: str = None,   # e.g. "2M"
+    abitrate: str = "128k",
+    fps: int = None,
+    scale: str = None,      # e.g. "1280:720"
+    extra_args: list = None,
+    logger=None,
+):
+    """
+    通用转码：支持任意容器/编码器组合
+    - vcodec/acodec 支持 'copy'（封装重打包或直接流拷贝）
+    - out_path 后缀决定容器格式：.mp4/.mkv/.mov/.avi/.wmv...
+    """
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    cmd = ["ffmpeg", "-y", "-i", in_path]
+
+    # video
+    cmd += ["-c:v", vcodec]
+    if vcodec != "copy":
+        cmd += ["-pix_fmt", pix_fmt]
+        if crf is not None:
+            cmd += ["-crf", str(crf)]
+        if preset:
+            cmd += ["-preset", preset]
+        if vbitrate:
+            cmd += ["-b:v", str(vbitrate)]
+
+    # fps/scale
+    if fps is not None:
+        cmd += ["-r", str(int(fps))]
+    if scale is not None:
+        cmd += ["-vf", f"scale={scale}"]
+
+    # audio
+    cmd += ["-c:a", acodec]
+    if acodec != "copy":
+        if abitrate:
+            cmd += ["-b:a", str(abitrate)]
+
+    if extra_args:
+        cmd += list(extra_args)
+
+    cmd += [out_path]
+    return run_cmd(cmd, logger=logger)
+
+
+
+def cut_segment(in_path: str, out_path: str, start: float, end: float, logger=None):
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    cmd = ["ffmpeg", "-y", "-ss", str(start), "-to", str(end), "-i", in_path, "-c", "copy", out_path]
+    return run_cmd(cmd, logger=logger)
+
+def concat_segments(segment_paths, out_path: str, logger=None):
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    list_file = out_path + ".txt"
+    with open(list_file, "w", encoding="utf-8") as f:
+        for p in segment_paths:
+            f.write(f"file '{os.path.abspath(p)}'\n")
+    cmd = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", list_file, "-c", "copy", out_path]
+    return run_cmd(cmd, logger=logger)
\ No newline at end of file
diff --git a/runtime/ops/mapper/_video_common/io_video.py b/runtime/ops/mapper/_video_common/io_video.py
new file mode 100644
index 00000000..787a9b6c
--- /dev/null
+++ b/runtime/ops/mapper/_video_common/io_video.py
@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+import cv2
+
+def get_video_info(video_path: str):
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise RuntimeError(f"Cannot open video: {video_path}")
+    fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    cap.release()
+    return fps, width, height, frames
\ No newline at end of file
diff --git a/runtime/ops/mapper/_video_common/log.py b/runtime/ops/mapper/_video_common/log.py
new file mode 100644
index 00000000..a47e9d32
--- /dev/null
+++ b/runtime/ops/mapper/_video_common/log.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+import logging
+import os
+
+def get_logger(name: str, log_dir: str = None):
+    logger = logging.getLogger(name)
+    if logger.handlers:
+        return logger
+
+    logger.setLevel(logging.INFO)
+    fmt = logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s")
+
+    sh = logging.StreamHandler()
+    sh.setFormatter(fmt)
+    logger.addHandler(sh)
+
+    if log_dir:
+        os.makedirs(log_dir, exist_ok=True)
+        fh = logging.FileHandler(os.path.join(log_dir, "run.log"), encoding="utf-8")
+        fh.setFormatter(fmt)
+        logger.addHandler(fh)
+
+    return logger
\ No newline at end of file
diff --git a/runtime/ops/mapper/_video_common/paths.py b/runtime/ops/mapper/_video_common/paths.py
new file mode 100644
index 00000000..89591302
--- /dev/null
+++ b/runtime/ops/mapper/_video_common/paths.py
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+import os
+import time
+import uuid
+
+def ensure_dir(p: str):
+    os.makedirs(p, exist_ok=True)
+    return p
+
+def make_run_dir(export_path: str, op_name: str):
+    """
+    统一输出目录：{export_path}/{op_name}/{timestamp_uuid}/
+    """
+    ts = time.strftime("%Y%m%d_%H%M%S")
+    run_id = f"{ts}_{uuid.uuid4().hex[:8]}"
+    out_dir = os.path.join(export_path, op_name, run_id)
+    ensure_dir(out_dir)
+    return out_dir
\ No newline at end of file
diff --git a/runtime/ops/mapper/_video_common/schema.py b/runtime/ops/mapper/_video_common/schema.py
new file mode 100644
index 00000000..d566359e
--- /dev/null
+++ b/runtime/ops/mapper/_video_common/schema.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+def init_tracks_schema(video_path, fps, width, height):
+    return {
+        "video": video_path,
+        "fps": float(fps),
+        "width": int(width),
+        "height": int(height),
+        "frames": []  # {"frame_id": i, "objects":[{"track_id":..,"bbox":[..],...}]}
+    }
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_audio_extract/__init__.py b/runtime/ops/mapper/video_audio_extract/__init__.py
new file mode 100644
index 00000000..674e260e
--- /dev/null
+++ b/runtime/ops/mapper/video_audio_extract/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoAudioExtract",
+    module_path="ops.mapper.video_audio_extract.process",
+)
diff --git a/runtime/ops/mapper/video_audio_extract/metadata.yml b/runtime/ops/mapper/video_audio_extract/metadata.yml
new file mode 100644
index 00000000..6486b513
--- /dev/null
+++ b/runtime/ops/mapper/video_audio_extract/metadata.yml
@@ -0,0 +1,16 @@
+name: '视频抽取音频'
+name_en: 'Video Audio Extract'
+description: '从视频中抽取音频，默认输出 wav（16k/mono）；也可输出 aac，并生成音频信息 audio_info.json。'
+description_en: 'Extract audio from video, default wav (16k/mono); can output aac; also generates audio_info.json.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoAudioExtract'
+version: '1.0.0'
+types:
+  - 'cleaning'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'audio'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_audio_extract/process.py b/runtime/ops/mapper/video_audio_extract/process.py
new file mode 100644
index 00000000..08ff6b48
--- /dev/null
+++ b/runtime/ops/mapper/video_audio_extract/process.py
@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+import shutil
+import subprocess
+
+from .._video_common.paths import make_run_dir, ensure_dir
+from .._video_common.log import get_logger
+
+
+class VideoAudioExtract:
+    """从视频提取音频（wav 16k mono）
+
+    params:
+      - ffmpeg_path: str, optional
+      - sample_rate: int, default 16000
+      - channels: int, default 1
+      - out_format: wav|aac, default wav
+
+    outputs:
+      - artifacts/audio.wav (or audio.aac)
+      - artifacts/audio_info.json
+    """
+
+    @staticmethod
+    def execute(sample, params):
+        video_path = sample["filePath"]
+        export_path = sample.get("export_path", "./outputs")
+
+        op_name = "video_audio_extract"
+        out_dir = make_run_dir(export_path, op_name)
+        log_dir = ensure_dir(os.path.join(out_dir, "logs"))
+        art_dir = ensure_dir(os.path.join(out_dir, "artifacts"))
+
+        logger = get_logger(op_name, log_dir)
+        logger.info(f"video={video_path}")
+        logger.info(f"out_dir={out_dir}")
+
+        ffmpeg_path = params.get("ffmpeg_path") or shutil.which("ffmpeg")
+        if not ffmpeg_path:
+            raise RuntimeError("ffmpeg not found. Please install ffmpeg or pass params.ffmpeg_path")
+
+        sr = int(params.get("sample_rate", 16000))
+        ch = int(params.get("channels", 1))
+        out_format = (params.get("out_format", "wav") or "wav").lower()
+
+        if out_format == "aac":
+            audio_path = os.path.join(art_dir, "audio.aac")
+            cmd = [
+                ffmpeg_path, "-hide_banner", "-y",
+                "-i", video_path,
+                "-vn",
+                "-ac", str(ch),
+                "-ar", str(sr),
+                "-c:a", "aac",
+                audio_path
+            ]
+        else:
+            audio_path = os.path.join(art_dir, "audio.wav")
+            cmd = [
+                ffmpeg_path, "-hide_banner", "-y",
+                "-i", video_path,
+                "-vn",
+                "-ac", str(ch),
+                "-ar", str(sr),
+                "-c:a", "pcm_s16le",
+                audio_path
+            ]
+
+        logger.info("FFmpeg cmd: " + " ".join(cmd))
+        p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        if p.returncode != 0:
+            raise RuntimeError(f"FFmpeg failed (code={p.returncode}).\nSTDERR:\n{p.stderr}")
+
+        info = {"audio_path": audio_path, "sample_rate": sr, "channels": ch, "format": out_format}
+        info_path = os.path.join(art_dir, "audio_info.json")
+        with open(info_path, "w", encoding="utf-8") as f:
+            json.dump(info, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"Done. audio={audio_path}")
+        return {"out_dir": out_dir, "audio_path": audio_path, "audio_info": info_path}
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_classify_qwenvl/__init__.py b/runtime/ops/mapper/video_classify_qwenvl/__init__.py
new file mode 100644
index 00000000..1a47cd17
--- /dev/null
+++ b/runtime/ops/mapper/video_classify_qwenvl/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoClassifyQwenVL",
+    module_path="ops.mapper.video_classify_qwenvl.process",
+)
diff --git a/runtime/ops/mapper/video_classify_qwenvl/metadata.yml b/runtime/ops/mapper/video_classify_qwenvl/metadata.yml
new file mode 100644
index 00000000..1f27ca1a
--- /dev/null
+++ b/runtime/ops/mapper/video_classify_qwenvl/metadata.yml
@@ -0,0 +1,16 @@
+name: '视频分类（QwenVL）'
+name_en: 'Video Classify (QwenVL)'
+description: '抽帧调用 QwenVL classify25，多帧投票输出分类结果 classification.json。'
+description_en: 'Sample frames and call QwenVL classify25; vote to output classification.json.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoClassifyQwenVL'
+version: '1.0.0'
+types:
+  - 'annotation'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'text'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_classify_qwenvl/process.py b/runtime/ops/mapper/video_classify_qwenvl/process.py
new file mode 100644
index 00000000..9638043b
--- /dev/null
+++ b/runtime/ops/mapper/video_classify_qwenvl/process.py
@@ -0,0 +1,118 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+import collections
+import cv2
+import importlib
+
+from .._video_common.paths import make_run_dir, ensure_dir
+from .._video_common.log import get_logger
+from .._video_common.io_video import get_video_info
+
+_qwen = importlib.import_module("tools.qwen_sensitive")
+qwenvl_infer = _qwen.qwenvl_infer
+
+
+CLASS25 = [
+    "日常生活", "影视剧集", "音乐舞蹈", "幽默搞笑", "游戏电竞",
+    "动漫二次元", "新闻时事", "教育教学", "科技数码", "财经商业",
+    "纪录片", "体育竞技", "美食烹饪", "时尚美妆", "汽车交通",
+    "萌宠动物", "健康健身", "自然风光", "三农", "监控安防",
+    "广告营销", "才艺展示", "军事国防", "情感心理", "其他"
+]
+
+
+def _sample_frame_indices(total_frames: int, fps: float, sample_fps: float, max_frames: int):
+    if total_frames <= 0:
+        return []
+    sample_fps = float(sample_fps)
+    fps = float(fps) if fps else 25.0
+    step = max(1, int(round(fps / max(sample_fps, 0.0001))))
+    idxs = list(range(0, total_frames, step))
+    if max_frames and len(idxs) > int(max_frames):
+        n = int(max_frames)
+        idxs = [idxs[int(i * (len(idxs) - 1) / max(1, n - 1))] for i in range(n)]
+    return idxs
+
+
+class VideoClassifyQwenVL:
+    """视频分类（25类）
+
+    思路：抽帧 -> 调 QwenVL 服务 task=classify25 -> 多帧投票输出 top1
+
+    params:
+      - sample_fps: float, default 1.0
+      - max_frames: int, default 12
+      - return_topk: int, default 3
+    """
+
+    @staticmethod
+    def execute(sample, params):
+        video_path = sample["filePath"]
+        export_path = sample.get("export_path", "./outputs")
+
+        op_name = "video_classify_qwenvl"
+        out_dir = make_run_dir(export_path, op_name)
+        log_dir = ensure_dir(os.path.join(out_dir, "logs"))
+        art_dir = ensure_dir(os.path.join(out_dir, "artifacts"))
+        frames_dir = ensure_dir(os.path.join(art_dir, "frames"))
+
+        logger = get_logger(op_name, log_dir)
+        logger.info(f"video={video_path}")
+        logger.info(f"out_dir={out_dir}")
+
+        fps, w, h, total = get_video_info(video_path)
+        sample_fps = float(params.get("sample_fps", 1.0))
+        max_frames = int(params.get("max_frames", 12))
+        return_topk = int(params.get("return_topk", 3))
+
+        idxs = _sample_frame_indices(total, fps, sample_fps, max_frames)
+        logger.info(f"fps={fps:.3f}, frames={total}, sample_fps={sample_fps}, idxs={len(idxs)}")
+
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            raise RuntimeError(f"Cannot open video: {video_path}")
+
+        votes = []
+        evidences = []
+        for k, fi in enumerate(idxs):
+            cap.set(cv2.CAP_PROP_POS_FRAMES, int(fi))
+            ok, frame = cap.read()
+            if not ok or frame is None:
+                continue
+
+            jpg_path = os.path.join(frames_dir, f"frame_{int(fi):06d}.jpg")
+            cv2.imwrite(jpg_path, frame)
+
+            resp = qwenvl_infer(frame, task="classify25", timeout=180)
+            cid = int(resp.get("class_id", 25) or 25)
+            cname = resp.get("class_name", "其他") or "其他"
+
+            votes.append(cname)
+            evidences.append({"frame_id": int(fi), "jpg": jpg_path, "class_id": cid, "class_name": cname})
+
+            logger.info(f"[{k+1}/{len(idxs)}] frame={fi} -> {cid}:{cname}")
+
+        cap.release()
+
+        if not votes:
+            result = {"top1": {"class_id": 25, "class_name": "其他", "score": 0.0}, "topk": [], "evidence": []}
+        else:
+            c = collections.Counter(votes)
+            top = c.most_common(max(1, return_topk))
+            top1_name, top1_cnt = top[0]
+            top1_id = (CLASS25.index(top1_name) + 1) if top1_name in CLASS25 else 25
+            result = {
+                "top1": {"class_id": int(top1_id), "class_name": top1_name, "score": float(top1_cnt / len(votes))},
+                "topk": [{"class_id": (CLASS25.index(name)+1) if name in CLASS25 else 25,
+                          "class_name": name, "score": float(cnt / len(votes))} for name, cnt in top],
+                "evidence": evidences,
+                "meta": {"fps": float(fps), "width": int(w), "height": int(h), "total_frames": int(total)}
+            }
+
+        json_path = os.path.join(art_dir, "classification.json")
+        with open(json_path, "w", encoding="utf-8") as f:
+            json.dump(result, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"Done. classification_json={json_path}")
+        return {"out_dir": out_dir, "classification_json": json_path, "top1": result["top1"]}
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_deborder_crop/__init__.py b/runtime/ops/mapper/video_deborder_crop/__init__.py
new file mode 100644
index 00000000..c8f8e4d0
--- /dev/null
+++ b/runtime/ops/mapper/video_deborder_crop/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoDeborderCrop",
+    module_path="ops.mapper.video_deborder_crop.process",
+)
diff --git a/runtime/ops/mapper/video_deborder_crop/metadata.yml b/runtime/ops/mapper/video_deborder_crop/metadata.yml
new file mode 100644
index 00000000..71ac6ea2
--- /dev/null
+++ b/runtime/ops/mapper/video_deborder_crop/metadata.yml
@@ -0,0 +1,16 @@
+name: '视频去黑边裁剪'
+name_en: 'Video Deborder Crop'
+description: '使用 ffmpeg cropdetect 自动检测黑边并裁剪输出 deborder.mp4；也支持 force_crop 指定裁剪框。'
+description_en: 'Detect black borders via ffmpeg cropdetect and crop to output deborder.mp4; supports force_crop.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoDeborderCrop'
+version: '1.0.0'
+types:
+  - 'cleaning'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'video'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_deborder_crop/process.py b/runtime/ops/mapper/video_deborder_crop/process.py
new file mode 100644
index 00000000..02e00924
--- /dev/null
+++ b/runtime/ops/mapper/video_deborder_crop/process.py
@@ -0,0 +1,212 @@
+# -*- coding: utf-8 -*-
+import os
+import re
+import json
+import shutil
+import subprocess
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+from .._video_common.paths import make_run_dir, ensure_dir
+from .._video_common.log import get_logger
+
+
+@dataclass
+class CropBox:
+    w: int
+    h: int
+    x: int
+    y: int
+
+    def to_str(self) -> str:
+        return f"{self.w}:{self.h}:{self.x}:{self.y}"
+
+
+def _even(x: int) -> int:
+    return x - (x % 2)
+
+
+def _parse_cropdetect(stderr: str) -> List[CropBox]:
+    # ffmpeg cropdetect logs like: "crop=iw:ih:x:y" or "crop=1920:800:0:140"
+    boxes = []
+    for line in stderr.splitlines():
+        m = re.search(r"crop=(\d+):(\d+):(\d+):(\d+)", line)
+        if m:
+            w, h, x, y = map(int, m.groups())
+            boxes.append(CropBox(w=w, h=h, x=x, y=y))
+    return boxes
+
+
+def _pick_box(boxes: List[CropBox], mode: str = "safe_keep_more") -> Optional[CropBox]:
+    """
+    mode:
+      - safe_keep_more: 尽量少裁（更保守，避免误裁内容）=> 取 w/h 最大 + x/y 最小
+      - aggressive_remove_more: 尽量多裁黑边 => 取 w/h 最小 + x/y 最大
+      - median: 取中位数
+    """
+    if not boxes:
+        return None
+
+    ws = sorted(b.w for b in boxes)
+    hs = sorted(b.h for b in boxes)
+    xs = sorted(b.x for b in boxes)
+    ys = sorted(b.y for b in boxes)
+
+    if mode == "aggressive_remove_more":
+        w, h, x, y = min(ws), min(hs), max(xs), max(ys)
+    elif mode == "median":
+        mid = len(ws) // 2
+        w, h, x, y = ws[mid], hs[mid], xs[mid], ys[mid]
+    else:
+        # 默认：尽量少裁，避免裁掉内容
+        w, h, x, y = max(ws), max(hs), min(xs), min(ys)
+
+    # crop 参数通常要求偶数（编码器/像素格式更兼容）
+    return CropBox(w=_even(w), h=_even(h), x=_even(x), y=_even(y))
+
+
+def detect_crop_box(
+    ffmpeg_path: str,
+    video_path: str,
+    sample_points: List[Tuple[float, float]],
+    cropdetect: str,
+    logger,
+) -> Optional[CropBox]:
+    """在多个时间点探测 crop，汇总后给出一个 crop box。"""
+    all_boxes: List[CropBox] = []
+    for (ss, dur) in sample_points:
+        cmd = [
+            ffmpeg_path, "-hide_banner", "-y",
+            "-ss", f"{ss}",
+            "-i", video_path,
+            "-t", f"{dur}",
+            "-vf", cropdetect,
+            "-f", "null", "-"
+        ]
+        logger.info("cropdetect cmd: " + " ".join(cmd))
+        p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        # cropdetect 输出在 stderr；即使 returncode!=0 也可能有输出，所以不直接失败
+        boxes = _parse_cropdetect(p.stderr)
+        if boxes:
+            # 取该段最后一个（通常更稳定）
+            all_boxes.append(boxes[-1])
+
+    # 汇总选择一个 box（默认保守：少裁）
+    return _pick_box(all_boxes, mode="safe_keep_more")
+
+
+def crop_video(
+    ffmpeg_path: str,
+    video_path: str,
+    out_path: str,
+    crop: CropBox,
+    logger,
+    crf: int = 23,
+    preset: str = "veryfast",
+    audio_copy: bool = True,
+):
+    # 裁剪会改变尺寸，必须重新编码视频；音频可以 copy
+    cmd = [
+        ffmpeg_path, "-hide_banner", "-y",
+        "-i", video_path,
+        "-vf", f"crop={crop.to_str()}",
+        "-c:v", "libx264",
+        "-preset", preset,
+        "-crf", str(crf),
+        "-pix_fmt", "yuv420p",
+    ]
+    if audio_copy:
+        cmd += ["-c:a", "copy"]
+    else:
+        cmd += ["-c:a", "aac", "-b:a", "128k"]
+
+    cmd += [out_path]
+
+    logger.info("crop cmd: " + " ".join(cmd))
+    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    if p.returncode != 0:
+        raise RuntimeError(f"ffmpeg crop failed (code={p.returncode}).\nSTDERR:\n{p.stderr}")
+
+
+class VideoDeborderCrop:
+    """去黑边（自动 cropdetect + crop）
+
+    params:
+      - ffmpeg_path: str, optional
+      - cropdetect: str, default "cropdetect=24:16:0"
+      - sample_points: list, optional
+          默认会采样 [(0,2),(5,2)]；如果视频很短也没关系
+      - force_crop: str, optional  # 直接指定 "w:h:x:y"
+      - crf: int, default 23
+      - preset: str, default "veryfast"
+      - audio_copy: bool, default True
+
+    outputs:
+      - artifacts/deborder.mp4
+      - artifacts/crop_params.json
+    """
+
+    @staticmethod
+    def execute(sample, params):
+        video_path = sample["filePath"]
+        export_path = sample.get("export_path", "./outputs")
+
+        op_name = "video_deborder_crop"
+        out_dir = make_run_dir(export_path, op_name)
+        log_dir = ensure_dir(os.path.join(out_dir, "logs"))
+        art_dir = ensure_dir(os.path.join(out_dir, "artifacts"))
+        logger = get_logger(op_name, log_dir)
+
+        logger.info(f"video={video_path}")
+        logger.info(f"out_dir={out_dir}")
+
+        ffmpeg_path = params.get("ffmpeg_path") or shutil.which("ffmpeg")
+        if not ffmpeg_path:
+            raise RuntimeError("ffmpeg not found. Please install ffmpeg or pass params.ffmpeg_path")
+
+        cropdetect = params.get("cropdetect", "cropdetect=24:16:0")
+        force_crop = params.get("force_crop", None)
+        crf = int(params.get("crf", 23))
+        preset = params.get("preset", "veryfast")
+        audio_copy = bool(params.get("audio_copy", True))
+
+        # 默认采样点：开头 2s + 5s 处 2s
+        sample_points = params.get("sample_points", None)
+        if not sample_points:
+            sample_points = [(0.0, 2.0), (5.0, 2.0)]
+
+        crop_box: Optional[CropBox] = None
+        if force_crop:
+            m = re.match(r"(\d+):(\d+):(\d+):(\d+)", str(force_crop))
+            if not m:
+                raise ValueError('force_crop must be "w:h:x:y"')
+            w, h, x, y = map(int, m.groups())
+            crop_box = CropBox(w=_even(w), h=_even(h), x=_even(x), y=_even(y))
+        else:
+            crop_box = detect_crop_box(ffmpeg_path, video_path, sample_points, cropdetect, logger)
+
+        if not crop_box:
+            # 探测不到就原样输出（不裁剪）
+            logger.warning("cropdetect found nothing, keep original video.")
+            crop_box = CropBox(w=0, h=0, x=0, y=0)
+
+        out_mp4 = os.path.join(art_dir, "deborder.mp4")
+        crop_json = os.path.join(art_dir, "crop_params.json")
+
+        if crop_box.w == 0 or crop_box.h == 0:
+            # 直接复制（不裁）
+            cmd = [ffmpeg_path, "-hide_banner", "-y", "-i", video_path, "-c", "copy", out_mp4]
+            logger.info("copy cmd: " + " ".join(cmd))
+            p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+            if p.returncode != 0:
+                raise RuntimeError(f"ffmpeg copy failed (code={p.returncode}).\nSTDERR:\n{p.stderr}")
+            info = {"mode": "copy", "crop": None, "out_mp4": out_mp4}
+        else:
+            crop_video(ffmpeg_path, video_path, out_mp4, crop_box, logger, crf=crf, preset=preset, audio_copy=audio_copy)
+            info = {"mode": "crop", "crop": crop_box.__dict__, "out_mp4": out_mp4}
+
+        with open(crop_json, "w", encoding="utf-8") as f:
+            json.dump(info, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"Done. deborder_mp4={out_mp4}")
+        return {"out_dir": out_dir, "deborder_mp4": out_mp4, "crop_params_json": crop_json}
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_event_tag_qwenvl/__init__.py b/runtime/ops/mapper/video_event_tag_qwenvl/__init__.py
new file mode 100644
index 00000000..5ac7d0d8
--- /dev/null
+++ b/runtime/ops/mapper/video_event_tag_qwenvl/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoEventTagQwenVL",
+    module_path="ops.mapper.video_event_tag_qwenvl.process",
+)
diff --git a/runtime/ops/mapper/video_event_tag_qwenvl/metadata.yml b/runtime/ops/mapper/video_event_tag_qwenvl/metadata.yml
new file mode 100644
index 00000000..56d797f7
--- /dev/null
+++ b/runtime/ops/mapper/video_event_tag_qwenvl/metadata.yml
@@ -0,0 +1,16 @@
+name: '事件标注（QwenVL）'
+name_en: 'Video Event Tag (QwenVL)'
+description: '自适应分段取每段中点帧调用 QwenVL event_tag，输出 events.json。'
+description_en: 'Adaptive segmentation; call QwenVL event_tag on mid-frame of each segment; outputs events.json.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoEventTagQwenVL'
+version: '1.0.0'
+types:
+  - 'annotation'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'text'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_event_tag_qwenvl/process.py b/runtime/ops/mapper/video_event_tag_qwenvl/process.py
new file mode 100644
index 00000000..78264c88
--- /dev/null
+++ b/runtime/ops/mapper/video_event_tag_qwenvl/process.py
@@ -0,0 +1,121 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+import cv2
+import importlib
+
+from .._video_common.paths import make_run_dir, ensure_dir
+from .._video_common.log import get_logger
+from .._video_common.io_video import get_video_info
+
+_qwen = importlib.import_module("tools.qwen_sensitive")
+qwenvl_infer = _qwen.qwenvl_infer
+
+
+def _clamp(x, lo, hi):
+    return max(lo, min(hi, x))
+
+
+class VideoEventTagQwenVL:
+    """事件标注（自适应分段）
+
+    目标（默认参数下）：
+      - 8 秒视频 -> 大约 4 段（≈2s/段）
+      - 120 秒视频 -> 大约 12 段（≈10s/段）
+      - 600 秒视频 -> 大约 12 段（≈50s/段）
+
+    params:
+      - adaptive_segment: bool, default True
+      - target_segments: int, default 12
+      - min_segment_seconds: float, default 2.0
+      - max_segment_seconds: float, default 60.0
+      - segment_seconds: float, optional（手动覆盖；当 adaptive_segment=False 时使用）
+      - max_segments: int, default 60
+      - max_new_tokens: int, default 32
+
+    outputs:
+      - artifacts/events.json: [{start, end, event, evidence:{frame_id,jpg}}]
+    """
+
+    @staticmethod
+    def execute(sample, params):
+        video_path = sample["filePath"]
+        export_path = sample.get("export_path", "./outputs")
+
+        op_name = "video_event_tag_qwenvl"
+        out_dir = make_run_dir(export_path, op_name)
+        log_dir = ensure_dir(os.path.join(out_dir, "logs"))
+        art_dir = ensure_dir(os.path.join(out_dir, "artifacts"))
+        frames_dir = ensure_dir(os.path.join(art_dir, "frames"))
+
+        logger = get_logger(op_name, log_dir)
+        logger.info(f"video={video_path}")
+        logger.info(f"out_dir={out_dir}")
+
+        fps, w, h, total = get_video_info(video_path)
+        duration = (total / fps) if fps else 0.0
+
+        adaptive = bool(params.get("adaptive_segment", True))
+        target_segments = int(params.get("target_segments", 12))
+        min_seg_s = float(params.get("min_segment_seconds", 2.0))
+        max_seg_s = float(params.get("max_segment_seconds", 60.0))
+
+        if adaptive:
+            # seg_s = duration / target_segments，并 clamp 到[min_seg_s, max_seg_s]
+            seg_s = _clamp(duration / max(1, target_segments), min_seg_s, max_seg_s)
+        else:
+            seg_s = float(params.get("segment_seconds", 5.0))
+
+        max_segments = int(params.get("max_segments", 60))
+        max_new_tokens = int(params.get("max_new_tokens", 32))
+
+        logger.info(
+            f"fps={fps:.3f}, frames={total}, duration={duration:.2f}s, "
+            f"adaptive={adaptive}, segment_seconds={seg_s:.2f}, target_segments={target_segments}"
+        )
+
+        if duration <= 0:
+            events = []
+        else:
+            nseg = int(duration // seg_s) + 1
+            nseg = min(nseg, max_segments)
+
+            cap = cv2.VideoCapture(video_path)
+            if not cap.isOpened():
+                raise RuntimeError(f"Cannot open video: {video_path}")
+
+            events = []
+            for i in range(nseg):
+                start = i * seg_s
+                end = min(duration, (i + 1) * seg_s)
+                mid = (start + end) / 2.0
+                frame_id = int(mid * fps)
+
+                cap.set(cv2.CAP_PROP_POS_FRAMES, frame_id)
+                ok, frame = cap.read()
+                if not ok or frame is None:
+                    continue
+
+                jpg_path = os.path.join(frames_dir, f"seg_{i:03d}_frame_{frame_id:06d}.jpg")
+                cv2.imwrite(jpg_path, frame)
+
+                resp = qwenvl_infer(frame, task="event_tag", max_new_tokens=max_new_tokens, timeout=180)
+                ev = (resp.get("event") or "").strip()
+
+                events.append({
+                    "start": float(start),
+                    "end": float(end),
+                    "event": ev,
+                    "evidence": {"frame_id": int(frame_id), "jpg": jpg_path}
+                })
+
+                logger.info(f"[{i+1}/{nseg}] {start:.2f}-{end:.2f} mid={mid:.2f}s -> {ev}")
+
+            cap.release()
+
+        json_path = os.path.join(art_dir, "events.json")
+        with open(json_path, "w", encoding="utf-8") as f:
+            json.dump(events, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"Done. events_json={json_path}, segments={len(events)}")
+        return {"out_dir": out_dir, "events_json": json_path, "count": len(events), "segment_seconds": float(seg_s)}
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_format_convert/__init__.py b/runtime/ops/mapper/video_format_convert/__init__.py
new file mode 100644
index 00000000..62f6d450
--- /dev/null
+++ b/runtime/ops/mapper/video_format_convert/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoFormatConvert",
+    module_path="ops.mapper.video_format_convert.process",
+)
diff --git a/runtime/ops/mapper/video_format_convert/metadata.yml b/runtime/ops/mapper/video_format_convert/metadata.yml
new file mode 100644
index 00000000..ed1e3f2a
--- /dev/null
+++ b/runtime/ops/mapper/video_format_convert/metadata.yml
@@ -0,0 +1,16 @@
+name: '视频格式转换'
+name_en: 'Video Format Convert'
+description: '仅做容器格式转换（stream copy，不重编码）；输出 converted.xxx 与 convert_result.json。'
+description_en: 'Container remux via ffmpeg stream copy (no re-encode); outputs converted.xxx and convert_result.json.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoFormatConvert'
+version: '1.0.0'
+types:
+  - 'cleaning'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'video'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_format_convert/process.py b/runtime/ops/mapper/video_format_convert/process.py
new file mode 100644
index 00000000..3a4ce552
--- /dev/null
+++ b/runtime/ops/mapper/video_format_convert/process.py
@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+
+from .._video_common.paths import make_run_dir
+from .._video_common.log import get_logger
+from .._video_common.ffmpeg import run_cmd
+
+
+class VideoFormatConvert:
+    """
+    仅做“容器格式转换”（不重编码）：
+    - 通过 ffmpeg stream copy 实现：-c:v copy -c:a copy
+    - 输出文件后缀决定目标容器格式：mp4/mkv/mov/avi/wmv...
+
+    输入:
+      sample["filePath"]
+      sample["export_path"]
+
+    params:
+      - container: 目标容器后缀（默认 "mp4"）
+      - out_name: 输出文件名（默认 "converted.{container}"）
+      - copy_video: 是否 copy 视频流（默认 True）
+      - copy_audio: 是否 copy 音频流（默认 True）
+      - extra_args: 额外 ffmpeg 参数列表（可选）
+
+    输出:
+      out_dir/converted.xxx
+      out_dir/convert_result.json
+      out_dir/run.log
+    """
+
+    def execute(self, sample: dict, params: dict = None):
+        params = params or {}
+        in_path = sample["filePath"]
+        export_path = sample["export_path"]
+
+        out_dir = make_run_dir(export_path, "video_format_convert")
+        logger = get_logger("VideoFormatConvert", log_dir=out_dir)
+
+        # 目标容器
+        container = str(params.get("container", "mp4")).lstrip(".").lower()
+        out_name = params.get("out_name", f"converted.{container}")
+        if not out_name.lower().endswith(f".{container}"):
+            # 防止用户给了不匹配的后缀
+            out_name = f"{out_name}.{container}"
+        out_video = os.path.join(out_dir, out_name)
+
+        copy_video = bool(params.get("copy_video", True))
+        copy_audio = bool(params.get("copy_audio", True))
+        extra_args = params.get("extra_args", None)  # list[str] or None
+
+        logger.info(f"Start container convert (stream copy). in={in_path}, out={out_video}, container={container}")
+
+        cmd = ["ffmpeg", "-y", "-i", in_path]
+
+        # 视频流
+        cmd += ["-c:v", "copy" if copy_video else "libx264"]
+        # 音频流
+        cmd += ["-c:a", "copy" if copy_audio else "aac"]
+
+        # 如果用户传了额外参数（例如 -map 0、-movflags +faststart 等）
+        if extra_args:
+            if not isinstance(extra_args, list):
+                raise ValueError("params['extra_args'] must be a list, e.g. ['-movflags', '+faststart']")
+            cmd += extra_args
+
+        cmd += [out_video]
+
+        try:
+            run_cmd(cmd, logger=logger)
+        except Exception as e:
+            # 给更明确的提示：某些容器不支持某些编码，copy 会失败
+            logger.error("Container convert failed. This is usually due to codec/container incompatibility when using stream copy.")
+            logger.error("You can either choose a different container, or enable re-encode (copy_video/copy_audio=False).")
+            raise
+
+        result = {
+            "out_dir": out_dir,
+            "input": in_path,
+            "output_video": out_video,
+            "mode": "stream_copy",
+            "params": {
+                "container": container,
+                "out_name": out_name,
+                "copy_video": copy_video,
+                "copy_audio": copy_audio,
+                "extra_args": extra_args,
+            },
+        }
+
+        json_path = os.path.join(out_dir, "convert_result.json")
+        with open(json_path, "w", encoding="utf-8") as f:
+            json.dump(result, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"Done. output={out_video}")
+        return result
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_keyframe_extract/__init__.py b/runtime/ops/mapper/video_keyframe_extract/__init__.py
new file mode 100644
index 00000000..be2c3c37
--- /dev/null
+++ b/runtime/ops/mapper/video_keyframe_extract/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoKeyframeExtract",
+    module_path="ops.mapper.video_keyframe_extract.process",
+)
diff --git a/runtime/ops/mapper/video_keyframe_extract/metadata.yml b/runtime/ops/mapper/video_keyframe_extract/metadata.yml
new file mode 100644
index 00000000..2779b28e
--- /dev/null
+++ b/runtime/ops/mapper/video_keyframe_extract/metadata.yml
@@ -0,0 +1,16 @@
+name: '关键帧提取'
+name_en: 'Video Keyframe Extract'
+description: '基于 ffmpeg scene detect 提取关键帧，并可补封面帧（t=0），输出 keyframes.json 与关键帧图片。'
+description_en: 'Extract keyframes via ffmpeg scene detect and optional cover frame (t=0); outputs keyframes.json and images.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoKeyframeExtract'
+version: '1.0.0'
+types:
+  - 'annotation'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'image'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_keyframe_extract/process.py b/runtime/ops/mapper/video_keyframe_extract/process.py
new file mode 100644
index 00000000..b2b79551
--- /dev/null
+++ b/runtime/ops/mapper/video_keyframe_extract/process.py
@@ -0,0 +1,229 @@
+import os
+import json
+import shutil
+import subprocess
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+
+
+def _run(cmd: List[str]) -> Tuple[int, str]:
+    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    return p.returncode, (p.stderr or "") + (p.stdout or "")
+
+
+def _ensure_dir(p: str):
+    os.makedirs(p, exist_ok=True)
+
+
+def _list_jpgs(d: str) -> List[str]:
+    if not os.path.isdir(d):
+        return []
+    xs = [os.path.join(d, x) for x in os.listdir(d) if x.lower().endswith(".jpg")]
+    xs.sort()
+    return xs
+
+
+def _probe_duration(ffprobe_path: str, video_path: str) -> float:
+    # 尽量不用任何第三方库，直接 ffprobe
+    cmd = [
+        ffprobe_path, "-v", "error",
+        "-show_entries", "format=duration",
+        "-of", "default=noprint_wrappers=1:nokey=1",
+        video_path
+    ]
+    rc, out = _run(cmd)
+    if rc != 0:
+        return 0.0
+    try:
+        return float(out.strip().splitlines()[-1])
+    except Exception:
+        return 0.0
+
+
+@dataclass
+class KeyframeParams:
+    ffmpeg_path: str = ""
+    ffprobe_path: str = ""
+    scene_threshold: float = 0.3
+    threshold_candidates: Optional[List[float]] = None
+    max_keyframes: int = 30
+    min_interval_sec: float = 1.0
+    always_include_first: bool = True
+    quality: int = 2  # -q:v
+    out_json_name: str = "keyframes.json"
+
+
+class VideoKeyframeExtractLocal:
+    """
+    本地运行版：不依赖 datamate。
+    输出：
+      <out_dir>/artifacts/keyframes/cover.jpg (可选)
+      <out_dir>/artifacts/keyframes/%06d.jpg  (scene 帧)
+      <out_dir>/artifacts/keyframes.json
+    """
+
+    def run(self, video_path: str, out_dir: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+        p = KeyframeParams(**(params or {}))
+
+        ffmpeg = p.ffmpeg_path or shutil.which("ffmpeg")
+        ffprobe = p.ffprobe_path or shutil.which("ffprobe")
+        if not ffmpeg:
+            raise RuntimeError("ffmpeg not found. Install ffmpeg or set ffmpeg_path.")
+        if not ffprobe:
+            raise RuntimeError("ffprobe not found. Install ffprobe or set ffprobe_path.")
+
+        artifacts = os.path.join(out_dir, "artifacts")
+        key_dir = os.path.join(artifacts, "keyframes")
+        _ensure_dir(key_dir)
+
+        duration = _probe_duration(ffprobe, video_path)
+
+        outputs: List[Dict[str, Any]] = []
+
+        # 1) cover
+        cover_path = os.path.join(key_dir, "cover.jpg")
+        if p.always_include_first:
+            cmd = [
+                ffmpeg, "-hide_banner", "-y",
+                "-ss", "0",
+                "-i", video_path,
+                "-frames:v", "1",
+                "-q:v", str(p.quality),
+                "-vf", "format=yuvj420p",
+                cover_path
+            ]
+            rc, log = _run(cmd)
+            if rc == 0 and os.path.exists(cover_path):
+                outputs.append({"kind": "cover", "time_sec": 0.0, "path": cover_path})
+            else:
+                # cover 失败不致命
+                pass
+
+        # 2) scene keyframes
+        thr_candidates = p.threshold_candidates or [p.scene_threshold, 0.2, 0.15, 0.1, 0.06]
+        scene_files: List[str] = []
+        used_thr: Optional[float] = None
+
+        for thr in thr_candidates:
+            # 清掉旧的 scene 输出（保留 cover）
+            for f in _list_jpgs(key_dir):
+                if os.path.basename(f) != "cover.jpg":
+                    try:
+                        os.remove(f)
+                    except Exception:
+                        pass
+
+            vf = f"select='gt(scene,{thr})',format=yuvj420p"
+            out_tpl = os.path.join(key_dir, "%06d.jpg")
+
+            # 兼容新旧 ffmpeg
+            cmd = [
+                ffmpeg, "-hide_banner", "-y",
+                "-i", video_path,
+                "-vf", vf,
+                "-q:v", str(p.quality),
+                "-frames:v", str(p.max_keyframes * 3),
+                "-fps_mode", "vfr",
+                out_tpl
+            ]
+            rc, log = _run(cmd)
+            if rc != 0 and "Unrecognized option 'fps_mode'" in log:
+                cmd = [
+                    ffmpeg, "-hide_banner", "-y",
+                    "-i", video_path,
+                    "-vf", vf,
+                    "-q:v", str(p.quality),
+                    "-frames:v", str(p.max_keyframes * 3),
+                    "-vsync", "vfr",
+                    out_tpl
+                ]
+                rc, log = _run(cmd)
+
+            files = [f for f in _list_jpgs(key_dir) if os.path.basename(f) != "cover.jpg"]
+            if files:
+                scene_files = files
+                used_thr = thr
+                break
+
+        # 3) fallback：scene=0 时取中间帧
+        if not scene_files:
+            t = duration / 2.0 if duration > 0 else 0.0
+            mid_path = os.path.join(key_dir, "000001.jpg")
+            cmd = [
+                ffmpeg, "-hide_banner", "-y",
+                "-ss", f"{t}",
+                "-i", video_path,
+                "-frames:v", "1",
+                "-q:v", str(p.quality),
+                "-vf", "format=yuvj420p",
+                mid_path
+            ]
+            rc, log = _run(cmd)
+            if rc != 0 or (not os.path.exists(mid_path)):
+                raise RuntimeError(f"KeyframeExtractLocal failed: scene=0 and fallback midframe failed. log={log[-800:]}")
+            scene_files = [mid_path]
+            used_thr = None
+
+        # 4) 时间间隔过滤 + 截断 max_keyframes
+        # 这里用“均匀估计”时间戳（不解析 showinfo），足够用于过滤过密
+        if duration > 0 and len(scene_files) > 1:
+            kept: List[Tuple[float, str]] = []
+            last_t = -1e9
+            for i, f in enumerate(scene_files):
+                t = (i / max(1, (len(scene_files) - 1))) * duration
+                if t - last_t >= p.min_interval_sec:
+                    kept.append((t, f))
+                    last_t = t
+                if len(kept) >= p.max_keyframes:
+                    break
+            for t, f in kept:
+                outputs.append({"kind": "scene", "time_sec": float(t), "path": f})
+        else:
+            for f in scene_files[:p.max_keyframes]:
+                outputs.append({"kind": "scene", "time_sec": None, "path": f})
+
+        out_json = os.path.join(artifacts, p.out_json_name)
+        payload = {
+            "input": video_path,
+            "out_dir": out_dir,
+            "scene_threshold": p.scene_threshold,
+            "used_scene_threshold": used_thr,
+            "max_keyframes": p.max_keyframes,
+            "min_interval_sec": p.min_interval_sec,
+            "always_include_first": p.always_include_first,
+            "keyframes": outputs,
+        }
+        with open(out_json, "w", encoding="utf-8") as f:
+            json.dump(payload, f, ensure_ascii=False, indent=2)
+
+        return {
+            "out_dir": out_dir,
+            "keyframes_json": out_json,
+            "keyframes_dir": key_dir,
+        }
+
+
+if __name__ == "__main__":
+    import argparse
+
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--video", required=True)
+    ap.add_argument("--out_dir", required=True)
+    ap.add_argument("--scene_threshold", type=float, default=0.15)
+    ap.add_argument("--max_keyframes", type=int, default=30)
+    ap.add_argument("--min_interval_sec", type=float, default=1.0)
+    ap.add_argument("--always_include_first", action="store_true")
+    args = ap.parse_args()
+
+    runner = VideoKeyframeExtractLocal()
+    res = runner.run(
+        video_path=args.video,
+        out_dir=args.out_dir,
+        params={
+            "scene_threshold": args.scene_threshold,
+            "max_keyframes": args.max_keyframes,
+            "min_interval_sec": args.min_interval_sec,
+            "always_include_first": bool(args.always_include_first),
+        },
+    )
+    print(json.dumps(res, ensure_ascii=False, indent=2))
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_mot_track/__init__.py b/runtime/ops/mapper/video_mot_track/__init__.py
new file mode 100644
index 00000000..9d9954c8
--- /dev/null
+++ b/runtime/ops/mapper/video_mot_track/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoMotTrack",
+    module_path="ops.mapper.video_mot_track.process",
+)
diff --git a/runtime/ops/mapper/video_mot_track/configs/bytetrack.yaml b/runtime/ops/mapper/video_mot_track/configs/bytetrack.yaml
new file mode 100644
index 00000000..c8cdead2
--- /dev/null
+++ b/runtime/ops/mapper/video_mot_track/configs/bytetrack.yaml
@@ -0,0 +1,7 @@
+tracker_type: bytetrack
+track_high_thresh: 0.25
+track_low_thresh: 0.1
+new_track_thresh: 0.25
+track_buffer: 30
+match_thresh: 0.8
+fuse_score: True
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_mot_track/metadata.yml b/runtime/ops/mapper/video_mot_track/metadata.yml
new file mode 100644
index 00000000..24c0ff52
--- /dev/null
+++ b/runtime/ops/mapper/video_mot_track/metadata.yml
@@ -0,0 +1,16 @@
+name: '多目标跟踪'
+name_en: 'Video MOT Track'
+description: '基于检测+跟踪生成轨迹文件 tracks.json，并输出 debug.mp4 用于可视化验收。'
+description_en: 'Run detection+tracking to generate tracks.json and debug.mp4 for visualization.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoMotTrack'
+version: '1.0.0'
+types:
+  - 'annotation'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'text'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_mot_track/process.py b/runtime/ops/mapper/video_mot_track/process.py
new file mode 100644
index 00000000..fb82e9cf
--- /dev/null
+++ b/runtime/ops/mapper/video_mot_track/process.py
@@ -0,0 +1,119 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+import cv2
+from ultralytics import YOLO
+
+from .._video_common.io_video import get_video_info
+from .._video_common.schema import init_tracks_schema
+from .._video_common.paths import make_run_dir
+from .._video_common.log import get_logger
+
+def _draw_tracks(frame, objects):
+    for obj in objects:
+        x1, y1, x2, y2 = map(int, obj["bbox"])
+        tid = obj["track_id"]
+        score = obj.get("score", 0.0)
+        cls_id = obj.get("cls_id", -1)
+        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
+        text = f"id={tid} cls={cls_id} {score:.2f}"
+        cv2.putText(frame, text, (x1, max(0, y1 - 5)),
+                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
+    return frame
+
+class VideoMotTrack:
+    """
+    多目标追踪算子：
+    输入: sample["filePath"], sample["export_path"]
+    输出: tracks.json + debug.mp4
+    """
+    def execute(self, sample: dict, params: dict = None):
+        params = params or {}
+        video_path = sample["filePath"]
+        export_path = sample["export_path"]
+
+        out_dir = make_run_dir(export_path, "video_mot_track")
+        logger = get_logger("VideoMotTrack", log_dir=out_dir)
+
+        # 让 ultralytics 配置可写（避免 warning）
+        os.environ.setdefault("YOLO_CONFIG_DIR", os.path.join(out_dir, ".ultralytics"))
+        os.makedirs(os.environ["YOLO_CONFIG_DIR"], exist_ok=True)
+
+        # 默认使用算子包内置权重（离线环境不触发下载）
+        default_weight = os.path.join(os.path.dirname(__file__), "weights", "yolov8n.pt")
+        yolo_model = params.get("yolo_model", default_weight)
+        
+        conf = float(params.get("conf", 0.3))
+        iou = float(params.get("iou", 0.5))
+        classes = params.get("classes", None)  # "0,2,3" or None
+        tracker_cfg = params.get("tracker_cfg", os.path.join(os.path.dirname(__file__), "configs/bytetrack.yaml"))
+        save_debug = bool(params.get("save_debug", True))
+
+        cls_list = None
+        if classes:
+            cls_list = [int(x.strip()) for x in str(classes).split(",") if x.strip() != ""]
+
+        fps, W, H, _ = get_video_info(video_path)
+        tracks = init_tracks_schema(video_path, fps, W, H)
+
+        debug_path = os.path.join(out_dir, "debug.mp4")
+        debug_writer = None
+        if save_debug:
+            fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+            debug_writer = cv2.VideoWriter(debug_path, fourcc, fps, (W, H))
+
+        logger.info(f"Start tracking. video={video_path}, model={yolo_model}, conf={conf}, iou={iou}, classes={classes}")
+
+        model = YOLO(yolo_model)
+        results_iter = model.track(
+            source=video_path,
+            conf=conf,
+            iou=iou,
+            classes=cls_list,
+            tracker=tracker_cfg,
+            persist=True,
+            verbose=False,
+            stream=True,
+        )
+
+        for frame_id, r in enumerate(results_iter):
+            frame = r.orig_img
+            objs = []
+
+            if r.boxes is not None and r.boxes.id is not None:
+                xyxy = r.boxes.xyxy.cpu().numpy()
+                confs = r.boxes.conf.cpu().numpy()
+                clss = r.boxes.cls.cpu().numpy().astype(int)
+                tids = r.boxes.id.cpu().numpy().astype(int)
+
+                for box, s, c, tid in zip(xyxy, confs, clss, tids):
+                    x1, y1, x2, y2 = box.tolist()
+                    objs.append({
+                        "track_id": int(tid),
+                        "bbox": [float(x1), float(y1), float(x2), float(y2)],
+                        "score": float(s),
+                        "cls_id": int(c),
+                    })
+
+            tracks["frames"].append({"frame_id": frame_id, "objects": objs})
+
+            if debug_writer is not None:
+                vis = frame.copy()
+                vis = _draw_tracks(vis, objs)
+                debug_writer.write(vis)
+
+        if debug_writer is not None:
+            debug_writer.release()
+
+        json_path = os.path.join(out_dir, "tracks.json")
+        with open(json_path, "w", encoding="utf-8") as f:
+            json.dump(tracks, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"Done. tracks_json={json_path}, debug={debug_path if save_debug else None}")
+
+        # 返回给 runner
+        return {
+            "out_dir": out_dir,
+            "tracks_json": json_path,
+            "debug_video": debug_path if save_debug else None,
+        }
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_sensitive_crop/__init__.py b/runtime/ops/mapper/video_sensitive_crop/__init__.py
new file mode 100644
index 00000000..ff8912df
--- /dev/null
+++ b/runtime/ops/mapper/video_sensitive_crop/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoSensitiveCrop",
+    module_path="ops.mapper.video_sensitive_crop.process",
+)
diff --git a/runtime/ops/mapper/video_sensitive_crop/metadata.yml b/runtime/ops/mapper/video_sensitive_crop/metadata.yml
new file mode 100644
index 00000000..40398330
--- /dev/null
+++ b/runtime/ops/mapper/video_sensitive_crop/metadata.yml
@@ -0,0 +1,16 @@
+name: '视频敏感裁剪'
+name_en: 'Video Sensitive Crop'
+description: '根据敏感片段 JSON 裁剪/清洗输出 cleaned.mp4，并生成 crop_result.json。'
+description_en: 'Crop/clean video based on sensitive segments JSON; outputs cleaned.mp4 and crop_result.json.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoSensitiveCrop'
+version: '1.0.0'
+types:
+  - 'cleaning'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'video'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_sensitive_crop/process.py b/runtime/ops/mapper/video_sensitive_crop/process.py
new file mode 100644
index 00000000..154c1f37
--- /dev/null
+++ b/runtime/ops/mapper/video_sensitive_crop/process.py
@@ -0,0 +1,150 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+
+from .._video_common.paths import make_run_dir
+from .._video_common.log import get_logger
+from .._video_common.io_video import get_video_info
+from .._video_common.ffmpeg import cut_segment, concat_segments
+from ..video_sensitive_detect.process import VideoSensitiveDetect
+
+
+def complement_intervals(segments, duration):
+    if not segments:
+        return [[0.0, duration]]
+
+    segs = sorted([(float(x["start"]), float(x["end"])) for x in segments], key=lambda x: x[0])
+
+    # merge
+    merged = []
+    cs, ce = segs[0]
+    for s, e in segs[1:]:
+        if s <= ce:
+            ce = max(ce, e)
+        else:
+            merged.append([cs, ce])
+            cs, ce = s, e
+    merged.append([cs, ce])
+
+    keep = []
+    prev = 0.0
+    for s, e in merged:
+        s = max(0.0, min(duration, s))
+        e = max(0.0, min(duration, e))
+        if s > prev:
+            keep.append([prev, s])
+        prev = max(prev, e)
+    if prev < duration:
+        keep.append([prev, duration])
+
+    return [[s, e] for s, e in keep if e - s >= 0.05]
+
+
+class VideoSensitiveCrop:
+    """
+    敏感裁剪：默认 remove（剔除敏感段）
+    params:
+      - segments_json: 必填（video_sensitive_detect 输出）
+      - keep_mode: "remove" 或 "keep"（默认 remove）
+      - out_name: 默认 cleaned.mp4
+    """
+    def execute(self, sample: dict, params: dict = None):
+        params = params or {}
+        video_path = sample["filePath"]
+        export_path = sample["export_path"]
+
+        out_dir = make_run_dir(export_path, "video_sensitive_crop")
+        logger = get_logger("VideoSensitiveCrop", log_dir=out_dir)
+
+
+        segments_json = params.get("segments_json", "")
+        # 如果没传 segments_json，就自动先跑 VideoSensitiveDetect 生成
+        if (not segments_json) or (not os.path.exists(segments_json)):
+            # detect_params 优先从 params["detect_params"] 读取；否则从当前 params 里抽取 detect 所需字段
+            detect_params = params.get("detect_params", None)
+            if detect_params is None:
+                detect_keys = ["qwen_module", "qwen_func", "sample_fps", "threshold", "merge_gap", "prompt"]
+                detect_params = {k: params[k] for k in detect_keys if k in params}
+
+            # VideoSensitiveDetect 里 qwen_module 是必填的；没给就明确报错（避免你后面裁剪时不知道为什么没生成）
+            if "qwen_module" not in detect_params:
+                raise RuntimeError(
+                    "VideoSensitiveCrop: segments_json not provided, and detect_params missing required 'qwen_module'. "
+                    "Please pass params['qwen_module'] (and optional qwen_func/sample_fps/threshold/merge_gap)."
+                )
+
+            logger.info("segments_json not provided; run VideoSensitiveDetect first to generate sensitive_segments.json")
+            det_out = VideoSensitiveDetect().execute(sample, detect_params)
+
+            # 兼容不同返回 key：尽量从 det_out 中找出 json 路径
+            for key in [
+                "segments_json",
+                "sensitive_segments_json",
+                "sensitive_segments_path",
+                "json_path",
+                "output_json",
+            ]:
+                if key in det_out and det_out[key] and os.path.exists(det_out[key]):
+                    segments_json = det_out[key]
+                    break
+
+            # 如果 detect 没把路径通过 return 带出来，就回退到 out_dir 默认文件名（你的 detect 默认写 sensitive_segments.json）
+            if (not segments_json) or (not os.path.exists(segments_json)):
+                fallback = os.path.join(out_dir, "sensitive_segments.json")
+                if os.path.exists(fallback):
+                    segments_json = fallback
+
+            if (not segments_json) or (not os.path.exists(segments_json)):
+                raise RuntimeError("VideoSensitiveCrop: failed to obtain sensitive segments json from detect step.")
+
+
+
+
+        keep_mode = params.get("keep_mode", "remove")
+        out_name = params.get("out_name", "cleaned.mp4")
+        out_video = os.path.join(out_dir, out_name)
+
+        det = json.load(open(segments_json, "r", encoding="utf-8"))
+        segments = det.get("segments", [])
+
+        fps, W, H, nframes = get_video_info(video_path)
+        duration = nframes / float(fps) if fps > 0 else 0.0
+
+        if keep_mode == "remove":
+            keep_intervals = complement_intervals(segments, duration)
+        elif keep_mode == "keep":
+            keep_intervals = [[float(x["start"]), float(x["end"])] for x in segments]
+        else:
+            raise ValueError("keep_mode must be 'remove' or 'keep'")
+
+        logger.info(f"Start crop. mode={keep_mode}, keep_intervals={len(keep_intervals)}, duration={duration:.2f}s")
+
+        if not keep_intervals:
+            logger.info("No intervals to keep. Copy original as output.")
+            cut_segment(video_path, out_video, 0.0, duration, logger=logger)
+        else:
+            seg_dir = os.path.join(out_dir, "segments")
+            os.makedirs(seg_dir, exist_ok=True)
+
+            seg_files = []
+            for i, (s, e) in enumerate(keep_intervals):
+                seg_path = os.path.join(seg_dir, f"seg_{i:04d}.mp4")
+                cut_segment(video_path, seg_path, s, e, logger=logger)
+                seg_files.append(seg_path)
+
+            concat_segments(seg_files, out_video, logger=logger)
+
+        result = {
+            "out_dir": out_dir,
+            "input": video_path,
+            "segments_json": segments_json,
+            "keep_mode": keep_mode,
+            "output_video": out_video,
+            "kept_intervals": keep_intervals,
+        }
+        json_path = os.path.join(out_dir, "crop_result.json")
+        with open(json_path, "w", encoding="utf-8") as f:
+            json.dump(result, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"Done. output={out_video}")
+        return result
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_sensitive_detect/__init__.py b/runtime/ops/mapper/video_sensitive_detect/__init__.py
new file mode 100644
index 00000000..c03c3c42
--- /dev/null
+++ b/runtime/ops/mapper/video_sensitive_detect/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoSensitiveDetect",
+    module_path="ops.mapper.video_sensitive_detect.process",
+)
diff --git a/runtime/ops/mapper/video_sensitive_detect/metadata.yml b/runtime/ops/mapper/video_sensitive_detect/metadata.yml
new file mode 100644
index 00000000..7f51a044
--- /dev/null
+++ b/runtime/ops/mapper/video_sensitive_detect/metadata.yml
@@ -0,0 +1,16 @@
+name: '视频敏感检测'
+name_en: 'Video Sensitive Detect'
+description: '抽帧+Qwen判定生成敏感片段 sensitive_segments.json（需要提供 qwen_module/qwen_func）。'
+description_en: 'Sample frames and call Qwen inference to generate sensitive_segments.json (requires qwen_module/qwen_func).'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoSensitiveDetect'
+version: '1.0.0'
+types:
+  - 'annotation'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'text'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_sensitive_detect/process.py b/runtime/ops/mapper/video_sensitive_detect/process.py
new file mode 100644
index 00000000..fb665676
--- /dev/null
+++ b/runtime/ops/mapper/video_sensitive_detect/process.py
@@ -0,0 +1,138 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+import importlib
+import cv2
+
+from .._video_common.paths import make_run_dir
+from .._video_common.log import get_logger
+from .._video_common.io_video import get_video_info
+
+
+def merge_times_to_segments(times, gap=1.5):
+    if not times:
+        return []
+    times = sorted(times)
+    segs = []
+    s = times[0]
+    prev = times[0]
+    for t in times[1:]:
+        if t - prev <= gap:
+            prev = t
+        else:
+            segs.append([max(0.0, s - 0.5), prev + 0.5])
+            s = t
+            prev = t
+    segs.append([max(0.0, s - 0.5), prev + 0.5])
+    return segs
+
+
+class VideoSensitiveDetect:
+    """
+    简单敏感检测（抽帧 + Qwen判定）：
+    必填 params:
+      - qwen_module: 例如 "tools.qwen_sensitive"
+      - qwen_func:   例如 "sensitive_infer"
+
+    可选 params:
+      - sample_fps: 默认 1
+      - threshold: 默认 0.5
+      - merge_gap: 默认 1.5
+      - prompt: 要求模型输出 JSON 的提示词
+    """
+    def execute(self, sample: dict, params: dict = None):
+        params = params or {}
+        video_path = sample["filePath"]
+        export_path = sample["export_path"]
+
+        out_dir = make_run_dir(export_path, "video_sensitive_detect")
+        logger = get_logger("VideoSensitiveDetect", log_dir=out_dir)
+
+        qwen_module = params.get("qwen_module")
+        qwen_func = params.get("qwen_func", "sensitive_infer")
+        if not qwen_module:
+            raise RuntimeError("params['qwen_module'] is required, e.g. 'tools.qwen_sensitive'")
+
+        mod = importlib.import_module(qwen_module)
+        if not hasattr(mod, qwen_func):
+            raise RuntimeError(f"Function '{qwen_func}' not found in module '{qwen_module}'")
+        infer_fn = getattr(mod, qwen_func)
+
+        sample_fps = float(params.get("sample_fps", 1.0))
+        threshold = float(params.get("threshold", 0.5))
+        merge_gap = float(params.get("merge_gap", 1.5))
+        prompt = params.get(
+            "prompt",
+            "内容审核：若画面含色情/裸露、明显血腥流血、尸体、爆炸火光、持枪打斗、恐怖暴力、政治人物/政治标语，则判违规。"
+            "只输出JSON：{\"is_sensitive\":true/false,\"label\":\"porn|violence|blood|explosion|politics|none\",\"score\":0~1,\"reason\":\"一句话\"}"
+        )
+
+        fps, W, H, nframes = get_video_info(video_path)
+        step = max(1, int(round(fps / sample_fps)))
+
+        logger.info(f"Start sensitive detect. video={video_path}, fps={fps}, sample_fps={sample_fps}, step={step}")
+
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            raise RuntimeError(f"Cannot open video: {video_path}")
+
+        hits = []
+        times = []
+
+        frame_id = 0
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+
+            if frame_id % step != 0:
+                frame_id += 1
+                continue
+
+            t = frame_id / float(fps)
+
+            try:
+                res = infer_fn(frame, prompt)
+            except Exception as e:
+                logger.error(f"infer failed at t={t:.2f}: {e}")
+                frame_id += 1
+                continue
+
+            is_sensitive = bool(res.get("is_sensitive", False))
+            score = float(res.get("score", 0.0))
+            label = str(res.get("label", "unknown"))
+            reason = str(res.get("reason", ""))
+
+            hits.append({"time": t, "is_sensitive": is_sensitive, "score": score, "label": label, "reason": reason})
+
+            if is_sensitive and score >= threshold:
+                times.append(t)
+
+            frame_id += 1
+
+        cap.release()
+
+        segs = merge_times_to_segments(times, gap=merge_gap)
+
+        result = {
+            "out_dir": out_dir,
+            "video": video_path,
+            "sample_fps": sample_fps,
+            "threshold": threshold,
+            "merge_gap": merge_gap,
+            "hits": hits,
+            "segments": [{"start": float(s), "end": float(e)} for s, e in segs],
+        }
+
+        json_path = os.path.join(out_dir, "sensitive_segments.json")
+        with open(json_path, "w", encoding="utf-8") as f:
+            json.dump(result, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"Done. segments_json={json_path}, segments={len(segs)}, hits={len(hits)}")
+
+        return {
+            "out_dir": out_dir,
+            "segments_json": json_path,
+            "segments_count": len(segs),
+            "hits_count": len(hits),
+        }
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_speech_asr/__init__.py b/runtime/ops/mapper/video_speech_asr/__init__.py
new file mode 100644
index 00000000..cc00c40d
--- /dev/null
+++ b/runtime/ops/mapper/video_speech_asr/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoSpeechASR",
+    module_path="ops.mapper.video_speech_asr.process",
+)
diff --git a/runtime/ops/mapper/video_speech_asr/metadata.yml b/runtime/ops/mapper/video_speech_asr/metadata.yml
new file mode 100644
index 00000000..0847fa47
--- /dev/null
+++ b/runtime/ops/mapper/video_speech_asr/metadata.yml
@@ -0,0 +1,16 @@
+name: '语音识别ASR'
+name_en: 'Video Speech ASR'
+description: '从视频抽取音频并进行语音识别，输出 asr.json（可含时间戳）；支持指定语言/模型规模等参数。'
+description_en: 'Extract audio and run ASR, outputs asr.json (with timestamps); supports language/model options.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoSpeechASR'
+version: '1.0.0'
+types:
+  - 'annotation'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'text'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_speech_asr/process.py b/runtime/ops/mapper/video_speech_asr/process.py
new file mode 100644
index 00000000..71169899
--- /dev/null
+++ b/runtime/ops/mapper/video_speech_asr/process.py
@@ -0,0 +1,213 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+import shutil
+import subprocess
+import re
+
+from .._video_common.paths import make_run_dir, ensure_dir
+from .._video_common.log import get_logger
+
+
+def _write_srt(segments, srt_path):
+    def _fmt(t):
+        h = int(t // 3600)
+        m = int((t % 3600) // 60)
+        s = int(t % 60)
+        ms = int(round((t - int(t)) * 1000))
+        return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
+
+    with open(srt_path, "w", encoding="utf-8") as f:
+        for i, seg in enumerate(segments, 1):
+            f.write(str(i) + "\n")
+            f.write(f"{_fmt(seg['start'])} --> {_fmt(seg['end'])}\n")
+            f.write((seg.get("text") or "").strip() + "\n\n")
+
+
+def _contains_cjk(s: str) -> bool:
+    return bool(re.search(r"[\u4e00-\u9fff]", s or ""))
+
+
+def _to_simplified(text: str) -> str:
+    try:
+        from opencc import OpenCC
+        return OpenCC("t2s").convert(text)
+    except Exception:
+        return text
+
+
+class VideoSpeechASR:
+    """语音转文字（优先 faster-whisper；失败自动回退 openai-whisper）
+
+    params:
+      - ffmpeg_path: str, optional
+      - model: tiny|base|small|medium|large-v3, default small
+      - language: auto|zh|en, default zh
+      - beam_size: int, default 5
+      - vad_filter: bool, default True
+      - compute_type: int8|int8_float16|float16|float32, default int8
+      - sample_rate: int, default 16000
+      - channels: int, default 1
+      - max_audio_sec: float, optional
+      - zh_script: simplified|traditional|keep, default simplified
+
+      # 离线/本地模型（faster-whisper）
+      - fw_model_path: str, optional   # 本地模型路径（目录）
+      - fw_download_root: str, optional
+      - local_files_only: bool, default False
+
+    outputs:
+      - artifacts/audio.wav
+      - artifacts/asr.json / asr.txt / asr.srt
+      - artifacts/asr_backend.json（记录用了哪个后端/异常信息）
+    """
+
+    @staticmethod
+    def execute(sample, params):
+        video_path = sample["filePath"]
+        export_path = sample.get("export_path", "./outputs")
+
+        op_name = "video_speech_asr"
+        out_dir = make_run_dir(export_path, op_name)
+        log_dir = ensure_dir(os.path.join(out_dir, "logs"))
+        art_dir = ensure_dir(os.path.join(out_dir, "artifacts"))
+
+        logger = get_logger(op_name, log_dir)
+        logger.info(f"video={video_path}")
+        logger.info(f"out_dir={out_dir}")
+
+        ffmpeg_path = params.get("ffmpeg_path") or shutil.which("ffmpeg")
+        if not ffmpeg_path:
+            raise RuntimeError("ffmpeg not found. Please install ffmpeg or pass params.ffmpeg_path")
+
+        sr = int(params.get("sample_rate", 16000))
+        ch = int(params.get("channels", 1))
+        max_audio_sec = params.get("max_audio_sec", None)
+        max_audio_sec = float(max_audio_sec) if max_audio_sec is not None else None
+
+        audio_path = os.path.join(art_dir, "audio.wav")
+        cmd = [
+            ffmpeg_path, "-hide_banner", "-y",
+            "-i", video_path,
+            "-vn",
+            "-ac", str(ch),
+            "-ar", str(sr),
+            "-c:a", "pcm_s16le",
+        ]
+        if max_audio_sec is not None and max_audio_sec > 0:
+            cmd += ["-t", f"{max_audio_sec}"]
+        cmd += [audio_path]
+
+        logger.info("FFmpeg cmd: " + " ".join(cmd))
+        p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        if p.returncode != 0:
+            raise RuntimeError(f"FFmpeg failed (code={p.returncode}).\nSTDERR:\n{p.stderr}")
+
+        model_name = (params.get("model", "small") or "small")
+        language = (params.get("language", "zh") or "zh").lower()
+        beam_size = int(params.get("beam_size", 5))
+        vad_filter = bool(params.get("vad_filter", True))
+        compute_type = (params.get("compute_type", "int8") or "int8")
+        zh_script = (params.get("zh_script", "simplified") or "simplified").lower()
+
+        fw_model_path = params.get("fw_model_path", None)
+        fw_download_root = params.get("fw_download_root", None)
+        local_files_only = bool(params.get("local_files_only", False))
+
+        segments = []
+        full_text = ""
+        backend_info = {"backend": None, "error": None}
+
+        # ===== try faster-whisper =====
+        try:
+            from faster_whisper import WhisperModel
+            backend_info["backend"] = "faster-whisper"
+
+            # 离线策略：local_files_only 时，把 HF 的联网行为尽量关掉
+            if local_files_only:
+                os.environ.setdefault("HF_HUB_OFFLINE", "1")
+                os.environ.setdefault("TRANSFORMERS_OFFLINE", "1")
+
+            model_id = fw_model_path or model_name
+            logger.info(f"[ASR] faster-whisper load model={model_id}, compute_type={compute_type}, offline={local_files_only}")
+
+            fw = WhisperModel(
+                model_id,
+                device="cpu",
+                compute_type=compute_type,
+                download_root=fw_download_root,
+            )
+
+            logger.info("[ASR] faster-whisper transcribe start...")
+            seg_iter, info = fw.transcribe(
+                audio_path,
+                language=None if language == "auto" else language,
+                beam_size=beam_size,
+                vad_filter=vad_filter,
+            )
+            for s in seg_iter:
+                segments.append({"start": float(s.start), "end": float(s.end), "text": (s.text or "").strip()})
+            full_text = " ".join([s["text"] for s in segments]).strip()
+            logger.info("[ASR] faster-whisper transcribe done.")
+
+        except Exception as e:
+            # ===== fallback openai-whisper =====
+            backend_info["backend"] = "openai-whisper"
+            backend_info["error"] = f"faster-whisper failed: {repr(e)}"
+            logger.warning("[ASR] faster-whisper failed, fallback openai-whisper. reason=" + repr(e))
+
+            try:
+                import whisper
+            except Exception as e2:
+                raise RuntimeError("ASR backend failed. Please install: pip install faster-whisper openai-whisper") from e2
+
+            logger.info(f"[ASR] openai-whisper load model={model_name} (slow on CPU)")
+            wmodel = whisper.load_model(model_name)
+
+            wargs = {"fp16": False, "verbose": False}
+            if language != "auto":
+                wargs["language"] = language
+
+            logger.info("[ASR] openai-whisper transcribe start...")
+            result = wmodel.transcribe(audio_path, **wargs)
+            logger.info("[ASR] openai-whisper transcribe done.")
+
+            for seg in result.get("segments", []):
+                segments.append({
+                    "start": float(seg.get("start", 0.0)),
+                    "end": float(seg.get("end", 0.0)),
+                    "text": (seg.get("text") or "").strip()
+                })
+            full_text = (result.get("text") or "").strip()
+
+        # 简体化
+        if zh_script == "simplified":
+            if _contains_cjk(full_text):
+                full_text = _to_simplified(full_text)
+            for s in segments:
+                if _contains_cjk(s["text"]):
+                    s["text"] = _to_simplified(s["text"])
+
+        json_path = os.path.join(art_dir, "asr.json")
+        txt_path = os.path.join(art_dir, "asr.txt")
+        srt_path = os.path.join(art_dir, "asr.srt")
+        backend_path = os.path.join(art_dir, "asr_backend.json")
+
+        with open(json_path, "w", encoding="utf-8") as f:
+            json.dump({"text": full_text, "segments": segments}, f, ensure_ascii=False, indent=2)
+        with open(txt_path, "w", encoding="utf-8") as f:
+            f.write(full_text + "\n")
+        _write_srt(segments, srt_path)
+
+        with open(backend_path, "w", encoding="utf-8") as f:
+            json.dump(backend_info, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"Done. segments={len(segments)} asr_json={json_path}")
+        return {
+            "out_dir": out_dir,
+            "audio_wav": audio_path,
+            "asr_json": json_path,
+            "asr_txt": txt_path,
+            "asr_srt": srt_path,
+            "asr_backend": backend_path,
+        }
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_subject_crop/__init__.py b/runtime/ops/mapper/video_subject_crop/__init__.py
new file mode 100644
index 00000000..b4bc44bd
--- /dev/null
+++ b/runtime/ops/mapper/video_subject_crop/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoSubjectCrop",
+    module_path="ops.mapper.video_subject_crop.process",
+)
diff --git a/runtime/ops/mapper/video_subject_crop/metadata.yml b/runtime/ops/mapper/video_subject_crop/metadata.yml
new file mode 100644
index 00000000..45107f9c
--- /dev/null
+++ b/runtime/ops/mapper/video_subject_crop/metadata.yml
@@ -0,0 +1,16 @@
+name: '主体跟踪裁剪'
+name_en: 'Video Subject Crop'
+description: '根据 tracks.json 选择 Top1 主体轨迹并裁剪输出 subject.mp4，用于单主体验收链路。'
+description_en: 'Select the top subject track from tracks.json and crop to output subject.mp4.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoSubjectCrop'
+version: '1.0.0'
+types:
+  - 'cleaning'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'video'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_subject_crop/process.py b/runtime/ops/mapper/video_subject_crop/process.py
new file mode 100644
index 00000000..fb870c15
--- /dev/null
+++ b/runtime/ops/mapper/video_subject_crop/process.py
@@ -0,0 +1,175 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+import cv2
+
+from .._video_common.paths import make_run_dir
+from .._video_common.log import get_logger
+from ..video_mot_track.process import VideoMotTrack
+
+def _bbox_area(b):
+    x1, y1, x2, y2 = b
+    return max(0.0, x2 - x1) * max(0.0, y2 - y1)
+
+def _select_top1_track(tracks: dict, min_frames: int = 10):
+    stats = {}  # tid -> {"count":int, "area_sum":float}
+    for fr in tracks.get("frames", []):
+        for obj in fr.get("objects", []):
+            tid = int(obj["track_id"])
+            area = _bbox_area(obj["bbox"])
+            if tid not in stats:
+                stats[tid] = {"count": 0, "area_sum": 0.0}
+            stats[tid]["count"] += 1
+            stats[tid]["area_sum"] += area
+
+    items = []
+    for tid, s in stats.items():
+        if s["count"] < min_frames:
+            continue
+        avg_area = s["area_sum"] / max(1, s["count"])
+        items.append((tid, s["count"], avg_area))
+
+    if not items:
+        return None
+
+    items.sort(key=lambda x: (x[1], x[2]), reverse=True)
+    return int(items[0][0])
+
+def _clamp(val, lo, hi):
+    return max(lo, min(hi, val))
+
+def _ema(prev_bbox, bbox, alpha=0.8):
+    if prev_bbox is None:
+        return bbox
+    return [
+        alpha*prev_bbox[0] + (1-alpha)*bbox[0],
+        alpha*prev_bbox[1] + (1-alpha)*bbox[1],
+        alpha*prev_bbox[2] + (1-alpha)*bbox[2],
+        alpha*prev_bbox[3] + (1-alpha)*bbox[3],
+    ]
+
+def _expand_bbox(bbox, margin, W, H):
+    x1, y1, x2, y2 = bbox
+    w = x2 - x1
+    h = y2 - y1
+    x1 = x1 - w * margin
+    y1 = y1 - h * margin
+    x2 = x2 + w * margin
+    y2 = y2 + h * margin
+    x1 = _clamp(int(x1), 0, W-1)
+    y1 = _clamp(int(y1), 0, H-1)
+    x2 = _clamp(int(x2), 0, W-1)
+    y2 = _clamp(int(y2), 0, H-1)
+    if x2 <= x1: x2 = min(W-1, x1+1)
+    if y2 <= y1: y2 = min(H-1, y1+1)
+    return [x1, y1, x2, y2]
+
+class VideoSubjectCrop:
+    """
+    主体追踪裁剪（Top1）：
+    输入:
+      - sample["filePath"]
+      - sample["export_path"]
+      - params["tracks_json"] (可选：不提供就自动找同一次 run 的 tracks.json)
+    输出:
+      - subjects/subject.mp4
+      - subjects/subject_track_id.txt
+    """
+    def execute(self, sample: dict, params: dict = None):
+        params = params or {}
+        video_path = sample["filePath"]
+        export_path = sample["export_path"]
+
+        out_dir = make_run_dir(export_path, "video_subject_crop")
+        logger = get_logger("VideoSubjectCrop", log_dir=out_dir)
+
+        tracks_json = params.get("tracks_json", None)
+        if (not tracks_json) or (not os.path.exists(tracks_json)):
+            # 自动跑 MOT 生成 tracks.json
+            mot_params = params.get("mot_params", {})  # 可选：把 mot 的参数也透传进来
+            logger.info("tracks_json not provided; run VideoMotTrack first to generate tracks.json")
+            mot_out = VideoMotTrack().execute(sample, mot_params)
+            tracks_json = mot_out["tracks_json"]
+
+        crop_size = int(params.get("crop_size", 512))
+        margin = float(params.get("margin", 0.15))
+        smooth_alpha = float(params.get("smooth_alpha", 0.8))
+        min_frames = int(params.get("min_frames", 10))
+        fill_missing = bool(params.get("fill_missing", False))
+
+        with open(tracks_json, "r", encoding="utf-8") as f:
+            tracks = json.load(f)
+
+        fps = float(tracks["fps"])
+        W = int(tracks["width"])
+        H = int(tracks["height"])
+
+        subject_id = _select_top1_track(tracks, min_frames=min_frames)
+        if subject_id is None:
+            raise RuntimeError(f"No valid subject track found (min_frames={min_frames}).")
+
+        subjects_dir = os.path.join(out_dir, "subjects")
+        os.makedirs(subjects_dir, exist_ok=True)
+
+        with open(os.path.join(subjects_dir, "subject_track_id.txt"), "w", encoding="utf-8") as f:
+            f.write(str(subject_id))
+
+        out_video = os.path.join(subjects_dir, "subject.mp4")
+
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            raise RuntimeError(f"Cannot open video: {video_path}")
+
+        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+        writer = cv2.VideoWriter(out_video, fourcc, fps, (crop_size, crop_size))
+
+        last_bbox = None
+        frame_id = 0
+
+        logger.info(f"Start subject crop. subject_id={subject_id}, tracks={tracks_json}")
+
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+
+            bbox = None
+            if frame_id < len(tracks.get("frames", [])):
+                objs = tracks["frames"][frame_id].get("objects", [])
+                for obj in objs:
+                    if int(obj["track_id"]) == int(subject_id):
+                        bbox = obj["bbox"]
+                        break
+
+            if bbox is None:
+                if fill_missing and last_bbox is not None:
+                    bbox_s = last_bbox
+                else:
+                    frame_id += 1
+                    continue
+            else:
+                bbox_s = _ema(last_bbox, bbox, alpha=smooth_alpha)
+                last_bbox = bbox_s
+
+            bbox_e = _expand_bbox(bbox_s, margin=margin, W=W, H=H)
+            x1, y1, x2, y2 = bbox_e
+            crop = frame[y1:y2, x1:x2]
+            if crop.size == 0:
+                frame_id += 1
+                continue
+
+            crop = cv2.resize(crop, (crop_size, crop_size), interpolation=cv2.INTER_LINEAR)
+            writer.write(crop)
+
+            frame_id += 1
+
+        cap.release()
+        writer.release()
+
+        logger.info(f"Done. subject_video={out_video}")
+
+        return {
+            "out_dir": out_dir,
+            "subject_track_id": subject_id,
+            "subject_video": out_video,
+        }
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_subtitle_ocr/__init__.py b/runtime/ops/mapper/video_subtitle_ocr/__init__.py
new file mode 100644
index 00000000..5460c2f8
--- /dev/null
+++ b/runtime/ops/mapper/video_subtitle_ocr/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoSubtitleOCR",
+    module_path="ops.mapper.video_subtitle_ocr.process",
+)
diff --git a/runtime/ops/mapper/video_subtitle_ocr/metadata.yml b/runtime/ops/mapper/video_subtitle_ocr/metadata.yml
new file mode 100644
index 00000000..98948bdd
--- /dev/null
+++ b/runtime/ops/mapper/video_subtitle_ocr/metadata.yml
@@ -0,0 +1,16 @@
+name: '视频字幕OCR提取'
+name_en: 'Video Subtitle OCR'
+description: '对视频底部字幕区域进行OCR识别，输出 subtitles.json 与 subtitles.srt；可选自动去黑边、抽帧、跳过相似帧、字幕去重合并、英文空格修复。'
+description_en: 'OCR for bottom subtitles, outputs subtitles.json and subtitles.srt; optional deborder, sampling, frame skipping, merge, English spacing fix.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoSubtitleOCR'
+version: '1.0.0'
+types:
+  - 'annotation'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'text'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_subtitle_ocr/process.py b/runtime/ops/mapper/video_subtitle_ocr/process.py
new file mode 100644
index 00000000..a97eb789
--- /dev/null
+++ b/runtime/ops/mapper/video_subtitle_ocr/process.py
@@ -0,0 +1,373 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+import re
+import shutil
+import subprocess
+import cv2
+import numpy as np
+
+from .._video_common.paths import make_run_dir, ensure_dir
+from .._video_common.log import get_logger
+from .._video_common.io_video import get_video_info
+
+
+def _write_srt(segments, srt_path):
+    def _fmt(t):
+        h = int(t // 3600)
+        m = int((t % 3600) // 60)
+        s = int(t % 60)
+        ms = int(round((t - int(t)) * 1000))
+        return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
+
+    with open(srt_path, "w", encoding="utf-8") as f:
+        for i, seg in enumerate(segments, 1):
+            f.write(str(i) + "\n")
+            f.write(f"{_fmt(seg['start'])} --> {_fmt(seg['end'])}\n")
+            f.write((seg.get("text") or "").strip() + "\n\n")
+
+
+def _clean_text(t: str) -> str:
+    if not t:
+        return ""
+    t = t.strip()
+    t = re.sub(r"\s+", " ", t)
+    return t
+
+
+def _english_ratio(text: str) -> float:
+    if not text:
+        return 0.0
+    letters = sum(c.isalpha() for c in text)
+    return letters / max(1, len(text))
+
+
+def _fix_english_spacing(text: str) -> str:
+    """英文字幕空格修复（轻量规则，避免影响中文）"""
+    if not text:
+        return text
+    if _english_ratio(text) < 0.40:
+        return text
+
+    t = text
+
+    # 小写后接大写：ThisIs -> This Is
+    t = re.sub(r"([a-z])([A-Z])", r"\1 \2", t)
+
+    # 字母数字边界：A1 / 1A
+    t = re.sub(r"([A-Za-z])(\d)", r"\1 \2", t)
+    t = re.sub(r"(\d)([A-Za-z])", r"\1 \2", t)
+
+    # 标点前去空格，标点后若紧跟字母则补空格（保守）
+    t = re.sub(r"\s+([,.;:?!])", r"\1", t)
+    t = re.sub(r"([,.;:?!])([A-Za-z])", r"\1 \2", t)
+
+    # 多空格压缩
+    t = re.sub(r"\s+", " ", t).strip()
+    return t
+
+
+def _norm_sub_key(text: str) -> str:
+    """用于合并的规范化 key：空格归一、末尾标点归一、英文小写化"""
+    if not text:
+        return ""
+    t = text.strip()
+    t = re.sub(r"\s+", " ", t)
+    # 去掉末尾重复标点（中英文都考虑）
+    t = re.sub(r"[.。!?！？]+$", "", t).strip()
+
+    # 英文占比高则统一小写，便于合并
+    if _english_ratio(t) > 0.40:
+        t = t.lower()
+
+    return t
+
+
+def _roi_changed(cur_roi, last_roi, diff_thr=4.0):
+    """diff_thr 调低一点更敏感，避免跳过字幕变化"""
+    if last_roi is None:
+        return True
+    a = cv2.cvtColor(cur_roi, cv2.COLOR_BGR2GRAY)
+    b = cv2.cvtColor(last_roi, cv2.COLOR_BGR2GRAY)
+    if a.shape != b.shape:
+        b = cv2.resize(b, (a.shape[1], a.shape[0]), interpolation=cv2.INTER_AREA)
+    diff = np.mean(np.abs(a.astype(np.float32) - b.astype(np.float32)))
+    return diff >= diff_thr
+
+
+def _even(x: int) -> int:
+    return x - (x % 2)
+
+
+def _parse_cropdetect(stderr: str):
+    m_last = None
+    for line in stderr.splitlines():
+        m = re.search(r"crop=(\d+):(\d+):(\d+):(\d+)", line)
+        if m:
+            m_last = m
+    if not m_last:
+        return None
+    w, h, x, y = map(int, m_last.groups())
+    return (_even(w), _even(h), _even(x), _even(y))
+
+
+def _deborder_ffmpeg(ffmpeg_path: str, in_video: str, out_video: str, logger):
+    cmd1 = [
+        ffmpeg_path, "-hide_banner", "-y",
+        "-ss", "0", "-i", in_video, "-t", "2",
+        "-vf", "cropdetect=24:16:0",
+        "-f", "null", "-"
+    ]
+    logger.info("cropdetect cmd: " + " ".join(cmd1))
+    p1 = subprocess.run(cmd1, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    crop = _parse_cropdetect(p1.stderr)
+    if not crop:
+        logger.warning("cropdetect found nothing, keep original (copy).")
+        cmdc = [ffmpeg_path, "-hide_banner", "-y", "-i", in_video, "-c", "copy", out_video]
+        p = subprocess.run(cmdc, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        if p.returncode != 0:
+            raise RuntimeError(f"ffmpeg copy failed.\n{p.stderr}")
+        return None
+
+    w, h, x, y = crop
+    cmd2 = [
+        ffmpeg_path, "-hide_banner", "-y",
+        "-i", in_video,
+        "-vf", f"crop={w}:{h}:{x}:{y}",
+        "-c:v", "libx264", "-preset", "veryfast", "-crf", "23", "-pix_fmt", "yuv420p",
+        "-c:a", "copy",
+        out_video
+    ]
+    logger.info("crop cmd: " + " ".join(cmd2))
+    p2 = subprocess.run(cmd2, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    if p2.returncode != 0:
+        raise RuntimeError(f"ffmpeg crop failed.\n{p2.stderr}")
+    return {"w": w, "h": h, "x": x, "y": y}
+
+
+def _extract_texts_from_any(res):
+    """
+    兼容 PaddleOCR 多种返回：
+    - 传统：res = [ [ [box,(text,score)], ... ] ]
+    - 新 pipeline/dict：res 可能是 dict/对象，里头有 'rec_texts'/'rec_scores' 或 'texts'/'scores'
+    返回: list[(text,score)]
+    """
+    out = []
+
+    # dict 风格
+    if isinstance(res, dict):
+        keys_text = ["rec_texts", "texts", "text"]
+        keys_score = ["rec_scores", "scores", "score"]
+        texts = None
+        scores = None
+        for kt in keys_text:
+            if kt in res:
+                texts = res[kt]
+                break
+        for ks in keys_score:
+            if ks in res:
+                scores = res[ks]
+                break
+
+        if texts is not None:
+            if isinstance(texts, str):
+                out.append((texts, float(scores) if scores is not None else 0.0))
+                return out
+            if isinstance(texts, (list, tuple)):
+                if scores is None:
+                    for t in texts:
+                        out.append((str(t), 0.0))
+                else:
+                    if isinstance(scores, (list, tuple)) and len(scores) == len(texts):
+                        for t, s in zip(texts, scores):
+                            out.append((str(t), float(s)))
+                    else:
+                        for t in texts:
+                            out.append((str(t), float(scores) if scores is not None else 0.0))
+                return out
+
+        if "result" in res:
+            return _extract_texts_from_any(res["result"])
+
+    # list 风格（传统）
+    if isinstance(res, list):
+        if len(res) == 0:
+            return out
+
+        if isinstance(res[0], dict):
+            for item in res:
+                out.extend(_extract_texts_from_any(item))
+            return out
+
+        lines = res[0] if isinstance(res[0], list) else res
+        for line in lines:
+            try:
+                if isinstance(line, (list, tuple)) and len(line) >= 2:
+                    info = line[1]
+                    if isinstance(info, (list, tuple)) and len(info) >= 2:
+                        out.append((str(info[0]), float(info[1])))
+                    elif isinstance(info, str):
+                        out.append((info, 0.0))
+            except Exception:
+                continue
+        return out
+
+    # 兜底
+    try:
+        s = str(res)
+        if s:
+            out.append((s, 0.0))
+    except Exception:
+        pass
+    return out
+
+
+class VideoSubtitleOCR:
+    """字幕 OCR（自动去黑边 + 固定下30% + 英文空格修复 + 去重合并）
+
+    params:
+      - preprocess_deborder: bool, default True
+      - sample_fps: float, default 1.0
+      - max_frames: int, default 240
+      - subtitle_ratio: float, default 0.30
+      - ocr_lang: ch|en, default ch
+      - min_score: float, default 0.0
+      - roi_diff_thr: float, default 4.0
+      - gap_merge_sec: float, default 1.2     # ✅ 更容易合并跨帧字幕
+      - fix_english_space: bool, default True # ✅ 英文空格修复开关
+
+    outputs:
+      - artifacts/subtitles.json
+      - artifacts/subtitles.srt
+      - artifacts/frames/subtitle_*.jpg
+      - artifacts/deborder.mp4 (if preprocess_deborder=True)
+    """
+
+    @staticmethod
+    def execute(sample, params):
+        os.environ.setdefault("PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK", "True")
+
+        in_video = sample["filePath"]
+        export_path = sample.get("export_path", "./outputs")
+        op_name = "video_subtitle_ocr"
+        out_dir = make_run_dir(export_path, op_name)
+        log_dir = ensure_dir(os.path.join(out_dir, "logs"))
+        art_dir = ensure_dir(os.path.join(out_dir, "artifacts"))
+        frames_dir = ensure_dir(os.path.join(art_dir, "frames"))
+        logger = get_logger(op_name, log_dir)
+
+        ffmpeg_path = params.get("ffmpeg_path") or shutil.which("ffmpeg")
+        if not ffmpeg_path:
+            raise RuntimeError("ffmpeg not found")
+
+        # ✅ 默认自动去黑边
+        if params.get("preprocess_deborder", True):
+            deborder_mp4 = os.path.join(art_dir, "deborder.mp4")
+            crop = _deborder_ffmpeg(ffmpeg_path, in_video, deborder_mp4, logger)
+            with open(os.path.join(art_dir, "deborder_crop.json"), "w", encoding="utf-8") as f:
+                json.dump({"crop": crop, "deborder_mp4": deborder_mp4}, f, ensure_ascii=False, indent=2)
+            src_video = deborder_mp4
+        else:
+            src_video = in_video
+
+        logger.info(f"video={src_video}")
+        logger.info(f"out_dir={out_dir}")
+
+        from paddleocr import PaddleOCR
+        ocr_lang = params.get("ocr_lang", "ch")
+        ocr = PaddleOCR(use_angle_cls=True, lang=ocr_lang)
+
+        fps, w, h, total = get_video_info(src_video)
+        sample_fps = float(params.get("sample_fps", 1.0))
+        max_frames = int(params.get("max_frames", 240))
+        subtitle_ratio = float(params.get("subtitle_ratio", 0.30))
+        min_score = float(params.get("min_score", 0.0))
+        roi_diff_thr = float(params.get("roi_diff_thr", 4.0))
+        gap_merge = float(params.get("gap_merge_sec", 1.2))
+        fix_en_space = bool(params.get("fix_english_space", True))
+
+        step = max(1, int(round(fps / max(sample_fps, 0.0001))))
+        idxs = list(range(0, total, step))
+        if max_frames and len(idxs) > max_frames:
+            n = max_frames
+            idxs = [idxs[int(i * (len(idxs) - 1) / max(1, n - 1))] for i in range(n)]
+
+        cap = cv2.VideoCapture(src_video)
+        if not cap.isOpened():
+            raise RuntimeError(f"Cannot open video: {src_video}")
+
+        raw_hits = []
+        last_roi = None
+
+        for k, fi in enumerate(idxs):
+            cap.set(cv2.CAP_PROP_POS_FRAMES, int(fi))
+            ok, frame = cap.read()
+            if not ok or frame is None:
+                continue
+
+            t = float(fi / fps) if fps else 0.0
+            y0 = int(h * (1.0 - subtitle_ratio))
+            roi = frame[y0:h, 0:w]
+
+            if not _roi_changed(roi, last_roi, diff_thr=roi_diff_thr):
+                continue
+            last_roi = roi
+
+            jpg_path = os.path.join(frames_dir, f"subtitle_{int(fi):06d}.jpg")
+            cv2.imwrite(jpg_path, roi)
+
+            res = ocr.ocr(roi)
+            pairs = _extract_texts_from_any(res)
+            texts = [txt for (txt, sc) in pairs if txt and float(sc) >= min_score]
+
+            text = _clean_text(" ".join(texts))
+            if fix_en_space:
+                text = _fix_english_spacing(text)
+
+            if text:
+                raw_hits.append({"t": t, "text": text, "key": _norm_sub_key(text), "frame_id": int(fi), "jpg": jpg_path})
+
+            if (k + 1) % 20 == 0 or k == len(idxs) - 1:
+                logger.info(f"[{k+1}/{len(idxs)}] frame={fi} hit={1 if text else 0} len={len(text)}")
+
+        cap.release()
+
+        # ✅ 合并相邻相同字幕（按规范化 key 合并）
+        segments = []
+        for hit in raw_hits:
+            if not segments:
+                segments.append({
+                    "start": hit["t"],
+                    "end": hit["t"],
+                    "text": hit["text"],
+                    "key": hit["key"],
+                    "evidence": [{"t": hit["t"], "frame_id": hit["frame_id"], "jpg": hit["jpg"]}],
+                })
+                continue
+
+            last = segments[-1]
+            if hit["key"] == last["key"] and (hit["t"] - last["end"] <= gap_merge):
+                last["end"] = hit["t"]
+                last["evidence"].append({"t": hit["t"], "frame_id": hit["frame_id"], "jpg": hit["jpg"]})
+            else:
+                segments.append({
+                    "start": hit["t"],
+                    "end": hit["t"],
+                    "text": hit["text"],
+                    "key": hit["key"],
+                    "evidence": [{"t": hit["t"], "frame_id": hit["frame_id"], "jpg": hit["jpg"]}],
+                })
+
+        # end 往后延一点，srt 更自然
+        for seg in segments:
+            seg["end"] = float(seg["end"] + max(0.4, 1.0 / max(sample_fps, 0.1)))
+
+        # 输出时不需要 key（但保留也无所谓；你想更干净就删掉）
+        json_path = os.path.join(art_dir, "subtitles.json")
+        srt_path = os.path.join(art_dir, "subtitles.srt")
+        with open(json_path, "w", encoding="utf-8") as f:
+            json.dump({"segments": segments}, f, ensure_ascii=False, indent=2)
+        _write_srt(segments, srt_path)
+
+        logger.info(f"Done. subtitles={len(segments)} srt={srt_path}")
+        return {"out_dir": out_dir, "subtitles_json": json_path, "subtitles_srt": srt_path, "count": len(segments)}
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_summary_qwenvl/__init__.py b/runtime/ops/mapper/video_summary_qwenvl/__init__.py
new file mode 100644
index 00000000..6e9386f9
--- /dev/null
+++ b/runtime/ops/mapper/video_summary_qwenvl/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoSummaryQwenVL",
+    module_path="ops.mapper.video_summary_qwenvl.process",
+)
diff --git a/runtime/ops/mapper/video_summary_qwenvl/metadata.yml b/runtime/ops/mapper/video_summary_qwenvl/metadata.yml
new file mode 100644
index 00000000..5f34dd41
--- /dev/null
+++ b/runtime/ops/mapper/video_summary_qwenvl/metadata.yml
@@ -0,0 +1,16 @@
+name: '视频摘要（QwenVL）'
+name_en: 'Video Summary (QwenVL)'
+description: '抽多帧拼 montage，只调用一次 QwenVL summary，输出 summary.json（含 montage.jpg 与证据帧）。'
+description_en: 'Build montage from sampled frames, call QwenVL summary once; outputs summary.json with montage and evidence.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoSummaryQwenVL'
+version: '1.0.0'
+types:
+  - 'annotation'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'text'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_summary_qwenvl/process.py b/runtime/ops/mapper/video_summary_qwenvl/process.py
new file mode 100644
index 00000000..8119d5fb
--- /dev/null
+++ b/runtime/ops/mapper/video_summary_qwenvl/process.py
@@ -0,0 +1,174 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+import cv2
+import math
+import importlib
+import re
+
+from .._video_common.paths import make_run_dir, ensure_dir
+from .._video_common.log import get_logger
+from .._video_common.io_video import get_video_info
+
+_qwen = importlib.import_module("tools.qwen_sensitive")
+qwenvl_infer = _qwen.qwenvl_infer
+
+
+def _sample_frame_indices(total_frames: int, fps: float, sample_fps: float, max_frames: int):
+    """按 sample_fps 抽帧，然后均匀下采样到 max_frames。"""
+    if total_frames <= 0:
+        return []
+    fps = float(fps) if fps else 25.0
+    step = max(1, int(round(fps / max(float(sample_fps), 0.0001))))
+    idxs = list(range(0, total_frames, step))
+    if max_frames and len(idxs) > int(max_frames):
+        n = int(max_frames)
+        idxs = [idxs[int(i * (len(idxs) - 1) / max(1, n - 1))] for i in range(n)]
+    return idxs
+
+
+def _make_montage(frames_bgr, cell_w=384, cell_h=216, max_cols=4, bg=0):
+    """把多帧拼成一张图（montage），用于“基于多帧生成总摘要”。
+    - frames_bgr: List[np.ndarray(BGR)]
+    """
+    if not frames_bgr:
+        return None
+
+    n = len(frames_bgr)
+    cols = min(max_cols, n)
+    rows = int(math.ceil(n / cols))
+
+    montage = (bg * np.ones((rows * cell_h, cols * cell_w, 3), dtype=frames_bgr[0].dtype))  # noqa
+
+    for i, fr in enumerate(frames_bgr):
+        r = i // cols
+        c = i % cols
+        # resize to cell
+        fr_r = cv2.resize(fr, (cell_w, cell_h), interpolation=cv2.INTER_AREA)
+        y0 = r * cell_h
+        x0 = c * cell_w
+        montage[y0:y0 + cell_h, x0:x0 + cell_w] = fr_r
+
+    return montage
+
+
+def _squeeze_whitespace(text: str) -> str:
+    """把多余空白（包括 \\n）压成一个空格，变成“一段话”更易读。"""
+    if not text:
+        return ""
+    t = re.sub(r"\s+", " ", text).strip()
+    return t
+
+
+# numpy 在 montage 中用到（避免你环境里没装导致 import 报错：你现在 qwen 环境肯定有 numpy，但 datamate 环境也应有）
+import numpy as np  # noqa: E402
+
+
+class VideoSummaryQwenVL:
+    """视频文本概括（QwenVL，多帧总摘要）
+
+    核心：抽多帧 -> 拼成 montage -> 只调用一次 task=summary -> 得到“总摘要”
+
+    params:
+      - sample_fps: float, default 1.0
+      - max_frames: int, default 12
+      - language: zh|en, default zh
+      - style: short|normal|detail, default normal
+      - max_new_tokens: int, default 160
+      - montage_cell_w: int, default 384
+      - montage_cell_h: int, default 216
+      - montage_max_cols: int, default 4
+
+    outputs:
+      - artifacts/summary.json: {summary, evidence:[{frame_id,jpg}], montage_jpg}
+    """
+
+    @staticmethod
+    def execute(sample, params):
+        video_path = sample["filePath"]
+        export_path = sample.get("export_path", "./outputs")
+
+        op_name = "video_summary_qwenvl"
+        out_dir = make_run_dir(export_path, op_name)
+        log_dir = ensure_dir(os.path.join(out_dir, "logs"))
+        art_dir = ensure_dir(os.path.join(out_dir, "artifacts"))
+        frames_dir = ensure_dir(os.path.join(art_dir, "frames"))
+
+        logger = get_logger(op_name, log_dir)
+        logger.info(f"video={video_path}")
+        logger.info(f"out_dir={out_dir}")
+
+        fps, w, h, total = get_video_info(video_path)
+
+        sample_fps = float(params.get("sample_fps", 1.0))
+        max_frames = int(params.get("max_frames", 12))
+        language = params.get("language", "zh")
+        style = params.get("style", "normal")
+        max_new_tokens = int(params.get("max_new_tokens", 160))
+
+        cell_w = int(params.get("montage_cell_w", 384))
+        cell_h = int(params.get("montage_cell_h", 216))
+        max_cols = int(params.get("montage_max_cols", 4))
+
+        idxs = _sample_frame_indices(total, fps, sample_fps, max_frames)
+        logger.info(f"fps={fps:.3f}, frames={total}, idxs={len(idxs)}, style={style}, lang={language}")
+
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            raise RuntimeError(f"Cannot open video: {video_path}")
+
+        frames_for_montage = []
+        evidence = []
+
+        for k, fi in enumerate(idxs):
+            cap.set(cv2.CAP_PROP_POS_FRAMES, int(fi))
+            ok, frame = cap.read()
+            if not ok or frame is None:
+                continue
+
+            jpg_path = os.path.join(frames_dir, f"frame_{int(fi):06d}.jpg")
+            cv2.imwrite(jpg_path, frame)
+
+            frames_for_montage.append(frame)
+            evidence.append({"frame_id": int(fi), "jpg": jpg_path})
+
+            logger.info(f"[{k+1}/{len(idxs)}] frame={fi} collected")
+
+        cap.release()
+
+        montage = _make_montage(frames_for_montage, cell_w=cell_w, cell_h=cell_h, max_cols=max_cols, bg=0)
+        if montage is None:
+            result = {
+                "summary": "",
+                "evidence": evidence,
+                "montage_jpg": "",
+                "meta": {"fps": float(fps), "width": int(w), "height": int(h), "total_frames": int(total)}
+            }
+        else:
+            montage_path = os.path.join(art_dir, "montage.jpg")
+            cv2.imwrite(montage_path, montage)
+
+            # ✅ 只调用一次：基于“多帧拼图”生成总摘要
+            resp = qwenvl_infer(
+                montage,
+                task="summary",
+                language=language,
+                style=style,
+                max_new_tokens=max_new_tokens,
+                timeout=180,
+            )
+            summary = _squeeze_whitespace((resp.get("summary") or "").strip())
+
+            result = {
+                "summary": summary,
+                "evidence": evidence,
+                "montage_jpg": montage_path,
+                "meta": {"fps": float(fps), "width": int(w), "height": int(h), "total_frames": int(total)}
+            }
+
+        json_path = os.path.join(art_dir, "summary.json")
+        with open(json_path, "w", encoding="utf-8") as f:
+            json.dump(result, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"Done. summary_json={json_path}, summary_len={len(result.get('summary',''))}")
+        return {"out_dir": out_dir, "summary_json": json_path, "summary": result.get("summary", "")}
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_text_ocr/__init__.py b/runtime/ops/mapper/video_text_ocr/__init__.py
new file mode 100644
index 00000000..74283ffb
--- /dev/null
+++ b/runtime/ops/mapper/video_text_ocr/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoTextOCR",
+    module_path="ops.mapper.video_text_ocr.process",
+)
diff --git a/runtime/ops/mapper/video_text_ocr/metadata.yml b/runtime/ops/mapper/video_text_ocr/metadata.yml
new file mode 100644
index 00000000..5f911d60
--- /dev/null
+++ b/runtime/ops/mapper/video_text_ocr/metadata.yml
@@ -0,0 +1,16 @@
+name: '视频显著文字OCR提取'
+name_en: 'Video Text OCR'
+description: '对视频上方/主要区域显著文字进行OCR识别，输出 text_ocr.json；可选自动去黑边、抽帧、跳过相似帧。'
+description_en: 'OCR for salient texts on main/top region, outputs text_ocr.json; optional deborder, sampling, frame skipping.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoTextOCR'
+version: '1.0.0'
+types:
+  - 'annotation'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'text'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_text_ocr/process.py b/runtime/ops/mapper/video_text_ocr/process.py
new file mode 100644
index 00000000..5995250c
--- /dev/null
+++ b/runtime/ops/mapper/video_text_ocr/process.py
@@ -0,0 +1,254 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+import re
+import shutil
+import subprocess
+import cv2
+import numpy as np
+from collections import Counter
+
+from .._video_common.paths import make_run_dir, ensure_dir
+from .._video_common.log import get_logger
+from .._video_common.io_video import get_video_info
+
+
+def _clean_text(t: str) -> str:
+    if not t:
+        return ""
+    t = t.strip()
+    t = re.sub(r"\s+", " ", t)
+    return t
+
+
+def _roi_changed(cur_roi, last_roi, diff_thr=6.0):
+    if last_roi is None:
+        return True
+    a = cv2.cvtColor(cur_roi, cv2.COLOR_BGR2GRAY)
+    b = cv2.cvtColor(last_roi, cv2.COLOR_BGR2GRAY)
+    if a.shape != b.shape:
+        b = cv2.resize(b, (a.shape[1], a.shape[0]), interpolation=cv2.INTER_AREA)
+    diff = np.mean(np.abs(a.astype(np.float32) - b.astype(np.float32)))
+    return diff >= diff_thr
+
+
+def _even(x: int) -> int:
+    return x - (x % 2)
+
+
+def _parse_cropdetect(stderr: str):
+    m_last = None
+    for line in stderr.splitlines():
+        m = re.search(r"crop=(\d+):(\d+):(\d+):(\d+)", line)
+        if m:
+            m_last = m
+    if not m_last:
+        return None
+    w, h, x, y = map(int, m_last.groups())
+    return (_even(w), _even(h), _even(x), _even(y))
+
+
+def _deborder_ffmpeg(ffmpeg_path: str, in_video: str, out_video: str, logger):
+    cmd1 = [
+        ffmpeg_path, "-hide_banner", "-y",
+        "-ss", "0", "-i", in_video, "-t", "2",
+        "-vf", "cropdetect=24:16:0",
+        "-f", "null", "-"
+    ]
+    logger.info("cropdetect cmd: " + " ".join(cmd1))
+    p1 = subprocess.run(cmd1, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    crop = _parse_cropdetect(p1.stderr)
+    if not crop:
+        logger.warning("cropdetect found nothing, keep original (copy).")
+        cmdc = [ffmpeg_path, "-hide_banner", "-y", "-i", in_video, "-c", "copy", out_video]
+        p = subprocess.run(cmdc, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        if p.returncode != 0:
+            raise RuntimeError(f"ffmpeg copy failed.\n{p.stderr}")
+        return None
+
+    w, h, x, y = crop
+    cmd2 = [
+        ffmpeg_path, "-hide_banner", "-y",
+        "-i", in_video,
+        "-vf", f"crop={w}:{h}:{x}:{y}",
+        "-c:v", "libx264", "-preset", "veryfast", "-crf", "23", "-pix_fmt", "yuv420p",
+        "-c:a", "copy",
+        out_video
+    ]
+    logger.info("crop cmd: " + " ".join(cmd2))
+    p2 = subprocess.run(cmd2, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    if p2.returncode != 0:
+        raise RuntimeError(f"ffmpeg crop failed.\n{p2.stderr}")
+    return {"w": w, "h": h, "x": x, "y": y}
+
+
+def _extract_texts_from_any(res):
+    out = []
+    if isinstance(res, dict):
+        for kt in ["rec_texts", "texts", "text"]:
+            if kt in res:
+                texts = res[kt]
+                scores = res.get("rec_scores", res.get("scores", res.get("score", None)))
+                if isinstance(texts, str):
+                    out.append((texts, float(scores) if scores is not None else 0.0))
+                    return out
+                if isinstance(texts, (list, tuple)):
+                    if isinstance(scores, (list, tuple)) and len(scores) == len(texts):
+                        for t, s in zip(texts, scores):
+                            out.append((str(t), float(s)))
+                    else:
+                        for t in texts:
+                            out.append((str(t), float(scores) if scores is not None else 0.0))
+                    return out
+        if "result" in res:
+            return _extract_texts_from_any(res["result"])
+
+    if isinstance(res, list):
+        if len(res) == 0:
+            return out
+        if isinstance(res[0], dict):
+            for item in res:
+                out.extend(_extract_texts_from_any(item))
+            return out
+        lines = res[0] if isinstance(res[0], list) else res
+        for line in lines:
+            try:
+                if isinstance(line, (list, tuple)) and len(line) >= 2:
+                    info = line[1]
+                    if isinstance(info, (list, tuple)) and len(info) >= 2:
+                        out.append((str(info[0]), float(info[1])))
+                    elif isinstance(info, str):
+                        out.append((info, 0.0))
+            except Exception:
+                continue
+        return out
+
+    try:
+        s = str(res)
+        if s:
+            out.append((s, 0.0))
+    except Exception:
+        pass
+    return out
+
+
+def _is_garbage_text(t: str) -> bool:
+    if not t:
+        return True
+    s = t.replace(" ", "")
+    if len(s) < 2:
+        return True
+    letters = sum(c.isalpha() for c in s)
+    if letters / len(s) > 0.9:
+        uniq = len(set(s.lower()))
+        if uniq <= 5:
+            return True
+    cnt = Counter(s.lower())
+    most = cnt.most_common(1)[0][1]
+    if most / len(s) > 0.65:
+        return True
+    return False
+
+
+class VideoTextOCR:
+    """显著文字 OCR（自动去黑边 + 上70%）
+
+    params:
+      - preprocess_deborder: bool, default True
+      - sample_fps: float, default 0.5
+      - max_frames: int, default 120
+      - top_ratio: float, default 0.70
+      - ocr_lang: ch|en, default ch
+      - min_score: float, default 0.0
+      - roi_diff_thr: float, default 6.0
+    """
+
+    @staticmethod
+    def execute(sample, params):
+        os.environ.setdefault("PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK", "True")
+
+        in_video = sample["filePath"]
+        export_path = sample.get("export_path", "./outputs")
+        op_name = "video_text_ocr"
+        out_dir = make_run_dir(export_path, op_name)
+        log_dir = ensure_dir(os.path.join(out_dir, "logs"))
+        art_dir = ensure_dir(os.path.join(out_dir, "artifacts"))
+        frames_dir = ensure_dir(os.path.join(art_dir, "frames"))
+        logger = get_logger(op_name, log_dir)
+
+        ffmpeg_path = params.get("ffmpeg_path") or shutil.which("ffmpeg")
+        if not ffmpeg_path:
+            raise RuntimeError("ffmpeg not found")
+
+        if params.get("preprocess_deborder", True):
+            deborder_mp4 = os.path.join(art_dir, "deborder.mp4")
+            crop = _deborder_ffmpeg(ffmpeg_path, in_video, deborder_mp4, logger)
+            with open(os.path.join(art_dir, "deborder_crop.json"), "w", encoding="utf-8") as f:
+                json.dump({"crop": crop, "deborder_mp4": deborder_mp4}, f, ensure_ascii=False, indent=2)
+            src_video = deborder_mp4
+        else:
+            src_video = in_video
+
+        logger.info(f"video={src_video}")
+        logger.info(f"out_dir={out_dir}")
+
+        from paddleocr import PaddleOCR
+        ocr_lang = params.get("ocr_lang", "ch")
+        ocr = PaddleOCR(use_angle_cls=True, lang=ocr_lang)
+
+        fps, w, h, total = get_video_info(src_video)
+        sample_fps = float(params.get("sample_fps", 0.5))
+        max_frames = int(params.get("max_frames", 120))
+        top_ratio = float(params.get("top_ratio", 0.70))
+        min_score = float(params.get("min_score", 0.0))
+        roi_diff_thr = float(params.get("roi_diff_thr", 6.0))
+
+        step = max(1, int(round(fps / max(sample_fps, 0.0001))))
+        idxs = list(range(0, total, step))
+        if max_frames and len(idxs) > max_frames:
+            n = max_frames
+            idxs = [idxs[int(i * (len(idxs) - 1) / max(1, n - 1))] for i in range(n)]
+
+        cap = cv2.VideoCapture(src_video)
+        if not cap.isOpened():
+            raise RuntimeError(f"Cannot open video: {src_video}")
+
+        hits = []
+        last_roi = None
+
+        for k, fi in enumerate(idxs):
+            cap.set(cv2.CAP_PROP_POS_FRAMES, int(fi))
+            ok, frame = cap.read()
+            if not ok or frame is None:
+                continue
+
+            t = float(fi / fps) if fps else 0.0
+            y1 = int(h * top_ratio)
+            roi = frame[0:y1, 0:w]
+
+            if not _roi_changed(roi, last_roi, diff_thr=roi_diff_thr):
+                continue
+            last_roi = roi
+
+            jpg_path = os.path.join(frames_dir, f"text_{int(fi):06d}.jpg")
+            cv2.imwrite(jpg_path, roi)
+
+            res = ocr.ocr(roi)
+            pairs = _extract_texts_from_any(res)
+            texts = [txt for (txt, sc) in pairs if txt and float(sc) >= min_score]
+            text = _clean_text(" ".join(texts))
+
+            if text and (not _is_garbage_text(text)):
+                hits.append({"t": t, "frame_id": int(fi), "text": text, "jpg": jpg_path})
+
+            if (k + 1) % 20 == 0 or k == len(idxs) - 1:
+                logger.info(f"[{k+1}/{len(idxs)}] frame={fi} hit={1 if text else 0} len={len(text)}")
+
+        cap.release()
+
+        json_path = os.path.join(art_dir, "text_ocr.json")
+        with open(json_path, "w", encoding="utf-8") as f:
+            json.dump({"hits": hits}, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"Done. hits={len(hits)}")
+        return {"out_dir": out_dir, "text_ocr_json": json_path, "count": len(hits)}
\ No newline at end of file

From 4dfc1315215d2fd6ad712fcd669a274564b5774a Mon Sep 17 00:00:00 2001
From: guotingxuan5599 <1321352073@qq.com>
Date: Fri, 6 Mar 2026 10:48:51 +0800
Subject: [PATCH 4/4] feat: video operators PR1-PR5 (model repo paths + http
 qwen + ocr/asr/yolo updates)

---
 .../ops/mapper/_video_common/model_paths.py   |  28 +++
 .../mapper/_video_common/qwen_http_client.py  |  42 ++++
 .../mapper/video_classify_qwenvl/process.py   | 122 ++++++-----
 .../mapper/video_event_tag_qwenvl/process.py  | 200 ++++++++++--------
 runtime/ops/mapper/video_mot_track/process.py | 113 +++++-----
 .../mapper/video_sensitive_detect/process.py  | 115 +++++-----
 .../ops/mapper/video_subtitle_ocr/process.py  |  37 +++-
 .../mapper/video_summary_qwenvl/process.py    | 179 +++++++---------
 runtime/ops/mapper/video_text_ocr/process.py  |  36 +++-
 9 files changed, 506 insertions(+), 366 deletions(-)
 create mode 100644 runtime/ops/mapper/_video_common/model_paths.py
 create mode 100644 runtime/ops/mapper/_video_common/qwen_http_client.py

diff --git a/runtime/ops/mapper/_video_common/model_paths.py b/runtime/ops/mapper/_video_common/model_paths.py
new file mode 100644
index 00000000..1a9bffe2
--- /dev/null
+++ b/runtime/ops/mapper/_video_common/model_paths.py
@@ -0,0 +1,28 @@
+import os
+
+def get_model_root(params=None) -> str:
+    """
+    模型根目录优先级：
+      1) params['model_root']
+      2) 环境变量 DATAMATE_MODEL_ROOT
+      3) 默认 /mnt/models
+    """
+    params = params or {}
+    return params.get("model_root") or os.environ.get("DATAMATE_MODEL_ROOT") or "/mnt/models"
+
+
+def resolve_model_path(params, param_key: str, default_rel: str) -> str:
+    """
+    解析模型路径：
+      - 如果 params[param_key] 是绝对路径：直接用
+      - 如果是相对路径：拼到 model_root
+      - 如果没传：用 model_root + default_rel
+    """
+    params = params or {}
+    root = get_model_root(params)
+
+    v = params.get(param_key)
+    if v:
+        return v if os.path.isabs(v) else os.path.join(root, v)
+
+    return os.path.join(root, default_rel)
\ No newline at end of file
diff --git a/runtime/ops/mapper/_video_common/qwen_http_client.py b/runtime/ops/mapper/_video_common/qwen_http_client.py
new file mode 100644
index 00000000..b6640f69
--- /dev/null
+++ b/runtime/ops/mapper/_video_common/qwen_http_client.py
@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+import cv2
+import requests
+
+def qwenvl_infer_by_image_path(
+    image_path: str,
+    task: str,
+    service_url: str = "http://127.0.0.1:18080",
+    max_new_tokens: int = 64,
+    language: str = "zh",
+    style: str = "normal",
+    timeout: int = 180,
+):
+    """
+    对齐你当前服务端 qwen_vl_server.py 的接口：
+      POST {service_url}/infer
+      JSON: {image_path, task, max_new_tokens, language, style}
+
+    返回：服务端 jsonify 的 dict
+    """
+    sess = requests.Session()
+    sess.trust_env = False  # 避免系统代理拦 localhost
+
+    payload = {
+        "image_path": image_path,
+        "task": task,
+        "max_new_tokens": int(max_new_tokens),
+        "language": language,
+        "style": style,
+    }
+    r = sess.post(service_url.rstrip("/") + "/infer", json=payload, timeout=timeout)
+    r.raise_for_status()
+    return r.json()
+
+def save_frame_to_jpg(frame_bgr, out_path: str):
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    ok = cv2.imwrite(out_path, frame_bgr)
+    if not ok:
+        raise RuntimeError(f"failed to write jpg: {out_path}")
+    return out_path
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_classify_qwenvl/process.py b/runtime/ops/mapper/video_classify_qwenvl/process.py
index 9638043b..48a0a608 100644
--- a/runtime/ops/mapper/video_classify_qwenvl/process.py
+++ b/runtime/ops/mapper/video_classify_qwenvl/process.py
@@ -3,31 +3,18 @@
 import json
 import collections
 import cv2
-import importlib
 
 from .._video_common.paths import make_run_dir, ensure_dir
 from .._video_common.log import get_logger
 from .._video_common.io_video import get_video_info
-
-_qwen = importlib.import_module("tools.qwen_sensitive")
-qwenvl_infer = _qwen.qwenvl_infer
-
-
-CLASS25 = [
-    "日常生活", "影视剧集", "音乐舞蹈", "幽默搞笑", "游戏电竞",
-    "动漫二次元", "新闻时事", "教育教学", "科技数码", "财经商业",
-    "纪录片", "体育竞技", "美食烹饪", "时尚美妆", "汽车交通",
-    "萌宠动物", "健康健身", "自然风光", "三农", "监控安防",
-    "广告营销", "才艺展示", "军事国防", "情感心理", "其他"
-]
+from .._video_common.qwen_http_client import qwenvl_infer_by_image_path, save_frame_to_jpg
 
 
 def _sample_frame_indices(total_frames: int, fps: float, sample_fps: float, max_frames: int):
     if total_frames <= 0:
         return []
-    sample_fps = float(sample_fps)
     fps = float(fps) if fps else 25.0
-    step = max(1, int(round(fps / max(sample_fps, 0.0001))))
+    step = max(1, int(round(fps / max(float(sample_fps), 1e-6))))
     idxs = list(range(0, total_frames, step))
     if max_frames and len(idxs) > int(max_frames):
         n = int(max_frames)
@@ -36,83 +23,92 @@ def _sample_frame_indices(total_frames: int, fps: float, sample_fps: float, max_
 
 
 class VideoClassifyQwenVL:
-    """视频分类（25类）
-
-    思路：抽帧 -> 调 QwenVL 服务 task=classify25 -> 多帧投票输出 top1
+    """
+    抽帧 + QwenVL HTTP 分类（对齐服务端 task=classify25）：
+      返回: {class_id, class_name, raw}
 
     params:
-      - sample_fps: float, default 1.0
-      - max_frames: int, default 12
-      - return_topk: int, default 3
+      - service_url: 默认 http://127.0.0.1:18080
+      - timeout_sec: 默认 180
+      - sample_fps: 默认 1.0
+      - max_frames: 默认 12
+      - return_topk: 默认 3
+      - max_new_tokens: 默认 16
+    outputs:
+      - artifacts/classification.json
     """
 
-    @staticmethod
-    def execute(sample, params):
+    def execute(self, sample, params=None):
+        params = params or {}
         video_path = sample["filePath"]
         export_path = sample.get("export_path", "./outputs")
 
-        op_name = "video_classify_qwenvl"
-        out_dir = make_run_dir(export_path, op_name)
+        out_dir = make_run_dir(export_path, "video_classify_qwenvl")
         log_dir = ensure_dir(os.path.join(out_dir, "logs"))
         art_dir = ensure_dir(os.path.join(out_dir, "artifacts"))
         frames_dir = ensure_dir(os.path.join(art_dir, "frames"))
+        logger = get_logger("VideoClassifyQwenVL", log_dir)
 
-        logger = get_logger(op_name, log_dir)
-        logger.info(f"video={video_path}")
-        logger.info(f"out_dir={out_dir}")
-
-        fps, w, h, total = get_video_info(video_path)
+        service_url = params.get("service_url", "http://127.0.0.1:18080")
+        timeout_sec = int(params.get("timeout_sec", 180))
         sample_fps = float(params.get("sample_fps", 1.0))
         max_frames = int(params.get("max_frames", 12))
         return_topk = int(params.get("return_topk", 3))
+        max_new_tokens = int(params.get("max_new_tokens", 16))
 
-        idxs = _sample_frame_indices(total, fps, sample_fps, max_frames)
-        logger.info(f"fps={fps:.3f}, frames={total}, sample_fps={sample_fps}, idxs={len(idxs)}")
+        fps, W, H, total_frames = get_video_info(video_path)
+        idxs = _sample_frame_indices(total_frames, fps, sample_fps, max_frames)
 
         cap = cv2.VideoCapture(video_path)
         if not cap.isOpened():
             raise RuntimeError(f"Cannot open video: {video_path}")
 
-        votes = []
-        evidences = []
-        for k, fi in enumerate(idxs):
-            cap.set(cv2.CAP_PROP_POS_FRAMES, int(fi))
+        votes = collections.Counter()
+        evidence = []
+
+        for idx in idxs:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
             ok, frame = cap.read()
-            if not ok or frame is None:
+            if not ok:
                 continue
 
-            jpg_path = os.path.join(frames_dir, f"frame_{int(fi):06d}.jpg")
-            cv2.imwrite(jpg_path, frame)
-
-            resp = qwenvl_infer(frame, task="classify25", timeout=180)
-            cid = int(resp.get("class_id", 25) or 25)
-            cname = resp.get("class_name", "其他") or "其他"
-
-            votes.append(cname)
-            evidences.append({"frame_id": int(fi), "jpg": jpg_path, "class_id": cid, "class_name": cname})
+            frame_jpg = os.path.join(frames_dir, f"{idx:06d}.jpg")
+            save_frame_to_jpg(frame, frame_jpg)
+
+            try:
+                res = qwenvl_infer_by_image_path(
+                    image_path=frame_jpg,
+                    task="classify25",
+                    service_url=service_url,
+                    max_new_tokens=max_new_tokens,
+                    timeout=timeout_sec,
+                )
+            except Exception as e:
+                logger.error(f"classify infer failed frame={idx}: {repr(e)}")
+                continue
 
-            logger.info(f"[{k+1}/{len(idxs)}] frame={fi} -> {cid}:{cname}")
+            class_name = (res.get("class_name") or "其他").strip()
+            class_id = int(res.get("class_id", 25))
+            votes[class_name] += 1
+            evidence.append({"frame_idx": idx, "image_path": frame_jpg, "class_id": class_id, "class_name": class_name})
 
         cap.release()
 
-        if not votes:
-            result = {"top1": {"class_id": 25, "class_name": "其他", "score": 0.0}, "topk": [], "evidence": []}
-        else:
-            c = collections.Counter(votes)
-            top = c.most_common(max(1, return_topk))
-            top1_name, top1_cnt = top[0]
-            top1_id = (CLASS25.index(top1_name) + 1) if top1_name in CLASS25 else 25
-            result = {
-                "top1": {"class_id": int(top1_id), "class_name": top1_name, "score": float(top1_cnt / len(votes))},
-                "topk": [{"class_id": (CLASS25.index(name)+1) if name in CLASS25 else 25,
-                          "class_name": name, "score": float(cnt / len(votes))} for name, cnt in top],
-                "evidence": evidences,
-                "meta": {"fps": float(fps), "width": int(w), "height": int(h), "total_frames": int(total)}
-            }
+        topk = [{"label": k, "vote": int(v)} for k, v in votes.most_common(return_topk)]
+        top1 = topk[0]["label"] if topk else "其他"
+
+        result = {
+            "top1": top1,
+            "topk": topk,
+            "service_url": service_url,
+            "sample_fps": sample_fps,
+            "max_frames": max_frames,
+            "evidence": evidence,
+        }
 
         json_path = os.path.join(art_dir, "classification.json")
         with open(json_path, "w", encoding="utf-8") as f:
             json.dump(result, f, ensure_ascii=False, indent=2)
 
-        logger.info(f"Done. classification_json={json_path}")
-        return {"out_dir": out_dir, "classification_json": json_path, "top1": result["top1"]}
\ No newline at end of file
+        logger.info(f"Done. classification_json={json_path}, top1={top1}")
+        return {"out_dir": out_dir, "classification_json": json_path, "top1": top1}
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_event_tag_qwenvl/process.py b/runtime/ops/mapper/video_event_tag_qwenvl/process.py
index 78264c88..43974f27 100644
--- a/runtime/ops/mapper/video_event_tag_qwenvl/process.py
+++ b/runtime/ops/mapper/video_event_tag_qwenvl/process.py
@@ -2,120 +2,134 @@
 import os
 import json
 import cv2
-import importlib
 
 from .._video_common.paths import make_run_dir, ensure_dir
 from .._video_common.log import get_logger
 from .._video_common.io_video import get_video_info
+from .._video_common.qwen_http_client import qwenvl_infer_by_image_path, save_frame_to_jpg
 
-_qwen = importlib.import_module("tools.qwen_sensitive")
-qwenvl_infer = _qwen.qwenvl_infer
 
+def _make_segments(duration_sec: float, params: dict):
+    adaptive = bool(params.get("adaptive_segment", True))
+    max_segments = int(params.get("max_segments", 60))
+    max_new_tokens = int(params.get("max_new_tokens", 32))
 
-def _clamp(x, lo, hi):
-    return max(lo, min(hi, x))
+    if duration_sec <= 0:
+        return [(0.0, 0.0)]
 
+    if not adaptive:
+        seg_len = float(params.get("segment_seconds", 5.0))
+    else:
+        target = int(params.get("target_segments", 12))
+        min_seg = float(params.get("min_segment_seconds", 2.0))
+        max_seg = float(params.get("max_segment_seconds", 60.0))
+        seg_len = duration_sec / max(1, target)
+        seg_len = max(min_seg, min(max_seg, seg_len))
 
-class VideoEventTagQwenVL:
-    """事件标注（自适应分段）
+    segs = []
+    s = 0.0
+    while s < duration_sec and len(segs) < max_segments:
+        e = min(duration_sec, s + seg_len)
+        segs.append((s, e))
+        s = e
+    return segs
 
-    目标（默认参数下）：
-      - 8 秒视频 -> 大约 4 段（≈2s/段）
-      - 120 秒视频 -> 大约 12 段（≈10s/段）
-      - 600 秒视频 -> 大约 12 段（≈50s/段）
 
-    params:
-      - adaptive_segment: bool, default True
-      - target_segments: int, default 12
-      - min_segment_seconds: float, default 2.0
-      - max_segment_seconds: float, default 60.0
-      - segment_seconds: float, optional（手动覆盖；当 adaptive_segment=False 时使用）
-      - max_segments: int, default 60
-      - max_new_tokens: int, default 32
+class VideoEventTagQwenVL:
+    """
+    分段取中点帧 → QwenVL HTTP 事件标注（对齐服务端 task=event_tag）：
+      返回: {event}
 
+    params:
+      - service_url: 默认 http://127.0.0.1:18080
+      - timeout_sec: 默认 180
+      - adaptive_segment: 默认 True
+      - target_segments: 默认 12
+      - min_segment_seconds: 默认 2.0
+      - max_segment_seconds: 默认 60.0
+      - segment_seconds: 默认 5.0（当 adaptive_segment=false 时）
+      - max_segments: 默认 60
+      - max_new_tokens: 默认 32
     outputs:
-      - artifacts/events.json: [{start, end, event, evidence:{frame_id,jpg}}]
+      - artifacts/events.json
+      - artifacts/frames/*.jpg
     """
 
-    @staticmethod
-    def execute(sample, params):
+    def execute(self, sample, params=None):
+        params = params or {}
         video_path = sample["filePath"]
         export_path = sample.get("export_path", "./outputs")
 
-        op_name = "video_event_tag_qwenvl"
-        out_dir = make_run_dir(export_path, op_name)
+        out_dir = make_run_dir(export_path, "video_event_tag_qwenvl")
         log_dir = ensure_dir(os.path.join(out_dir, "logs"))
         art_dir = ensure_dir(os.path.join(out_dir, "artifacts"))
         frames_dir = ensure_dir(os.path.join(art_dir, "frames"))
+        logger = get_logger("VideoEventTagQwenVL", log_dir)
 
-        logger = get_logger(op_name, log_dir)
-        logger.info(f"video={video_path}")
-        logger.info(f"out_dir={out_dir}")
-
-        fps, w, h, total = get_video_info(video_path)
-        duration = (total / fps) if fps else 0.0
-
-        adaptive = bool(params.get("adaptive_segment", True))
-        target_segments = int(params.get("target_segments", 12))
-        min_seg_s = float(params.get("min_segment_seconds", 2.0))
-        max_seg_s = float(params.get("max_segment_seconds", 60.0))
-
-        if adaptive:
-            # seg_s = duration / target_segments，并 clamp 到[min_seg_s, max_seg_s]
-            seg_s = _clamp(duration / max(1, target_segments), min_seg_s, max_seg_s)
-        else:
-            seg_s = float(params.get("segment_seconds", 5.0))
-
-        max_segments = int(params.get("max_segments", 60))
+        service_url = params.get("service_url", "http://127.0.0.1:18080")
+        timeout_sec = int(params.get("timeout_sec", 180))
         max_new_tokens = int(params.get("max_new_tokens", 32))
 
-        logger.info(
-            f"fps={fps:.3f}, frames={total}, duration={duration:.2f}s, "
-            f"adaptive={adaptive}, segment_seconds={seg_s:.2f}, target_segments={target_segments}"
-        )
-
-        if duration <= 0:
-            events = []
-        else:
-            nseg = int(duration // seg_s) + 1
-            nseg = min(nseg, max_segments)
-
-            cap = cv2.VideoCapture(video_path)
-            if not cap.isOpened():
-                raise RuntimeError(f"Cannot open video: {video_path}")
-
-            events = []
-            for i in range(nseg):
-                start = i * seg_s
-                end = min(duration, (i + 1) * seg_s)
-                mid = (start + end) / 2.0
-                frame_id = int(mid * fps)
-
-                cap.set(cv2.CAP_PROP_POS_FRAMES, frame_id)
-                ok, frame = cap.read()
-                if not ok or frame is None:
-                    continue
-
-                jpg_path = os.path.join(frames_dir, f"seg_{i:03d}_frame_{frame_id:06d}.jpg")
-                cv2.imwrite(jpg_path, frame)
-
-                resp = qwenvl_infer(frame, task="event_tag", max_new_tokens=max_new_tokens, timeout=180)
-                ev = (resp.get("event") or "").strip()
-
-                events.append({
-                    "start": float(start),
-                    "end": float(end),
-                    "event": ev,
-                    "evidence": {"frame_id": int(frame_id), "jpg": jpg_path}
-                })
-
-                logger.info(f"[{i+1}/{nseg}] {start:.2f}-{end:.2f} mid={mid:.2f}s -> {ev}")
-
-            cap.release()
-
-        json_path = os.path.join(art_dir, "events.json")
-        with open(json_path, "w", encoding="utf-8") as f:
-            json.dump(events, f, ensure_ascii=False, indent=2)
-
-        logger.info(f"Done. events_json={json_path}, segments={len(events)}")
-        return {"out_dir": out_dir, "events_json": json_path, "count": len(events), "segment_seconds": float(seg_s)}
\ No newline at end of file
+        fps, W, H, total_frames = get_video_info(video_path)
+        duration_sec = (float(total_frames) / float(fps)) if fps else 0.0
+        segs = _make_segments(duration_sec, params)
+
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            raise RuntimeError(f"Cannot open video: {video_path}")
+
+        events = []
+        for i, (s, e) in enumerate(segs):
+            mid = (s + e) / 2.0
+            mid_frame = int(round(mid * float(fps))) if fps else 0
+            cap.set(cv2.CAP_PROP_POS_FRAMES, mid_frame)
+            ok, frame = cap.read()
+            if not ok:
+                continue
+
+            frame_jpg = os.path.join(frames_dir, f"seg_{i:04d}_mid_{mid_frame:06d}.jpg")
+            save_frame_to_jpg(frame, frame_jpg)
+
+            try:
+                res = qwenvl_infer_by_image_path(
+                    image_path=frame_jpg,
+                    task="event_tag",
+                    service_url=service_url,
+                    max_new_tokens=max_new_tokens,
+                    timeout=timeout_sec,
+                )
+                event = (res.get("event") or "").strip()
+            except Exception as ex:
+                logger.error(f"event_tag infer failed seg={i} mid={mid:.2f}: {repr(ex)}")
+                event = ""
+
+            events.append(
+                {
+                    "seg_id": i,
+                    "start": float(s),
+                    "end": float(e),
+                    "mid": float(mid),
+                    "mid_frame": int(mid_frame),
+                    "image_path": frame_jpg,
+                    "event": event,
+                }
+            )
+
+        cap.release()
+
+        out_json = os.path.join(art_dir, "events.json")
+        with open(out_json, "w", encoding="utf-8") as f:
+            json.dump(
+                {
+                    "video": video_path,
+                    "service_url": service_url,
+                    "duration_sec": duration_sec,
+                    "segments": events,
+                },
+                f,
+                ensure_ascii=False,
+                indent=2,
+            )
+
+        logger.info(f"Done. events_json={out_json}, segments={len(events)}")
+        return {"out_dir": out_dir, "events_json": out_json, "segments_count": len(events)}
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_mot_track/process.py b/runtime/ops/mapper/video_mot_track/process.py
index fb82e9cf..b3a1010c 100644
--- a/runtime/ops/mapper/video_mot_track/process.py
+++ b/runtime/ops/mapper/video_mot_track/process.py
@@ -2,47 +2,53 @@
 import os
 import json
 import cv2
+import shutil
+
 from ultralytics import YOLO
 
+from .._video_common.paths import make_run_dir, ensure_dir
+from .._video_common.log import get_logger
 from .._video_common.io_video import get_video_info
 from .._video_common.schema import init_tracks_schema
-from .._video_common.paths import make_run_dir
-from .._video_common.log import get_logger
+from .._video_common.model_paths import resolve_model_path
 
-def _draw_tracks(frame, objects):
-    for obj in objects:
-        x1, y1, x2, y2 = map(int, obj["bbox"])
-        tid = obj["track_id"]
-        score = obj.get("score", 0.0)
-        cls_id = obj.get("cls_id", -1)
-        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
-        text = f"id={tid} cls={cls_id} {score:.2f}"
-        cv2.putText(frame, text, (x1, max(0, y1 - 5)),
-                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
-    return frame
 
 class VideoMotTrack:
+    """多目标跟踪（YOLO + ByteTrack）
+
+    权重策略（模型仓）：
+      DATAMATE_MODEL_ROOT=/mnt/models
+      默认权重：/mnt/models/yolo/yolov8n.pt
+    params:
+      - model_root: 可选，覆盖 DATAMATE_MODEL_ROOT
+      - yolo_model: 可选，权重路径（相对/绝对均可）
+      - conf: default 0.3
+      - iou: default 0.5
+      - classes: "0,2,3" or None
+      - tracker_cfg: bytetrack yaml 路径（默认算子 configs/bytetrack.yaml）
+      - save_debug: default True
+    outputs:
+      - tracks.json
+      - debug.mp4 (optional)
     """
-    多目标追踪算子：
-    输入: sample["filePath"], sample["export_path"]
-    输出: tracks.json + debug.mp4
-    """
+
     def execute(self, sample: dict, params: dict = None):
         params = params or {}
         video_path = sample["filePath"]
-        export_path = sample["export_path"]
+        export_path = sample.get("export_path", "./outputs")
 
         out_dir = make_run_dir(export_path, "video_mot_track")
-        logger = get_logger("VideoMotTrack", log_dir=out_dir)
+        log_dir = ensure_dir(os.path.join(out_dir, "logs"))
+        art_dir = ensure_dir(os.path.join(out_dir, "artifacts"))
+        logger = get_logger("VideoMotTrack", log_dir)
 
-        # 让 ultralytics 配置可写（避免 warning）
-        os.environ.setdefault("YOLO_CONFIG_DIR", os.path.join(out_dir, ".ultralytics"))
+        # YOLO config dir（避免写到不可写目录）
+        os.environ.setdefault("YOLO_CONFIG_DIR", os.path.join(out_dir, "yolo_cfg"))
         os.makedirs(os.environ["YOLO_CONFIG_DIR"], exist_ok=True)
 
-        # 默认使用算子包内置权重（离线环境不触发下载）
-        default_weight = os.path.join(os.path.dirname(__file__), "weights", "yolov8n.pt")
-        yolo_model = params.get("yolo_model", default_weight)
-        
+        # ✅ 模型仓默认权重
+        yolo_model = resolve_model_path(params, "yolo_model", "yolo/yolov8n.pt")
+
         conf = float(params.get("conf", 0.3))
         iou = float(params.get("iou", 0.5))
         classes = params.get("classes", None)  # "0,2,3" or None
@@ -56,13 +62,15 @@ def execute(self, sample: dict, params: dict = None):
         fps, W, H, _ = get_video_info(video_path)
         tracks = init_tracks_schema(video_path, fps, W, H)
 
-        debug_path = os.path.join(out_dir, "debug.mp4")
+        debug_path = os.path.join(art_dir, "debug.mp4")
         debug_writer = None
         if save_debug:
             fourcc = cv2.VideoWriter_fourcc(*"mp4v")
             debug_writer = cv2.VideoWriter(debug_path, fourcc, fps, (W, H))
 
         logger.info(f"Start tracking. video={video_path}, model={yolo_model}, conf={conf}, iou={iou}, classes={classes}")
+        if not os.path.exists(yolo_model):
+            raise RuntimeError(f"YOLO weight not found: {yolo_model}. Please download to model repo path.")
 
         model = YOLO(yolo_model)
         results_iter = model.track(
@@ -71,49 +79,46 @@ def execute(self, sample: dict, params: dict = None):
             iou=iou,
             classes=cls_list,
             tracker=tracker_cfg,
-            persist=True,
-            verbose=False,
             stream=True,
+            verbose=False,
         )
 
-        for frame_id, r in enumerate(results_iter):
+        frame_idx = 0
+        for r in results_iter:
             frame = r.orig_img
             objs = []
-
             if r.boxes is not None and r.boxes.id is not None:
-                xyxy = r.boxes.xyxy.cpu().numpy()
-                confs = r.boxes.conf.cpu().numpy()
-                clss = r.boxes.cls.cpu().numpy().astype(int)
-                tids = r.boxes.id.cpu().numpy().astype(int)
-
-                for box, s, c, tid in zip(xyxy, confs, clss, tids):
-                    x1, y1, x2, y2 = box.tolist()
+                ids = r.boxes.id.cpu().numpy().tolist()
+                xyxy = r.boxes.xyxy.cpu().numpy().tolist()
+                confs = r.boxes.conf.cpu().numpy().tolist()
+                clss = r.boxes.cls.cpu().numpy().tolist()
+                for tid, bb, sc, cc in zip(ids, xyxy, confs, clss):
+                    x1, y1, x2, y2 = bb
                     objs.append({
                         "track_id": int(tid),
                         "bbox": [float(x1), float(y1), float(x2), float(y2)],
-                        "score": float(s),
-                        "cls_id": int(c),
+                        "score": float(sc),
+                        "cls_id": int(cc),
                     })
-
-            tracks["frames"].append({"frame_id": frame_id, "objects": objs})
+                    if debug_writer is not None:
+                        cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0,255,0), 2)
+                        cv2.putText(frame, f"id={int(tid)}", (int(x1), int(y1)-5),
+                                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,0), 2)
+            tracks["frames"].append({"frame_idx": frame_idx, "objects": objs})
 
             if debug_writer is not None:
-                vis = frame.copy()
-                vis = _draw_tracks(vis, objs)
-                debug_writer.write(vis)
+                debug_writer.write(frame)
+            frame_idx += 1
 
         if debug_writer is not None:
             debug_writer.release()
 
-        json_path = os.path.join(out_dir, "tracks.json")
-        with open(json_path, "w", encoding="utf-8") as f:
+        tracks_path = os.path.join(art_dir, "tracks.json")
+        with open(tracks_path, "w", encoding="utf-8") as f:
             json.dump(tracks, f, ensure_ascii=False, indent=2)
 
-        logger.info(f"Done. tracks_json={json_path}, debug={debug_path if save_debug else None}")
-
-        # 返回给 runner
-        return {
-            "out_dir": out_dir,
-            "tracks_json": json_path,
-            "debug_video": debug_path if save_debug else None,
-        }
\ No newline at end of file
+        logger.info(f"Done. tracks_json={tracks_path}")
+        out = {"out_dir": out_dir, "tracks_json": tracks_path}
+        if save_debug:
+            out["debug_mp4"] = debug_path
+        return out
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_sensitive_detect/process.py b/runtime/ops/mapper/video_sensitive_detect/process.py
index fb665676..a5707065 100644
--- a/runtime/ops/mapper/video_sensitive_detect/process.py
+++ b/runtime/ops/mapper/video_sensitive_detect/process.py
@@ -1,15 +1,15 @@
 # -*- coding: utf-8 -*-
 import os
 import json
-import importlib
 import cv2
 
-from .._video_common.paths import make_run_dir
+from .._video_common.paths import make_run_dir, ensure_dir
 from .._video_common.log import get_logger
 from .._video_common.io_video import get_video_info
+from .._video_common.qwen_http_client import qwenvl_infer_by_image_path, save_frame_to_jpg
 
 
-def merge_times_to_segments(times, gap=1.5):
+def _merge_times_to_segments(times, gap=1.5, pad=0.5):
     if not times:
         return []
     times = sorted(times)
@@ -20,106 +20,129 @@ def merge_times_to_segments(times, gap=1.5):
         if t - prev <= gap:
             prev = t
         else:
-            segs.append([max(0.0, s - 0.5), prev + 0.5])
+            segs.append([max(0.0, s - pad), prev + pad])
             s = t
             prev = t
-    segs.append([max(0.0, s - 0.5), prev + 0.5])
+    segs.append([max(0.0, s - pad), prev + pad])
     return segs
 
 
 class VideoSensitiveDetect:
     """
-    简单敏感检测（抽帧 + Qwen判定）：
-    必填 params:
-      - qwen_module: 例如 "tools.qwen_sensitive"
-      - qwen_func:   例如 "sensitive_infer"
+    抽帧 + QwenVL HTTP 敏感检测（对齐 qwen_vl_server.py）：
 
-    可选 params:
-      - sample_fps: 默认 1
+    服务端：
+      POST {service_url}/infer
+      JSON: {image_path, task="sensitive", max_new_tokens, language, style}
+      返回: {is_sensitive,label,score,reason}
+
+    params:
+      - service_url: 默认 http://127.0.0.1:18080
+      - timeout_sec: 默认 180
+      - sample_fps: 默认 1.0
       - threshold: 默认 0.5
       - merge_gap: 默认 1.5
-      - prompt: 要求模型输出 JSON 的提示词
+      - pad_sec: 默认 0.5
+      - max_new_tokens: 默认 8
+    outputs:
+      - out_dir/sensitive_segments.json
     """
+
     def execute(self, sample: dict, params: dict = None):
         params = params or {}
         video_path = sample["filePath"]
-        export_path = sample["export_path"]
+        export_path = sample.get("export_path", "./outputs")
 
         out_dir = make_run_dir(export_path, "video_sensitive_detect")
-        logger = get_logger("VideoSensitiveDetect", log_dir=out_dir)
-
-        qwen_module = params.get("qwen_module")
-        qwen_func = params.get("qwen_func", "sensitive_infer")
-        if not qwen_module:
-            raise RuntimeError("params['qwen_module'] is required, e.g. 'tools.qwen_sensitive'")
-
-        mod = importlib.import_module(qwen_module)
-        if not hasattr(mod, qwen_func):
-            raise RuntimeError(f"Function '{qwen_func}' not found in module '{qwen_module}'")
-        infer_fn = getattr(mod, qwen_func)
+        log_dir = ensure_dir(os.path.join(out_dir, "logs"))
+        art_dir = ensure_dir(os.path.join(out_dir, "artifacts"))
+        frames_dir = ensure_dir(os.path.join(art_dir, "frames"))
+        logger = get_logger("VideoSensitiveDetect", log_dir)
 
+        service_url = params.get("service_url", "http://127.0.0.1:18080")
+        timeout_sec = int(params.get("timeout_sec", 180))
         sample_fps = float(params.get("sample_fps", 1.0))
         threshold = float(params.get("threshold", 0.5))
         merge_gap = float(params.get("merge_gap", 1.5))
-        prompt = params.get(
-            "prompt",
-            "内容审核：若画面含色情/裸露、明显血腥流血、尸体、爆炸火光、持枪打斗、恐怖暴力、政治人物/政治标语，则判违规。"
-            "只输出JSON：{\"is_sensitive\":true/false,\"label\":\"porn|violence|blood|explosion|politics|none\",\"score\":0~1,\"reason\":\"一句话\"}"
-        )
+        pad_sec = float(params.get("pad_sec", 0.5))
+        max_new_tokens = int(params.get("max_new_tokens", 8))
 
-        fps, W, H, nframes = get_video_info(video_path)
-        step = max(1, int(round(fps / sample_fps)))
+        fps, W, H, total_frames = get_video_info(video_path)
+        step = max(1, int(round(float(fps) / max(sample_fps, 1e-6))))
 
-        logger.info(f"Start sensitive detect. video={video_path}, fps={fps}, sample_fps={sample_fps}, step={step}")
+        logger.info(
+            f"Start sensitive detect. video={video_path}, fps={fps}, step={step}, "
+            f"url={service_url}, thr={threshold}, gap={merge_gap}"
+        )
 
         cap = cv2.VideoCapture(video_path)
         if not cap.isOpened():
             raise RuntimeError(f"Cannot open video: {video_path}")
 
         hits = []
-        times = []
+        sensitive_times = []
 
         frame_id = 0
         while True:
-            ret, frame = cap.read()
-            if not ret:
+            ok, frame = cap.read()
+            if not ok:
                 break
 
             if frame_id % step != 0:
                 frame_id += 1
                 continue
 
-            t = frame_id / float(fps)
+            t = frame_id / float(fps) if fps else 0.0
+            frame_jpg = os.path.join(frames_dir, f"{frame_id:06d}.jpg")
+            save_frame_to_jpg(frame, frame_jpg)
 
             try:
-                res = infer_fn(frame, prompt)
+                res = qwenvl_infer_by_image_path(
+                    image_path=frame_jpg,
+                    task="sensitive",
+                    service_url=service_url,
+                    max_new_tokens=max_new_tokens,
+                    timeout=timeout_sec,
+                )
             except Exception as e:
-                logger.error(f"infer failed at t={t:.2f}: {e}")
+                logger.error(f"infer failed at t={t:.2f}s frame={frame_id}: {repr(e)}")
                 frame_id += 1
                 continue
 
             is_sensitive = bool(res.get("is_sensitive", False))
             score = float(res.get("score", 0.0))
-            label = str(res.get("label", "unknown"))
+            label = str(res.get("label", "none"))
             reason = str(res.get("reason", ""))
 
-            hits.append({"time": t, "is_sensitive": is_sensitive, "score": score, "label": label, "reason": reason})
+            hits.append(
+                {
+                    "time": t,
+                    "frame_idx": frame_id,
+                    "image_path": frame_jpg,
+                    "is_sensitive": is_sensitive,
+                    "label": label,
+                    "score": score,
+                    "reason": reason,
+                }
+            )
 
             if is_sensitive and score >= threshold:
-                times.append(t)
+                sensitive_times.append(t)
 
             frame_id += 1
 
         cap.release()
 
-        segs = merge_times_to_segments(times, gap=merge_gap)
+        segs = _merge_times_to_segments(sensitive_times, gap=merge_gap, pad=pad_sec)
 
         result = {
             "out_dir": out_dir,
             "video": video_path,
+            "service_url": service_url,
             "sample_fps": sample_fps,
             "threshold": threshold,
             "merge_gap": merge_gap,
+            "pad_sec": pad_sec,
             "hits": hits,
             "segments": [{"start": float(s), "end": float(e)} for s, e in segs],
         }
@@ -129,10 +152,4 @@ def execute(self, sample: dict, params: dict = None):
             json.dump(result, f, ensure_ascii=False, indent=2)
 
         logger.info(f"Done. segments_json={json_path}, segments={len(segs)}, hits={len(hits)}")
-
-        return {
-            "out_dir": out_dir,
-            "segments_json": json_path,
-            "segments_count": len(segs),
-            "hits_count": len(hits),
-        }
\ No newline at end of file
+        return {"out_dir": out_dir, "segments_json": json_path, "segments_count": len(segs)}
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_subtitle_ocr/process.py b/runtime/ops/mapper/video_subtitle_ocr/process.py
index a97eb789..2c58753f 100644
--- a/runtime/ops/mapper/video_subtitle_ocr/process.py
+++ b/runtime/ops/mapper/video_subtitle_ocr/process.py
@@ -6,12 +6,45 @@
 import subprocess
 import cv2
 import numpy as np
+import inspect
 
 from .._video_common.paths import make_run_dir, ensure_dir
 from .._video_common.log import get_logger
 from .._video_common.io_video import get_video_info
+from paddleocr import PaddleOCR
+from .._video_common.model_paths import resolve_model_path
 
-
+def build_paddle_ocr(params, ocr_lang: str, use_angle_cls: bool):
+    """
+    默认模型目录：
+      /mnt/models/ocr/det
+      /mnt/models/ocr/rec
+      /mnt/models/ocr/cls
+    也支持 params['ocr_model_dir'] 指定（相对/绝对）。
+    """
+    ocr_root = resolve_model_path(params, "ocr_model_dir", "ocr")
+    det_dir = os.path.join(ocr_root, "det")
+    rec_dir = os.path.join(ocr_root, "rec")
+    cls_dir = os.path.join(ocr_root, "cls")
+
+    # 目录不存在就直接报错，让用户去模型仓下载到固定位置
+    for p in [det_dir, rec_dir] + ([cls_dir] if use_angle_cls else []):
+        if not os.path.exists(p):
+            raise RuntimeError(f"PaddleOCR model dir not found: {p}. Please download OCR models into model repo path.")
+
+    sig = inspect.signature(PaddleOCR.__init__)
+    kw = {"lang": ocr_lang}
+    if "use_angle_cls" in sig.parameters:
+        kw["use_angle_cls"] = use_angle_cls
+    # PaddleOCR 3.4.0 支持这些
+    if "det_model_dir" in sig.parameters:
+        kw["det_model_dir"] = det_dir
+    if "rec_model_dir" in sig.parameters:
+        kw["rec_model_dir"] = rec_dir
+    if "cls_model_dir" in sig.parameters and use_angle_cls:
+        kw["cls_model_dir"] = cls_dir
+
+    return PaddleOCR(**kw)
 def _write_srt(segments, srt_path):
     def _fmt(t):
         h = int(t // 3600)
@@ -275,7 +308,7 @@ def execute(sample, params):
 
         from paddleocr import PaddleOCR
         ocr_lang = params.get("ocr_lang", "ch")
-        ocr = PaddleOCR(use_angle_cls=True, lang=ocr_lang)
+        ocr = build_paddle_ocr(params, ocr_lang=ocr_lang, use_angle_cls=False)   
 
         fps, w, h, total = get_video_info(src_video)
         sample_fps = float(params.get("sample_fps", 1.0))
diff --git a/runtime/ops/mapper/video_summary_qwenvl/process.py b/runtime/ops/mapper/video_summary_qwenvl/process.py
index 8119d5fb..19efc107 100644
--- a/runtime/ops/mapper/video_summary_qwenvl/process.py
+++ b/runtime/ops/mapper/video_summary_qwenvl/process.py
@@ -1,25 +1,20 @@
 # -*- coding: utf-8 -*-
 import os
 import json
-import cv2
 import math
-import importlib
-import re
+import cv2
 
 from .._video_common.paths import make_run_dir, ensure_dir
 from .._video_common.log import get_logger
 from .._video_common.io_video import get_video_info
-
-_qwen = importlib.import_module("tools.qwen_sensitive")
-qwenvl_infer = _qwen.qwenvl_infer
+from .._video_common.qwen_http_client import qwenvl_infer_by_image_path, save_frame_to_jpg
 
 
 def _sample_frame_indices(total_frames: int, fps: float, sample_fps: float, max_frames: int):
-    """按 sample_fps 抽帧，然后均匀下采样到 max_frames。"""
     if total_frames <= 0:
         return []
     fps = float(fps) if fps else 25.0
-    step = max(1, int(round(fps / max(float(sample_fps), 0.0001))))
+    step = max(1, int(round(fps / max(float(sample_fps), 1e-6))))
     idxs = list(range(0, total_frames, step))
     if max_frames and len(idxs) > int(max_frames):
         n = int(max_frames)
@@ -27,78 +22,56 @@ def _sample_frame_indices(total_frames: int, fps: float, sample_fps: float, max_
     return idxs
 
 
-def _make_montage(frames_bgr, cell_w=384, cell_h=216, max_cols=4, bg=0):
-    """把多帧拼成一张图（montage），用于“基于多帧生成总摘要”。
-    - frames_bgr: List[np.ndarray(BGR)]
-    """
-    if not frames_bgr:
-        return None
-
-    n = len(frames_bgr)
+def _make_montage(frames, cell_w=384, cell_h=216, max_cols=4):
+    n = len(frames)
     cols = min(max_cols, n)
     rows = int(math.ceil(n / cols))
-
-    montage = (bg * np.ones((rows * cell_h, cols * cell_w, 3), dtype=frames_bgr[0].dtype))  # noqa
-
-    for i, fr in enumerate(frames_bgr):
+    canvas = 255 * (cv2.cvtColor(cv2.UMat(cell_h * rows, cell_w * cols, cv2.CV_8UC3), cv2.COLOR_BGR2RGB).get())
+    canvas[:] = 255
+    for i, img in enumerate(frames):
         r = i // cols
         c = i % cols
-        # resize to cell
-        fr_r = cv2.resize(fr, (cell_w, cell_h), interpolation=cv2.INTER_AREA)
-        y0 = r * cell_h
-        x0 = c * cell_w
-        montage[y0:y0 + cell_h, x0:x0 + cell_w] = fr_r
-
-    return montage
-
-
-def _squeeze_whitespace(text: str) -> str:
-    """把多余空白（包括 \\n）压成一个空格，变成“一段话”更易读。"""
-    if not text:
-        return ""
-    t = re.sub(r"\s+", " ", text).strip()
-    return t
-
-
-# numpy 在 montage 中用到（避免你环境里没装导致 import 报错：你现在 qwen 环境肯定有 numpy，但 datamate 环境也应有）
-import numpy as np  # noqa: E402
+        x0, y0 = c * cell_w, r * cell_h
+        resized = cv2.resize(img, (cell_w, cell_h))
+        canvas[y0 : y0 + cell_h, x0 : x0 + cell_w] = resized
+    return canvas
 
 
 class VideoSummaryQwenVL:
-    """视频文本概括（QwenVL，多帧总摘要）
-
-    核心：抽多帧 -> 拼成 montage -> 只调用一次 task=summary -> 得到“总摘要”
+    """
+    抽多帧拼 montage → QwenVL HTTP 生成摘要（对齐服务端 task=summary）：
+      返回: {summary}
 
     params:
-      - sample_fps: float, default 1.0
-      - max_frames: int, default 12
-      - language: zh|en, default zh
-      - style: short|normal|detail, default normal
-      - max_new_tokens: int, default 160
-      - montage_cell_w: int, default 384
-      - montage_cell_h: int, default 216
-      - montage_max_cols: int, default 4
-
+      - service_url: 默认 http://127.0.0.1:18080
+      - timeout_sec: 默认 180
+      - sample_fps: 默认 1.0
+      - max_frames: 默认 12
+      - language: 默认 zh
+      - style: 默认 normal
+      - max_new_tokens: 默认 160
+      - montage_cell_w: 默认 384
+      - montage_cell_h: 默认 216
+      - montage_max_cols: 默认 4
     outputs:
-      - artifacts/summary.json: {summary, evidence:[{frame_id,jpg}], montage_jpg}
+      - artifacts/montage.jpg
+      - artifacts/summary.json
+      - artifacts/frames/*.jpg
     """
 
-    @staticmethod
-    def execute(sample, params):
+    def execute(self, sample, params=None):
+        params = params or {}
         video_path = sample["filePath"]
         export_path = sample.get("export_path", "./outputs")
 
-        op_name = "video_summary_qwenvl"
-        out_dir = make_run_dir(export_path, op_name)
+        out_dir = make_run_dir(export_path, "video_summary_qwenvl")
         log_dir = ensure_dir(os.path.join(out_dir, "logs"))
         art_dir = ensure_dir(os.path.join(out_dir, "artifacts"))
         frames_dir = ensure_dir(os.path.join(art_dir, "frames"))
+        logger = get_logger("VideoSummaryQwenVL", log_dir)
 
-        logger = get_logger(op_name, log_dir)
-        logger.info(f"video={video_path}")
-        logger.info(f"out_dir={out_dir}")
-
-        fps, w, h, total = get_video_info(video_path)
+        service_url = params.get("service_url", "http://127.0.0.1:18080")
+        timeout_sec = int(params.get("timeout_sec", 180))
 
         sample_fps = float(params.get("sample_fps", 1.0))
         max_frames = int(params.get("max_frames", 12))
@@ -110,65 +83,63 @@ def execute(sample, params):
         cell_h = int(params.get("montage_cell_h", 216))
         max_cols = int(params.get("montage_max_cols", 4))
 
-        idxs = _sample_frame_indices(total, fps, sample_fps, max_frames)
-        logger.info(f"fps={fps:.3f}, frames={total}, idxs={len(idxs)}, style={style}, lang={language}")
+        fps, W, H, total_frames = get_video_info(video_path)
+        idxs = _sample_frame_indices(total_frames, fps, sample_fps, max_frames)
 
         cap = cv2.VideoCapture(video_path)
         if not cap.isOpened():
             raise RuntimeError(f"Cannot open video: {video_path}")
 
-        frames_for_montage = []
+        frames = []
         evidence = []
 
-        for k, fi in enumerate(idxs):
-            cap.set(cv2.CAP_PROP_POS_FRAMES, int(fi))
+        for idx in idxs:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
             ok, frame = cap.read()
-            if not ok or frame is None:
+            if not ok:
                 continue
-
-            jpg_path = os.path.join(frames_dir, f"frame_{int(fi):06d}.jpg")
-            cv2.imwrite(jpg_path, frame)
-
-            frames_for_montage.append(frame)
-            evidence.append({"frame_id": int(fi), "jpg": jpg_path})
-
-            logger.info(f"[{k+1}/{len(idxs)}] frame={fi} collected")
+            frame_jpg = os.path.join(frames_dir, f"{idx:06d}.jpg")
+            save_frame_to_jpg(frame, frame_jpg)
+            frames.append(frame)
+            evidence.append({"frame_idx": idx, "image_path": frame_jpg})
 
         cap.release()
 
-        montage = _make_montage(frames_for_montage, cell_w=cell_w, cell_h=cell_h, max_cols=max_cols, bg=0)
-        if montage is None:
-            result = {
-                "summary": "",
-                "evidence": evidence,
-                "montage_jpg": "",
-                "meta": {"fps": float(fps), "width": int(w), "height": int(h), "total_frames": int(total)}
-            }
-        else:
-            montage_path = os.path.join(art_dir, "montage.jpg")
+        montage_path = os.path.join(art_dir, "montage.jpg")
+        summary = ""
+
+        if frames:
+            montage = _make_montage(frames, cell_w=cell_w, cell_h=cell_h, max_cols=max_cols)
             cv2.imwrite(montage_path, montage)
 
-            # ✅ 只调用一次：基于“多帧拼图”生成总摘要
-            resp = qwenvl_infer(
-                montage,
+            res = qwenvl_infer_by_image_path(
+                image_path=montage_path,
                 task="summary",
+                service_url=service_url,
+                max_new_tokens=max_new_tokens,
                 language=language,
                 style=style,
-                max_new_tokens=max_new_tokens,
-                timeout=180,
+                timeout=timeout_sec,
+            )
+            summary = (res.get("summary") or "").strip()
+
+        out_json = os.path.join(art_dir, "summary.json")
+        with open(out_json, "w", encoding="utf-8") as f:
+            json.dump(
+                {
+                    "summary": summary,
+                    "service_url": service_url,
+                    "sample_fps": sample_fps,
+                    "max_frames": max_frames,
+                    "language": language,
+                    "style": style,
+                    "evidence": evidence,
+                    "montage": montage_path,
+                },
+                f,
+                ensure_ascii=False,
+                indent=2,
             )
-            summary = _squeeze_whitespace((resp.get("summary") or "").strip())
-
-            result = {
-                "summary": summary,
-                "evidence": evidence,
-                "montage_jpg": montage_path,
-                "meta": {"fps": float(fps), "width": int(w), "height": int(h), "total_frames": int(total)}
-            }
-
-        json_path = os.path.join(art_dir, "summary.json")
-        with open(json_path, "w", encoding="utf-8") as f:
-            json.dump(result, f, ensure_ascii=False, indent=2)
 
-        logger.info(f"Done. summary_json={json_path}, summary_len={len(result.get('summary',''))}")
-        return {"out_dir": out_dir, "summary_json": json_path, "summary": result.get("summary", "")}
\ No newline at end of file
+        logger.info(f"Done. summary_json={out_json}")
+        return {"out_dir": out_dir, "summary_json": out_json, "montage_jpg": montage_path}
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_text_ocr/process.py b/runtime/ops/mapper/video_text_ocr/process.py
index 5995250c..082b2143 100644
--- a/runtime/ops/mapper/video_text_ocr/process.py
+++ b/runtime/ops/mapper/video_text_ocr/process.py
@@ -6,12 +6,46 @@
 import subprocess
 import cv2
 import numpy as np
+import inspect
 from collections import Counter
 
 from .._video_common.paths import make_run_dir, ensure_dir
 from .._video_common.log import get_logger
 from .._video_common.io_video import get_video_info
+from paddleocr import PaddleOCR
+from .._video_common.model_paths import resolve_model_path
 
+def build_paddle_ocr(params, ocr_lang: str, use_angle_cls: bool):
+    """
+    默认模型目录：
+      /mnt/models/ocr/det
+      /mnt/models/ocr/rec
+      /mnt/models/ocr/cls
+    也支持 params['ocr_model_dir'] 指定（相对/绝对）。
+    """
+    ocr_root = resolve_model_path(params, "ocr_model_dir", "ocr")
+    det_dir = os.path.join(ocr_root, "det")
+    rec_dir = os.path.join(ocr_root, "rec")
+    cls_dir = os.path.join(ocr_root, "cls")
+
+    # 目录不存在就直接报错，让用户去模型仓下载到固定位置
+    for p in [det_dir, rec_dir] + ([cls_dir] if use_angle_cls else []):
+        if not os.path.exists(p):
+            raise RuntimeError(f"PaddleOCR model dir not found: {p}. Please download OCR models into model repo path.")
+
+    sig = inspect.signature(PaddleOCR.__init__)
+    kw = {"lang": ocr_lang}
+    if "use_angle_cls" in sig.parameters:
+        kw["use_angle_cls"] = use_angle_cls
+    # PaddleOCR 3.4.0 支持这些
+    if "det_model_dir" in sig.parameters:
+        kw["det_model_dir"] = det_dir
+    if "rec_model_dir" in sig.parameters:
+        kw["rec_model_dir"] = rec_dir
+    if "cls_model_dir" in sig.parameters and use_angle_cls:
+        kw["cls_model_dir"] = cls_dir
+
+    return PaddleOCR(**kw)
 
 def _clean_text(t: str) -> str:
     if not t:
@@ -194,7 +228,7 @@ def execute(sample, params):
 
         from paddleocr import PaddleOCR
         ocr_lang = params.get("ocr_lang", "ch")
-        ocr = PaddleOCR(use_angle_cls=True, lang=ocr_lang)
+        ocr = build_paddle_ocr(params, ocr_lang=ocr_lang, use_angle_cls=False)   
 
         fps, w, h, total = get_video_info(src_video)
         sample_fps = float(params.get("sample_fps", 0.5))