ModelEngine-Group · Dallas98 · Dec 18, 2025 · Dec 11, 2025 · Dec 11, 2025 · Dec 11, 2025
diff --git a/frontend/src/pages/SynthesisTask/CreateTask.tsx b/frontend/src/pages/SynthesisTask/CreateTask.tsx
diff --git a/runtime/datamate-python/app/common/text_split.py b/runtime/datamate-python/app/common/text_split.py
diff --git a/runtime/datamate-python/app/db/models/data_synthesis.py b/runtime/datamate-python/app/db/models/data_synthesis.py
@@ -1,94 +1,95 @@
 import uuid
-from xml.etree.ElementTree import tostring
 
-from sqlalchemy import Column, String, Text, Integer, JSON, TIMESTAMP, ForeignKey, func
-from sqlalchemy.orm import relationship
+from sqlalchemy import Column, String, Text, Integer, JSON, TIMESTAMP, func
 
 from app.db.session import Base
 from app.module.generation.schema.generation import CreateSynthesisTaskRequest
 
 
 async def save_synthesis_task(db_session, synthesis_task: CreateSynthesisTaskRequest):
-    """保存数据合成任务。"""
-    # 转换为模型实例
+    """保存数据合成任务。
+
+    注意：当前 MySQL 表 `t_data_synth_instances` 结构中只包含 synth_type / synth_config 等字段，
+    没有 model_id、text_split_config、source_file_id、result_data_location 等列，因此这里只保存
+    与表结构一致的字段，其他信息由上层逻辑或其它表负责管理。
+    """
     gid = str(uuid.uuid4())
-    synthesis_task_instance = DataSynthesisInstance(
+
+    # 兼容旧请求结构：从请求对象中提取必要字段，
+    #   - 合成类型：synthesis_type -> synth_type
+    #   - 合成配置：text_split_config + synthesis_config 合并后写入 synth_config
+
+    synth_task_instance = DataSynthInstance(
         id=gid,
         name=synthesis_task.name,
         description=synthesis_task.description,
         status="pending",
-        model_id=synthesis_task.model_id,
-        synthesis_type=synthesis_task.synthesis_type.value,
+        synth_type=synthesis_task.synthesis_type.value,
         progress=0,
-        result_data_location=f"/dataset/synthesis_results/{gid}/",
-        text_split_config=synthesis_task.text_split_config.model_dump(),
-        synthesis_config=synthesis_task.synthesis_config.model_dump(),
-        source_file_id=synthesis_task.source_file_id,
-        total_files=len(synthesis_task.source_file_id),
+        synth_config=synthesis_task.synth_config.model_dump(),
+        total_files=len(synthesis_task.source_file_id or []),
         processed_files=0,
         total_chunks=0,
         processed_chunks=0,
-        total_synthesis_data=0,
+        total_synth_data=0,
         created_at=func.now(),
         updated_at=func.now(),
         created_by="system",
-        updated_by="system"
+        updated_by="system",
     )
-    db_session.add(synthesis_task_instance)
+    db_session.add(synth_task_instance)
     await db_session.commit()
-    await db_session.refresh(synthesis_task_instance)
-    return synthesis_task_instance
+    await db_session.refresh(synth_task_instance)
+    return synth_task_instance
 
 
-class DataSynthesisInstance(Base):
-    """数据合成任务表，对应表 t_data_synthesis_instances
+class DataSynthInstance(Base):
+    """数据合成任务表，对应表 t_data_synth_instances
 
-    create table if not exists t_data_synthesis_instances
+    create table if not exists t_data_synth_instances
     (
         id VARCHAR(36) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci PRIMARY KEY COMMENT 'UUID',
         name VARCHAR(255) NOT NULL COMMENT '任务名称',
         description TEXT COMMENT '任务描述',
         status VARCHAR(20) COMMENT '任务状态',
-        synthesis_type VARCHAR(20) NOT NULL COMMENT '合成类型',
-        model_id VARCHAR(255) NOT NULL COMMENT '模型ID',
+        synth_type VARCHAR(20) NOT NULL COMMENT '合成类型',
         progress INT DEFAULT 0 COMMENT '任务进度(百分比)',
-        result_data_location VARCHAR(1000) COMMENT '结果数据存储位置',
-        text_split_config JSON NOT NULL COMMENT '文本切片配置',
-        synthesis_config JSON NOT NULL COMMENT '合成配置',
-        source_file_id JSON NOT NULL COMMENT '原始文件ID列表',
+        synth_config JSON NOT NULL COMMENT '合成配置',
         total_files INT DEFAULT 0 COMMENT '总文件数',
         processed_files INT DEFAULT 0 COMMENT '已处理文件数',
         total_chunks INT DEFAULT 0 COMMENT '总文本块数',
         processed_chunks INT DEFAULT 0 COMMENT '已处理文本块数',
-        total_synthesis_data INT DEFAULT 0 COMMENT '总合成数据量',
+        total_synth_data INT DEFAULT 0 COMMENT '总合成数据量',
         created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
         updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
         created_by VARCHAR(255) COMMENT '创建者',
         updated_by VARCHAR(255) COMMENT '更新者'
     ) COMMENT='数据合成任务表（UUID 主键）';
     """
 
-    __tablename__ = "t_data_synthesis_instances"
+    __tablename__ = "t_data_synth_instances"
 
     id = Column(String(36), primary_key=True, index=True, comment="UUID")
     name = Column(String(255), nullable=False, comment="任务名称")
     description = Column(Text, nullable=True, comment="任务描述")
     status = Column(String(20), nullable=True, comment="任务状态")
-    synthesis_type = Column(String(20), nullable=False, comment="合成类型")
-    model_id = Column(String(255), nullable=False, comment="模型ID")
+    # 与数据库字段保持一致：synth_type / synth_config
+    synth_type = Column(String(20), nullable=False, comment="合成类型")
     progress = Column(Integer, nullable=False, default=0, comment="任务进度(百分比)")
-    result_data_location = Column(String(1000), nullable=True, comment="结果数据存储位置")
-    text_split_config = Column(JSON, nullable=False, comment="文本切片配置")
-    synthesis_config = Column(JSON, nullable=False, comment="合成配置")
-    source_file_id = Column(JSON, nullable=False, comment="原始文件ID列表")
+    synth_config = Column(JSON, nullable=False, comment="合成配置")
     total_files = Column(Integer, nullable=False, default=0, comment="总文件数")
     processed_files = Column(Integer, nullable=False, default=0, comment="已处理文件数")
     total_chunks = Column(Integer, nullable=False, default=0, comment="总文本块数")
     processed_chunks = Column(Integer, nullable=False, default=0, comment="已处理文本块数")
-    total_synthesis_data = Column(Integer, nullable=False, default=0, comment="总合成数据量")
-
-    created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), nullable=True, comment="创建时间")
-    updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), nullable=True, comment="更新时间")
+    total_synth_data = Column(Integer, nullable=False, default=0, comment="总合成数据量")
+    created_at = Column(TIMESTAMP, nullable=False, default=func.now(), comment="创建时间")
+    updated_at = Column(
+        TIMESTAMP,
+        nullable=False,
+        default=func.now(),
+        onupdate=func.now(),
+        comment="更新时间",
+    )
     created_by = Column(String(255), nullable=True, comment="创建者")
     updated_by = Column(String(255), nullable=True, comment="更新者")
 
@@ -123,7 +124,7 @@ class DataSynthesisFileInstance(Base):
     )
     file_name = Column(String(255), nullable=False, comment="文件名")
     source_file_id = Column(String(255), nullable=False, comment="原始文件ID")
-    target_file_location = Column(String(1000), nullable=False, comment="目标文件存储位置")
+    target_file_location = Column(String(1000), nullable=True, comment="目标文件存储位置")
     status = Column(String(20), nullable=True, comment="任务状态")
     total_chunks = Column(Integer, nullable=False, default=0, comment="总文本块数")
     processed_chunks = Column(Integer, nullable=False, default=0, comment="已处理文本块数")

diff --git a/runtime/datamate-python/app/module/evaluation/service/evaluation.py b/runtime/datamate-python/app/module/evaluation/service/evaluation.py
@@ -13,7 +13,7 @@
 from app.db.session import AsyncSessionLocal
 from app.module.evaluation.schema.evaluation import SourceType
 from app.module.shared.schema import TaskStatus
-from app.module.shared.util.model_chat import call_openai_style_model, _extract_json_substring
+from app.module.shared.util.model_chat import call_openai_style_model, extract_json_substring
 from app.module.evaluation.schema.prompt import get_prompt
 from app.module.shared.util.structured_file import StructuredFileHandlerFactory
 from app.module.system.service.common_service import get_model_by_id
@@ -36,8 +36,8 @@ def get_eval_prompt(self, item: EvaluationItem) -> str:
                             .replace("{question}", eval_content.get("instruction")))
                            .replace("{answer}", eval_content.get("output")))
         if self.task.task_type == "COT":
-            prompt_text = ((prompt_text.replace("{question}", eval_content.get("question"))
-                            .replace("{conclusion}", eval_content.get("conclusion")))
+            prompt_text = ((prompt_text.replace("{question}", eval_content.get("instruction"))
+                            .replace("{conclusion}", eval_content.get("output")))
                            .replace("{chain_of_thought}", eval_content.get("chain_of_thought")))
         return prompt_text
 
@@ -73,7 +73,7 @@ async def evaluate_item(self, model_config, item: EvaluationItem, semaphore: asy
                     call_openai_style_model, model_config.base_url, model_config.api_key, model_config.model_name,
                     prompt_text,
                 )
-                resp_text = _extract_json_substring(resp_text)
+                resp_text = extract_json_substring(resp_text)
                 try:
                     json.loads(resp_text)
                 except Exception as e: