Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
a6c3712
fix(chart): update Helm chart helpers and values for improved configu…
Dallas98 Dec 11, 2025
175cdb9
feat(SynthesisTaskTab): enhance task table with tooltip support and i…
Dallas98 Dec 11, 2025
5970775
feat(CreateTask, SynthFileTask): improve task creation and detail vie…
Dallas98 Dec 11, 2025
f92dd59
feat(SynthFileTask): enhance file display with progress tracking and …
Dallas98 Dec 11, 2025
01e0301
feat(SynthFileTask): enhance file display with progress tracking and …
Dallas98 Dec 11, 2025
a207659
Merge branch 'main' into dev
Dallas98 Dec 11, 2025
951b065
feat(SynthDataDetail): add delete action for chunks with confirmation…
Dallas98 Dec 11, 2025
a7463ca
feat(SynthDataDetail): update edit and delete buttons to icon-only fo…
Dallas98 Dec 11, 2025
61230fa
feat(SynthDataDetail): add confirmation modals for chunk and synthesi…
Dallas98 Dec 11, 2025
937ae72
Merge branch 'refs/heads/main' into dev
Dallas98 Dec 13, 2025
4aaf0fd
feat(DocumentSplitter): add enhanced document splitting functionality…
Dallas98 Dec 13, 2025
2ec96d9
feat(DataSynthesis): refactor data synthesis models and update task h…
Dallas98 Dec 13, 2025
efc32df
feat(DataSynthesis): streamline synthesis task handling and enhance c…
Dallas98 Dec 15, 2025
02ee5f1
Merge branch 'main' into dev
Dallas98 Dec 16, 2025
b58d561
Merge branch 'main' into dev
Dallas98 Dec 17, 2025
59f8319
feat(DataSynthesis): refactor data synthesis models and update task h…
Dallas98 Dec 17, 2025
9775425
Merge branch 'main' into dev
Dallas98 Dec 17, 2025
81d0ed8
fix(generation_service): ensure processed chunks are incremented rega…
Dallas98 Dec 17, 2025
3bcc48c
feat(CreateTask): enhance task creation with new synthesis templates …
Dallas98 Dec 18, 2025
a847317
feat(CreateTask): enhance task creation with new synthesis templates …
Dallas98 Dec 18, 2025
401ae45
feat(CreateTask): enhance task creation with new synthesis templates …
Dallas98 Dec 18, 2025
97f8fb6
feat(CreateTask): enhance task creation with new synthesis templates …
Dallas98 Dec 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
560 changes: 444 additions & 116 deletions frontend/src/pages/SynthesisTask/CreateTask.tsx

Large diffs are not rendered by default.

Empty file.
81 changes: 41 additions & 40 deletions runtime/datamate-python/app/db/models/data_synthesis.py
Original file line number Diff line number Diff line change
@@ -1,94 +1,95 @@
import uuid
from xml.etree.ElementTree import tostring

from sqlalchemy import Column, String, Text, Integer, JSON, TIMESTAMP, ForeignKey, func
from sqlalchemy.orm import relationship
from sqlalchemy import Column, String, Text, Integer, JSON, TIMESTAMP, func

from app.db.session import Base
from app.module.generation.schema.generation import CreateSynthesisTaskRequest


async def save_synthesis_task(db_session, synthesis_task: CreateSynthesisTaskRequest):
"""保存数据合成任务。"""
# 转换为模型实例
"""保存数据合成任务。

注意:当前 MySQL 表 `t_data_synth_instances` 结构中只包含 synth_type / synth_config 等字段,
没有 model_id、text_split_config、source_file_id、result_data_location 等列,因此这里只保存
与表结构一致的字段,其他信息由上层逻辑或其它表负责管理。
"""
gid = str(uuid.uuid4())
synthesis_task_instance = DataSynthesisInstance(

# 兼容旧请求结构:从请求对象中提取必要字段,
# - 合成类型:synthesis_type -> synth_type
# - 合成配置:text_split_config + synthesis_config 合并后写入 synth_config

synth_task_instance = DataSynthInstance(
id=gid,
name=synthesis_task.name,
description=synthesis_task.description,
status="pending",
model_id=synthesis_task.model_id,
synthesis_type=synthesis_task.synthesis_type.value,
synth_type=synthesis_task.synthesis_type.value,
progress=0,
result_data_location=f"/dataset/synthesis_results/{gid}/",
text_split_config=synthesis_task.text_split_config.model_dump(),
synthesis_config=synthesis_task.synthesis_config.model_dump(),
source_file_id=synthesis_task.source_file_id,
total_files=len(synthesis_task.source_file_id),
synth_config=synthesis_task.synth_config.model_dump(),
total_files=len(synthesis_task.source_file_id or []),
processed_files=0,
total_chunks=0,
processed_chunks=0,
total_synthesis_data=0,
total_synth_data=0,
created_at=func.now(),
updated_at=func.now(),
created_by="system",
updated_by="system"
updated_by="system",
)
db_session.add(synthesis_task_instance)
db_session.add(synth_task_instance)
await db_session.commit()
await db_session.refresh(synthesis_task_instance)
return synthesis_task_instance
await db_session.refresh(synth_task_instance)
return synth_task_instance


class DataSynthesisInstance(Base):
"""数据合成任务表,对应表 t_data_synthesis_instances
class DataSynthInstance(Base):
"""数据合成任务表,对应表 t_data_synth_instances

create table if not exists t_data_synthesis_instances
create table if not exists t_data_synth_instances
(
id VARCHAR(36) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci PRIMARY KEY COMMENT 'UUID',
name VARCHAR(255) NOT NULL COMMENT '任务名称',
description TEXT COMMENT '任务描述',
status VARCHAR(20) COMMENT '任务状态',
synthesis_type VARCHAR(20) NOT NULL COMMENT '合成类型',
model_id VARCHAR(255) NOT NULL COMMENT '模型ID',
synth_type VARCHAR(20) NOT NULL COMMENT '合成类型',
progress INT DEFAULT 0 COMMENT '任务进度(百分比)',
result_data_location VARCHAR(1000) COMMENT '结果数据存储位置',
text_split_config JSON NOT NULL COMMENT '文本切片配置',
synthesis_config JSON NOT NULL COMMENT '合成配置',
source_file_id JSON NOT NULL COMMENT '原始文件ID列表',
synth_config JSON NOT NULL COMMENT '合成配置',
total_files INT DEFAULT 0 COMMENT '总文件数',
processed_files INT DEFAULT 0 COMMENT '已处理文件数',
total_chunks INT DEFAULT 0 COMMENT '总文本块数',
processed_chunks INT DEFAULT 0 COMMENT '已处理文本块数',
total_synthesis_data INT DEFAULT 0 COMMENT '总合成数据量',
total_synth_data INT DEFAULT 0 COMMENT '总合成数据量',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
created_by VARCHAR(255) COMMENT '创建者',
updated_by VARCHAR(255) COMMENT '更新者'
) COMMENT='数据合成任务表(UUID 主键)';
"""

__tablename__ = "t_data_synthesis_instances"
__tablename__ = "t_data_synth_instances"

id = Column(String(36), primary_key=True, index=True, comment="UUID")
name = Column(String(255), nullable=False, comment="任务名称")
description = Column(Text, nullable=True, comment="任务描述")
status = Column(String(20), nullable=True, comment="任务状态")
synthesis_type = Column(String(20), nullable=False, comment="合成类型")
model_id = Column(String(255), nullable=False, comment="模型ID")
# 与数据库字段保持一致:synth_type / synth_config
synth_type = Column(String(20), nullable=False, comment="合成类型")
progress = Column(Integer, nullable=False, default=0, comment="任务进度(百分比)")
result_data_location = Column(String(1000), nullable=True, comment="结果数据存储位置")
text_split_config = Column(JSON, nullable=False, comment="文本切片配置")
synthesis_config = Column(JSON, nullable=False, comment="合成配置")
source_file_id = Column(JSON, nullable=False, comment="原始文件ID列表")
synth_config = Column(JSON, nullable=False, comment="合成配置")
total_files = Column(Integer, nullable=False, default=0, comment="总文件数")
processed_files = Column(Integer, nullable=False, default=0, comment="已处理文件数")
total_chunks = Column(Integer, nullable=False, default=0, comment="总文本块数")
processed_chunks = Column(Integer, nullable=False, default=0, comment="已处理文本块数")
total_synthesis_data = Column(Integer, nullable=False, default=0, comment="总合成数据量")

created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), nullable=True, comment="创建时间")
updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), nullable=True, comment="更新时间")
total_synth_data = Column(Integer, nullable=False, default=0, comment="总合成数据量")
created_at = Column(TIMESTAMP, nullable=False, default=func.now(), comment="创建时间")
updated_at = Column(
TIMESTAMP,
nullable=False,
default=func.now(),
onupdate=func.now(),
comment="更新时间",
)
created_by = Column(String(255), nullable=True, comment="创建者")
updated_by = Column(String(255), nullable=True, comment="更新者")

Expand Down Expand Up @@ -123,7 +124,7 @@ class DataSynthesisFileInstance(Base):
)
file_name = Column(String(255), nullable=False, comment="文件名")
source_file_id = Column(String(255), nullable=False, comment="原始文件ID")
target_file_location = Column(String(1000), nullable=False, comment="目标文件存储位置")
target_file_location = Column(String(1000), nullable=True, comment="目标文件存储位置")
status = Column(String(20), nullable=True, comment="任务状态")
total_chunks = Column(Integer, nullable=False, default=0, comment="总文本块数")
processed_chunks = Column(Integer, nullable=False, default=0, comment="已处理文本块数")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from app.db.session import AsyncSessionLocal
from app.module.evaluation.schema.evaluation import SourceType
from app.module.shared.schema import TaskStatus
from app.module.shared.util.model_chat import call_openai_style_model, _extract_json_substring
from app.module.shared.util.model_chat import call_openai_style_model, extract_json_substring
from app.module.evaluation.schema.prompt import get_prompt
from app.module.shared.util.structured_file import StructuredFileHandlerFactory
from app.module.system.service.common_service import get_model_by_id
Expand All @@ -36,8 +36,8 @@ def get_eval_prompt(self, item: EvaluationItem) -> str:
.replace("{question}", eval_content.get("instruction")))
.replace("{answer}", eval_content.get("output")))
if self.task.task_type == "COT":
prompt_text = ((prompt_text.replace("{question}", eval_content.get("question"))
.replace("{conclusion}", eval_content.get("conclusion")))
prompt_text = ((prompt_text.replace("{question}", eval_content.get("instruction"))
.replace("{conclusion}", eval_content.get("output")))
.replace("{chain_of_thought}", eval_content.get("chain_of_thought")))
return prompt_text

Expand Down Expand Up @@ -73,7 +73,7 @@ async def evaluate_item(self, model_config, item: EvaluationItem, semaphore: asy
call_openai_style_model, model_config.base_url, model_config.api_key, model_config.model_name,
prompt_text,
)
resp_text = _extract_json_substring(resp_text)
resp_text = extract_json_substring(resp_text)
try:
json.loads(resp_text)
except Exception as e:
Expand Down
Loading