Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
5ffa075
feat: extend UniversalDocLoader to support additional file formats in…
Dallas98 Feb 12, 2026
ecfa7e6
Merge remote-tracking branch 'origin/main'
Dallas98 Feb 13, 2026
bfdedd5
Merge remote-tracking branch 'origin/main'
Dallas98 Feb 24, 2026
f2f7626
feat: implement RAG module with document loading, splitting, and proc…
Dallas98 Feb 25, 2026
01349c0
feat: implement RAG module with document loading, splitting, and proc…
Dallas98 Feb 25, 2026
77de551
feat: update Milvus configuration and enhance file processing logic i…
Dallas98 Feb 26, 2026
bcad0d1
Merge branch 'main' into feat/rag
Dallas98 Feb 26, 2026
50f48b0
feat: enhance RAG infrastructure with document processing, vector sto…
Dallas98 Feb 26, 2026
cdfea22
feat: add progress tracking for RAG file processing and enhance worke…
Dallas98 Feb 26, 2026
b3fa57b
feat: enhance retrieval service with advanced search ranking and filt…
Dallas98 Feb 28, 2026
d1ade8c
Merge branch 'main' into feat/rag
Dallas98 Mar 2, 2026
592defd
feat: enhance retrieval service with BM25 indexing and improved ranki…
Dallas98 Mar 2, 2026
381761f
feat: enhance retrieval service with BM25 indexing and improved ranki…
Dallas98 Mar 2, 2026
9d6b359
feat: implement Milvus client singleton management and refactor vecto…
Dallas98 Mar 2, 2026
5c37b78
feat: refactor file processing to use async session for database inte…
Dallas98 Mar 2, 2026
d61027f
feat: enhance chunk processing with filtering, cleaning, and batch st…
Dallas98 Mar 2, 2026
cd98c33
feat: enhance API response models with additional fields and configur…
Dallas98 Mar 2, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
581 changes: 556 additions & 25 deletions .claude/skills/backend-architect/SKILL.md

Large diffs are not rendered by default.

568 changes: 568 additions & 0 deletions .claude/skills/fastapi-templates/SKILL.md

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,11 @@ const KnowledgeBaseDetailPage: React.FC = () => {
handleKeywordChange,
} = useFetchData<KBFile>(
(params) => id ? queryKnowledgeBaseFilesUsingGet(id, params) : Promise.resolve({ data: [] }),
(file) => mapFileData(file, t)
(file) => mapFileData(file, t),
30000, // 30秒轮询间隔
false, // 不自动轮询
[], // 额外的轮询函数
0 // pageOffset: Python 后端期望 page 从 1 开始,前端 current=1 时传 page=1
);

// File table logic
Expand Down
6 changes: 5 additions & 1 deletion frontend/src/pages/KnowledgeBase/Home/KnowledgeBasePage.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,11 @@ export default function KnowledgeBasePage() {
handleKeywordChange,
} = useFetchData<KnowledgeBaseItem>(
queryKnowledgeBasesUsingPost,
(kb) => mapKnowledgeBase(kb, false, t) // 在首页不显示索引模型和文本理解模型字段
(kb) => mapKnowledgeBase(kb, false, t), // 在首页不显示索引模型和文本理解模型字段
30000, // 30秒轮询间隔
false, // 不自动轮询
[], // 额外的轮询函数
0 // pageOffset: Python 后端期望 page 从 1 开始,前端 current=1 时传 page=1
);

useEffect(() => {
Expand Down
29 changes: 24 additions & 5 deletions frontend/src/pages/KnowledgeBase/knowledge-base.api.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
import { get, post, put, del } from "@/utils/request";

// 获取知识库列表
export function queryKnowledgeBasesUsingPost(params: object) {
return post("/api/knowledge-base/list", params);
export function queryKnowledgeBasesUsingPost(params: any) {
// 将前端的 size 参数映射为后端的 page_size
const { size, ...rest } = params;
return post("/api/knowledge-base/list", {
...rest,
page_size: size
});
}

// 创建知识库
Expand All @@ -26,8 +31,22 @@ export function deleteKnowledgeBaseByIdUsingDelete(baseId: string) {
}

// 获取知识生成文件列表
export function queryKnowledgeBaseFilesUsingGet(baseId: string, params?: Record<string, string>) {
return get(`/api/knowledge-base/${baseId}/files${params ? `?${new URLSearchParams(params).toString()}` : ""}`);
export function queryKnowledgeBaseFilesUsingGet(baseId: string, params?: Record<string, any>) {
if (!params) {
return get(`/api/knowledge-base/${baseId}/files`);
}
// 将前端的 size 参数映射为后端的 page_size
const { size, page, ...rest } = params;
const queryParams = {
page: page || 1,
page_size: size || 10,
...rest
};
return get(`/api/knowledge-base/${baseId}/files?${new URLSearchParams(
Object.entries(queryParams)
.filter(([_, v]) => v !== undefined && v !== null)
.reduce((acc, [k, v]) => ({ ...acc, [k]: String(v) }), {})
).toString()}`);
}

// 添加文件到知识库
Expand Down Expand Up @@ -62,5 +81,5 @@ export function queryKnowledgeBaseFileDetailUsingGet(
) {
const page = params.page ?? 1;
const size = params.size ?? 20;
return get(`/api/knowledge-base/${knowledgeBaseId}/files/${ragFileId}?page=${page}&size=${size}`);
return get(`/api/knowledge-base/${knowledgeBaseId}/files/${ragFileId}?page=${page}&page_size=${size}`);
}
59 changes: 44 additions & 15 deletions frontend/vite.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,25 +13,54 @@ export default defineConfig({
},
server: {
host: "0.0.0.0",
proxy: {
"^/api": {
target: "http://localhost:8080", // 本地后端服务地址
proxy: (() => {
const pythonProxyConfig = {
target: "http://localhost:18000",
changeOrigin: true,
secure: false,
rewrite: (path) => path.replace(/^\/api/, "/api"),
configure: (proxy, options) => {
// proxy 是 'http-proxy' 的实例
proxy.on("proxyReq", (proxyReq, req, res) => {
// 可以在这里修改请求头
proxyReq.removeHeader("referer");
proxyReq.removeHeader("origin");
configure: (proxy: { on: (event: string, handler: (arg: unknown) => void) => void }) => {
proxy.on("proxyReq", (proxyReq: unknown) => {
(proxyReq as { removeHeader: (name: string) => void }).removeHeader("referer");
(proxyReq as { removeHeader: (name: string) => void }).removeHeader("origin");
});
proxy.on("proxyRes", (proxyRes, req, res) => {
delete proxyRes.headers["set-cookie"];
proxyRes.headers["cookies"] = ""; // 清除 cookies 头
proxy.on("proxyRes", (proxyRes: unknown) => {
const res = proxyRes as { headers: Record<string, unknown> };
delete res.headers["set-cookie"];
res.headers["cookies"] = "";
});
},
},
},
};

const javaProxyConfig = {
target: "http://localhost:8080",
changeOrigin: true,
secure: false,
configure: (proxy: { on: (event: string, handler: (arg: unknown) => void) => void }) => {
proxy.on("proxyReq", (proxyReq: unknown) => {
(proxyReq as { removeHeader: (name: string) => void }).removeHeader("referer");
(proxyReq as { removeHeader: (name: string) => void }).removeHeader("origin");
});
proxy.on("proxyRes", (proxyRes: unknown) => {
const res = proxyRes as { headers: Record<string, unknown> };
delete res.headers["set-cookie"];
res.headers["cookies"] = "";
});
},
};

// Python 服务: rag, synthesis, annotation, evaluation, models
const pythonPaths = ["rag", "synthesis", "annotation", "knowledge-base", "data-collection", "evaluation", "models"];
// Java 服务: data-management, knowledge-base
const javaPaths = ["data-management", "operators", "cleansing"];

const proxy: Record<string, object> = {};
for (const p of pythonPaths) {
proxy[`/api/${p}`] = pythonProxyConfig;
}
for (const p of javaPaths) {
proxy[`/api/${p}`] = javaProxyConfig;
}
return proxy;
})(),
},
});
7 changes: 7 additions & 0 deletions runtime/datamate-python/app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,5 +77,12 @@ def build_database_url(self):

datamate_jwt_enable: bool = False

# Milvus 配置
milvus_uri: str = "http://localhost:19530"
milvus_token: str = ""

# 文件存储配置(共享文件系统)
file_storage_path: str = "/data/files"

# 全局设置实例
settings = Settings()
13 changes: 11 additions & 2 deletions runtime/datamate-python/app/core/exception/codes.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,17 @@ def __init__(self):
# ========== RAG 模块 ==========
RAG_CONFIG_ERROR: Final = ErrorCode("rag.0001", "RAG configuration error", 400)
RAG_KNOWLEDGE_BASE_NOT_FOUND: Final = ErrorCode("rag.0002", "Knowledge base not found", 404)
RAG_MODEL_NOT_FOUND: Final = ErrorCode("rag.0003", "RAG model not found", 404)
RAG_QUERY_FAILED: Final = ErrorCode("rag.0004", "RAG query failed", 500)
RAG_KNOWLEDGE_BASE_ALREADY_EXISTS: Final = ErrorCode("rag.0003", "Knowledge base already exists", 400)
RAG_KNOWLEDGE_BASE_NAME_INVALID: Final = ErrorCode("rag.0004", "Knowledge base name is invalid", 400)
RAG_FILE_NOT_FOUND: Final = ErrorCode("rag.0005", "RAG file not found", 404)
RAG_FILE_PROCESS_FAILED: Final = ErrorCode("rag.0006", "File processing failed", 500)
RAG_FILE_PARSE_FAILED: Final = ErrorCode("rag.0007", "File parsing failed", 500)
RAG_CHUNK_NOT_FOUND: Final = ErrorCode("rag.0008", "Chunk not found", 404)
RAG_MODEL_NOT_FOUND: Final = ErrorCode("rag.0009", "RAG model not found", 404)
RAG_QUERY_FAILED: Final = ErrorCode("rag.0010", "RAG query failed", 500)
RAG_MILVUS_ERROR: Final = ErrorCode("rag.0011", "Milvus operation failed", 500)
RAG_COLLECTION_NOT_FOUND: Final = ErrorCode("rag.0012", "Milvus collection not found", 404)
RAG_EMBEDDING_FAILED: Final = ErrorCode("rag.0013", "Embedding generation failed", 500)

# ========== 配比模块 ==========
RATIO_TASK_NOT_FOUND: Final = ErrorCode("ratio.0001", "Ratio task not found", 404)
Expand Down
4 changes: 4 additions & 0 deletions runtime/datamate-python/app/db/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
ChunkUploadPreRequest
)

from .knowledge_gen import KnowledgeBase, RagFile

__all__ = [
"Dataset",
"DatasetTag",
Expand All @@ -48,4 +50,6 @@
"CategoryRelation",
"OperatorRelease",
"ChunkUploadPreRequest",
"KnowledgeBase",
"RagFile",
]
2 changes: 1 addition & 1 deletion runtime/datamate-python/app/db/models/base_entity.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from sqlalchemy import Column, String, TIMESTAMP, Text, JSON
from sqlalchemy import Column, String, TIMESTAMP, Text
from sqlalchemy.orm import declarative_base
from sqlalchemy.sql import func

Expand Down
86 changes: 65 additions & 21 deletions runtime/datamate-python/app/db/models/knowledge_gen.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,82 @@
"""
Tables of RAG Management Module
知识库(RAG)相关 ORM 模型

表: t_rag_knowledge_base, t_rag_file
与 Java 实体保持一致。
"""
import uuid
from sqlalchemy import Column, String, TIMESTAMP, Text, Integer, JSON
from sqlalchemy.sql import func
from enum import Enum
from sqlalchemy import Column, String, Integer, JSON
from app.db.models.base_entity import BaseEntity


class RagKnowledgeBase(BaseEntity):
"""知识库模型"""
class RagType(str, Enum):
"""RAG 类型枚举

对应 Java: com.datamate.rag.indexer.interfaces.dto.RagType
"""
DOCUMENT = "DOCUMENT" # 文档型 RAG(向量检索)
GRAPH = "GRAPH" # 知识图谱型 RAG(LightRAG)


class FileStatus(str, Enum):
"""文件状态枚举

对应 Java: com.datamate.rag.indexer.domain.model.FileStatus
"""
UNPROCESSED = "UNPROCESSED" # 未处理
PROCESSING = "PROCESSING" # 处理中
PROCESSED = "PROCESSED" # 已处理
PROCESS_FAILED = "PROCESS_FAILED" # 处理失败


class KnowledgeBase(BaseEntity):
"""知识库实体

对应 Java: com.datamate.rag.indexer.domain.model.KnowledgeBase
表名: t_rag_knowledge_base
"""
__tablename__ = "t_rag_knowledge_base"
__ignore_data_scope__ = True

id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID")
name = Column(String(255), nullable=False, comment="知识库名称")
type = Column(String(50), nullable=False, comment="知识库类型")
id = Column(String(36), primary_key=True, comment="知识库ID")
name = Column(String(255), nullable=False, unique=True, comment="知识库名称")
description = Column(String(512), nullable=True, comment="知识库描述")
embedding_model = Column(String(255), nullable=False, comment="嵌入模型")
chat_model = Column(String(255), nullable=True, comment="聊天模型")
type = Column(
String(50),
nullable=False,
default=RagType.DOCUMENT,
comment="RAG类型",
)
embedding_model = Column(String(255), nullable=False, comment="嵌入模型ID")
chat_model = Column(String(255), nullable=True, comment="聊天模型ID")

def __repr__(self):
return f"<RagKnowledgeBase(id={self.id}, name={self.name}, type={self.type})>"
return f"<KnowledgeBase(id={self.id}, name={self.name}, type={self.type})>"


class RagFile(BaseEntity):
"""知识库文件模型"""
"""RAG 文件实体

对应 Java: com.datamate.rag.indexer.domain.model.RagFile
表名: t_rag_file
"""
__tablename__ = "t_rag_file"
__ignore_data_scope__ = True

id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID")
knowledge_base_id = Column(String(36), nullable=False, comment="知识库ID")
file_name = Column(String(255), nullable=False, comment="文件名")
file_id = Column(String(255), nullable=False, comment="文件ID")
chunk_count = Column(Integer, nullable=True, comment="切片数")
file_metadata = Column("metadata", JSON, nullable=True, comment="元数据")
status = Column(String(50), nullable=True, comment="文件状态")
err_msg = Column(Text, nullable=True, comment="错误信息")
id = Column(String(36), primary_key=True, comment="RAG文件ID")
knowledge_base_id = Column(String(36), nullable=False, index=True, comment="知识库ID")
file_name = Column(String(512), nullable=False, comment="文件名")
file_id = Column(String(36), nullable=False, comment="原始文件ID")
chunk_count = Column(Integer, nullable=True, comment="分块数量")
file_metadata = Column("metadata", JSON, nullable=True, comment="元数据(JSON格式)")
status = Column(
String(50),
nullable=False,
default=FileStatus.UNPROCESSED,
comment="处理状态",
)
err_msg = Column(String(2048), nullable=True, comment="错误信息")
progress = Column(Integer, default=0, nullable=False, comment="处理进度(0-100)")

def __repr__(self):
return f"<RagFile(id={self.id}, file_name={self.file_name}, status={self.status})>"
3 changes: 3 additions & 0 deletions runtime/datamate-python/app/module/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .operator.interface import operator_router
from .operator.interface import category_router
from .cleaning.interface import router as cleaning_router
from .rag.interface.knowledge_base import router as knowledge_base_router

router = APIRouter(
prefix="/api"
Expand All @@ -26,4 +27,6 @@
router.include_router(category_router)
router.include_router(cleaning_router)

router.include_router(knowledge_base_router)

__all__ = ["router"]
23 changes: 23 additions & 0 deletions runtime/datamate-python/app/module/rag/infra/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""
RAG 基础设施层

提供文档处理、向量存储、嵌入模型和后台任务功能。

使用示例:
from app.module.rag.infra.document import ingest_file_to_chunks
from app.module.rag.infra.vectorstore import VectorStoreFactory
from app.module.rag.infra.task import get_global_pool
"""
from app.module.rag.infra.document import (
SplitOptions,
default_split_options,
ingest_file_to_chunks,
load_and_split,
)

__all__ = [
"load_and_split",
"ingest_file_to_chunks",
"SplitOptions",
"default_split_options",
]
42 changes: 42 additions & 0 deletions runtime/datamate-python/app/module/rag/infra/document/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""
文档处理模块

提供文档加载、分块和处理管道功能。

使用示例:
from app.module.rag.infra.document import (
ingest_file_to_chunks,
SplitOptions,
DocumentChunk,
)

chunks = await ingest_file_to_chunks(
"/path/to/doc.pdf",
chunk_size=500,
overlap_size=50,
)
"""
from app.module.rag.infra.document.processor import (
SplitOptions,
default_split_options,
ingest_file_to_chunks,
load_and_split,
)
from app.module.rag.infra.document.types import (
DocumentChunk,
ParsedDocument,
langchain_documents_to_parsed,
)

__all__ = [
# 处理管道入口
"load_and_split",
"ingest_file_to_chunks",
# 选项
"SplitOptions",
"default_split_options",
# 类型
"DocumentChunk",
"ParsedDocument",
"langchain_documents_to_parsed",
]
Loading
Loading