From af41d0cc5afbedb444196e972525a6036eb805f8 Mon Sep 17 00:00:00 2001 From: ulleo Date: Tue, 18 Nov 2025 16:59:39 +0800 Subject: [PATCH] feat: import Terminologies --- backend/apps/terminology/api/terminology.py | 118 ++++++++- backend/apps/terminology/curd/terminology.py | 237 ++++++++++++++++-- backend/locales/en.json | 5 +- backend/locales/ko-KR.json | 5 +- backend/locales/zh-CN.json | 5 +- frontend/src/api/professional.ts | 9 + .../src/views/system/professional/index.vue | 124 ++++++++- frontend/src/views/system/training/index.vue | 4 +- 8 files changed, 470 insertions(+), 37 deletions(-) diff --git a/backend/apps/terminology/api/terminology.py b/backend/apps/terminology/api/terminology.py index 7b8b573e..38df667f 100644 --- a/backend/apps/terminology/api/terminology.py +++ b/backend/apps/terminology/api/terminology.py @@ -1,15 +1,20 @@ import asyncio +import hashlib import io +import os +import uuid +from http.client import HTTPException from typing import Optional import pandas as pd -from fastapi import APIRouter, Query +from fastapi import APIRouter, File, UploadFile, Query from fastapi.responses import StreamingResponse from apps.chat.models.chat_model import AxisObj from apps.terminology.curd.terminology import page_terminology, create_terminology, update_terminology, \ - delete_terminology, enable_terminology, get_all_terminology + delete_terminology, enable_terminology, get_all_terminology, batch_create_terminology from apps.terminology.models.terminology_model import TerminologyInfo +from common.core.config import settings from common.core.deps import SessionDep, CurrentUser, Trans from common.utils.data_format import DataFormat @@ -89,3 +94,112 @@ def inner(): result = await asyncio.to_thread(inner) return StreamingResponse(result, media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") + + +path = settings.EXCEL_PATH + +from sqlalchemy.orm import sessionmaker, scoped_session +from common.core.db import engine +from sqlmodel import Session + +session_maker = scoped_session(sessionmaker(bind=engine, class_=Session)) + + +@router.post("/uploadExcel") +async def upload_excel(trans: Trans, current_user: CurrentUser, file: UploadFile = File(...)): + ALLOWED_EXTENSIONS = {"xlsx", "xls"} + if not file.filename.lower().endswith(tuple(ALLOWED_EXTENSIONS)): + raise HTTPException(400, "Only support .xlsx/.xls") + + os.makedirs(path, exist_ok=True) + base_filename = f"{file.filename.split('.')[0]}_{hashlib.sha256(uuid.uuid4().bytes).hexdigest()[:10]}" + filename = f"{base_filename}.{file.filename.split('.')[1]}" + save_path = os.path.join(path, filename) + with open(save_path, "wb") as f: + f.write(await file.read()) + + oid = current_user.oid + + use_cols = [0, 1, 2, 3, 4] + + def inner(): + + session = session_maker() + + sheet_names = pd.ExcelFile(save_path).sheet_names + + import_data = [] + + for sheet_name in sheet_names: + + df = pd.read_excel( + save_path, + sheet_name=sheet_name, + engine='calamine', + header=0, + usecols=use_cols, + dtype=str + ).fillna("") + + for index, row in df.iterrows(): + # 跳过空行 + if row.isnull().all(): + continue + + word = row[0].strip() if pd.notna(row[0]) and row[0].strip() else None + other_words = [w.strip() for w in row[1].strip().split(',')] if pd.notna(row[1]) and row[ + 1].strip() else [] + description = row[2].strip() if pd.notna(row[2]) and row[2].strip() else None + datasource_names = [d.strip() for d in row[3].strip().split(',')] if pd.notna(row[3]) and row[ + 3].strip() else [] + all_datasource = True if pd.notna(row[4]) and row[4].lower().strip() in ['y', 'yes', 'true'] else False + specific_ds = False if all_datasource else True + + import_data.append(TerminologyInfo(word=word, description=description, other_words=other_words, + datasource_names=datasource_names, specific_ds=specific_ds)) + + res = batch_create_terminology(session, import_data, oid, trans) + + failed_records = res['failed_records'] + + error_excel_filename = None + + if len(failed_records) > 0: + data_list = [] + for obj in failed_records: + _data = { + "word": obj['data'].word, + "other_words": ', '.join(obj['data'].other_words) if obj['data'].other_words else '', + "description": obj['data'].description, + "all_data_sources": 'N' if obj['data'].specific_ds else 'Y', + "datasource": ', '.join(obj['data'].datasource_names) if obj['data'].datasource_names and obj[ + 'data'].specific_ds else '', + "errors": obj['errors'] + } + data_list.append(_data) + + fields = [] + fields.append(AxisObj(name=trans('i18n_terminology.term_name'), value='word')) + fields.append(AxisObj(name=trans('i18n_terminology.synonyms'), value='other_words')) + fields.append(AxisObj(name=trans('i18n_terminology.term_description'), value='description')) + fields.append(AxisObj(name=trans('i18n_terminology.effective_data_sources'), value='datasource')) + fields.append(AxisObj(name=trans('i18n_terminology.all_data_sources'), value='all_data_sources')) + fields.append(AxisObj(name=trans('i18n_data_training.error_info'), value='errors')) + + md_data, _fields_list = DataFormat.convert_object_array_for_pandas(fields, data_list) + + df = pd.DataFrame(md_data, columns=_fields_list) + error_excel_filename = f"{base_filename}_error.xlsx" + save_error_path = os.path.join(path, error_excel_filename) + # 保存 DataFrame 到 Excel + df.to_excel(save_error_path, index=False) + + return { + 'success_count': res['success_count'], + 'failed_count': len(failed_records), + 'duplicate_count': res['duplicate_count'], + 'original_count': res['original_count'], + 'error_excel_filename': error_excel_filename, + } + + return await asyncio.to_thread(inner) diff --git a/backend/apps/terminology/curd/terminology.py b/backend/apps/terminology/curd/terminology.py index 85946fa7..cb3dd230 100644 --- a/backend/apps/terminology/curd/terminology.py +++ b/backend/apps/terminology/curd/terminology.py @@ -201,6 +201,16 @@ def get_all_terminology(session: SessionDep, name: Optional[str] = None, oid: Op def create_terminology(session: SessionDep, info: TerminologyInfo, oid: int, trans: Trans): + """ + 创建单个术语记录 + """ + # 基本验证 + if not info.word or not info.word.strip(): + raise Exception(trans("i18n_terminology.word_cannot_be_empty")) + + if not info.description or not info.description.strip(): + raise Exception(trans("i18n_terminology.description_cannot_be_empty")) + create_time = datetime.datetime.now() specific_ds = info.specific_ds if info.specific_ds is not None else False @@ -210,16 +220,26 @@ def create_terminology(session: SessionDep, info: TerminologyInfo, oid: int, tra if not datasource_ids: raise Exception(trans("i18n_terminology.datasource_cannot_be_none")) - parent = Terminology(word=info.word, create_time=create_time, description=info.description, oid=oid, - specific_ds=specific_ds, enabled=info.enabled, - datasource_ids=datasource_ids) + parent = Terminology( + word=info.word, + create_time=create_time, + description=info.description, + oid=oid, + specific_ds=specific_ds, + enabled=info.enabled, + datasource_ids=datasource_ids + ) words = [info.word] - for child in info.other_words: - if child in words: + for child_word in info.other_words: + # 先检查是否为空字符串 + if not child_word or child_word.strip() == "": + continue + + if child_word in words: raise Exception(trans("i18n_terminology.cannot_be_repeated")) else: - words.append(child) + words.append(child_word) # 基础查询条件(word 和 oid 必须满足) base_query = and_( @@ -243,7 +263,7 @@ def create_terminology(session: SessionDep, info: TerminologyInfo, oid: int, tra SELECT 1 FROM jsonb_array_elements(datasource_ids) AS elem WHERE elem::text::int = ANY(:datasource_ids) ) - """) # 检查是否包含任意目标值 + """) ) ) ) @@ -255,31 +275,206 @@ def create_terminology(session: SessionDep, info: TerminologyInfo, oid: int, tra if exists: raise Exception(trans("i18n_terminology.exists_in_db")) - result = Terminology(**parent.model_dump()) - session.add(parent) session.flush() session.refresh(parent) - result.id = parent.id - session.commit() - - _list: List[Terminology] = [] + # 插入子记录(其他词) + child_list = [] if info.other_words: for other_word in info.other_words: if other_word.strip() == "": continue - _list.append( - Terminology(pid=result.id, word=other_word, create_time=create_time, oid=oid, enabled=result.enabled, - specific_ds=specific_ds, datasource_ids=datasource_ids)) - session.bulk_save_objects(_list) - session.flush() + child_list.append( + Terminology( + pid=parent.id, + word=other_word, + create_time=create_time, + oid=oid, + enabled=info.enabled, + specific_ds=specific_ds, + datasource_ids=datasource_ids + ) + ) + + if child_list: + session.bulk_save_objects(child_list) + session.flush() + session.commit() - # embedding - run_save_terminology_embeddings([result.id]) + # 处理embedding + run_save_terminology_embeddings([parent.id]) + + return parent.id + + +def batch_create_terminology(session: SessionDep, info_list: List[TerminologyInfo], oid: int, trans: Trans): + """ + 批量创建术语记录(复用单条插入逻辑) + """ + if not info_list: + return { + 'success_count': 0, + 'failed_records': [], + 'duplicate_count': 0, + 'original_count': 0, + 'deduplicated_count': 0 + } + + failed_records = [] + success_count = 0 + inserted_ids = [] + + # 第一步:数据去重(根据新的唯一性规则) + unique_records = {} + duplicate_records = [] + + for info in info_list: + # 过滤掉空的其他词 + filtered_other_words = [w.strip().lower() for w in info.other_words if w and w.strip()] + + # 根据specific_ds决定是否处理datasource_names + specific_ds = info.specific_ds if info.specific_ds is not None else False + filtered_datasource_names = [] + + if specific_ds and info.datasource_names: + # 只有当specific_ds为True时才考虑数据源名称 + filtered_datasource_names = [d.strip().lower() for d in info.datasource_names if d and d.strip()] + + # 创建唯一标识(根据新的规则) + # 1. word和other_words合并并排序(考虑顺序不同) + all_words = [info.word.strip().lower()] if info.word else [] + all_words.extend(filtered_other_words) + all_words_sorted = sorted(all_words) + + # 2. datasource_names排序(考虑顺序不同) + datasource_names_sorted = sorted(filtered_datasource_names) + + unique_key = ( + ','.join(all_words_sorted), # 合并后的所有词(排序) + ','.join(datasource_names_sorted), # 数据源名称(排序) + str(specific_ds) # specific_ds状态 + ) + + if unique_key in unique_records: + duplicate_records.append(info) + else: + unique_records[unique_key] = info + + # 将去重后的数据转换为列表 + deduplicated_list = list(unique_records.values()) + + # 预加载数据源名称到ID的映射 + datasource_name_to_id = {} + datasource_stmt = select(CoreDatasource.id, CoreDatasource.name).where(CoreDatasource.oid == oid) + datasource_result = session.execute(datasource_stmt).all() + for ds in datasource_result: + datasource_name_to_id[ds.name.strip()] = ds.id + + # 验证和转换数据源名称 + valid_records = [] + for info in deduplicated_list: + error_messages = [] + + # 基本验证 + if not info.word or not info.word.strip(): + error_messages.append(trans("i18n_terminology.word_cannot_be_empty")) + failed_records.append({ + 'data': info, + 'errors': error_messages + }) + continue + + if not info.description or not info.description.strip(): + error_messages.append(trans("i18n_terminology.description_cannot_be_empty")) + failed_records.append({ + 'data': info, + 'errors': error_messages + }) + continue - return result.id + # 根据specific_ds决定是否验证数据源 + specific_ds = info.specific_ds if info.specific_ds is not None else False + datasource_ids = [] + + if specific_ds: + # specific_ds为True时需要验证数据源 + if info.datasource_names: + for ds_name in info.datasource_names: + if not ds_name or not ds_name.strip(): + continue # 跳过空的数据源名称 + + if ds_name.strip() in datasource_name_to_id: + datasource_ids.append(datasource_name_to_id[ds_name.strip()]) + else: + error_messages.append(trans("i18n_terminology.datasource_not_found").format(ds_name)) + + # 检查specific_ds为True时必须有数据源 + if not datasource_ids: + error_messages.append(trans("i18n_terminology.datasource_cannot_be_none")) + else: + # specific_ds为False时忽略数据源名称 + datasource_ids = [] + + # 检查主词和其他词是否重复(过滤空字符串) + words = [info.word.strip().lower()] + if info.other_words: + for other_word in info.other_words: + # 先检查是否为空字符串 + if not other_word or other_word.strip() == "": + continue + + word_lower = other_word.strip().lower() + if word_lower in words: + error_messages.append(trans("i18n_terminology.cannot_be_repeated")) + else: + words.append(word_lower) + + if error_messages: + failed_records.append({ + 'data': info, + 'errors': error_messages + }) + continue + + # 创建新的TerminologyInfo对象 + processed_info = TerminologyInfo( + word=info.word.strip(), + description=info.description.strip(), + other_words=[w for w in info.other_words if w and w.strip()], # 过滤空字符串 + datasource_ids=datasource_ids, + datasource_names=info.datasource_names, + specific_ds=specific_ds, + enabled=info.enabled if info.enabled is not None else True + ) + + valid_records.append(processed_info) + + # 使用事务批量处理有效记录 + if valid_records: + for info in valid_records: + try: + # 直接复用create_terminology方法 + terminology_id = create_terminology(session, info, oid, trans) + inserted_ids.append(terminology_id) + success_count += 1 + + except Exception as e: + # 如果单条插入失败,回滚当前记录 + session.rollback() + failed_records.append({ + 'data': info, + 'errors': [str(e)] + }) + + return { + 'success_count': success_count, + 'failed_records': failed_records, + 'duplicate_count': len(duplicate_records), + 'original_count': len(info_list), + 'deduplicated_count': len(deduplicated_list) + } def update_terminology(session: SessionDep, info: TerminologyInfo, oid: int, trans: Trans): diff --git a/backend/locales/en.json b/backend/locales/en.json index 10bdd082..a7a5c114 100644 --- a/backend/locales/en.json +++ b/backend/locales/en.json @@ -47,7 +47,10 @@ "term_description": "Terminology Description", "effective_data_sources": "Effective Data Sources", "all_data_sources": "All Data Sources", - "synonyms": "Synonyms" + "synonyms": "Synonyms", + "word_cannot_be_empty": "Term cannot be empty", + "description_cannot_be_empty": "Term description cannot be empty", + "datasource_not_found": "Datasource not found" }, "i18n_data_training": { "datasource_cannot_be_none": "Datasource cannot be empty", diff --git a/backend/locales/ko-KR.json b/backend/locales/ko-KR.json index c9c339af..a0743a35 100644 --- a/backend/locales/ko-KR.json +++ b/backend/locales/ko-KR.json @@ -47,7 +47,10 @@ "term_description": "용어 설명", "effective_data_sources": "유효 데이터 소스", "all_data_sources": "모든 데이터 소스", - "synonyms": "동의어" + "synonyms": "동의어", + "word_cannot_be_empty": "용어는 비울 수 없습니다", + "description_cannot_be_empty": "용어 설명은 비울 수 없습니다", + "datasource_not_found": "데이터 소스를 찾을 수 없음" }, "i18n_data_training": { "datasource_cannot_be_none": "데이터 소스는 비울 수 없습니다", diff --git a/backend/locales/zh-CN.json b/backend/locales/zh-CN.json index 577ef8c2..2e136e20 100644 --- a/backend/locales/zh-CN.json +++ b/backend/locales/zh-CN.json @@ -47,7 +47,10 @@ "term_description": "术语描述", "effective_data_sources": "生效数据源", "all_data_sources": "所有数据源", - "synonyms": "同义词" + "synonyms": "同义词", + "word_cannot_be_empty": "术语不能为空", + "description_cannot_be_empty": "术语描述不能为空", + "datasource_not_found": "找不到数据源" }, "i18n_data_training": { "datasource_cannot_be_none": "数据源不能为空", diff --git a/frontend/src/api/professional.ts b/frontend/src/api/professional.ts index 64408d3a..36599f47 100644 --- a/frontend/src/api/professional.ts +++ b/frontend/src/api/professional.ts @@ -16,4 +16,13 @@ export const professionalApi = { responseType: 'blob', requestOptions: { customError: true }, }), + downloadError: (path: any) => + request.post( + `/system/terminology/download-fail-info`, + { file: path }, + { + responseType: 'blob', + requestOptions: { customError: true }, + } + ), } diff --git a/frontend/src/views/system/professional/index.vue b/frontend/src/views/system/professional/index.vue index a1e14229..4d5d4f7a 100644 --- a/frontend/src/views/system/professional/index.vue +++ b/frontend/src/views/system/professional/index.vue @@ -12,6 +12,9 @@ import icon_searchOutline_outlined from '@/assets/svg/icon_search-outline_outlin import EmptyBackground from '@/views/dashboard/common/EmptyBackground.vue' import { useI18n } from 'vue-i18n' import { cloneDeep } from 'lodash-es' +import { genFileId, type UploadInstance, type UploadProps, type UploadRawFile } from 'element-plus' +import { trainingApi } from '@/api/training.ts' +import { useCache } from '@/utils/useCache.ts' interface Form { id?: string | null @@ -24,6 +27,7 @@ interface Form { } const { t } = useI18n() +const { wsCache } = useCache() const multipleSelectionAll = ref([]) const allDsList = ref([]) const keywords = ref('') @@ -67,7 +71,91 @@ const cancelDelete = () => { isIndeterminate.value = false } -const uploadExcel = () => {} +const uploadRef = ref() +const uploadLoading = ref(false) + +const token = wsCache.get('user.token') +const headers = ref({ 'X-SQLBOT-TOKEN': `Bearer ${token}` }) +const getUploadURL = import.meta.env.VITE_API_BASE_URL + '/system/terminology/uploadExcel' + +const handleExceed: UploadProps['onExceed'] = (files) => { + uploadRef.value!.clearFiles() + const file = files[0] as UploadRawFile + file.uid = genFileId() + uploadRef.value!.handleStart(file) +} + +const beforeUpload = (rawFile: any) => { + if (rawFile.size / 1024 / 1024 > 50) { + ElMessage.error(t('common.not_exceed_50mb')) + return false + } + uploadLoading.value = true + return true +} +const onSuccess = (response: any) => { + uploadRef.value!.clearFiles() + search() + + if (response?.data?.failed_count > 0 && response?.data?.error_excel_filename) { + ElMessage.error( + t('training.upload_failed', { + success: response.data.success_count, + fail: response.data.failed_count, + fail_info: response.data.error_excel_filename, + }) + ) + trainingApi + .downloadError(response.data.error_excel_filename) + .then((res) => { + const blob = new Blob([res], { + type: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + }) + const link = document.createElement('a') + link.href = URL.createObjectURL(blob) + link.download = response.data.error_excel_filename + document.body.appendChild(link) + link.click() + document.body.removeChild(link) + }) + .catch(async (error) => { + if (error.response) { + try { + let text = await error.response.data.text() + try { + text = JSON.parse(text) + } finally { + ElMessage({ + message: text, + type: 'error', + showClose: true, + }) + } + } catch (e) { + console.error('Error processing error response:', e) + } + } else { + console.error('Other error:', error) + ElMessage({ + message: error, + type: 'error', + showClose: true, + }) + } + }) + .finally(() => { + uploadLoading.value = false + }) + } else { + ElMessage.success(t('training.upload_success')) + uploadLoading.value = false + } +} + +const onError = () => { + uploadLoading.value = false + uploadRef.value!.clearFiles() +} const exportExcel = () => { ElMessageBox.confirm(t('professional.all_236_terms', { msg: pageInfo.total }), { @@ -357,7 +445,7 @@ const changeStatus = (id: any, val: any) => {
{{ $t('professional.professional_terminology') }} -
+
{ {{ $t('professional.export_all') }} - - - {{ $t('user.batch_import') }} - + + + + {{ $t('user.batch_import') }} + +