From 34b8988a6d556f74e8091e3e28d26fcaf1cdfc40 Mon Sep 17 00:00:00 2001 From: Thai Hua Date: Tue, 14 Apr 2026 18:09:05 +0700 Subject: [PATCH] chore: optimize requirements for Render deployment (remove sklearn, keep numpy & requests for Colab MiniLM microservice) --- app/routers/cv_router.py | 9 +++-- app/routers/job_router.py | 22 ++++++++++--- app/services/nlp_engine.py | 24 ++++---------- app/services/vector_engine.py | 62 +++++++++++++++++++++++++++++++++++ requirements.txt | 4 +-- 5 files changed, 95 insertions(+), 26 deletions(-) create mode 100644 app/services/vector_engine.py diff --git a/app/routers/cv_router.py b/app/routers/cv_router.py index 2b38dc7..9c56fdc 100644 --- a/app/routers/cv_router.py +++ b/app/routers/cv_router.py @@ -7,6 +7,7 @@ from app.database.models import CVUpdate from app.services.nlp_engine import extract_text, analyze_cv_text, score_cv +from app.services.vector_engine import compress_cv_data, get_embedding from app.auth import get_current_user router = APIRouter(prefix="/api/v1/cv", tags=["CV Processing & Talent Pool"]) @@ -49,11 +50,15 @@ async def upload_cv_to_pool( "candidate_email": candidate_email, "is_existing": True } + + compressed_text = compress_cv_data(cv_data, cv_data.get("skills", [])) + cv_vector = get_embedding(compressed_text) pool_record = { "hr_email": current_hr, "filename": file.filename, "raw_text": raw_text, + "cv_vector": cv_vector, "candidate_info": { "email": cv_data.get("email"), "phone": cv_data.get("phone"), @@ -124,11 +129,11 @@ async def map_cv_to_job( raise HTTPException(status_code=400, detail="Hồ sơ này đã được đưa vào chiến dịch này rồi!") cv_data_for_scoring = { - "raw_text": cv_record.get("raw_text", ""), "skills": cv_record.get("extracted_skills", []), "years_of_experience": cv_record["candidate_info"].get("years_of_experience", 0), "skill_experience": cv_record["candidate_info"].get("skill_experience", {}), - "education_level": cv_record["candidate_info"].get("education_level", "Không đề cập") + "education_level": cv_record["candidate_info"].get("education_level", "Không đề cập"), + "cv_vector": cv_record.get("cv_vector", []) } scoring_result = score_cv(cv_data_for_scoring, jd_data) diff --git a/app/routers/job_router.py b/app/routers/job_router.py index 219c058..fd8e94f 100644 --- a/app/routers/job_router.py +++ b/app/routers/job_router.py @@ -7,6 +7,7 @@ from app.database.config import get_db from app.database.models import JobCreateEnterprise, JobResponse from app.services.nlp_engine import score_cv +from app.services.vector_engine import compress_jd_data, get_embedding router = APIRouter(prefix="/api/v1/jobs", tags=["Job Management & Ranking"]) @@ -22,11 +23,11 @@ async def rescore_all_applications_for_job(job_id: str, jd_data: dict, current_h continue cv_data_for_scoring = { - "raw_text": cv_record.get("raw_text", ""), "skills": cv_record.get("extracted_skills", []), "years_of_experience": cv_record["candidate_info"].get("years_of_experience", 0), "skill_experience": cv_record["candidate_info"].get("skill_experience", {}), - "education_level": cv_record["candidate_info"].get("education_level", "Không đề cập") + "education_level": cv_record["candidate_info"].get("education_level", "Không đề cập"), + "cv_vector": cv_record.get("cv_vector", []) } new_score = score_cv(cv_data_for_scoring, jd_data) @@ -41,13 +42,18 @@ async def rescore_all_applications_for_job(job_id: str, jd_data: dict, current_h async def create_job(job: JobCreateEnterprise, current_hr: str = Depends(get_current_user)): db = get_db() job_dict = job.model_dump() + + compressed_jd = compress_jd_data(job_dict) + jd_vector = get_embedding(compressed_jd) + jd_search_text = f"{job.description} {job.requirements} {job.benefits or ''} {job.other_info or ''}".lower() job_dict.update({ "created_by": current_hr, "created_at": datetime.now(timezone.utc), "status": "open", - "jd_search_text": jd_search_text + "jd_search_text": jd_search_text, + "jd_vector": jd_vector }) result = await db["hr_jobs"].insert_one(job_dict) @@ -84,8 +90,16 @@ async def update_job( raise HTTPException(status_code=404, detail="Không tìm thấy Job hoặc bạn không có quyền chỉnh sửa") update_data = job_update.model_dump() + + compressed_jd = compress_jd_data(update_data) + new_jd_vector = get_embedding(compressed_jd) + jd_search_text = f"{job_update.description} {job_update.requirements} {job_update.benefits or ''} {job_update.other_info or ''}".lower() - update_data.update({"updated_at": datetime.now(timezone.utc), "jd_search_text": jd_search_text}) + update_data.update({ + "updated_at": datetime.now(timezone.utc), + "jd_search_text": jd_search_text, + "jd_vector": new_jd_vector + }) await db["hr_jobs"].update_one({"_id": ObjectId(job_id)}, {"$set": update_data}) diff --git a/app/services/nlp_engine.py b/app/services/nlp_engine.py index 36bcc37..2468dbb 100644 --- a/app/services/nlp_engine.py +++ b/app/services/nlp_engine.py @@ -10,8 +10,7 @@ from fastapi import UploadFile, HTTPException from fastapi.concurrency import run_in_threadpool -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.metrics.pairwise import cosine_similarity +from app.services.vector_engine import calculate_cosine_similarity import logging from datetime import datetime @@ -288,19 +287,6 @@ def calculate_experience_score(cv_yoe: int, jd_min_yoe: int) -> float: ratio = cv_yoe / jd_min_yoe return round(ratio * 100, 2) -def calculate_nlp_similarity(cv_text: str, jd_text: str) -> float: - if not cv_text or not jd_text: - return 0.0 - - try: - local_vectorizer = TfidfVectorizer(stop_words='english') - tfidf_matrix = local_vectorizer.fit_transform([cv_text, jd_text]) - similarity_score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0] - return round(float(similarity_score) * 100, 2) - except Exception as e: - logger.error(f"Lỗi khi tính TF-IDF: {str(e)}") - return 0.0 - def calculate_education_score(cv_edu: str, jd_min_edu: str) -> float: if not jd_min_edu or jd_min_edu.lower() == "không yêu cầu": return 100.0 @@ -335,21 +321,23 @@ def score_cv(cv_data: dict, jd_data: dict) -> dict: jd_required_skills = jd_data.get("required_skills", []) jd_preferred_skills = jd_data.get("preferred_skills", []) jd_min_yoe = jd_data.get("min_yoe", 0) - jd_search_text = jd_data.get("jd_search_text", "") jd_education = jd_data.get("education", {}) jd_min_edu = jd_education.get("min_level", "Không yêu cầu") - cv_text = cv_data.get("raw_text", "") cv_skills = set(cv_data.get("skills", [])) cv_skill_exp = cv_data.get("skill_experience", {}) cv_yoe = cv_data.get("years_of_experience", 0) cv_edu = cv_data.get("education_level", "Không đề cập") + jd_vector = jd_data.get("jd_vector", []) + cv_vector = cv_data.get("cv_vector", []) + skill_score, matched_skills, missing_required_skills = calculate_skill_score(cv_skills, cv_skill_exp, jd_required_skills, jd_preferred_skills) experience_score = calculate_experience_score(cv_yoe, jd_min_yoe) education_score = calculate_education_score(cv_edu, jd_min_edu) - nlp_score = calculate_nlp_similarity(cv_text, jd_search_text) + + nlp_score = calculate_cosine_similarity(cv_vector, jd_vector) WEIGHT_SKILL = 0.40 WEIGHT_NLP = 0.30 diff --git a/app/services/vector_engine.py b/app/services/vector_engine.py new file mode 100644 index 0000000..d4de417 --- /dev/null +++ b/app/services/vector_engine.py @@ -0,0 +1,62 @@ +import os +import requests +import numpy as np +from dotenv import load_dotenv + +load_dotenv() +COLAB_API_URL = os.getenv("COLAB_API_URL") + +def compress_cv_data(candidate_info: dict, extracted_skills: list) -> str: + edu = candidate_info.get("education_level", "Không có thông tin học vấn") + yoe = candidate_info.get("years_of_experience", 0) + skills_str = ", ".join(extracted_skills) if extracted_skills else "Không có kỹ năng rõ ràng" + + return f"Ứng viên trình độ {edu}, có {yoe} năm kinh nghiệm làm việc. Kỹ năng chuyên môn bao gồm: {skills_str}." + +def compress_jd_data(jd_data: dict) -> str: + title = jd_data.get("title", "") + yoe = jd_data.get("min_yoe", 0) + edu = jd_data.get("education", {}).get("min_level", "") + req_skills = [s.get("name") for s in jd_data.get("required_skills", [])] + skills_str = ", ".join(req_skills) if req_skills else "Không yêu cầu kỹ năng cụ thể" + + return f"Tuyển dụng vị trí {title}. Yêu cầu trình độ {edu}, tối thiểu {yoe} năm kinh nghiệm. Yêu cầu kỹ năng chuyên môn: {skills_str}." + +def get_embedding(text: str) -> list: + if not COLAB_API_URL: + print("CẢNH BÁO: Chưa cấu hình COLAB_API_URL trong file .env") + return [] + + try: + response = requests.post(COLAB_API_URL, json={"text": text}, timeout=30) + + if response.status_code == 200: + return response.json().get("embedding", []) + else: + print(f"Lỗi từ Colab API: {response.text}") + return [] + + except requests.exceptions.RequestException as e: + print(f"Lỗi kết nối đến Colab Microservice: {e}") + return [] + +def calculate_cosine_similarity(vec1: list, vec2: list) -> float: + if not vec1 or not vec2: + return 0.0 + + v1 = np.array(vec1).flatten() + v2 = np.array(vec2).flatten() + + if len(v1) == 0 or len(v2) == 0 or len(v1) != len(v2): + return 0.0 + + dot_product = np.dot(v1, v2) + norm_v1 = np.linalg.norm(v1) + norm_v2 = np.linalg.norm(v2) + + if norm_v1 == 0 or norm_v2 == 0: + return 0.0 + + similarity = dot_product / (norm_v1 * norm_v2) + score = max(0.0, float(similarity) * 100) + return round(score, 2) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4e2d362..ce0cfaf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,9 +7,9 @@ motor pydantic[email] dnspython python-dotenv -scikit-learn PyJWT passlib[bcrypt] bcrypt==4.0.1 httpx -requests \ No newline at end of file +requests +numpy \ No newline at end of file