Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions app/routers/cv_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from app.database.models import CVUpdate

from app.services.nlp_engine import extract_text, analyze_cv_text, score_cv
from app.services.vector_engine import compress_cv_data, get_embedding
from app.auth import get_current_user

router = APIRouter(prefix="/api/v1/cv", tags=["CV Processing & Talent Pool"])
Expand Down Expand Up @@ -49,11 +50,15 @@ async def upload_cv_to_pool(
"candidate_email": candidate_email,
"is_existing": True
}

compressed_text = compress_cv_data(cv_data, cv_data.get("skills", []))
cv_vector = get_embedding(compressed_text)

pool_record = {
"hr_email": current_hr,
"filename": file.filename,
"raw_text": raw_text,
"cv_vector": cv_vector,
"candidate_info": {
"email": cv_data.get("email"),
"phone": cv_data.get("phone"),
Expand Down Expand Up @@ -124,11 +129,11 @@ async def map_cv_to_job(
raise HTTPException(status_code=400, detail="Hồ sơ này đã được đưa vào chiến dịch này rồi!")

cv_data_for_scoring = {
"raw_text": cv_record.get("raw_text", ""),
"skills": cv_record.get("extracted_skills", []),
"years_of_experience": cv_record["candidate_info"].get("years_of_experience", 0),
"skill_experience": cv_record["candidate_info"].get("skill_experience", {}),
"education_level": cv_record["candidate_info"].get("education_level", "Không đề cập")
"education_level": cv_record["candidate_info"].get("education_level", "Không đề cập"),
"cv_vector": cv_record.get("cv_vector", [])
}

scoring_result = score_cv(cv_data_for_scoring, jd_data)
Expand Down
22 changes: 18 additions & 4 deletions app/routers/job_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from app.database.config import get_db
from app.database.models import JobCreateEnterprise, JobResponse
from app.services.nlp_engine import score_cv
from app.services.vector_engine import compress_jd_data, get_embedding

router = APIRouter(prefix="/api/v1/jobs", tags=["Job Management & Ranking"])

Expand All @@ -22,11 +23,11 @@ async def rescore_all_applications_for_job(job_id: str, jd_data: dict, current_h
continue

cv_data_for_scoring = {
"raw_text": cv_record.get("raw_text", ""),
"skills": cv_record.get("extracted_skills", []),
"years_of_experience": cv_record["candidate_info"].get("years_of_experience", 0),
"skill_experience": cv_record["candidate_info"].get("skill_experience", {}),
"education_level": cv_record["candidate_info"].get("education_level", "Không đề cập")
"education_level": cv_record["candidate_info"].get("education_level", "Không đề cập"),
"cv_vector": cv_record.get("cv_vector", [])
}

new_score = score_cv(cv_data_for_scoring, jd_data)
Expand All @@ -41,13 +42,18 @@ async def rescore_all_applications_for_job(job_id: str, jd_data: dict, current_h
async def create_job(job: JobCreateEnterprise, current_hr: str = Depends(get_current_user)):
db = get_db()
job_dict = job.model_dump()

compressed_jd = compress_jd_data(job_dict)
jd_vector = get_embedding(compressed_jd)

jd_search_text = f"{job.description} {job.requirements} {job.benefits or ''} {job.other_info or ''}".lower()

job_dict.update({
"created_by": current_hr,
"created_at": datetime.now(timezone.utc),
"status": "open",
"jd_search_text": jd_search_text
"jd_search_text": jd_search_text,
"jd_vector": jd_vector
})

result = await db["hr_jobs"].insert_one(job_dict)
Expand Down Expand Up @@ -84,8 +90,16 @@ async def update_job(
raise HTTPException(status_code=404, detail="Không tìm thấy Job hoặc bạn không có quyền chỉnh sửa")

update_data = job_update.model_dump()

compressed_jd = compress_jd_data(update_data)
new_jd_vector = get_embedding(compressed_jd)

jd_search_text = f"{job_update.description} {job_update.requirements} {job_update.benefits or ''} {job_update.other_info or ''}".lower()
update_data.update({"updated_at": datetime.now(timezone.utc), "jd_search_text": jd_search_text})
update_data.update({
"updated_at": datetime.now(timezone.utc),
"jd_search_text": jd_search_text,
"jd_vector": new_jd_vector
})

await db["hr_jobs"].update_one({"_id": ObjectId(job_id)}, {"$set": update_data})

Expand Down
24 changes: 6 additions & 18 deletions app/services/nlp_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@
from fastapi import UploadFile, HTTPException
from fastapi.concurrency import run_in_threadpool

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from app.services.vector_engine import calculate_cosine_similarity
import logging

from datetime import datetime
Expand Down Expand Up @@ -288,19 +287,6 @@ def calculate_experience_score(cv_yoe: int, jd_min_yoe: int) -> float:
ratio = cv_yoe / jd_min_yoe
return round(ratio * 100, 2)

def calculate_nlp_similarity(cv_text: str, jd_text: str) -> float:
if not cv_text or not jd_text:
return 0.0

try:
local_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = local_vectorizer.fit_transform([cv_text, jd_text])
similarity_score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
return round(float(similarity_score) * 100, 2)
except Exception as e:
logger.error(f"Lỗi khi tính TF-IDF: {str(e)}")
return 0.0

def calculate_education_score(cv_edu: str, jd_min_edu: str) -> float:
if not jd_min_edu or jd_min_edu.lower() == "không yêu cầu":
return 100.0
Expand Down Expand Up @@ -335,21 +321,23 @@ def score_cv(cv_data: dict, jd_data: dict) -> dict:
jd_required_skills = jd_data.get("required_skills", [])
jd_preferred_skills = jd_data.get("preferred_skills", [])
jd_min_yoe = jd_data.get("min_yoe", 0)
jd_search_text = jd_data.get("jd_search_text", "")

jd_education = jd_data.get("education", {})
jd_min_edu = jd_education.get("min_level", "Không yêu cầu")

cv_text = cv_data.get("raw_text", "")
cv_skills = set(cv_data.get("skills", []))
cv_skill_exp = cv_data.get("skill_experience", {})
cv_yoe = cv_data.get("years_of_experience", 0)
cv_edu = cv_data.get("education_level", "Không đề cập")

jd_vector = jd_data.get("jd_vector", [])
cv_vector = cv_data.get("cv_vector", [])

skill_score, matched_skills, missing_required_skills = calculate_skill_score(cv_skills, cv_skill_exp, jd_required_skills, jd_preferred_skills)
experience_score = calculate_experience_score(cv_yoe, jd_min_yoe)
education_score = calculate_education_score(cv_edu, jd_min_edu)
nlp_score = calculate_nlp_similarity(cv_text, jd_search_text)

nlp_score = calculate_cosine_similarity(cv_vector, jd_vector)

WEIGHT_SKILL = 0.40
WEIGHT_NLP = 0.30
Expand Down
62 changes: 62 additions & 0 deletions app/services/vector_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import os
import requests
import numpy as np
from dotenv import load_dotenv

load_dotenv()
COLAB_API_URL = os.getenv("COLAB_API_URL")

def compress_cv_data(candidate_info: dict, extracted_skills: list) -> str:
edu = candidate_info.get("education_level", "Không có thông tin học vấn")
yoe = candidate_info.get("years_of_experience", 0)
skills_str = ", ".join(extracted_skills) if extracted_skills else "Không có kỹ năng rõ ràng"

return f"Ứng viên trình độ {edu}, có {yoe} năm kinh nghiệm làm việc. Kỹ năng chuyên môn bao gồm: {skills_str}."

def compress_jd_data(jd_data: dict) -> str:
title = jd_data.get("title", "")
yoe = jd_data.get("min_yoe", 0)
edu = jd_data.get("education", {}).get("min_level", "")
req_skills = [s.get("name") for s in jd_data.get("required_skills", [])]
skills_str = ", ".join(req_skills) if req_skills else "Không yêu cầu kỹ năng cụ thể"

return f"Tuyển dụng vị trí {title}. Yêu cầu trình độ {edu}, tối thiểu {yoe} năm kinh nghiệm. Yêu cầu kỹ năng chuyên môn: {skills_str}."

def get_embedding(text: str) -> list:
if not COLAB_API_URL:
print("CẢNH BÁO: Chưa cấu hình COLAB_API_URL trong file .env")
return []

try:
response = requests.post(COLAB_API_URL, json={"text": text}, timeout=30)

if response.status_code == 200:
return response.json().get("embedding", [])
else:
print(f"Lỗi từ Colab API: {response.text}")
return []

except requests.exceptions.RequestException as e:
print(f"Lỗi kết nối đến Colab Microservice: {e}")
return []

def calculate_cosine_similarity(vec1: list, vec2: list) -> float:
if not vec1 or not vec2:
return 0.0

v1 = np.array(vec1).flatten()
v2 = np.array(vec2).flatten()

if len(v1) == 0 or len(v2) == 0 or len(v1) != len(v2):
return 0.0

dot_product = np.dot(v1, v2)
norm_v1 = np.linalg.norm(v1)
norm_v2 = np.linalg.norm(v2)

if norm_v1 == 0 or norm_v2 == 0:
return 0.0

similarity = dot_product / (norm_v1 * norm_v2)
score = max(0.0, float(similarity) * 100)
return round(score, 2)
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ motor
pydantic[email]
dnspython
python-dotenv
scikit-learn
PyJWT
passlib[bcrypt]
bcrypt==4.0.1
httpx
requests
requests
numpy
Loading