From d4e596ae3f2e493be01cf8d31ee6ab2be244368b Mon Sep 17 00:00:00 2001 From: Thai Hua Date: Sun, 26 Apr 2026 19:53:29 +0700 Subject: [PATCH 1/3] fix(nlp): resolve cv_yoe scope & prevent vector semantic dilution --- app/services/nlp_engine.py | 8 ++++++-- app/services/vector_engine.py | 7 ++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/app/services/nlp_engine.py b/app/services/nlp_engine.py index 2468dbb..6a09a39 100644 --- a/app/services/nlp_engine.py +++ b/app/services/nlp_engine.py @@ -20,6 +20,7 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +logging.getLogger("pdfminer").setLevel(logging.ERROR) def load_skills(file_path: str) -> Dict[str, List[str]]: skill_map = {} @@ -223,7 +224,7 @@ def get_normalized_skill(raw_skill: str) -> str: return root return raw_lower -def calculate_skill_score(cv_skills: set, cv_skill_exp: dict, jd_required: list, jd_preferred: list): +def calculate_skill_score(cv_skills: set, cv_skill_exp: dict, cv_yoe: float, jd_required: list, jd_preferred: list): score = 0.0 total_weight = sum(s.get('weight', 1.0) for s in jd_required) + sum(s.get('weight', 0.5) for s in jd_preferred) @@ -243,6 +244,9 @@ def evaluate_skill(skill_dict, default_weight): if norm_name in cv_skills: cv_years = cv_skill_exp.get(norm_name, 0.0) + if cv_years == 0.0 and cv_yoe > 0: + cv_years = cv_yoe * 0.5 + if req_years > 0: if cv_years >= req_years: bonus = min((cv_years - req_years) * 0.1, 0.2) * weight @@ -333,7 +337,7 @@ def score_cv(cv_data: dict, jd_data: dict) -> dict: jd_vector = jd_data.get("jd_vector", []) cv_vector = cv_data.get("cv_vector", []) - skill_score, matched_skills, missing_required_skills = calculate_skill_score(cv_skills, cv_skill_exp, jd_required_skills, jd_preferred_skills) + skill_score, matched_skills, missing_required_skills = calculate_skill_score(cv_skills, cv_skill_exp, cv_yoe, jd_required_skills, jd_preferred_skills) experience_score = calculate_experience_score(cv_yoe, jd_min_yoe) education_score = calculate_education_score(cv_edu, jd_min_edu) diff --git a/app/services/vector_engine.py b/app/services/vector_engine.py index d4de417..7f9202e 100644 --- a/app/services/vector_engine.py +++ b/app/services/vector_engine.py @@ -11,16 +11,17 @@ def compress_cv_data(candidate_info: dict, extracted_skills: list) -> str: yoe = candidate_info.get("years_of_experience", 0) skills_str = ", ".join(extracted_skills) if extracted_skills else "Không có kỹ năng rõ ràng" - return f"Ứng viên trình độ {edu}, có {yoe} năm kinh nghiệm làm việc. Kỹ năng chuyên môn bao gồm: {skills_str}." + return f"Ứng viên trình độ {edu}, có {yoe} năm kinh nghiệm làm việc. Kỹ năng chuyên môn: {skills_str}." def compress_jd_data(jd_data: dict) -> str: title = jd_data.get("title", "") yoe = jd_data.get("min_yoe", 0) - edu = jd_data.get("education", {}).get("min_level", "") + edu = jd_data.get("education", {}).get("min_level", "Không yêu cầu") + req_skills = [s.get("name") for s in jd_data.get("required_skills", [])] skills_str = ", ".join(req_skills) if req_skills else "Không yêu cầu kỹ năng cụ thể" - return f"Tuyển dụng vị trí {title}. Yêu cầu trình độ {edu}, tối thiểu {yoe} năm kinh nghiệm. Yêu cầu kỹ năng chuyên môn: {skills_str}." + return f"Tuyển dụng vị trí {title}. Yêu cầu trình độ {edu}, tối thiểu {yoe} năm kinh nghiệm. Kỹ năng chuyên môn: {skills_str}." def get_embedding(text: str) -> list: if not COLAB_API_URL: From 73150932f0c63eaec33ef2d1789ef63e2ec1d2df Mon Sep 17 00:00:00 2001 From: Thai Hua Date: Sun, 26 Apr 2026 22:16:20 +0700 Subject: [PATCH 2/3] feat(nlp): implement tiered penalty for spammy CVs based on word count --- app/routers/cv_router.py | 3 ++- app/routers/job_router.py | 3 ++- app/services/nlp_engine.py | 13 ++++++++++++- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/app/routers/cv_router.py b/app/routers/cv_router.py index 9c56fdc..912a517 100644 --- a/app/routers/cv_router.py +++ b/app/routers/cv_router.py @@ -133,7 +133,8 @@ async def map_cv_to_job( "years_of_experience": cv_record["candidate_info"].get("years_of_experience", 0), "skill_experience": cv_record["candidate_info"].get("skill_experience", {}), "education_level": cv_record["candidate_info"].get("education_level", "Không đề cập"), - "cv_vector": cv_record.get("cv_vector", []) + "cv_vector": cv_record.get("cv_vector", []), + "word_count": len((cv_record.get("raw_text", "") or "").split()) } scoring_result = score_cv(cv_data_for_scoring, jd_data) diff --git a/app/routers/job_router.py b/app/routers/job_router.py index fd8e94f..00f82db 100644 --- a/app/routers/job_router.py +++ b/app/routers/job_router.py @@ -27,7 +27,8 @@ async def rescore_all_applications_for_job(job_id: str, jd_data: dict, current_h "years_of_experience": cv_record["candidate_info"].get("years_of_experience", 0), "skill_experience": cv_record["candidate_info"].get("skill_experience", {}), "education_level": cv_record["candidate_info"].get("education_level", "Không đề cập"), - "cv_vector": cv_record.get("cv_vector", []) + "cv_vector": cv_record.get("cv_vector", []), + "word_count": len((cv_record.get("raw_text", "") or "").split()) } new_score = score_cv(cv_data_for_scoring, jd_data) diff --git a/app/services/nlp_engine.py b/app/services/nlp_engine.py index 6a09a39..af02d29 100644 --- a/app/services/nlp_engine.py +++ b/app/services/nlp_engine.py @@ -357,13 +357,24 @@ def score_cv(cv_data: dict, jd_data: dict) -> dict: total_score = min(100.0, total_score) + word_count = cv_data.get("word_count", 500) + penalty_score = 0.0 + + if word_count < 200: + penalty_score = 20.0 + elif word_count < 300: + penalty_score = 10.0 + + total_score = max(0.0, total_score - penalty_score) + return { "total_score": round(total_score, 2), "score_breakdown": { "skills_score": skill_score, "experience_score": experience_score, "education_score": education_score, - "nlp_score": nlp_score + "nlp_score": nlp_score, + "penalty_score": penalty_score }, "matched_skills": matched_skills, "missing_required_skills": missing_required_skills From 1751f4fe830490814c9868cef9a75af206a4f07b Mon Sep 17 00:00:00 2001 From: Thai Hua Date: Mon, 27 Apr 2026 11:37:54 +0700 Subject: [PATCH 3/3] feat(ai): maximize BGE-M3 context & implement adaptive NLP penalty --- app/routers/cv_router.py | 5 +++-- app/routers/job_router.py | 3 ++- app/services/nlp_engine.py | 22 ++++++++++++++++++---- app/services/vector_engine.py | 21 ++++++++++++++++++--- 4 files changed, 41 insertions(+), 10 deletions(-) diff --git a/app/routers/cv_router.py b/app/routers/cv_router.py index 912a517..74c957d 100644 --- a/app/routers/cv_router.py +++ b/app/routers/cv_router.py @@ -51,7 +51,7 @@ async def upload_cv_to_pool( "is_existing": True } - compressed_text = compress_cv_data(cv_data, cv_data.get("skills", [])) + compressed_text = compress_cv_data(raw_text, cv_data, cv_data.get("skills", [])) cv_vector = get_embedding(compressed_text) pool_record = { @@ -134,7 +134,8 @@ async def map_cv_to_job( "skill_experience": cv_record["candidate_info"].get("skill_experience", {}), "education_level": cv_record["candidate_info"].get("education_level", "Không đề cập"), "cv_vector": cv_record.get("cv_vector", []), - "word_count": len((cv_record.get("raw_text", "") or "").split()) + "word_count": len((cv_record.get("raw_text", "") or "").split()), + "raw_text": cv_record.get("raw_text", "") } scoring_result = score_cv(cv_data_for_scoring, jd_data) diff --git a/app/routers/job_router.py b/app/routers/job_router.py index 00f82db..303d8ae 100644 --- a/app/routers/job_router.py +++ b/app/routers/job_router.py @@ -28,7 +28,8 @@ async def rescore_all_applications_for_job(job_id: str, jd_data: dict, current_h "skill_experience": cv_record["candidate_info"].get("skill_experience", {}), "education_level": cv_record["candidate_info"].get("education_level", "Không đề cập"), "cv_vector": cv_record.get("cv_vector", []), - "word_count": len((cv_record.get("raw_text", "") or "").split()) + "word_count": len((cv_record.get("raw_text", "") or "").split()), + "raw_text": cv_record.get("raw_text", "") } new_score = score_cv(cv_data_for_scoring, jd_data) diff --git a/app/services/nlp_engine.py b/app/services/nlp_engine.py index af02d29..2a89716 100644 --- a/app/services/nlp_engine.py +++ b/app/services/nlp_engine.py @@ -68,7 +68,7 @@ def extract_skills(text: str) -> List[str]: found.add(main) break - return list(found) + return sorted(list(found)) def extract_basic_info(text: str) -> Dict: email = re.search(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b", text) @@ -96,6 +96,9 @@ def extract_social_links(text: str) -> dict: for match in matches: url = match.group(0).rstrip('.,;)]') + if url.endswith(('.js', '.ts', '.php', '.py', '.html', '.css', '.cpp')): + continue + if '@' in url and not url.startswith('http'): continue @@ -357,12 +360,23 @@ def score_cv(cv_data: dict, jd_data: dict) -> dict: total_score = min(100.0, total_score) - word_count = cv_data.get("word_count", 500) + raw_text = cv_data.get("raw_text", "").lower() + eng_words = [" the ", " and ", " in ", " to ", " of ", " for ", " with "] + vie_words = [" và ", " của ", " trong ", " cho ", " với ", " tại ", " là ", " các ", " người "] + + eng_count = sum(raw_text.count(w) for w in eng_words) + vie_count = sum(raw_text.count(w) for w in vie_words) + is_english = eng_count > vie_count + + threshold_severe = 100 if is_english else 200 + threshold_light = 150 if is_english else 300 + + word_count = cv_data.get("word_count") penalty_score = 0.0 - if word_count < 200: + if word_count < threshold_severe: penalty_score = 20.0 - elif word_count < 300: + elif word_count < threshold_light: penalty_score = 10.0 total_score = max(0.0, total_score - penalty_score) diff --git a/app/services/vector_engine.py b/app/services/vector_engine.py index 7f9202e..9856283 100644 --- a/app/services/vector_engine.py +++ b/app/services/vector_engine.py @@ -6,12 +6,16 @@ load_dotenv() COLAB_API_URL = os.getenv("COLAB_API_URL") -def compress_cv_data(candidate_info: dict, extracted_skills: list) -> str: +def compress_cv_data(raw_text: str, candidate_info: dict, extracted_skills: list) -> str: edu = candidate_info.get("education_level", "Không có thông tin học vấn") yoe = candidate_info.get("years_of_experience", 0) skills_str = ", ".join(extracted_skills) if extracted_skills else "Không có kỹ năng rõ ràng" - return f"Ứng viên trình độ {edu}, có {yoe} năm kinh nghiệm làm việc. Kỹ năng chuyên môn: {skills_str}." + full_cv_context = ( + f"Thông tin tóm tắt: Trình độ {edu}, {yoe} năm kinh nghiệm. Kỹ năng: {skills_str}.\n" + f"Chi tiết Hồ sơ:\n{raw_text}" + ) + return full_cv_context def compress_jd_data(jd_data: dict) -> str: title = jd_data.get("title", "") @@ -21,7 +25,18 @@ def compress_jd_data(jd_data: dict) -> str: req_skills = [s.get("name") for s in jd_data.get("required_skills", [])] skills_str = ", ".join(req_skills) if req_skills else "Không yêu cầu kỹ năng cụ thể" - return f"Tuyển dụng vị trí {title}. Yêu cầu trình độ {edu}, tối thiểu {yoe} năm kinh nghiệm. Kỹ năng chuyên môn: {skills_str}." + desc = jd_data.get("description", "") + reqs = jd_data.get("requirements", "") + benefits = jd_data.get("benefits", "") + + full_jd_context = ( + f"Vị trí tuyển dụng: {title}\n" + f"Yêu cầu tối thiểu: Trình độ {edu}, tối thiểu {yoe} năm kinh nghiệm. Kỹ năng: {skills_str}.\n" + f"Mô tả công việc:\n{desc}\n" + f"Yêu cầu chi tiết:\n{reqs}\n" + f"Quyền lợi:\n{benefits}" + ) + return full_jd_context def get_embedding(text: str) -> list: if not COLAB_API_URL: