diff --git a/app/routers/cv_router.py b/app/routers/cv_router.py index 9c56fdc..74c957d 100644 --- a/app/routers/cv_router.py +++ b/app/routers/cv_router.py @@ -51,7 +51,7 @@ async def upload_cv_to_pool( "is_existing": True } - compressed_text = compress_cv_data(cv_data, cv_data.get("skills", [])) + compressed_text = compress_cv_data(raw_text, cv_data, cv_data.get("skills", [])) cv_vector = get_embedding(compressed_text) pool_record = { @@ -133,7 +133,9 @@ async def map_cv_to_job( "years_of_experience": cv_record["candidate_info"].get("years_of_experience", 0), "skill_experience": cv_record["candidate_info"].get("skill_experience", {}), "education_level": cv_record["candidate_info"].get("education_level", "Không đề cập"), - "cv_vector": cv_record.get("cv_vector", []) + "cv_vector": cv_record.get("cv_vector", []), + "word_count": len((cv_record.get("raw_text", "") or "").split()), + "raw_text": cv_record.get("raw_text", "") } scoring_result = score_cv(cv_data_for_scoring, jd_data) diff --git a/app/routers/job_router.py b/app/routers/job_router.py index fd8e94f..303d8ae 100644 --- a/app/routers/job_router.py +++ b/app/routers/job_router.py @@ -27,7 +27,9 @@ async def rescore_all_applications_for_job(job_id: str, jd_data: dict, current_h "years_of_experience": cv_record["candidate_info"].get("years_of_experience", 0), "skill_experience": cv_record["candidate_info"].get("skill_experience", {}), "education_level": cv_record["candidate_info"].get("education_level", "Không đề cập"), - "cv_vector": cv_record.get("cv_vector", []) + "cv_vector": cv_record.get("cv_vector", []), + "word_count": len((cv_record.get("raw_text", "") or "").split()), + "raw_text": cv_record.get("raw_text", "") } new_score = score_cv(cv_data_for_scoring, jd_data) diff --git a/app/services/nlp_engine.py b/app/services/nlp_engine.py index 2468dbb..2a89716 100644 --- a/app/services/nlp_engine.py +++ b/app/services/nlp_engine.py @@ -20,6 +20,7 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +logging.getLogger("pdfminer").setLevel(logging.ERROR) def load_skills(file_path: str) -> Dict[str, List[str]]: skill_map = {} @@ -67,7 +68,7 @@ def extract_skills(text: str) -> List[str]: found.add(main) break - return list(found) + return sorted(list(found)) def extract_basic_info(text: str) -> Dict: email = re.search(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b", text) @@ -95,6 +96,9 @@ def extract_social_links(text: str) -> dict: for match in matches: url = match.group(0).rstrip('.,;)]') + if url.endswith(('.js', '.ts', '.php', '.py', '.html', '.css', '.cpp')): + continue + if '@' in url and not url.startswith('http'): continue @@ -223,7 +227,7 @@ def get_normalized_skill(raw_skill: str) -> str: return root return raw_lower -def calculate_skill_score(cv_skills: set, cv_skill_exp: dict, jd_required: list, jd_preferred: list): +def calculate_skill_score(cv_skills: set, cv_skill_exp: dict, cv_yoe: float, jd_required: list, jd_preferred: list): score = 0.0 total_weight = sum(s.get('weight', 1.0) for s in jd_required) + sum(s.get('weight', 0.5) for s in jd_preferred) @@ -243,6 +247,9 @@ def evaluate_skill(skill_dict, default_weight): if norm_name in cv_skills: cv_years = cv_skill_exp.get(norm_name, 0.0) + if cv_years == 0.0 and cv_yoe > 0: + cv_years = cv_yoe * 0.5 + if req_years > 0: if cv_years >= req_years: bonus = min((cv_years - req_years) * 0.1, 0.2) * weight @@ -333,7 +340,7 @@ def score_cv(cv_data: dict, jd_data: dict) -> dict: jd_vector = jd_data.get("jd_vector", []) cv_vector = cv_data.get("cv_vector", []) - skill_score, matched_skills, missing_required_skills = calculate_skill_score(cv_skills, cv_skill_exp, jd_required_skills, jd_preferred_skills) + skill_score, matched_skills, missing_required_skills = calculate_skill_score(cv_skills, cv_skill_exp, cv_yoe, jd_required_skills, jd_preferred_skills) experience_score = calculate_experience_score(cv_yoe, jd_min_yoe) education_score = calculate_education_score(cv_edu, jd_min_edu) @@ -353,13 +360,35 @@ def score_cv(cv_data: dict, jd_data: dict) -> dict: total_score = min(100.0, total_score) + raw_text = cv_data.get("raw_text", "").lower() + eng_words = [" the ", " and ", " in ", " to ", " of ", " for ", " with "] + vie_words = [" và ", " của ", " trong ", " cho ", " với ", " tại ", " là ", " các ", " người "] + + eng_count = sum(raw_text.count(w) for w in eng_words) + vie_count = sum(raw_text.count(w) for w in vie_words) + is_english = eng_count > vie_count + + threshold_severe = 100 if is_english else 200 + threshold_light = 150 if is_english else 300 + + word_count = cv_data.get("word_count") + penalty_score = 0.0 + + if word_count < threshold_severe: + penalty_score = 20.0 + elif word_count < threshold_light: + penalty_score = 10.0 + + total_score = max(0.0, total_score - penalty_score) + return { "total_score": round(total_score, 2), "score_breakdown": { "skills_score": skill_score, "experience_score": experience_score, "education_score": education_score, - "nlp_score": nlp_score + "nlp_score": nlp_score, + "penalty_score": penalty_score }, "matched_skills": matched_skills, "missing_required_skills": missing_required_skills diff --git a/app/services/vector_engine.py b/app/services/vector_engine.py index d4de417..9856283 100644 --- a/app/services/vector_engine.py +++ b/app/services/vector_engine.py @@ -6,21 +6,37 @@ load_dotenv() COLAB_API_URL = os.getenv("COLAB_API_URL") -def compress_cv_data(candidate_info: dict, extracted_skills: list) -> str: +def compress_cv_data(raw_text: str, candidate_info: dict, extracted_skills: list) -> str: edu = candidate_info.get("education_level", "Không có thông tin học vấn") yoe = candidate_info.get("years_of_experience", 0) skills_str = ", ".join(extracted_skills) if extracted_skills else "Không có kỹ năng rõ ràng" - return f"Ứng viên trình độ {edu}, có {yoe} năm kinh nghiệm làm việc. Kỹ năng chuyên môn bao gồm: {skills_str}." + full_cv_context = ( + f"Thông tin tóm tắt: Trình độ {edu}, {yoe} năm kinh nghiệm. Kỹ năng: {skills_str}.\n" + f"Chi tiết Hồ sơ:\n{raw_text}" + ) + return full_cv_context def compress_jd_data(jd_data: dict) -> str: title = jd_data.get("title", "") yoe = jd_data.get("min_yoe", 0) - edu = jd_data.get("education", {}).get("min_level", "") + edu = jd_data.get("education", {}).get("min_level", "Không yêu cầu") + req_skills = [s.get("name") for s in jd_data.get("required_skills", [])] skills_str = ", ".join(req_skills) if req_skills else "Không yêu cầu kỹ năng cụ thể" - return f"Tuyển dụng vị trí {title}. Yêu cầu trình độ {edu}, tối thiểu {yoe} năm kinh nghiệm. Yêu cầu kỹ năng chuyên môn: {skills_str}." + desc = jd_data.get("description", "") + reqs = jd_data.get("requirements", "") + benefits = jd_data.get("benefits", "") + + full_jd_context = ( + f"Vị trí tuyển dụng: {title}\n" + f"Yêu cầu tối thiểu: Trình độ {edu}, tối thiểu {yoe} năm kinh nghiệm. Kỹ năng: {skills_str}.\n" + f"Mô tả công việc:\n{desc}\n" + f"Yêu cầu chi tiết:\n{reqs}\n" + f"Quyền lợi:\n{benefits}" + ) + return full_jd_context def get_embedding(text: str) -> list: if not COLAB_API_URL: