|
| 1 | +import re |
| 2 | +import nltk |
| 3 | +from .heuristics import ( |
| 4 | + SENSATIONALIST_KEYWORDS, |
| 5 | + CLICKBAIT_PATTERNS, |
| 6 | + HEURISTIC_WEIGHTS |
| 7 | +) |
| 8 | + |
| 9 | +def _download_nltk_data(): |
| 10 | + try: |
| 11 | + nltk.data.find('tokenizers/punkt') |
| 12 | + except LookupError: |
| 13 | + nltk.download('punkt', quiet=True) |
| 14 | + try: |
| 15 | + nltk.data.find('help/tagsets') |
| 16 | + except LookupError: |
| 17 | + nltk.download('averaged_perceptron_tagger', quiet=True) |
| 18 | + try: |
| 19 | + nltk.data.find('chunkers/maxent_ne_chunker') |
| 20 | + except LookupError: |
| 21 | + nltk.download('maxent_ne_chunker', quiet=True) |
| 22 | + try: |
| 23 | + nltk.data.find('corpora/words') |
| 24 | + except LookupError: |
| 25 | + nltk.download('words', quiet=True) |
| 26 | + |
| 27 | +def analyze_text_for_fake_content(text): |
| 28 | + """ |
| 29 | + Analyzes a block of text for indicators of fake or misleading content. |
| 30 | + """ |
| 31 | + _download_nltk_data() |
| 32 | + if not text: |
| 33 | + return {"score": 0.0, "indicators_found": []} |
| 34 | + |
| 35 | + text_lower = text.lower() |
| 36 | + score = 0.0 |
| 37 | + indicators_found = [] |
| 38 | + |
| 39 | + # 1. Analyze text for sensationalist keywords |
| 40 | + for keyword in SENSATIONALIST_KEYWORDS: |
| 41 | + if keyword in text_lower: |
| 42 | + score += HEURISTIC_WEIGHTS.get("SENSATIONALIST_KEYWORD", 1.0) |
| 43 | + indicators_found.append(f"Found sensationalist keyword: '{keyword}'") |
| 44 | + |
| 45 | + # 2. Analyze text for clickbait patterns |
| 46 | + for pattern in CLICKBAIT_PATTERNS: |
| 47 | + if re.search(pattern, text, re.IGNORECASE): |
| 48 | + score += HEURISTIC_WEIGHTS.get("CLICKBAIT_PATTERN", 1.5) |
| 49 | + indicators_found.append(f"Found clickbait pattern: '{pattern}'") |
| 50 | + |
| 51 | + # 3. Check for excessive punctuation (common in fake news/clickbait) |
| 52 | + if re.search(r'!!|!!\?|\?{2,}', text): |
| 53 | + score += 1.0 |
| 54 | + indicators_found.append("Found excessive punctuation (e.g., '!!', '???').") |
| 55 | + |
| 56 | + # 4. Check for all caps words (excluding short acronyms) |
| 57 | + all_caps_words = re.findall(r'\b[A-Z]{4,}\b', text) |
| 58 | + if len(all_caps_words) >= 2: |
| 59 | + score += 1.0 |
| 60 | + indicators_found.append(f"Found multiple words in all caps: {', '.join(all_caps_words[:3])}...") |
| 61 | + |
| 62 | + # 5. Named Entity Extraction (Optional, for context) |
| 63 | + named_entities = {"organizations": [], "persons": []} |
| 64 | + try: |
| 65 | + tokens = nltk.word_tokenize(text) |
| 66 | + tagged = nltk.pos_tag(tokens) |
| 67 | + entities = nltk.ne_chunk(tagged) |
| 68 | + for entity in entities: |
| 69 | + if isinstance(entity, nltk.Tree): |
| 70 | + entity_text = " ".join([word for word, tag in entity.leaves()]) |
| 71 | + if entity.label() == 'ORGANIZATION': |
| 72 | + if entity_text not in named_entities["organizations"]: |
| 73 | + named_entities["organizations"].append(entity_text) |
| 74 | + elif entity.label() == 'PERSON': |
| 75 | + if entity_text not in named_entities["persons"]: |
| 76 | + named_entities["persons"].append(entity_text) |
| 77 | + except Exception: |
| 78 | + # NLTK might not be fully initialized or data missing |
| 79 | + pass |
| 80 | + |
| 81 | + return { |
| 82 | + "score": round(score, 2), |
| 83 | + "indicators_found": indicators_found, |
| 84 | + "named_entities": named_entities, |
| 85 | + "assessment": "High suspicion" if score > 5.0 else "Moderate suspicion" if score >= 3.0 else "Low suspicion" |
| 86 | + } |
0 commit comments