Merge pull request #27 from GYFX35/feat/ai-fake-content-verification-11814308070149739403

GYFX35 · web-flow · commit 5161cdbea16c · 2026-02-23T07:52:28.000Z
Add AI content and fake content verification features
diff --git a/social_media_analyzer/ai_content_detector.py b/social_media_analyzer/ai_content_detector.py
@@ -0,0 +1,62 @@
+import re
+from .heuristics import AI_PHRASES, HEURISTIC_WEIGHTS
+
+def analyze_text_for_ai_content(text):
+    """
+    Analyzes text for indicators of being AI-generated.
+    """
+    if not text:
+        return {"score": 0.0, "indicators_found": [], "is_ai_generated": False}
+
+    text_lower = text.lower()
+    score = 0.0
+    indicators_found = []
+
+    # 1. Check for known AI phrases
+    for phrase in AI_PHRASES:
+        if phrase in text_lower:
+            score += HEURISTIC_WEIGHTS.get("AI_PHRASE", 2.0)
+            indicators_found.append(f"Found AI-typical phrase: '{phrase}'")
+
+    # 2. Check for robotic structure (e.g., numbered lists with consistent formatting)
+    numbered_list_pattern = r'\n\d+\.\s'
+    numbered_lists = re.findall(numbered_list_pattern, text)
+    if len(numbered_lists) >= 3:
+        score += HEURISTIC_WEIGHTS.get("ROBOTIC_STRUCTURE", 3.0)
+        indicators_found.append("Contains multiple numbered lists, typical of structured AI output.")
+
+    # 3. Check for lack of common human errors (very basic check)
+    # This is a heuristic: AI text is often too "perfect"
+    # We could count common typos and if they are 0 in a long text, it's slightly more suspicious
+    common_typos = ['teh', 'receive', 'believe', 'occured', 'definately']
+    words = text_lower.split()
+    if len(words) > 50:
+        typo_found = False
+        for word in words:
+            if word in common_typos:
+                typo_found = True
+                break
+        if not typo_found:
+            score += HEURISTIC_WEIGHTS.get("LACK_OF_ERRORS", 1.5)
+            indicators_found.append("Text is highly polished with no common human typos in a long passage.")
+
+    # 4. Check for repetitive sentence starts
+    sentences = re.split(r'[.!?]+', text)
+    sentences = [s.strip() for s in sentences if s.strip()]
+    if len(sentences) >= 5:
+        starts = [s.split()[0].lower() if s.split() else "" for s in sentences]
+        from collections import Counter
+        counts = Counter(starts)
+        most_common = counts.most_common(1)
+        if most_common and most_common[0][1] / len(sentences) > 0.4:
+            score += HEURISTIC_WEIGHTS.get("ROBOTIC_STRUCTURE", 3.0)
+            indicators_found.append(f"High repetition of sentence starters ('{most_common[0][0]}'), suggests robotic generation.")
+
+    is_ai = score >= 5.0
+
+    return {
+        "score": round(score, 2),
+        "indicators_found": indicators_found,
+        "is_ai_generated": is_ai,
+        "assessment": "High likelihood" if score > 7.0 else "Medium likelihood" if score >= 5.0 else "Low likelihood"
+    }
diff --git a/social_media_analyzer/fake_content_verifier.py b/social_media_analyzer/fake_content_verifier.py
@@ -0,0 +1,86 @@
+import re
+import nltk
+from .heuristics import (
+    SENSATIONALIST_KEYWORDS,
+    CLICKBAIT_PATTERNS,
+    HEURISTIC_WEIGHTS
+)
+
+def _download_nltk_data():
+    try:
+        nltk.data.find('tokenizers/punkt')
+    except LookupError:
+        nltk.download('punkt', quiet=True)
+    try:
+        nltk.data.find('help/tagsets')
+    except LookupError:
+        nltk.download('averaged_perceptron_tagger', quiet=True)
+    try:
+        nltk.data.find('chunkers/maxent_ne_chunker')
+    except LookupError:
+        nltk.download('maxent_ne_chunker', quiet=True)
+    try:
+        nltk.data.find('corpora/words')
+    except LookupError:
+        nltk.download('words', quiet=True)
+
+def analyze_text_for_fake_content(text):
+    """
+    Analyzes a block of text for indicators of fake or misleading content.
+    """
+    _download_nltk_data()
+    if not text:
+        return {"score": 0.0, "indicators_found": []}
+
+    text_lower = text.lower()
+    score = 0.0
+    indicators_found = []
+
+    # 1. Analyze text for sensationalist keywords
+    for keyword in SENSATIONALIST_KEYWORDS:
+        if keyword in text_lower:
+            score += HEURISTIC_WEIGHTS.get("SENSATIONALIST_KEYWORD", 1.0)
+            indicators_found.append(f"Found sensationalist keyword: '{keyword}'")
+
+    # 2. Analyze text for clickbait patterns
+    for pattern in CLICKBAIT_PATTERNS:
+        if re.search(pattern, text, re.IGNORECASE):
+            score += HEURISTIC_WEIGHTS.get("CLICKBAIT_PATTERN", 1.5)
+            indicators_found.append(f"Found clickbait pattern: '{pattern}'")
+
+    # 3. Check for excessive punctuation (common in fake news/clickbait)
+    if re.search(r'!!|!!\?|\?{2,}', text):
+        score += 1.0
+        indicators_found.append("Found excessive punctuation (e.g., '!!', '???').")
+
+    # 4. Check for all caps words (excluding short acronyms)
+    all_caps_words = re.findall(r'\b[A-Z]{4,}\b', text)
+    if len(all_caps_words) >= 2:
+        score += 1.0
+        indicators_found.append(f"Found multiple words in all caps: {', '.join(all_caps_words[:3])}...")
+
+    # 5. Named Entity Extraction (Optional, for context)
+    named_entities = {"organizations": [], "persons": []}
+    try:
+        tokens = nltk.word_tokenize(text)
+        tagged = nltk.pos_tag(tokens)
+        entities = nltk.ne_chunk(tagged)
+        for entity in entities:
+            if isinstance(entity, nltk.Tree):
+                entity_text = " ".join([word for word, tag in entity.leaves()])
+                if entity.label() == 'ORGANIZATION':
+                    if entity_text not in named_entities["organizations"]:
+                        named_entities["organizations"].append(entity_text)
+                elif entity.label() == 'PERSON':
+                    if entity_text not in named_entities["persons"]:
+                        named_entities["persons"].append(entity_text)
+    except Exception:
+        # NLTK might not be fully initialized or data missing
+        pass
+
+    return {
+        "score": round(score, 2),
+        "indicators_found": indicators_found,
+        "named_entities": named_entities,
+        "assessment": "High suspicion" if score > 5.0 else "Moderate suspicion" if score >= 3.0 else "Low suspicion"
+    }
diff --git a/social_media_analyzer/heuristics.py b/social_media_analyzer/heuristics.py
@@ -154,6 +154,25 @@
 ]
 
 
+# --- AI Content Heuristics ---
+
+AI_PHRASES = [
+    "as an ai language model",
+    "based on the information provided",
+    "it is important to note that",
+    "in conclusion",
+    "furthermore",
+    "moreover",
+    "on the other hand",
+    "at the end of the day",
+    "it is worth mentioning",
+    "provide a comprehensive overview",
+    "the content provided",
+    "can be summarized as",
+    "let me know if you need any further assistance",
+    "here is a summary of",
+]
+
 # --- Fake News Heuristics ---
 
 FAKE_NEWS_DOMAINS = [
@@ -172,7 +191,9 @@
 CLICKBAIT_PATTERNS = [
     r"you won't believe", r"will shock you", r"number \d will amaze you",
     r"this one weird trick", r"doctors hate him", r"the truth about",
-    r"scientists baffled", r"what happened next", r"secret to"
+    r"scientists baffled", r"what happened next", r"secret to",
+    r"completely changed my life", r"instant results", r"limited time offer",
+    r"everyone is talking about", r"gone viral"
 ]
 
 # --- Regular Expression Patterns ---
@@ -271,6 +292,9 @@ def generate_suspicious_url_patterns(legitimate_domains):
     "PHONE_NUMBER_UNSOLICITED": 1.0,
     "SUSPICIOUS_URL_PATTERN": 3.0, # High weight for matching a suspicious URL pattern
     "GOOGLE_SAFE_BROWSING_HIT": 10.0, # Very high weight for a positive Google Safe Browsing match
+    "AI_PHRASE": 2.0,
+    "ROBOTIC_STRUCTURE": 3.0,
+    "LACK_OF_ERRORS": 1.5,
     # Teenager Protection Weights
     "CYBERBULLYING": 2.5,
     "INAPPROPRIATE_CONTENT": 3.0,
diff --git a/social_media_analyzer/test_ai_and_fake_content.py b/social_media_analyzer/test_ai_and_fake_content.py
@@ -0,0 +1,26 @@
+import unittest
+from .ai_content_detector import analyze_text_for_ai_content
+from .fake_content_verifier import analyze_text_for_fake_content
+
+class TestAIAndFakeContent(unittest.TestCase):
+
+    def test_ai_content_detection(self):
+        ai_text = "As an AI language model, I am designed to provide helpful information. Furthermore, in conclusion, this text looks like it was generated by an AI."
+        result = analyze_text_for_ai_content(ai_text)
+        self.assertTrue(result["is_ai_generated"])
+        self.assertIn("Found AI-typical phrase: 'as an ai language model'", result["indicators_found"])
+
+    def test_human_content_detection(self):
+        human_text = "Hey there, I just wanted to say hi. I hope you have a great day! No robotic stuff here."
+        result = analyze_text_for_ai_content(human_text)
+        self.assertFalse(result["is_ai_generated"])
+
+    def test_fake_content_verification(self):
+        fake_text = "SHOCKING BOMBSHELL!!! You won't believe what happened next! This secret will shock you."
+        result = analyze_text_for_fake_content(fake_text)
+        self.assertGreater(result["score"], 3.0)
+        self.assertIn("Found sensationalist keyword: 'shocking'", result["indicators_found"])
+        self.assertIn("Found clickbait pattern: 'you won't believe'", result["indicators_found"])
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/AIContentDetector.jsx b/src/AIContentDetector.jsx
@@ -0,0 +1,64 @@
+import React, { useState } from 'react';
+
+function AIContentDetector() {
+    const [text, setText] = useState('');
+    const [result, setResult] = useState(null);
+    const [loading, setLoading] = useState(false);
+
+    const handleAnalyze = () => {
+        setLoading(true);
+        fetch('/analyze/ai-content', {
+            method: 'POST',
+            headers: {
+                'Content-Type': 'application/json',
+            },
+            body: JSON.stringify({ text }),
+        })
+            .then((res) => res.json())
+            .then((data) => {
+                setResult(data);
+                setLoading(false);
+            })
+            .catch((error) => {
+                console.error('Error:', error);
+                setLoading(false);
+            });
+    };
+
+    return (
+        <div className="analyzer-container">
+            <h2>AI Content Detector</h2>
+            <p>Paste text below to check if it was likely generated by an AI.</p>
+            <textarea
+                value={text}
+                onChange={(e) => setText(e.target.value)}
+                placeholder="Paste text here..."
+                rows="10"
+                style={{ width: '100%', marginBottom: '10px' }}
+            />
+            <br />
+            <button onClick={handleAnalyze} disabled={loading || !text}>
+                {loading ? 'Analyzing...' : 'Detect AI Content'}
+            </button>
+            {result && (
+                <div className="results">
+                    <h3>Analysis Results</h3>
+                    <p><strong>Assessment:</strong> {result.assessment}</p>
+                    <p><strong>Confidence Score:</strong> {result.score}</p>
+                    {result.indicators_found.length > 0 && (
+                        <>
+                            <h4>Indicators Found:</h4>
+                            <ul>
+                                {result.indicators_found.map((indicator, index) => (
+                                    <li key={index}>{indicator}</li>
+                                ))}
+                            </ul>
+                        </>
+                    )}
+                </div>
+            )}
+        </div>
+    );
+}
+
+export default AIContentDetector;
diff --git a/src/App.jsx b/src/App.jsx
@@ -2,6 +2,8 @@ import React, { useState } from 'react';
 import './App.css';
 import ScamAnalyzer from './ScamAnalyzer';
 import FakeNewsAnalyzer from './FakeNewsAnalyzer';
+import AIContentDetector from './AIContentDetector';
+import FakeContentAnalyzer from './FakeContentAnalyzer';
 import FBIGame from './FBIGame';
 import SupplyChainPlatform from './SupplyChainPlatform';
 
@@ -15,13 +17,17 @@ function App() {
         <nav>
           <button className={view === 'scam' ? 'active' : ''} onClick={() => setView('scam')}>Scam Analyzer</button>
           <button className={view === 'fake-news' ? 'active' : ''} onClick={() => setView('fake-news')}>Fake News Analyzer</button>
+          <button className={view === 'ai-content' ? 'active' : ''} onClick={() => setView('ai-content')}>AI Content</button>
+          <button className={view === 'fake-content' ? 'active' : ''} onClick={() => setView('fake-content')}>Fake Content</button>
           <button className={view === 'fbi-game' ? 'active' : ''} onClick={() => setView('fbi-game')}>FBI AR Game</button>
           <button className={view === 'supply-chain' ? 'active' : ''} onClick={() => setView('supply-chain')}>Supply Chain</button>
         </nav>
       </header>
       <main>
         {view === 'scam' && <ScamAnalyzer />}
         {view === 'fake-news' && <FakeNewsAnalyzer />}
+        {view === 'ai-content' && <AIContentDetector />}
+        {view === 'fake-content' && <FakeContentAnalyzer />}
         {view === 'fbi-game' && <FBIGame />}
         {view === 'supply-chain' && <SupplyChainPlatform />}
       </main>
diff --git a/src/FakeContentAnalyzer.jsx b/src/FakeContentAnalyzer.jsx
@@ -0,0 +1,75 @@
+import React, { useState } from 'react';
+
+function FakeContentAnalyzer() {
+    const [text, setText] = useState('');
+    const [result, setResult] = useState(null);
+    const [loading, setLoading] = useState(false);
+
+    const handleAnalyze = () => {
+        setLoading(true);
+        fetch('/analyze/fake-content', {
+            method: 'POST',
+            headers: {
+                'Content-Type': 'application/json',
+            },
+            body: JSON.stringify({ text }),
+        })
+            .then((res) => res.json())
+            .then((data) => {
+                setResult(data);
+                setLoading(false);
+            })
+            .catch((error) => {
+                console.error('Error:', error);
+                setLoading(false);
+            });
+    };
+
+    return (
+        <div className="analyzer-container">
+            <h2>Fake Content Verifier</h2>
+            <p>Paste text content (e.g., a social media post or article snippet) to check for misleading indicators.</p>
+            <textarea
+                value={text}
+                onChange={(e) => setText(e.target.value)}
+                placeholder="Paste content here..."
+                rows="10"
+                style={{ width: '100%', marginBottom: '10px' }}
+            />
+            <br />
+            <button onClick={handleAnalyze} disabled={loading || !text}>
+                {loading ? 'Analyzing...' : 'Verify Content'}
+            </button>
+            {result && (
+                <div className="results">
+                    <h3>Analysis Results</h3>
+                    <p><strong>Assessment:</strong> {result.assessment}</p>
+                    <p><strong>Suspicion Score:</strong> {result.score}</p>
+                    {result.indicators_found.length > 0 && (
+                        <>
+                            <h4>Indicators Found:</h4>
+                            <ul>
+                                {result.indicators_found.map((indicator, index) => (
+                                    <li key={index}>{indicator}</li>
+                                ))}
+                            </ul>
+                        </>
+                    )}
+                    {result.named_entities && (result.named_entities.organizations.length > 0 || result.named_entities.persons.length > 0) && (
+                        <div className="entities">
+                            <h4>Entities Mentioned:</h4>
+                            {result.named_entities.organizations.length > 0 && (
+                                <p><strong>Organizations:</strong> {result.named_entities.organizations.join(', ')}</p>
+                            )}
+                            {result.named_entities.persons.length > 0 && (
+                                <p><strong>People:</strong> {result.named_entities.persons.join(', ')}</p>
+                            )}
+                        </div>
+                    )}
+                </div>
+            )}
+        </div>
+    );
+}
+
+export default FakeContentAnalyzer;
diff --git a/text_message_analyzer/app.py b/text_message_analyzer/app.py
diff --git a/vite.config.js b/vite.config.js