From f000aee7cb677f9b0b1738f2d46ca9da24b0e447 Mon Sep 17 00:00:00 2001
From: Larry Stewart <larry@cognitivecode.ai>
Date: Mon, 8 Jun 2026 11:27:28 -0400
Subject: [PATCH 1/3] refactor(qa): extract RerankerGate to its own module (E2
 1/3)

Relocate the already-standalone RerankerGate / RerankerGateContext /
RerankerGateResult (no QARunner coupling) from runner.py to
app/qa/reranker_gate.py, re-exported from runner.py so existing imports
keep working (app/qa/gate_metrics.py and the bare RerankerGate call in run()).

Pure relocation, zero logic change. Adds tests/qa/test_reranker_gate.py
(should_rerank decision matrix + re-export identity) since this logic had
NO prior test coverage. Full default suite: 319 passed.

Refs #21
---
 backend/app/qa/reranker_gate.py        | 157 +++++++++++++++++++++++++
 backend/app/qa/runner.py               | 153 ++----------------------
 backend/tests/qa/test_reranker_gate.py |  81 +++++++++++++
 3 files changed, 247 insertions(+), 144 deletions(-)
 create mode 100644 backend/app/qa/reranker_gate.py
 create mode 100644 backend/tests/qa/test_reranker_gate.py

diff --git a/backend/app/qa/reranker_gate.py b/backend/app/qa/reranker_gate.py
new file mode 100644
index 0000000..9164355
--- /dev/null
+++ b/backend/app/qa/reranker_gate.py
@@ -0,0 +1,157 @@
+"""Metric-gated reranking eligibility (E2 extraction from QARunner).
+
+Relocated verbatim from app/qa/runner.py — these classes had no QARunner
+instance coupling. Re-exported from runner.py for backward-compatible imports.
+"""
+
+import logging
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class RerankerGateContext:
+    """Context for reranker gate decision."""
+    # Baseline metrics (from recent evals)
+    baseline_seed_precision_at_5: float = 0.0
+    baseline_evidence_recall: float = 0.0
+
+    # A/B comparison metrics
+    rerank_seed_precision_at_5: float = 0.0
+    rerank_evidence_recall: float = 0.0
+    rerank_ab_improvement_at_5: float = 0.0
+
+    # Performance metrics
+    rerank_latency_overhead_pct: float = 0.0
+    rerank_degeneracy_rate: float = 0.0  # % of questions with degenerate output
+
+    # Baseline improvement trend (from N eval runs)
+    baseline_precision_trend: float = 0.0  # Change in precision over recent runs
+
+
+@dataclass
+class RerankerGateResult:
+    """Result of reranker gate evaluation."""
+    allowed: bool = False
+    reason: str = ""
+    mode: str = "baseline"  # "baseline" | "gated" | "forced"
+
+    # Individual gate checks
+    baseline_plateau_check: bool = False
+    ab_improvement_check: bool = False
+    recall_regression_check: bool = False
+    latency_check: bool = False
+    stability_check: bool = False
+
+
+class RerankerGate:
+    """Metric-gated reranking eligibility checker.
+
+    Reranking only runs when ALL conditions are met:
+    1. Baseline plateau (precision improvement < +0.02)
+    2. A/B improvement (rerank improves precision by ≥ +0.05)
+    3. No recall regression
+    4. Latency overhead ≤ 20%
+    5. Stability (degeneracy rate ≤ 10%)
+    """
+
+    # Gate thresholds (from directive)
+    BASELINE_PLATEAU_THRESHOLD = 0.02     # Max precision improvement before plateau
+    AB_IMPROVEMENT_THRESHOLD = 0.05       # Min required rerank improvement
+    LATENCY_OVERHEAD_MAX_PCT = 20.0       # Max latency overhead %
+    DEGENERACY_RATE_MAX_PCT = 10.0        # Max degeneracy/fallback rate %
+
+    # Fast mode limits (when gate passes)
+    FAST_MODE_MAX_CANDIDATES = 12
+    FAST_MODE_TIMEOUT_S = 2
+
+    @classmethod
+    def should_rerank(
+        cls,
+        context: Optional[RerankerGateContext] = None,
+        force: bool = False
+    ) -> RerankerGateResult:
+        """Determine if reranking should run.
+
+        Args:
+            context: Gate context with metrics (None = use defaults/deny)
+            force: Manual override via --rerank-force flag
+
+        Returns:
+            RerankerGateResult with decision and reasoning
+        """
+        result = RerankerGateResult()
+
+        # Manual override
+        if force:
+            result.allowed = True
+            result.mode = "forced"
+            result.reason = "forced_by_flag"
+            logger.warning("[RerankerGate] Reranking FORCED via --rerank-force flag")
+            return result
+
+        # No context = no metrics = deny
+        if context is None:
+            result.allowed = False
+            result.mode = "baseline"
+            result.reason = "no_gate_context"
+            logger.info("[RerankerGate] Reranking DENIED: no gate context provided")
+            return result
+
+        # Check all gate conditions
+        failures = []
+
+        # 1. Baseline plateau check
+        # Baseline is still improving if trend >= threshold
+        if context.baseline_precision_trend >= cls.BASELINE_PLATEAU_THRESHOLD:
+            failures.append(f"baseline_still_improving({context.baseline_precision_trend:.3f}>={cls.BASELINE_PLATEAU_THRESHOLD})")
+        else:
+            result.baseline_plateau_check = True
+
+        # 2. A/B improvement check
+        if context.rerank_ab_improvement_at_5 < cls.AB_IMPROVEMENT_THRESHOLD:
+            failures.append(f"ab_improvement_too_low({context.rerank_ab_improvement_at_5:.3f}<{cls.AB_IMPROVEMENT_THRESHOLD})")
+        else:
+            result.ab_improvement_check = True
+
+        # 3. No recall regression
+        if context.rerank_evidence_recall < context.baseline_evidence_recall:
+            failures.append(f"recall_regression({context.rerank_evidence_recall:.2f}<{context.baseline_evidence_recall:.2f})")
+        else:
+            result.recall_regression_check = True
+
+        # 4. Latency overhead check
+        if context.rerank_latency_overhead_pct > cls.LATENCY_OVERHEAD_MAX_PCT:
+            failures.append(f"latency_too_high({context.rerank_latency_overhead_pct:.1f}%>{cls.LATENCY_OVERHEAD_MAX_PCT}%)")
+        else:
+            result.latency_check = True
+
+        # 5. Stability check
+        if context.rerank_degeneracy_rate > cls.DEGENERACY_RATE_MAX_PCT:
+            failures.append(f"degeneracy_too_high({context.rerank_degeneracy_rate:.1f}%>{cls.DEGENERACY_RATE_MAX_PCT}%)")
+        else:
+            result.stability_check = True
+
+        # All checks must pass
+        if failures:
+            result.allowed = False
+            result.mode = "baseline"
+            result.reason = "; ".join(failures)
+            logger.info(f"[RerankerGate] Reranking DENIED: {result.reason}")
+        else:
+            result.allowed = True
+            result.mode = "gated"
+            result.reason = "all_gates_passed"
+            logger.info("[RerankerGate] Reranking ALLOWED: all gate conditions met")
+
+        return result
+
+    @classmethod
+    def get_fast_mode_config(cls) -> Dict[str, Any]:
+        """Get fast mode configuration when gate passes."""
+        return {
+            "max_candidates": cls.FAST_MODE_MAX_CANDIDATES,
+            "timeout_s": cls.FAST_MODE_TIMEOUT_S,
+        }
diff --git a/backend/app/qa/runner.py b/backend/app/qa/runner.py
index 908bd6a..2f3e87a 100644
--- a/backend/app/qa/runner.py
+++ b/backend/app/qa/runner.py
@@ -53,150 +53,15 @@
 # RERANKER GATE - Metric-based eligibility for reranking
 # =============================================================================
 
-@dataclass
-class RerankerGateContext:
-    """Context for reranker gate decision."""
-    # Baseline metrics (from recent evals)
-    baseline_seed_precision_at_5: float = 0.0
-    baseline_evidence_recall: float = 0.0
-    
-    # A/B comparison metrics
-    rerank_seed_precision_at_5: float = 0.0
-    rerank_evidence_recall: float = 0.0
-    rerank_ab_improvement_at_5: float = 0.0
-    
-    # Performance metrics
-    rerank_latency_overhead_pct: float = 0.0
-    rerank_degeneracy_rate: float = 0.0  # % of questions with degenerate output
-    
-    # Baseline improvement trend (from N eval runs)
-    baseline_precision_trend: float = 0.0  # Change in precision over recent runs
-
-
-@dataclass
-class RerankerGateResult:
-    """Result of reranker gate evaluation."""
-    allowed: bool = False
-    reason: str = ""
-    mode: str = "baseline"  # "baseline" | "gated" | "forced"
-    
-    # Individual gate checks
-    baseline_plateau_check: bool = False
-    ab_improvement_check: bool = False
-    recall_regression_check: bool = False
-    latency_check: bool = False
-    stability_check: bool = False
-
-
-class RerankerGate:
-    """Metric-gated reranking eligibility checker.
-    
-    Reranking only runs when ALL conditions are met:
-    1. Baseline plateau (precision improvement < +0.02)
-    2. A/B improvement (rerank improves precision by ≥ +0.05)
-    3. No recall regression
-    4. Latency overhead ≤ 20%
-    5. Stability (degeneracy rate ≤ 10%)
-    """
-    
-    # Gate thresholds (from directive)
-    BASELINE_PLATEAU_THRESHOLD = 0.02     # Max precision improvement before plateau
-    AB_IMPROVEMENT_THRESHOLD = 0.05       # Min required rerank improvement
-    LATENCY_OVERHEAD_MAX_PCT = 20.0       # Max latency overhead %
-    DEGENERACY_RATE_MAX_PCT = 10.0        # Max degeneracy/fallback rate %
-    
-    # Fast mode limits (when gate passes)
-    FAST_MODE_MAX_CANDIDATES = 12
-    FAST_MODE_TIMEOUT_S = 2
-    
-    @classmethod
-    def should_rerank(
-        cls,
-        context: Optional[RerankerGateContext] = None,
-        force: bool = False
-    ) -> RerankerGateResult:
-        """Determine if reranking should run.
-        
-        Args:
-            context: Gate context with metrics (None = use defaults/deny)
-            force: Manual override via --rerank-force flag
-            
-        Returns:
-            RerankerGateResult with decision and reasoning
-        """
-        result = RerankerGateResult()
-        
-        # Manual override
-        if force:
-            result.allowed = True
-            result.mode = "forced"
-            result.reason = "forced_by_flag"
-            logger.warning("[RerankerGate] Reranking FORCED via --rerank-force flag")
-            return result
-        
-        # No context = no metrics = deny
-        if context is None:
-            result.allowed = False
-            result.mode = "baseline"
-            result.reason = "no_gate_context"
-            logger.info("[RerankerGate] Reranking DENIED: no gate context provided")
-            return result
-        
-        # Check all gate conditions
-        failures = []
-        
-        # 1. Baseline plateau check
-        # Baseline is still improving if trend >= threshold
-        if context.baseline_precision_trend >= cls.BASELINE_PLATEAU_THRESHOLD:
-            failures.append(f"baseline_still_improving({context.baseline_precision_trend:.3f}>={cls.BASELINE_PLATEAU_THRESHOLD})")
-        else:
-            result.baseline_plateau_check = True
-        
-        # 2. A/B improvement check
-        if context.rerank_ab_improvement_at_5 < cls.AB_IMPROVEMENT_THRESHOLD:
-            failures.append(f"ab_improvement_too_low({context.rerank_ab_improvement_at_5:.3f}<{cls.AB_IMPROVEMENT_THRESHOLD})")
-        else:
-            result.ab_improvement_check = True
-        
-        # 3. No recall regression
-        if context.rerank_evidence_recall < context.baseline_evidence_recall:
-            failures.append(f"recall_regression({context.rerank_evidence_recall:.2f}<{context.baseline_evidence_recall:.2f})")
-        else:
-            result.recall_regression_check = True
-        
-        # 4. Latency overhead check
-        if context.rerank_latency_overhead_pct > cls.LATENCY_OVERHEAD_MAX_PCT:
-            failures.append(f"latency_too_high({context.rerank_latency_overhead_pct:.1f}%>{cls.LATENCY_OVERHEAD_MAX_PCT}%)")
-        else:
-            result.latency_check = True
-        
-        # 5. Stability check
-        if context.rerank_degeneracy_rate > cls.DEGENERACY_RATE_MAX_PCT:
-            failures.append(f"degeneracy_too_high({context.rerank_degeneracy_rate:.1f}%>{cls.DEGENERACY_RATE_MAX_PCT}%)")
-        else:
-            result.stability_check = True
-        
-        # All checks must pass
-        if failures:
-            result.allowed = False
-            result.mode = "baseline"
-            result.reason = "; ".join(failures)
-            logger.info(f"[RerankerGate] Reranking DENIED: {result.reason}")
-        else:
-            result.allowed = True
-            result.mode = "gated"
-            result.reason = "all_gates_passed"
-            logger.info("[RerankerGate] Reranking ALLOWED: all gate conditions met")
-        
-        return result
-    
-    @classmethod
-    def get_fast_mode_config(cls) -> Dict[str, Any]:
-        """Get fast mode configuration when gate passes."""
-        return {
-            "max_candidates": cls.FAST_MODE_MAX_CANDIDATES,
-            "timeout_s": cls.FAST_MODE_TIMEOUT_S,
-        }
+# RerankerGate and its context/result dataclasses live in app/qa/reranker_gate.py
+# (E2 extraction). Re-exported here so existing imports keep working, e.g.
+# `from app.qa.runner import RerankerGate, RerankerGateContext` (used by
+# app/qa/gate_metrics.py) and the bare RerankerGate.should_rerank() call in run().
+from app.qa.reranker_gate import (  # noqa: E402
+    RerankerGate,
+    RerankerGateContext,
+    RerankerGateResult,
+)
 
 
 @dataclass
diff --git a/backend/tests/qa/test_reranker_gate.py b/backend/tests/qa/test_reranker_gate.py
new file mode 100644
index 0000000..2621522
--- /dev/null
+++ b/backend/tests/qa/test_reranker_gate.py
@@ -0,0 +1,81 @@
+"""Characterization tests for RerankerGate (E2 extraction safety net).
+
+These did not exist before the extraction; they pin should_rerank()'s decision
+logic so the relocation to app/qa/reranker_gate.py is provably behavior-preserving.
+"""
+
+from app.qa.reranker_gate import RerankerGate, RerankerGateContext
+# The re-export path must keep working (gate_metrics.py + run() rely on it).
+from app.qa.runner import RerankerGate as RG_via_runner
+from app.qa.runner import RerankerGateContext as RGC_via_runner
+
+
+def _passing_ctx():
+    return RerankerGateContext(
+        baseline_precision_trend=0.0,        # < 0.02 plateau -> pass
+        rerank_ab_improvement_at_5=0.10,     # >= 0.05 -> pass
+        baseline_evidence_recall=0.5,
+        rerank_evidence_recall=0.6,          # no regression
+        rerank_latency_overhead_pct=10.0,    # <= 20 -> pass
+        rerank_degeneracy_rate=5.0,          # <= 10 -> pass
+    )
+
+
+def test_reexport_is_identical_object():
+    assert RG_via_runner is RerankerGate
+    assert RGC_via_runner is RerankerGateContext
+
+
+def test_force_allows():
+    r = RerankerGate.should_rerank(force=True)
+    assert r.allowed and r.mode == "forced" and r.reason == "forced_by_flag"
+
+
+def test_no_context_denies():
+    r = RerankerGate.should_rerank(context=None)
+    assert not r.allowed and r.reason == "no_gate_context"
+
+
+def test_all_gates_pass_allows():
+    r = RerankerGate.should_rerank(context=_passing_ctx())
+    assert r.allowed and r.mode == "gated" and r.reason == "all_gates_passed"
+
+
+def test_baseline_still_improving_denies():
+    ctx = _passing_ctx()
+    ctx.baseline_precision_trend = 0.05  # >= 0.02 plateau threshold
+    r = RerankerGate.should_rerank(context=ctx)
+    assert not r.allowed and "baseline_still_improving" in r.reason
+
+
+def test_ab_improvement_too_low_denies():
+    ctx = _passing_ctx()
+    ctx.rerank_ab_improvement_at_5 = 0.01  # < 0.05
+    r = RerankerGate.should_rerank(context=ctx)
+    assert not r.allowed and "ab_improvement_too_low" in r.reason
+
+
+def test_recall_regression_denies():
+    ctx = _passing_ctx()
+    ctx.rerank_evidence_recall = 0.4  # < baseline 0.5
+    r = RerankerGate.should_rerank(context=ctx)
+    assert not r.allowed and "recall_regression" in r.reason
+
+
+def test_latency_too_high_denies():
+    ctx = _passing_ctx()
+    ctx.rerank_latency_overhead_pct = 30.0  # > 20
+    r = RerankerGate.should_rerank(context=ctx)
+    assert not r.allowed and "latency_too_high" in r.reason
+
+
+def test_degeneracy_too_high_denies():
+    ctx = _passing_ctx()
+    ctx.rerank_degeneracy_rate = 50.0  # > 10
+    r = RerankerGate.should_rerank(context=ctx)
+    assert not r.allowed and "degeneracy_too_high" in r.reason
+
+
+def test_fast_mode_config():
+    cfg = RerankerGate.get_fast_mode_config()
+    assert cfg == {"max_candidates": 12, "timeout_s": 2}

From c58686a056e0b01492d9f2a701be43e1c7d60777 Mon Sep 17 00:00:00 2001
From: Larry Stewart <larry@cognitivecode.ai>
Date: Mon, 8 Jun 2026 11:29:02 -0400
Subject: [PATCH 2/3] refactor(qa): extract metadata query helpers (E2 2/3)

Move the three pure db-only helpers (_get_doc_total_pages,
_get_doc_collection_version, _get_nodes_metadata) from QARunner into
app/qa/metadata_queries.py as functions taking db; QARunner keeps thin
delegating wrappers (signatures/call sites unchanged).

Pure relocation. Adds tests/qa/test_metadata_queries.py (incl. the
None/empty short-circuits that avoid a DB hit). Full suite: 325 passed.

Refs #21
---
 backend/app/qa/metadata_queries.py        | 40 +++++++++++++
 backend/app/qa/runner.py                  | 68 +++--------------------
 backend/tests/qa/test_metadata_queries.py | 51 +++++++++++++++++
 3 files changed, 100 insertions(+), 59 deletions(-)
 create mode 100644 backend/app/qa/metadata_queries.py
 create mode 100644 backend/tests/qa/test_metadata_queries.py

diff --git a/backend/app/qa/metadata_queries.py b/backend/app/qa/metadata_queries.py
new file mode 100644
index 0000000..e24a8de
--- /dev/null
+++ b/backend/app/qa/metadata_queries.py
@@ -0,0 +1,40 @@
+"""Pure document/node metadata queries (E2 extraction from QARunner).
+
+These are db-only helpers with no other QARunner coupling; QARunner keeps thin
+wrappers that delegate here, so call sites and behavior are unchanged.
+"""
+
+from typing import Dict, List, Optional
+
+
+def get_doc_total_pages(db, doc_id: str) -> int:
+    """Total pages for a document (0 if unknown)."""
+    from app.db.graph_models import DocumentGraph
+    doc = db.query(DocumentGraph).filter(DocumentGraph.doc_id == doc_id).first()
+    if doc and doc.meta:
+        return doc.meta.get("total_pages", 0)
+    return 0
+
+
+def get_doc_collection_version(db, doc_id: Optional[str]) -> Optional[str]:
+    """Milvus collection version a document was embedded into.
+
+    Returns None when doc_id is None (search-all uses the default collection)
+    or when the version is unknown.
+    """
+    if not doc_id:
+        return None
+    from app.db.graph_models import DocumentGraph
+    doc = db.query(DocumentGraph).filter(DocumentGraph.doc_id == doc_id).first()
+    if doc and doc.embedded_collection_version:
+        return doc.embedded_collection_version
+    return None
+
+
+def get_nodes_metadata(db, node_ids: List[str]) -> Dict[str, Dict]:
+    """Return {node_id -> meta dict} for the given node ids."""
+    from app.db.graph_models import Node as NodeModel
+    if not node_ids:
+        return {}
+    nodes = db.query(NodeModel).filter(NodeModel.node_id.in_(node_ids)).all()
+    return {n.node_id: n.meta or {} for n in nodes}
diff --git a/backend/app/qa/runner.py b/backend/app/qa/runner.py
index 2f3e87a..fa1bd26 100644
--- a/backend/app/qa/runner.py
+++ b/backend/app/qa/runner.py
@@ -39,6 +39,7 @@
     verify_evidence_span,
 )
 from app.storage.minio_client import get_storage_client
+from app.qa import metadata_queries
 from .evidence_span import build_evidence_spans
 from .normalizer import normalize_query, NormalizedQuery
 from .section_booster import SectionBooster, SectionBoostResult
@@ -1829,67 +1830,16 @@ def _call_llm_reranker(
         return rerank_map
     
     def _get_doc_total_pages(self, doc_id: str) -> int:
-        """Get total pages for a document.
-        
-        Args:
-            doc_id: Document ID
-            
-        Returns:
-            Total pages (0 if unknown)
-        """
-        from app.db.graph_models import DocumentGraph
-        doc = self.db.query(DocumentGraph).filter(
-            DocumentGraph.doc_id == doc_id
-        ).first()
-        
-        if doc and doc.meta:
-            return doc.meta.get("total_pages", 0)
-        return 0
-    
+        """Total pages for a document (delegates to metadata_queries)."""
+        return metadata_queries.get_doc_total_pages(self.db, doc_id)
+
     def _get_doc_collection_version(self, doc_id: Optional[str]) -> Optional[str]:
-        """Get the Milvus collection version used when document was embedded.
-        
-        This enables querying the correct collection (v1 or v2) based on
-        where the document's vectors were actually stored.
-        
-        Args:
-            doc_id: Document ID (None means search all documents)
-            
-        Returns:
-            Collection version ("v1" or "v2") or None if unknown/all docs
-        """
-        if not doc_id:
-            # Searching all documents - use default collection (v2)
-            return None
-            
-        from app.db.graph_models import DocumentGraph
-        doc = self.db.query(DocumentGraph).filter(
-            DocumentGraph.doc_id == doc_id
-        ).first()
-        
-        if doc and doc.embedded_collection_version:
-            return doc.embedded_collection_version
-        return None
-    
+        """Milvus collection version a doc was embedded into (delegates)."""
+        return metadata_queries.get_doc_collection_version(self.db, doc_id)
+
     def _get_nodes_metadata(self, node_ids: List[str]) -> Dict[str, Dict]:
-        """Get metadata for nodes.
-        
-        Args:
-            node_ids: List of node IDs
-            
-        Returns:
-            Dict of node_id -> meta dict
-        """
-        from app.db.graph_models import Node as NodeModel
-        
-        if not node_ids:
-            return {}
-        
-        nodes = self.db.query(NodeModel).filter(
-            NodeModel.node_id.in_(node_ids)
-        ).all()
-        
-        return {n.node_id: n.meta or {} for n in nodes}
+        """Metadata for nodes as {node_id -> meta} (delegates)."""
+        return metadata_queries.get_nodes_metadata(self.db, node_ids)
     
     # ==========================================================================
     # STRUCTURED-OBJECT SEED INJECTION (deterministic, no LLM)
diff --git a/backend/tests/qa/test_metadata_queries.py b/backend/tests/qa/test_metadata_queries.py
new file mode 100644
index 0000000..f20f4a9
--- /dev/null
+++ b/backend/tests/qa/test_metadata_queries.py
@@ -0,0 +1,51 @@
+"""Characterization tests for the extracted metadata query helpers (E2 2/3)."""
+
+from unittest.mock import MagicMock
+
+from app.qa.metadata_queries import (
+    get_doc_collection_version,
+    get_doc_total_pages,
+    get_nodes_metadata,
+)
+
+
+def _db_first(obj):
+    db = MagicMock()
+    db.query.return_value.filter.return_value.first.return_value = obj
+    return db
+
+
+def test_total_pages_from_meta():
+    doc = MagicMock(meta={"total_pages": 7})
+    assert get_doc_total_pages(_db_first(doc), "d") == 7
+
+
+def test_total_pages_defaults_zero():
+    assert get_doc_total_pages(_db_first(None), "d") == 0
+    assert get_doc_total_pages(_db_first(MagicMock(meta=None)), "d") == 0
+
+
+def test_collection_version_none_short_circuits_without_query():
+    db = MagicMock()
+    assert get_doc_collection_version(db, None) is None
+    db.query.assert_not_called()  # search-all must not hit the DB
+
+
+def test_collection_version_returns_stored():
+    doc = MagicMock(embedded_collection_version="v2")
+    assert get_doc_collection_version(_db_first(doc), "d") == "v2"
+    assert get_doc_collection_version(_db_first(None), "d") is None
+
+
+def test_nodes_metadata_empty_short_circuits():
+    db = MagicMock()
+    assert get_nodes_metadata(db, []) == {}
+    db.query.assert_not_called()
+
+
+def test_nodes_metadata_maps_ids_to_meta():
+    n1 = MagicMock(node_id="a", meta={"x": 1})
+    n2 = MagicMock(node_id="b", meta=None)
+    db = MagicMock()
+    db.query.return_value.filter.return_value.all.return_value = [n1, n2]
+    assert get_nodes_metadata(db, ["a", "b"]) == {"a": {"x": 1}, "b": {}}

From 3a37d6969b74b0fa0d49e161b712a3a9038350fd Mon Sep 17 00:00:00 2001
From: Larry Stewart <larry@cognitivecode.ai>
Date: Mon, 8 Jun 2026 11:30:38 -0400
Subject: [PATCH 3/3] refactor(qa): extract structured-target detection (E2
 3/3)

Move the pure regex _detect_structured_targets + its 4 patterns from
QARunner into app/qa/structured_targets.py; QARunner aliases the patterns
(back-compat) and delegates the method. MAX_INJECTED_SEEDS stays on
QARunner (used by inject_structured_seeds, not extracted).

Pure relocation. Adds tests/qa/test_structured_targets.py (figure/table/
appendix/section detection + delegation parity). Full suite: 330 passed.

Refs #21
---
 backend/app/qa/runner.py                    | 57 ++++-----------------
 backend/app/qa/structured_targets.py        | 47 +++++++++++++++++
 backend/tests/qa/test_structured_targets.py | 32 ++++++++++++
 3 files changed, 89 insertions(+), 47 deletions(-)
 create mode 100644 backend/app/qa/structured_targets.py
 create mode 100644 backend/tests/qa/test_structured_targets.py

diff --git a/backend/app/qa/runner.py b/backend/app/qa/runner.py
index fa1bd26..005ac16 100644
--- a/backend/app/qa/runner.py
+++ b/backend/app/qa/runner.py
@@ -40,6 +40,7 @@
 )
 from app.storage.minio_client import get_storage_client
 from app.qa import metadata_queries
+from app.qa import structured_targets
 from .evidence_span import build_evidence_spans
 from .normalizer import normalize_query, NormalizedQuery
 from .section_booster import SectionBooster, SectionBoostResult
@@ -1848,54 +1849,16 @@ def _get_nodes_metadata(self, node_ids: List[str]) -> Dict[str, Dict]:
     # Max injected seeds to force into top-K
     MAX_INJECTED_SEEDS = 2
     
-    # Regex patterns for structured object mentions
-    FIGURE_PATTERN = re.compile(r'(?:Figure|Fig\.?)\s*(\d+(?:\.\d+)?)', re.IGNORECASE)
-    TABLE_PATTERN = re.compile(r'(?:Table|Tab\.?)\s*(\d+(?:\.\d+)?)', re.IGNORECASE)
-    APPENDIX_PATTERN = re.compile(r'Appendix\s*([A-Za-z])', re.IGNORECASE)
-    SECTION_PATTERN = re.compile(
-        r'\b(Conclusion|Methodology|Introduction|Discussion|Evaluation|Results|Appendix)\b',
-        re.IGNORECASE
-    )
-    
+    # Patterns + detection live in app/qa/structured_targets.py (E2 extraction);
+    # aliased here for back-compat with any external references.
+    FIGURE_PATTERN = structured_targets.FIGURE_PATTERN
+    TABLE_PATTERN = structured_targets.TABLE_PATTERN
+    APPENDIX_PATTERN = structured_targets.APPENDIX_PATTERN
+    SECTION_PATTERN = structured_targets.SECTION_PATTERN
+
     def _detect_structured_targets(self, question: str) -> Dict[str, List[str]]:
-        """Detect Figure/Table/Appendix/Section mentions in question.
-        
-        Args:
-            question: User question text
-            
-        Returns:
-            Dict with keys 'figures', 'tables', 'appendices', 'sections'
-            containing normalized target strings
-        """
-        targets = {
-            "figures": [],
-            "tables": [],
-            "appendices": [],
-            "sections": []
-        }
-        
-        # Detect Figure X
-        for match in self.FIGURE_PATTERN.finditer(question):
-            num = match.group(1)
-            targets["figures"].append(f"Figure {num}")
-        
-        # Detect Table X
-        for match in self.TABLE_PATTERN.finditer(question):
-            num = match.group(1)
-            targets["tables"].append(f"Table {num}")
-        
-        # Detect Appendix X
-        for match in self.APPENDIX_PATTERN.finditer(question):
-            letter = match.group(1).upper()
-            targets["appendices"].append(f"Appendix {letter}")
-        
-        # Detect section mentions
-        for match in self.SECTION_PATTERN.finditer(question):
-            section = match.group(1).title()  # "conclusion" -> "Conclusion"
-            if section not in targets["sections"]:
-                targets["sections"].append(section)
-        
-        return targets
+        """Detect Figure/Table/Appendix/Section mentions (delegates)."""
+        return structured_targets.detect_structured_targets(question)
     
     def _query_nodes_by_label(
         self,
diff --git a/backend/app/qa/structured_targets.py b/backend/app/qa/structured_targets.py
new file mode 100644
index 0000000..7dc388d
--- /dev/null
+++ b/backend/app/qa/structured_targets.py
@@ -0,0 +1,47 @@
+"""Detect Figure/Table/Appendix/Section mentions in a question (E2 extraction).
+
+Pure, deterministic string -> dict. Relocated verbatim from QARunner; the class
+keeps aliasing attributes + a delegating method so behavior is unchanged.
+"""
+
+import re
+from typing import Dict, List
+
+# Regex patterns for structured object mentions
+FIGURE_PATTERN = re.compile(r'(?:Figure|Fig\.?)\s*(\d+(?:\.\d+)?)', re.IGNORECASE)
+TABLE_PATTERN = re.compile(r'(?:Table|Tab\.?)\s*(\d+(?:\.\d+)?)', re.IGNORECASE)
+APPENDIX_PATTERN = re.compile(r'Appendix\s*([A-Za-z])', re.IGNORECASE)
+SECTION_PATTERN = re.compile(
+    r'\b(Conclusion|Methodology|Introduction|Discussion|Evaluation|Results|Appendix)\b',
+    re.IGNORECASE,
+)
+
+
+def detect_structured_targets(question: str) -> Dict[str, List[str]]:
+    """Detect Figure/Table/Appendix/Section mentions in a question.
+
+    Returns a dict with keys 'figures', 'tables', 'appendices', 'sections'
+    containing normalized target strings.
+    """
+    targets = {
+        "figures": [],
+        "tables": [],
+        "appendices": [],
+        "sections": [],
+    }
+
+    for match in FIGURE_PATTERN.finditer(question):
+        targets["figures"].append(f"Figure {match.group(1)}")
+
+    for match in TABLE_PATTERN.finditer(question):
+        targets["tables"].append(f"Table {match.group(1)}")
+
+    for match in APPENDIX_PATTERN.finditer(question):
+        targets["appendices"].append(f"Appendix {match.group(1).upper()}")
+
+    for match in SECTION_PATTERN.finditer(question):
+        section = match.group(1).title()  # "conclusion" -> "Conclusion"
+        if section not in targets["sections"]:
+            targets["sections"].append(section)
+
+    return targets
diff --git a/backend/tests/qa/test_structured_targets.py b/backend/tests/qa/test_structured_targets.py
new file mode 100644
index 0000000..d490055
--- /dev/null
+++ b/backend/tests/qa/test_structured_targets.py
@@ -0,0 +1,32 @@
+"""Characterization tests for structured-target detection (E2 3/3)."""
+
+from app.qa.structured_targets import detect_structured_targets
+
+
+def test_detects_figures_and_tables():
+    out = detect_structured_targets("Compare Figure 1.5 with Table 4 and Fig. 2")
+    assert out["figures"] == ["Figure 1.5", "Figure 2"]
+    assert out["tables"] == ["Table 4"]
+
+
+def test_detects_appendix_uppercased():
+    out = detect_structured_targets("See Appendix b for details")
+    assert out["appendices"] == ["Appendix B"]
+
+
+def test_detects_sections_titlecased_and_deduped():
+    out = detect_structured_targets("the conclusion and the Conclusion and methodology")
+    assert out["sections"] == ["Conclusion", "Methodology"]
+
+
+def test_no_match_returns_empty_lists():
+    out = detect_structured_targets("what is the revenue?")
+    assert out == {"figures": [], "tables": [], "appendices": [], "sections": []}
+
+
+def test_runner_method_delegates_identically():
+    # The QARunner wrapper must produce the same result as the module function.
+    from app.qa.runner import QARunner
+    runner = QARunner.__new__(QARunner)
+    q = "Explain Figure 3 in the Results section"
+    assert runner._detect_structured_targets(q) == detect_structured_targets(q)