From f000aee7cb677f9b0b1738f2d46ca9da24b0e447 Mon Sep 17 00:00:00 2001 From: Larry Stewart Date: Mon, 8 Jun 2026 11:27:28 -0400 Subject: [PATCH 1/3] refactor(qa): extract RerankerGate to its own module (E2 1/3) Relocate the already-standalone RerankerGate / RerankerGateContext / RerankerGateResult (no QARunner coupling) from runner.py to app/qa/reranker_gate.py, re-exported from runner.py so existing imports keep working (app/qa/gate_metrics.py and the bare RerankerGate call in run()). Pure relocation, zero logic change. Adds tests/qa/test_reranker_gate.py (should_rerank decision matrix + re-export identity) since this logic had NO prior test coverage. Full default suite: 319 passed. Refs #21 --- backend/app/qa/reranker_gate.py | 157 +++++++++++++++++++++++++ backend/app/qa/runner.py | 153 ++---------------------- backend/tests/qa/test_reranker_gate.py | 81 +++++++++++++ 3 files changed, 247 insertions(+), 144 deletions(-) create mode 100644 backend/app/qa/reranker_gate.py create mode 100644 backend/tests/qa/test_reranker_gate.py diff --git a/backend/app/qa/reranker_gate.py b/backend/app/qa/reranker_gate.py new file mode 100644 index 0000000..9164355 --- /dev/null +++ b/backend/app/qa/reranker_gate.py @@ -0,0 +1,157 @@ +"""Metric-gated reranking eligibility (E2 extraction from QARunner). + +Relocated verbatim from app/qa/runner.py — these classes had no QARunner +instance coupling. Re-exported from runner.py for backward-compatible imports. +""" + +import logging +from dataclasses import dataclass +from typing import Any, Dict, Optional + +logger = logging.getLogger(__name__) + + +@dataclass +class RerankerGateContext: + """Context for reranker gate decision.""" + # Baseline metrics (from recent evals) + baseline_seed_precision_at_5: float = 0.0 + baseline_evidence_recall: float = 0.0 + + # A/B comparison metrics + rerank_seed_precision_at_5: float = 0.0 + rerank_evidence_recall: float = 0.0 + rerank_ab_improvement_at_5: float = 0.0 + + # Performance metrics + rerank_latency_overhead_pct: float = 0.0 + rerank_degeneracy_rate: float = 0.0 # % of questions with degenerate output + + # Baseline improvement trend (from N eval runs) + baseline_precision_trend: float = 0.0 # Change in precision over recent runs + + +@dataclass +class RerankerGateResult: + """Result of reranker gate evaluation.""" + allowed: bool = False + reason: str = "" + mode: str = "baseline" # "baseline" | "gated" | "forced" + + # Individual gate checks + baseline_plateau_check: bool = False + ab_improvement_check: bool = False + recall_regression_check: bool = False + latency_check: bool = False + stability_check: bool = False + + +class RerankerGate: + """Metric-gated reranking eligibility checker. + + Reranking only runs when ALL conditions are met: + 1. Baseline plateau (precision improvement < +0.02) + 2. A/B improvement (rerank improves precision by ≥ +0.05) + 3. No recall regression + 4. Latency overhead ≤ 20% + 5. Stability (degeneracy rate ≤ 10%) + """ + + # Gate thresholds (from directive) + BASELINE_PLATEAU_THRESHOLD = 0.02 # Max precision improvement before plateau + AB_IMPROVEMENT_THRESHOLD = 0.05 # Min required rerank improvement + LATENCY_OVERHEAD_MAX_PCT = 20.0 # Max latency overhead % + DEGENERACY_RATE_MAX_PCT = 10.0 # Max degeneracy/fallback rate % + + # Fast mode limits (when gate passes) + FAST_MODE_MAX_CANDIDATES = 12 + FAST_MODE_TIMEOUT_S = 2 + + @classmethod + def should_rerank( + cls, + context: Optional[RerankerGateContext] = None, + force: bool = False + ) -> RerankerGateResult: + """Determine if reranking should run. + + Args: + context: Gate context with metrics (None = use defaults/deny) + force: Manual override via --rerank-force flag + + Returns: + RerankerGateResult with decision and reasoning + """ + result = RerankerGateResult() + + # Manual override + if force: + result.allowed = True + result.mode = "forced" + result.reason = "forced_by_flag" + logger.warning("[RerankerGate] Reranking FORCED via --rerank-force flag") + return result + + # No context = no metrics = deny + if context is None: + result.allowed = False + result.mode = "baseline" + result.reason = "no_gate_context" + logger.info("[RerankerGate] Reranking DENIED: no gate context provided") + return result + + # Check all gate conditions + failures = [] + + # 1. Baseline plateau check + # Baseline is still improving if trend >= threshold + if context.baseline_precision_trend >= cls.BASELINE_PLATEAU_THRESHOLD: + failures.append(f"baseline_still_improving({context.baseline_precision_trend:.3f}>={cls.BASELINE_PLATEAU_THRESHOLD})") + else: + result.baseline_plateau_check = True + + # 2. A/B improvement check + if context.rerank_ab_improvement_at_5 < cls.AB_IMPROVEMENT_THRESHOLD: + failures.append(f"ab_improvement_too_low({context.rerank_ab_improvement_at_5:.3f}<{cls.AB_IMPROVEMENT_THRESHOLD})") + else: + result.ab_improvement_check = True + + # 3. No recall regression + if context.rerank_evidence_recall < context.baseline_evidence_recall: + failures.append(f"recall_regression({context.rerank_evidence_recall:.2f}<{context.baseline_evidence_recall:.2f})") + else: + result.recall_regression_check = True + + # 4. Latency overhead check + if context.rerank_latency_overhead_pct > cls.LATENCY_OVERHEAD_MAX_PCT: + failures.append(f"latency_too_high({context.rerank_latency_overhead_pct:.1f}%>{cls.LATENCY_OVERHEAD_MAX_PCT}%)") + else: + result.latency_check = True + + # 5. Stability check + if context.rerank_degeneracy_rate > cls.DEGENERACY_RATE_MAX_PCT: + failures.append(f"degeneracy_too_high({context.rerank_degeneracy_rate:.1f}%>{cls.DEGENERACY_RATE_MAX_PCT}%)") + else: + result.stability_check = True + + # All checks must pass + if failures: + result.allowed = False + result.mode = "baseline" + result.reason = "; ".join(failures) + logger.info(f"[RerankerGate] Reranking DENIED: {result.reason}") + else: + result.allowed = True + result.mode = "gated" + result.reason = "all_gates_passed" + logger.info("[RerankerGate] Reranking ALLOWED: all gate conditions met") + + return result + + @classmethod + def get_fast_mode_config(cls) -> Dict[str, Any]: + """Get fast mode configuration when gate passes.""" + return { + "max_candidates": cls.FAST_MODE_MAX_CANDIDATES, + "timeout_s": cls.FAST_MODE_TIMEOUT_S, + } diff --git a/backend/app/qa/runner.py b/backend/app/qa/runner.py index 908bd6a..2f3e87a 100644 --- a/backend/app/qa/runner.py +++ b/backend/app/qa/runner.py @@ -53,150 +53,15 @@ # RERANKER GATE - Metric-based eligibility for reranking # ============================================================================= -@dataclass -class RerankerGateContext: - """Context for reranker gate decision.""" - # Baseline metrics (from recent evals) - baseline_seed_precision_at_5: float = 0.0 - baseline_evidence_recall: float = 0.0 - - # A/B comparison metrics - rerank_seed_precision_at_5: float = 0.0 - rerank_evidence_recall: float = 0.0 - rerank_ab_improvement_at_5: float = 0.0 - - # Performance metrics - rerank_latency_overhead_pct: float = 0.0 - rerank_degeneracy_rate: float = 0.0 # % of questions with degenerate output - - # Baseline improvement trend (from N eval runs) - baseline_precision_trend: float = 0.0 # Change in precision over recent runs - - -@dataclass -class RerankerGateResult: - """Result of reranker gate evaluation.""" - allowed: bool = False - reason: str = "" - mode: str = "baseline" # "baseline" | "gated" | "forced" - - # Individual gate checks - baseline_plateau_check: bool = False - ab_improvement_check: bool = False - recall_regression_check: bool = False - latency_check: bool = False - stability_check: bool = False - - -class RerankerGate: - """Metric-gated reranking eligibility checker. - - Reranking only runs when ALL conditions are met: - 1. Baseline plateau (precision improvement < +0.02) - 2. A/B improvement (rerank improves precision by ≥ +0.05) - 3. No recall regression - 4. Latency overhead ≤ 20% - 5. Stability (degeneracy rate ≤ 10%) - """ - - # Gate thresholds (from directive) - BASELINE_PLATEAU_THRESHOLD = 0.02 # Max precision improvement before plateau - AB_IMPROVEMENT_THRESHOLD = 0.05 # Min required rerank improvement - LATENCY_OVERHEAD_MAX_PCT = 20.0 # Max latency overhead % - DEGENERACY_RATE_MAX_PCT = 10.0 # Max degeneracy/fallback rate % - - # Fast mode limits (when gate passes) - FAST_MODE_MAX_CANDIDATES = 12 - FAST_MODE_TIMEOUT_S = 2 - - @classmethod - def should_rerank( - cls, - context: Optional[RerankerGateContext] = None, - force: bool = False - ) -> RerankerGateResult: - """Determine if reranking should run. - - Args: - context: Gate context with metrics (None = use defaults/deny) - force: Manual override via --rerank-force flag - - Returns: - RerankerGateResult with decision and reasoning - """ - result = RerankerGateResult() - - # Manual override - if force: - result.allowed = True - result.mode = "forced" - result.reason = "forced_by_flag" - logger.warning("[RerankerGate] Reranking FORCED via --rerank-force flag") - return result - - # No context = no metrics = deny - if context is None: - result.allowed = False - result.mode = "baseline" - result.reason = "no_gate_context" - logger.info("[RerankerGate] Reranking DENIED: no gate context provided") - return result - - # Check all gate conditions - failures = [] - - # 1. Baseline plateau check - # Baseline is still improving if trend >= threshold - if context.baseline_precision_trend >= cls.BASELINE_PLATEAU_THRESHOLD: - failures.append(f"baseline_still_improving({context.baseline_precision_trend:.3f}>={cls.BASELINE_PLATEAU_THRESHOLD})") - else: - result.baseline_plateau_check = True - - # 2. A/B improvement check - if context.rerank_ab_improvement_at_5 < cls.AB_IMPROVEMENT_THRESHOLD: - failures.append(f"ab_improvement_too_low({context.rerank_ab_improvement_at_5:.3f}<{cls.AB_IMPROVEMENT_THRESHOLD})") - else: - result.ab_improvement_check = True - - # 3. No recall regression - if context.rerank_evidence_recall < context.baseline_evidence_recall: - failures.append(f"recall_regression({context.rerank_evidence_recall:.2f}<{context.baseline_evidence_recall:.2f})") - else: - result.recall_regression_check = True - - # 4. Latency overhead check - if context.rerank_latency_overhead_pct > cls.LATENCY_OVERHEAD_MAX_PCT: - failures.append(f"latency_too_high({context.rerank_latency_overhead_pct:.1f}%>{cls.LATENCY_OVERHEAD_MAX_PCT}%)") - else: - result.latency_check = True - - # 5. Stability check - if context.rerank_degeneracy_rate > cls.DEGENERACY_RATE_MAX_PCT: - failures.append(f"degeneracy_too_high({context.rerank_degeneracy_rate:.1f}%>{cls.DEGENERACY_RATE_MAX_PCT}%)") - else: - result.stability_check = True - - # All checks must pass - if failures: - result.allowed = False - result.mode = "baseline" - result.reason = "; ".join(failures) - logger.info(f"[RerankerGate] Reranking DENIED: {result.reason}") - else: - result.allowed = True - result.mode = "gated" - result.reason = "all_gates_passed" - logger.info("[RerankerGate] Reranking ALLOWED: all gate conditions met") - - return result - - @classmethod - def get_fast_mode_config(cls) -> Dict[str, Any]: - """Get fast mode configuration when gate passes.""" - return { - "max_candidates": cls.FAST_MODE_MAX_CANDIDATES, - "timeout_s": cls.FAST_MODE_TIMEOUT_S, - } +# RerankerGate and its context/result dataclasses live in app/qa/reranker_gate.py +# (E2 extraction). Re-exported here so existing imports keep working, e.g. +# `from app.qa.runner import RerankerGate, RerankerGateContext` (used by +# app/qa/gate_metrics.py) and the bare RerankerGate.should_rerank() call in run(). +from app.qa.reranker_gate import ( # noqa: E402 + RerankerGate, + RerankerGateContext, + RerankerGateResult, +) @dataclass diff --git a/backend/tests/qa/test_reranker_gate.py b/backend/tests/qa/test_reranker_gate.py new file mode 100644 index 0000000..2621522 --- /dev/null +++ b/backend/tests/qa/test_reranker_gate.py @@ -0,0 +1,81 @@ +"""Characterization tests for RerankerGate (E2 extraction safety net). + +These did not exist before the extraction; they pin should_rerank()'s decision +logic so the relocation to app/qa/reranker_gate.py is provably behavior-preserving. +""" + +from app.qa.reranker_gate import RerankerGate, RerankerGateContext +# The re-export path must keep working (gate_metrics.py + run() rely on it). +from app.qa.runner import RerankerGate as RG_via_runner +from app.qa.runner import RerankerGateContext as RGC_via_runner + + +def _passing_ctx(): + return RerankerGateContext( + baseline_precision_trend=0.0, # < 0.02 plateau -> pass + rerank_ab_improvement_at_5=0.10, # >= 0.05 -> pass + baseline_evidence_recall=0.5, + rerank_evidence_recall=0.6, # no regression + rerank_latency_overhead_pct=10.0, # <= 20 -> pass + rerank_degeneracy_rate=5.0, # <= 10 -> pass + ) + + +def test_reexport_is_identical_object(): + assert RG_via_runner is RerankerGate + assert RGC_via_runner is RerankerGateContext + + +def test_force_allows(): + r = RerankerGate.should_rerank(force=True) + assert r.allowed and r.mode == "forced" and r.reason == "forced_by_flag" + + +def test_no_context_denies(): + r = RerankerGate.should_rerank(context=None) + assert not r.allowed and r.reason == "no_gate_context" + + +def test_all_gates_pass_allows(): + r = RerankerGate.should_rerank(context=_passing_ctx()) + assert r.allowed and r.mode == "gated" and r.reason == "all_gates_passed" + + +def test_baseline_still_improving_denies(): + ctx = _passing_ctx() + ctx.baseline_precision_trend = 0.05 # >= 0.02 plateau threshold + r = RerankerGate.should_rerank(context=ctx) + assert not r.allowed and "baseline_still_improving" in r.reason + + +def test_ab_improvement_too_low_denies(): + ctx = _passing_ctx() + ctx.rerank_ab_improvement_at_5 = 0.01 # < 0.05 + r = RerankerGate.should_rerank(context=ctx) + assert not r.allowed and "ab_improvement_too_low" in r.reason + + +def test_recall_regression_denies(): + ctx = _passing_ctx() + ctx.rerank_evidence_recall = 0.4 # < baseline 0.5 + r = RerankerGate.should_rerank(context=ctx) + assert not r.allowed and "recall_regression" in r.reason + + +def test_latency_too_high_denies(): + ctx = _passing_ctx() + ctx.rerank_latency_overhead_pct = 30.0 # > 20 + r = RerankerGate.should_rerank(context=ctx) + assert not r.allowed and "latency_too_high" in r.reason + + +def test_degeneracy_too_high_denies(): + ctx = _passing_ctx() + ctx.rerank_degeneracy_rate = 50.0 # > 10 + r = RerankerGate.should_rerank(context=ctx) + assert not r.allowed and "degeneracy_too_high" in r.reason + + +def test_fast_mode_config(): + cfg = RerankerGate.get_fast_mode_config() + assert cfg == {"max_candidates": 12, "timeout_s": 2} From c58686a056e0b01492d9f2a701be43e1c7d60777 Mon Sep 17 00:00:00 2001 From: Larry Stewart Date: Mon, 8 Jun 2026 11:29:02 -0400 Subject: [PATCH 2/3] refactor(qa): extract metadata query helpers (E2 2/3) Move the three pure db-only helpers (_get_doc_total_pages, _get_doc_collection_version, _get_nodes_metadata) from QARunner into app/qa/metadata_queries.py as functions taking db; QARunner keeps thin delegating wrappers (signatures/call sites unchanged). Pure relocation. Adds tests/qa/test_metadata_queries.py (incl. the None/empty short-circuits that avoid a DB hit). Full suite: 325 passed. Refs #21 --- backend/app/qa/metadata_queries.py | 40 +++++++++++++ backend/app/qa/runner.py | 68 +++-------------------- backend/tests/qa/test_metadata_queries.py | 51 +++++++++++++++++ 3 files changed, 100 insertions(+), 59 deletions(-) create mode 100644 backend/app/qa/metadata_queries.py create mode 100644 backend/tests/qa/test_metadata_queries.py diff --git a/backend/app/qa/metadata_queries.py b/backend/app/qa/metadata_queries.py new file mode 100644 index 0000000..e24a8de --- /dev/null +++ b/backend/app/qa/metadata_queries.py @@ -0,0 +1,40 @@ +"""Pure document/node metadata queries (E2 extraction from QARunner). + +These are db-only helpers with no other QARunner coupling; QARunner keeps thin +wrappers that delegate here, so call sites and behavior are unchanged. +""" + +from typing import Dict, List, Optional + + +def get_doc_total_pages(db, doc_id: str) -> int: + """Total pages for a document (0 if unknown).""" + from app.db.graph_models import DocumentGraph + doc = db.query(DocumentGraph).filter(DocumentGraph.doc_id == doc_id).first() + if doc and doc.meta: + return doc.meta.get("total_pages", 0) + return 0 + + +def get_doc_collection_version(db, doc_id: Optional[str]) -> Optional[str]: + """Milvus collection version a document was embedded into. + + Returns None when doc_id is None (search-all uses the default collection) + or when the version is unknown. + """ + if not doc_id: + return None + from app.db.graph_models import DocumentGraph + doc = db.query(DocumentGraph).filter(DocumentGraph.doc_id == doc_id).first() + if doc and doc.embedded_collection_version: + return doc.embedded_collection_version + return None + + +def get_nodes_metadata(db, node_ids: List[str]) -> Dict[str, Dict]: + """Return {node_id -> meta dict} for the given node ids.""" + from app.db.graph_models import Node as NodeModel + if not node_ids: + return {} + nodes = db.query(NodeModel).filter(NodeModel.node_id.in_(node_ids)).all() + return {n.node_id: n.meta or {} for n in nodes} diff --git a/backend/app/qa/runner.py b/backend/app/qa/runner.py index 2f3e87a..fa1bd26 100644 --- a/backend/app/qa/runner.py +++ b/backend/app/qa/runner.py @@ -39,6 +39,7 @@ verify_evidence_span, ) from app.storage.minio_client import get_storage_client +from app.qa import metadata_queries from .evidence_span import build_evidence_spans from .normalizer import normalize_query, NormalizedQuery from .section_booster import SectionBooster, SectionBoostResult @@ -1829,67 +1830,16 @@ def _call_llm_reranker( return rerank_map def _get_doc_total_pages(self, doc_id: str) -> int: - """Get total pages for a document. - - Args: - doc_id: Document ID - - Returns: - Total pages (0 if unknown) - """ - from app.db.graph_models import DocumentGraph - doc = self.db.query(DocumentGraph).filter( - DocumentGraph.doc_id == doc_id - ).first() - - if doc and doc.meta: - return doc.meta.get("total_pages", 0) - return 0 - + """Total pages for a document (delegates to metadata_queries).""" + return metadata_queries.get_doc_total_pages(self.db, doc_id) + def _get_doc_collection_version(self, doc_id: Optional[str]) -> Optional[str]: - """Get the Milvus collection version used when document was embedded. - - This enables querying the correct collection (v1 or v2) based on - where the document's vectors were actually stored. - - Args: - doc_id: Document ID (None means search all documents) - - Returns: - Collection version ("v1" or "v2") or None if unknown/all docs - """ - if not doc_id: - # Searching all documents - use default collection (v2) - return None - - from app.db.graph_models import DocumentGraph - doc = self.db.query(DocumentGraph).filter( - DocumentGraph.doc_id == doc_id - ).first() - - if doc and doc.embedded_collection_version: - return doc.embedded_collection_version - return None - + """Milvus collection version a doc was embedded into (delegates).""" + return metadata_queries.get_doc_collection_version(self.db, doc_id) + def _get_nodes_metadata(self, node_ids: List[str]) -> Dict[str, Dict]: - """Get metadata for nodes. - - Args: - node_ids: List of node IDs - - Returns: - Dict of node_id -> meta dict - """ - from app.db.graph_models import Node as NodeModel - - if not node_ids: - return {} - - nodes = self.db.query(NodeModel).filter( - NodeModel.node_id.in_(node_ids) - ).all() - - return {n.node_id: n.meta or {} for n in nodes} + """Metadata for nodes as {node_id -> meta} (delegates).""" + return metadata_queries.get_nodes_metadata(self.db, node_ids) # ========================================================================== # STRUCTURED-OBJECT SEED INJECTION (deterministic, no LLM) diff --git a/backend/tests/qa/test_metadata_queries.py b/backend/tests/qa/test_metadata_queries.py new file mode 100644 index 0000000..f20f4a9 --- /dev/null +++ b/backend/tests/qa/test_metadata_queries.py @@ -0,0 +1,51 @@ +"""Characterization tests for the extracted metadata query helpers (E2 2/3).""" + +from unittest.mock import MagicMock + +from app.qa.metadata_queries import ( + get_doc_collection_version, + get_doc_total_pages, + get_nodes_metadata, +) + + +def _db_first(obj): + db = MagicMock() + db.query.return_value.filter.return_value.first.return_value = obj + return db + + +def test_total_pages_from_meta(): + doc = MagicMock(meta={"total_pages": 7}) + assert get_doc_total_pages(_db_first(doc), "d") == 7 + + +def test_total_pages_defaults_zero(): + assert get_doc_total_pages(_db_first(None), "d") == 0 + assert get_doc_total_pages(_db_first(MagicMock(meta=None)), "d") == 0 + + +def test_collection_version_none_short_circuits_without_query(): + db = MagicMock() + assert get_doc_collection_version(db, None) is None + db.query.assert_not_called() # search-all must not hit the DB + + +def test_collection_version_returns_stored(): + doc = MagicMock(embedded_collection_version="v2") + assert get_doc_collection_version(_db_first(doc), "d") == "v2" + assert get_doc_collection_version(_db_first(None), "d") is None + + +def test_nodes_metadata_empty_short_circuits(): + db = MagicMock() + assert get_nodes_metadata(db, []) == {} + db.query.assert_not_called() + + +def test_nodes_metadata_maps_ids_to_meta(): + n1 = MagicMock(node_id="a", meta={"x": 1}) + n2 = MagicMock(node_id="b", meta=None) + db = MagicMock() + db.query.return_value.filter.return_value.all.return_value = [n1, n2] + assert get_nodes_metadata(db, ["a", "b"]) == {"a": {"x": 1}, "b": {}} From 3a37d6969b74b0fa0d49e161b712a3a9038350fd Mon Sep 17 00:00:00 2001 From: Larry Stewart Date: Mon, 8 Jun 2026 11:30:38 -0400 Subject: [PATCH 3/3] refactor(qa): extract structured-target detection (E2 3/3) Move the pure regex _detect_structured_targets + its 4 patterns from QARunner into app/qa/structured_targets.py; QARunner aliases the patterns (back-compat) and delegates the method. MAX_INJECTED_SEEDS stays on QARunner (used by inject_structured_seeds, not extracted). Pure relocation. Adds tests/qa/test_structured_targets.py (figure/table/ appendix/section detection + delegation parity). Full suite: 330 passed. Refs #21 --- backend/app/qa/runner.py | 57 ++++----------------- backend/app/qa/structured_targets.py | 47 +++++++++++++++++ backend/tests/qa/test_structured_targets.py | 32 ++++++++++++ 3 files changed, 89 insertions(+), 47 deletions(-) create mode 100644 backend/app/qa/structured_targets.py create mode 100644 backend/tests/qa/test_structured_targets.py diff --git a/backend/app/qa/runner.py b/backend/app/qa/runner.py index fa1bd26..005ac16 100644 --- a/backend/app/qa/runner.py +++ b/backend/app/qa/runner.py @@ -40,6 +40,7 @@ ) from app.storage.minio_client import get_storage_client from app.qa import metadata_queries +from app.qa import structured_targets from .evidence_span import build_evidence_spans from .normalizer import normalize_query, NormalizedQuery from .section_booster import SectionBooster, SectionBoostResult @@ -1848,54 +1849,16 @@ def _get_nodes_metadata(self, node_ids: List[str]) -> Dict[str, Dict]: # Max injected seeds to force into top-K MAX_INJECTED_SEEDS = 2 - # Regex patterns for structured object mentions - FIGURE_PATTERN = re.compile(r'(?:Figure|Fig\.?)\s*(\d+(?:\.\d+)?)', re.IGNORECASE) - TABLE_PATTERN = re.compile(r'(?:Table|Tab\.?)\s*(\d+(?:\.\d+)?)', re.IGNORECASE) - APPENDIX_PATTERN = re.compile(r'Appendix\s*([A-Za-z])', re.IGNORECASE) - SECTION_PATTERN = re.compile( - r'\b(Conclusion|Methodology|Introduction|Discussion|Evaluation|Results|Appendix)\b', - re.IGNORECASE - ) - + # Patterns + detection live in app/qa/structured_targets.py (E2 extraction); + # aliased here for back-compat with any external references. + FIGURE_PATTERN = structured_targets.FIGURE_PATTERN + TABLE_PATTERN = structured_targets.TABLE_PATTERN + APPENDIX_PATTERN = structured_targets.APPENDIX_PATTERN + SECTION_PATTERN = structured_targets.SECTION_PATTERN + def _detect_structured_targets(self, question: str) -> Dict[str, List[str]]: - """Detect Figure/Table/Appendix/Section mentions in question. - - Args: - question: User question text - - Returns: - Dict with keys 'figures', 'tables', 'appendices', 'sections' - containing normalized target strings - """ - targets = { - "figures": [], - "tables": [], - "appendices": [], - "sections": [] - } - - # Detect Figure X - for match in self.FIGURE_PATTERN.finditer(question): - num = match.group(1) - targets["figures"].append(f"Figure {num}") - - # Detect Table X - for match in self.TABLE_PATTERN.finditer(question): - num = match.group(1) - targets["tables"].append(f"Table {num}") - - # Detect Appendix X - for match in self.APPENDIX_PATTERN.finditer(question): - letter = match.group(1).upper() - targets["appendices"].append(f"Appendix {letter}") - - # Detect section mentions - for match in self.SECTION_PATTERN.finditer(question): - section = match.group(1).title() # "conclusion" -> "Conclusion" - if section not in targets["sections"]: - targets["sections"].append(section) - - return targets + """Detect Figure/Table/Appendix/Section mentions (delegates).""" + return structured_targets.detect_structured_targets(question) def _query_nodes_by_label( self, diff --git a/backend/app/qa/structured_targets.py b/backend/app/qa/structured_targets.py new file mode 100644 index 0000000..7dc388d --- /dev/null +++ b/backend/app/qa/structured_targets.py @@ -0,0 +1,47 @@ +"""Detect Figure/Table/Appendix/Section mentions in a question (E2 extraction). + +Pure, deterministic string -> dict. Relocated verbatim from QARunner; the class +keeps aliasing attributes + a delegating method so behavior is unchanged. +""" + +import re +from typing import Dict, List + +# Regex patterns for structured object mentions +FIGURE_PATTERN = re.compile(r'(?:Figure|Fig\.?)\s*(\d+(?:\.\d+)?)', re.IGNORECASE) +TABLE_PATTERN = re.compile(r'(?:Table|Tab\.?)\s*(\d+(?:\.\d+)?)', re.IGNORECASE) +APPENDIX_PATTERN = re.compile(r'Appendix\s*([A-Za-z])', re.IGNORECASE) +SECTION_PATTERN = re.compile( + r'\b(Conclusion|Methodology|Introduction|Discussion|Evaluation|Results|Appendix)\b', + re.IGNORECASE, +) + + +def detect_structured_targets(question: str) -> Dict[str, List[str]]: + """Detect Figure/Table/Appendix/Section mentions in a question. + + Returns a dict with keys 'figures', 'tables', 'appendices', 'sections' + containing normalized target strings. + """ + targets = { + "figures": [], + "tables": [], + "appendices": [], + "sections": [], + } + + for match in FIGURE_PATTERN.finditer(question): + targets["figures"].append(f"Figure {match.group(1)}") + + for match in TABLE_PATTERN.finditer(question): + targets["tables"].append(f"Table {match.group(1)}") + + for match in APPENDIX_PATTERN.finditer(question): + targets["appendices"].append(f"Appendix {match.group(1).upper()}") + + for match in SECTION_PATTERN.finditer(question): + section = match.group(1).title() # "conclusion" -> "Conclusion" + if section not in targets["sections"]: + targets["sections"].append(section) + + return targets diff --git a/backend/tests/qa/test_structured_targets.py b/backend/tests/qa/test_structured_targets.py new file mode 100644 index 0000000..d490055 --- /dev/null +++ b/backend/tests/qa/test_structured_targets.py @@ -0,0 +1,32 @@ +"""Characterization tests for structured-target detection (E2 3/3).""" + +from app.qa.structured_targets import detect_structured_targets + + +def test_detects_figures_and_tables(): + out = detect_structured_targets("Compare Figure 1.5 with Table 4 and Fig. 2") + assert out["figures"] == ["Figure 1.5", "Figure 2"] + assert out["tables"] == ["Table 4"] + + +def test_detects_appendix_uppercased(): + out = detect_structured_targets("See Appendix b for details") + assert out["appendices"] == ["Appendix B"] + + +def test_detects_sections_titlecased_and_deduped(): + out = detect_structured_targets("the conclusion and the Conclusion and methodology") + assert out["sections"] == ["Conclusion", "Methodology"] + + +def test_no_match_returns_empty_lists(): + out = detect_structured_targets("what is the revenue?") + assert out == {"figures": [], "tables": [], "appendices": [], "sections": []} + + +def test_runner_method_delegates_identically(): + # The QARunner wrapper must produce the same result as the module function. + from app.qa.runner import QARunner + runner = QARunner.__new__(QARunner) + q = "Explain Figure 3 in the Results section" + assert runner._detect_structured_targets(q) == detect_structured_targets(q)