From a026cc1fbe9bb60c1fd3c6323629a38868bd95c8 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Fri, 1 May 2026 13:16:39 -0400 Subject: [PATCH 01/11] =?UTF-8?q?feat(mcp):=20hygiene=20issues=20=E2=80=94?= =?UTF-8?q?=20list,=20get,=20search,=20update=20+=20resource=20+=20prompt?= =?UTF-8?q?=20(TG-1029)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expose profiling anomalies for read, search, and triage. Adds: * `list_hygiene_issues` — paginated list per profiling run / table group * `get_hygiene_issue` — full detail with type definition + column profile * `search_hygiene_issues` — cross-run search with project scoping + `since` * `update_hygiene_issue` — disposition update, gated by `disposition` permission * `testgen://hygiene-issue-types` — reference table resource * `hygiene_triage` — guided workflow prompt Likelihood split into `issue_likelihood` + `pii_risk` so PII discovery doesn't muddle regular-issue likelihood values; providing one auto-excludes the other category via the natural query semantics. User-facing terminology applied: `Quality Dimension`, `Muted` (for `Inactive`). Run timestamp sourced from `JobExecution.started_at` per the run/JE consolidation direction. No "anomaly" in user-facing surfaces — DB columns keep their names. --- testgen/common/models/hygiene_issue.py | 265 +++++++++++++++- testgen/mcp/prompts/workflows.py | 35 +++ testgen/mcp/server.py | 25 +- testgen/mcp/tools/common.py | 59 ++++ testgen/mcp/tools/hygiene_issues.py | 405 +++++++++++++++++++++++++ testgen/mcp/tools/reference.py | 22 ++ testgen/mcp/tools/test_definitions.py | 6 +- 7 files changed, 809 insertions(+), 8 deletions(-) create mode 100644 testgen/mcp/tools/hygiene_issues.py diff --git a/testgen/common/models/hygiene_issue.py b/testgen/common/models/hygiene_issue.py index caa4d460..a950c5a9 100644 --- a/testgen/common/models/hygiene_issue.py +++ b/testgen/common/models/hygiene_issue.py @@ -1,10 +1,11 @@ import re from collections.abc import Iterable from dataclasses import dataclass +from datetime import datetime from typing import Self from uuid import UUID, uuid4 -from sqlalchemy import Column, ForeignKey, String, and_, case, null, select +from sqlalchemy import Boolean, Column, ForeignKey, String, and_, case, null, select, update from sqlalchemy.dialects import postgresql from sqlalchemy.ext.hybrid import hybrid_property from sqlalchemy.orm import aliased, relationship @@ -12,6 +13,10 @@ from testgen.common.models import Base, get_current_session from testgen.common.models.entity import Entity +from testgen.common.models.job_execution import JobExecution +from testgen.common.models.profile_result import ProfileResult +from testgen.common.models.profiling_run import ProfilingRun +from testgen.common.models.table_group import TableGroup PII_RISK_RE = re.compile(r"Risk: (MODERATE|HIGH),") @@ -36,14 +41,96 @@ def active(self): return self.total - self.inactive +@dataclass +class HygieneIssueListRow: + """Row shape for ``list_hygiene_issues``.""" + + id: UUID + project_code: str + issue_type_name: str + schema_name: str + table_name: str + column_name: str + dq_dimension: str | None + disposition: str + priority: str | None + detail: str + detail_redactable: bool | None + pii_flag: str | None + + +@dataclass +class HygieneIssueSearchRow: + """Row shape for ``search_hygiene_issues``. Adds run + table-group context vs the list row.""" + + id: UUID + project_code: str + issue_type_name: str + table_groups_name: str + job_execution_id: UUID | None + started_at: datetime | None + schema_name: str + table_name: str + column_name: str + dq_dimension: str | None + disposition: str + priority: str | None + detail: str + detail_redactable: bool | None + pii_flag: str | None + + +@dataclass +class HygieneIssueDetail: + """Full row + type definition + column-profile context for ``get_hygiene_issue``.""" + + id: UUID + project_code: str + issue_type_name: str + type_description: str | None + suggested_action: str | None + schema_name: str + table_name: str + column_name: str + db_data_type: str | None + dq_dimension: str | None + impact_dimension: str | None + disposition: str + priority: str | None + detail: str + detail_redactable: bool | None + pii_flag: str | None + job_execution_id: UUID | None + started_at: datetime | None + column_general_type: str | None + column_db_data_type: str | None + column_record_ct: int | None + column_null_value_ct: int | None + column_distinct_value_ct: int | None + + class HygieneIssueType(Base): __tablename__ = "profile_anomaly_types" id: str = Column(String, primary_key=True) likelihood: str = Column("issue_likelihood", String) name: str = Column("anomaly_name", String) + description: str = Column("anomaly_description", String) + suggested_action: str = Column(String) + dq_dimension: str = Column(String) + impact_dimension: str = Column(String) + data_object: str = Column(String) + detail_redactable: bool = Column(Boolean) + + # Unmapped: anomaly_type, anomaly_criteria, detail_expression, + # dq_score_prevalence_formula, dq_score_risk_factor. - # Note: not all table columns are implemented by this entity + @classmethod + def select_where(cls, *clauses, order_by=None) -> list[Self]: + query = select(cls).where(*clauses) + if order_by is not None: + query = query.order_by(*order_by) + return list(get_current_session().scalars(query)) class HygieneIssue(Entity): @@ -58,14 +145,18 @@ class HygieneIssue(Entity): type_id: str = Column("anomaly_id", String, ForeignKey("profile_anomaly_types.id"), nullable=False) type_ = relationship(HygieneIssueType) + column_id: UUID = Column(postgresql.UUID(as_uuid=True)) + schema_name: str = Column(String, nullable=False) table_name: str = Column(String, nullable=False) column_name: str = Column(String, nullable=False) + db_data_type: str = Column(String) detail: str = Column(String, nullable=False) disposition: str = Column(String) + impact_dimension: str = Column(String) - # Note: not all table columns are implemented by this entity + # Unmapped: column_type, dq_prevalence. @hybrid_property def priority(self): @@ -132,6 +223,174 @@ def _count_active(likelihood_values: tuple[str, ...]): row = get_current_session().execute(query).first() return IssueLikelihoodCounts(**{k: v for k, v in row._mapping.items() if v is not None}) + @classmethod + def _priority_order(cls): + return case( + (cls.priority == "Definite", 1), + (cls.priority == "Likely", 2), + (cls.priority == "Possible", 3), + (cls.priority == "High", 4), + (cls.priority == "Moderate", 5), + else_=6, + ) + + @classmethod + def list_for_run( + cls, + profile_run_id: UUID, + *clauses, + page: int = 1, + limit: int = 50, + ) -> tuple[list[HygieneIssueListRow], int]: + """Paginated hygiene issues for a single profiling run. + + Caller-supplied ``*clauses`` carry every WHERE filter (project scoping, disposition, + likelihood / pii_risk, table / column / dq_dimension / issue_type filters). + """ + query = ( + select( + cls.id.label("id"), + cls.project_code.label("project_code"), + HygieneIssueType.name.label("issue_type_name"), + cls.schema_name.label("schema_name"), + cls.table_name.label("table_name"), + cls.column_name.label("column_name"), + HygieneIssueType.dq_dimension.label("dq_dimension"), + func.coalesce(cls.disposition, "Confirmed").label("disposition"), + cls.priority.label("priority"), + cls.detail.label("detail"), + HygieneIssueType.detail_redactable.label("detail_redactable"), + ProfileResult.pii_flag.label("pii_flag"), + ) + .join(HygieneIssueType, HygieneIssueType.id == cls.type_id) + .outerjoin( + ProfileResult, + and_( + ProfileResult.profile_run_id == cls.profile_run_id, + ProfileResult.schema_name == cls.schema_name, + ProfileResult.table_name == cls.table_name, + ProfileResult.column_name == cls.column_name, + ), + ) + .where(cls.profile_run_id == profile_run_id, *clauses) + .order_by(cls._priority_order(), cls.table_name, cls.column_name, cls.id) + ) + return cls._paginate(query, page=page, limit=limit, data_class=HygieneIssueListRow) + + @classmethod + def search( + cls, + *clauses, + page: int = 1, + limit: int = 50, + ) -> tuple[list[HygieneIssueSearchRow], int]: + """Cross-run paginated search over hygiene issues. + + Always JOINs ``ProfilingRun`` + ``JobExecution`` (for ``started_at`` + ``job_execution_id``) + and ``TableGroup`` (for ``table_groups_name``). Caller-supplied ``*clauses`` carry every + WHERE filter (project scoping, ``JobExecution.started_at`` window, all user filters). + """ + query = ( + select( + cls.id.label("id"), + cls.project_code.label("project_code"), + HygieneIssueType.name.label("issue_type_name"), + TableGroup.table_groups_name.label("table_groups_name"), + ProfilingRun.job_execution_id.label("job_execution_id"), + JobExecution.started_at.label("started_at"), + cls.schema_name.label("schema_name"), + cls.table_name.label("table_name"), + cls.column_name.label("column_name"), + HygieneIssueType.dq_dimension.label("dq_dimension"), + func.coalesce(cls.disposition, "Confirmed").label("disposition"), + cls.priority.label("priority"), + cls.detail.label("detail"), + HygieneIssueType.detail_redactable.label("detail_redactable"), + ProfileResult.pii_flag.label("pii_flag"), + ) + .join(HygieneIssueType, HygieneIssueType.id == cls.type_id) + .join(ProfilingRun, ProfilingRun.id == cls.profile_run_id) + .outerjoin(JobExecution, JobExecution.id == ProfilingRun.job_execution_id) + .join(TableGroup, TableGroup.id == cls.table_groups_id) + .outerjoin( + ProfileResult, + and_( + ProfileResult.profile_run_id == cls.profile_run_id, + ProfileResult.schema_name == cls.schema_name, + ProfileResult.table_name == cls.table_name, + ProfileResult.column_name == cls.column_name, + ), + ) + .where(*clauses) + .order_by(JobExecution.started_at.desc(), cls._priority_order(), cls.id) + ) + return cls._paginate(query, page=page, limit=limit, data_class=HygieneIssueSearchRow) + + @classmethod + def get_with_context(cls, issue_id: UUID, *clauses) -> HygieneIssueDetail | None: + """Fetch one hygiene issue with type definition + column-profile context. + + Returns ``None`` when no row matches the id and ``*clauses`` together — the + caller decides whether that's "missing", "not accessible", or both collapsed + into one error. + + Joins ``ProfileResult`` outer-style: table-level issues may have no matching + column profile row, in which case the column_* fields stay ``None``. + """ + query = ( + select( + cls.id.label("id"), + cls.project_code.label("project_code"), + HygieneIssueType.name.label("issue_type_name"), + HygieneIssueType.description.label("type_description"), + HygieneIssueType.suggested_action.label("suggested_action"), + cls.schema_name.label("schema_name"), + cls.table_name.label("table_name"), + cls.column_name.label("column_name"), + cls.db_data_type.label("db_data_type"), + HygieneIssueType.dq_dimension.label("dq_dimension"), + cls.impact_dimension.label("impact_dimension"), + func.coalesce(cls.disposition, "Confirmed").label("disposition"), + cls.priority.label("priority"), + cls.detail.label("detail"), + HygieneIssueType.detail_redactable.label("detail_redactable"), + ProfileResult.pii_flag.label("pii_flag"), + ProfilingRun.job_execution_id.label("job_execution_id"), + JobExecution.started_at.label("started_at"), + ProfileResult.general_type.label("column_general_type"), + ProfileResult.db_data_type.label("column_db_data_type"), + ProfileResult.record_ct.label("column_record_ct"), + ProfileResult.null_value_ct.label("column_null_value_ct"), + ProfileResult.distinct_value_ct.label("column_distinct_value_ct"), + ) + .join(HygieneIssueType, HygieneIssueType.id == cls.type_id) + .join(ProfilingRun, ProfilingRun.id == cls.profile_run_id) + .outerjoin(JobExecution, JobExecution.id == ProfilingRun.job_execution_id) + .outerjoin( + ProfileResult, + and_( + ProfileResult.profile_run_id == cls.profile_run_id, + ProfileResult.schema_name == cls.schema_name, + ProfileResult.table_name == cls.table_name, + ProfileResult.column_name == cls.column_name, + ), + ) + .where(cls.id == issue_id, *clauses) + ) + row = get_current_session().execute(query).mappings().first() + return HygieneIssueDetail(**row) if row else None + + @classmethod + def update_disposition(cls, issue_id: UUID, disposition: str, *clauses) -> bool: + """Update disposition on a single hygiene issue, scoped by caller-supplied clauses. + + Returns ``True`` if a row was updated, ``False`` if no row matched the id and + ``*clauses`` together. + """ + stmt = update(cls).where(cls.id == issue_id, *clauses).values(disposition=disposition) + result = get_current_session().execute(stmt) + return result.rowcount > 0 + @classmethod def select_with_diff( cls, profiling_run_id: UUID, other_profiling_run_id: UUID | None, *where_clauses, limit: int | None = None diff --git a/testgen/mcp/prompts/workflows.py b/testgen/mcp/prompts/workflows.py index ff76b7b8..2317054b 100644 --- a/testgen/mcp/prompts/workflows.py +++ b/testgen/mcp/prompts/workflows.py @@ -77,6 +77,41 @@ def profiling_overview() -> str: """ +def hygiene_triage(table_group_id: str | None = None) -> str: + """Guided hygiene issue triage workflow — review hygiene issues and decide what to do. + + Args: + table_group_id: Optional UUID of a table group to focus on. + """ + intro = ( + f"Focus on table group `{table_group_id}`." + if table_group_id + else "Pick a table group with confirmed hygiene issues." + ) + tg = f"'{table_group_id}'" if table_group_id else "'...'" + + steps: list[str] = [] + if not table_group_id: + steps.append( + "Call `get_data_inventory()` to see projects and table groups, with profiling status per group." + ) + steps.append(f"Call `list_profiling_summaries(table_group_id={tg})` to see hygiene issue counts per run.") + steps.append(f"Call `list_hygiene_issues(table_group_id={tg}, disposition='Confirmed')` for the issues to review.") + steps.append( + "For each top issue (ordered by priority), call `get_hygiene_issue(issue_id='...')` for full context — " + "issue type description, suggested action, column profile." + ) + steps.append("For unfamiliar issue types, read `testgen://hygiene-issue-types` once for the reference table.") + steps.append( + "For each issue: explain what was found, then ask the user whether to dismiss the issue " + "(call `update_hygiene_issue(issue_id='...', disposition='Dismissed')`) or investigate further." + ) + steps.append("Summarize the triage results and any patterns noted across the issues.") + + numbered = "\n".join(f"{i + 1}. {step}" for i, step in enumerate(steps)) + return f"Please triage hygiene issues. {intro}\n\n{numbered}\n" + + def compare_runs(test_suite: str | None = None) -> str: """Compare the two most recent test runs to identify regressions and improvements. diff --git a/testgen/mcp/server.py b/testgen/mcp/server.py index 19e3a539..ffe5d1c8 100644 --- a/testgen/mcp/server.py +++ b/testgen/mcp/server.py @@ -30,6 +30,9 @@ Test types have specific, non-obvious meanings (e.g., Alpha_Trunc). Do not guess what a test checks. ALWAYS look them up using either the `testgen://test-types` resource or the `get_test_type()` tool. +Hygiene issue types similarly have specific meanings. ALWAYS look them up using the +`testgen://hygiene-issue-types` resource. + INVESTIGATING FAILURES Use list_test_results to find failures, then get_source_data to see relevant data from the connected database. @@ -90,6 +93,7 @@ def build_mcp_app( from testgen.mcp.prompts.workflows import ( compare_runs, health_check, + hygiene_triage, investigate_failures, profiling_overview, table_health, @@ -102,8 +106,19 @@ def build_mcp_app( run_profiling, run_tests, ) + from testgen.mcp.tools.hygiene_issues import ( + get_hygiene_issue, + list_hygiene_issues, + search_hygiene_issues, + update_hygiene_issue, + ) from testgen.mcp.tools.profiling import get_table, list_column_profiles, list_profiling_summaries - from testgen.mcp.tools.reference import get_test_type, glossary_resource, test_types_resource + from testgen.mcp.tools.reference import ( + get_test_type, + glossary_resource, + hygiene_issue_types_resource, + test_types_resource, + ) from testgen.mcp.tools.source_data import get_source_data, get_source_data_query from testgen.mcp.tools.test_definitions import get_test, list_test_notes, list_test_types, list_tests from testgen.mcp.tools.test_results import ( @@ -166,9 +181,14 @@ def safe_prompt(fn): safe_tool(cancel_test_run) safe_tool(cancel_profiling_run) safe_tool(generate_tests) + safe_tool(list_hygiene_issues) + safe_tool(get_hygiene_issue) + safe_tool(search_hygiene_issues) + safe_tool(update_hygiene_issue) - # Resources (2) + # Resources safe_resource("testgen://test-types", test_types_resource) + safe_resource("testgen://hygiene-issue-types", hygiene_issue_types_resource) safe_resource("testgen://glossary", glossary_resource) # Prompts @@ -177,6 +197,7 @@ def safe_prompt(fn): safe_prompt(table_health) safe_prompt(compare_runs) safe_prompt(profiling_overview) + safe_prompt(hygiene_triage) app = mcp.streamable_http_app() return app, mcp.session_manager diff --git a/testgen/mcp/tools/common.py b/testgen/mcp/tools/common.py index 35a4f64f..d6c2d3a3 100644 --- a/testgen/mcp/tools/common.py +++ b/testgen/mcp/tools/common.py @@ -2,6 +2,7 @@ from uuid import UUID from testgen.common.date_service import parse_since +from testgen.common.models.hygiene_issue import HygieneIssueType from testgen.common.models.table_group import TableGroup from testgen.common.models.test_definition import TestType from testgen.common.models.test_result import TestResultStatus @@ -9,6 +10,13 @@ from testgen.mcp.exceptions import MCPResourceNotAccessible, MCPUserError from testgen.mcp.permissions import get_project_permissions +VALID_DQ_DIMENSIONS = {"Accuracy", "Completeness", "Consistency", "Recency", "Timeliness", "Uniqueness", "Validity"} +# DB stores "Inactive"; user-facing label is "Muted". +_DISPOSITION_USER_TO_DB = {"Confirmed": "Confirmed", "Dismissed": "Dismissed", "Muted": "Inactive"} +_DISPOSITION_DB_TO_USER = {v: k for k, v in _DISPOSITION_USER_TO_DB.items()} +_VALID_ISSUE_LIKELIHOODS = {"Definite", "Likely", "Possible"} +_VALID_PII_RISKS = {"High", "Moderate"} + def parse_uuid(value: str, label: str = "ID") -> UUID: try: @@ -42,6 +50,46 @@ def parse_since_arg(value: str, label: str = "since", *, today: date | None = No raise MCPUserError(f"Invalid `{label}`: {err}") from err +def parse_quality_dimension(value: str) -> str: + if value not in VALID_DQ_DIMENSIONS: + valid = ", ".join(sorted(VALID_DQ_DIMENSIONS)) + raise MCPUserError(f"Invalid quality_dimension `{value}`. Valid values: {valid}") + return value + + +def parse_disposition(value: str) -> str: + """Validate a user-facing disposition label and return the DB value. + + Accepts ``Confirmed``, ``Dismissed``, ``Muted`` and returns ``Confirmed``, + ``Dismissed``, ``Inactive`` respectively (the DB encodes the legacy ``Inactive``). + """ + if value not in _DISPOSITION_USER_TO_DB: + valid = ", ".join(sorted(_DISPOSITION_USER_TO_DB)) + raise MCPUserError(f"Invalid disposition `{value}`. Valid values: {valid}") + return _DISPOSITION_USER_TO_DB[value] + + +def format_disposition(value: str) -> str: + """Map a DB disposition value to its user-facing label (``Inactive`` → ``Muted``).""" + return _DISPOSITION_DB_TO_USER.get(value, value) + + +def parse_issue_likelihood_list(values: list[str]) -> list[str]: + invalid = [v for v in values if v not in _VALID_ISSUE_LIKELIHOODS] + if invalid: + valid = ", ".join(sorted(_VALID_ISSUE_LIKELIHOODS)) + raise MCPUserError(f"Invalid issue_likelihood values {invalid}. Valid values: {valid}") + return values + + +def parse_pii_risk_list(values: list[str]) -> list[str]: + invalid = [v for v in values if v not in _VALID_PII_RISKS] + if invalid: + valid = ", ".join(sorted(_VALID_PII_RISKS)) + raise MCPUserError(f"Invalid pii_risk values {invalid}. Valid values: {valid}") + return values + + def resolve_test_type(short_name: str) -> str: """Resolve a test type short name to its internal code.""" matches = TestType.select_where(TestType.test_name_short == short_name) @@ -52,6 +100,17 @@ def resolve_test_type(short_name: str) -> str: return matches[0].test_type +def resolve_issue_type(name: str) -> str: + """Resolve a hygiene issue type human label to its internal id (case-sensitive exact match).""" + matches = HygieneIssueType.select_where(HygieneIssueType.name == name) + if not matches: + raise MCPUserError( + f"Unknown hygiene issue type: `{name}`. " + "Use the testgen://hygiene-issue-types resource to see available types." + ) + return matches[0].id + + def format_page_info(total: int, page: int, limit: int) -> str: """Shared pagination summary line for MCP tool output.""" if total == 0: diff --git a/testgen/mcp/tools/hygiene_issues.py b/testgen/mcp/tools/hygiene_issues.py new file mode 100644 index 00000000..7101587a --- /dev/null +++ b/testgen/mcp/tools/hygiene_issues.py @@ -0,0 +1,405 @@ +from uuid import UUID + +from sqlalchemy import and_, or_ +from sqlalchemy.sql.elements import ColumnElement +from sqlalchemy.sql.functions import func + +from testgen.common.models import with_database_session +from testgen.common.models.hygiene_issue import HygieneIssue, HygieneIssueType +from testgen.common.models.job_execution import JobExecution +from testgen.common.models.profiling_run import ProfilingRun +from testgen.common.models.table_group import TableGroup +from testgen.common.pii_masking import PII_REDACTED +from testgen.mcp.exceptions import MCPResourceNotAccessible, MCPUserError +from testgen.mcp.permissions import get_project_permissions, mcp_permission +from testgen.mcp.tools.common import ( + format_disposition, + format_page_footer, + format_page_info, + parse_disposition, + parse_issue_likelihood_list, + parse_pii_risk_list, + parse_quality_dimension, + parse_since_arg, + parse_uuid, + resolve_issue_type, + resolve_table_group, + validate_limit, + validate_page, +) +from testgen.mcp.tools.markdown import MdDoc + + +def _redact_detail(row, view_pii_codes: set[str]) -> str: + """Return the row's detail, redacted if the type is redactable, the column is PII, + and the caller lacks `view_pii` on the row's project. Mirrors `mask_hygiene_detail`. + """ + if row.detail_redactable and row.pii_flag and row.project_code not in view_pii_codes: + return PII_REDACTED + return row.detail + + +def _build_likelihood_clause( + issue_likelihood: list[str] | None, + pii_risk: list[str] | None, +) -> ColumnElement[bool] | None: + """Construct the WHERE clause for the likelihood / pii_risk filter pair. + + `issue_likelihood` matches `HygieneIssueType.likelihood` directly, so PII rows + (likelihood = "Potential PII") are excluded automatically. `pii_risk` matches + only PII rows by also requiring `likelihood == "Potential PII"`. Providing one + of the two filters therefore auto-excludes the other category. + """ + likelihood_clause = HygieneIssueType.likelihood.in_(issue_likelihood) if issue_likelihood else None + pii_clause = ( + and_(HygieneIssueType.likelihood == "Potential PII", HygieneIssue.priority.in_(pii_risk)) + if pii_risk + else None + ) + if likelihood_clause is None: + return pii_clause + if pii_clause is None: + return likelihood_clause + return or_(likelihood_clause, pii_clause) + + +def _resolve_profile_run( + *, + job_execution_id: str | None, + table_group_id: str | None, +) -> tuple[UUID, str]: + """Resolve the scope to a (profile_run_id, run_id_label) tuple. + + Mutual-exclusion + at-least-one validation done by caller. Collapses missing + runs and inaccessible table groups into a single ``MCPResourceNotAccessible``. + """ + if table_group_id: + tg = resolve_table_group(table_group_id) + if tg.last_complete_profile_run_id is None: + raise MCPUserError(f"No completed profiling runs found for table group `{table_group_id}`.") + run = ProfilingRun.get(tg.last_complete_profile_run_id) + run_id_label = str(run.job_execution_id) if run and run.job_execution_id else f"latest run for table group `{table_group_id}`" + return tg.last_complete_profile_run_id, run_id_label + + job_uuid = parse_uuid(job_execution_id, "job_execution_id") + run = ProfilingRun.get_by_id_or_job(job_uuid) + perms = get_project_permissions() + tg = TableGroup.get(run.table_groups_id) if run else None + if run is None or tg is None or not perms.has_access(tg.project_code): + raise MCPResourceNotAccessible("Profiling run", job_execution_id) + return run.id, job_execution_id + + +@with_database_session +@mcp_permission("view") +def list_hygiene_issues( + *, + job_execution_id: str | None = None, + table_group_id: str | None = None, + table_name: str | None = None, + column_name: str | None = None, + quality_dimension: str | None = None, + disposition: str | None = "Confirmed", + issue_likelihood: list[str] | None = None, + pii_risk: list[str] | None = None, + issue_type: str | None = None, + limit: int = 50, + page: int = 1, +) -> str: + """List hygiene issues for a profiling run. + + Provide either ``job_execution_id`` for a specific run, or ``table_group_id`` to list + the issues from its latest profiling run. + + Args: + job_execution_id: UUID of a profiling run, e.g. from ``list_profiling_summaries``. + table_group_id: UUID of a table group. Resolves to the latest completed profiling run. + Mutually exclusive with ``job_execution_id``. + table_name: Filter by table name (exact match). + column_name: Filter by column name (exact match). + quality_dimension: Filter by data quality dimension ('Accuracy', 'Completeness', + 'Consistency', 'Recency', 'Timeliness', 'Uniqueness', 'Validity'). + disposition: Filter by disposition. Defaults to 'Confirmed'. + Valid values: 'Confirmed', 'Dismissed', 'Muted'. + issue_likelihood: Filter by issue likelihood. Values: 'Definite', 'Likely', 'Possible'. + Providing this filter auto-excludes PII issues; combine with ``pii_risk`` to include both. + pii_risk: Filter by PII risk level. Values: 'High', 'Moderate'. + Providing this filter auto-excludes regular issues. + issue_type: Filter by hygiene issue type (e.g. 'Personally Identifiable Information'). + See ``testgen://hygiene-issue-types`` for the full list. + limit: Maximum number of issues per page (default 50, max 200). + page: Page number, starting from 1 (default 1). + """ + if job_execution_id and table_group_id: + raise MCPUserError("Pass either `job_execution_id` or `table_group_id`, not both.") + if not job_execution_id and not table_group_id: + raise MCPUserError("Provide either `job_execution_id` or `table_group_id`.") + validate_page(page) + validate_limit(limit, 200) + + perms = get_project_permissions() + profile_run_id, run_id_label = _resolve_profile_run( + job_execution_id=job_execution_id, + table_group_id=table_group_id, + ) + + issue_likelihood = parse_issue_likelihood_list(issue_likelihood) if issue_likelihood else None + pii_risk = parse_pii_risk_list(pii_risk) if pii_risk else None + + clauses = [HygieneIssue.project_code.in_(perms.allowed_codes)] + disposition_value = parse_disposition(disposition or "Confirmed") + clauses.append(func.coalesce(HygieneIssue.disposition, "Confirmed") == disposition_value) + if table_name: + clauses.append(HygieneIssue.table_name == table_name) + if column_name: + clauses.append(HygieneIssue.column_name == column_name) + if quality_dimension: + clauses.append(HygieneIssueType.dq_dimension == parse_quality_dimension(quality_dimension)) + if issue_type: + clauses.append(HygieneIssue.type_id == resolve_issue_type(issue_type)) + likelihood_clause = _build_likelihood_clause(issue_likelihood, pii_risk) + if likelihood_clause is not None: + clauses.append(likelihood_clause) + + rows, total = HygieneIssue.list_for_run(profile_run_id, *clauses, page=page, limit=limit) + + doc = MdDoc() + doc.heading(1, f"Hygiene Issues for run `{run_id_label}`") + if not rows: + doc.text("_No hygiene issues match the supplied filters._") + return doc.render() + if info := format_page_info(total, page, limit): + doc.text(info) + + view_pii_codes = set(perms.codes_allowed_to("view_pii")) + for r in rows: + location = f"`{r.column_name}` in `{r.table_name}`" if r.column_name else f"`{r.table_name}`" + doc.heading(2, f"[{r.priority or 'Unknown'}] {r.issue_type_name} on {location}") + doc.field("Issue ID", r.id, code=True) + doc.field("Issue Type", r.issue_type_name) + if r.dq_dimension: + doc.field("Quality Dimension", r.dq_dimension) + doc.field("Disposition", format_disposition(r.disposition)) + if r.priority: + doc.field("Priority", r.priority) + if r.detail: + doc.field("Detail", _redact_detail(r, view_pii_codes)) + + if footer := format_page_footer(total, page, limit): + doc.text(footer) + + return doc.render() + + +@with_database_session +@mcp_permission("disposition") +def update_hygiene_issue(*, issue_id: str, disposition: str) -> str: + """Update the disposition of a hygiene issue (confirm, dismiss, or mute). + + Args: + issue_id: UUID of the hygiene issue. + disposition: New disposition. Valid values: 'Confirmed', 'Dismissed', 'Muted'. + """ + issue_uuid = parse_uuid(issue_id, "issue_id") + db_disposition = parse_disposition(disposition) + perms = get_project_permissions() + + updated = HygieneIssue.update_disposition( + issue_uuid, + db_disposition, + HygieneIssue.project_code.in_(perms.allowed_codes), + ) + if not updated: + raise MCPResourceNotAccessible("Hygiene issue", issue_id) + + doc = MdDoc() + doc.text(f"Updated hygiene issue {MdDoc.code(issue_id)} disposition to **{disposition}**.") + return doc.render() + + +@with_database_session +@mcp_permission("view") +def get_hygiene_issue(*, issue_id: str) -> str: + """Get full details of a specific hygiene issue. + + Includes the issue type definition (description, suggested action), profiling run + metadata, and column-profile context (general type, null rate, distinct count). + + Args: + issue_id: UUID of the hygiene issue, e.g. from ``list_hygiene_issues`` or + ``search_hygiene_issues``. + """ + issue_uuid = parse_uuid(issue_id, "issue_id") + perms = get_project_permissions() + + detail = HygieneIssue.get_with_context( + issue_uuid, + HygieneIssue.project_code.in_(perms.allowed_codes), + ) + if detail is None: + raise MCPResourceNotAccessible("Hygiene issue", issue_id) + + location = ( + f"`{detail.column_name}` in `{detail.table_name}`" if detail.column_name else f"`{detail.table_name}`" + ) + + doc = MdDoc() + doc.heading(1, f"[{detail.priority or 'Unknown'}] {detail.issue_type_name} on {location}") + doc.field("Issue ID", detail.id, code=True) + doc.field("Issue Type", detail.issue_type_name) + doc.field("Schema", detail.schema_name) + doc.field("Table", detail.table_name) + if detail.column_name: + doc.field("Column", detail.column_name) + if detail.dq_dimension: + doc.field("Quality Dimension", detail.dq_dimension) + if detail.impact_dimension: + doc.field("Impact Dimension", detail.impact_dimension) + doc.field("Disposition", format_disposition(detail.disposition)) + if detail.priority: + doc.field("Priority", detail.priority) + if detail.detail: + view_pii_codes = set(perms.codes_allowed_to("view_pii")) + doc.field("Detail", _redact_detail(detail, view_pii_codes)) + + if detail.suggested_action: + doc.heading(2, "Suggested Action") + doc.text(detail.suggested_action) + + if detail.type_description: + doc.heading(2, "Issue Type Description") + doc.text(detail.type_description) + + has_column_profile = detail.column_general_type is not None or detail.column_record_ct is not None + if has_column_profile: + doc.heading(2, "Column Profile") + if detail.column_general_type: + doc.field("General Type", detail.column_general_type) + if detail.column_db_data_type: + doc.field("DB Data Type", detail.column_db_data_type) + if detail.column_record_ct is not None: + doc.field("Records", detail.column_record_ct) + if detail.column_null_value_ct is not None: + doc.field("Nulls", detail.column_null_value_ct) + if detail.column_record_ct: + doc.field("Null Rate", f"{detail.column_null_value_ct / detail.column_record_ct:.2%}") + if detail.column_distinct_value_ct is not None: + doc.field("Distinct values", detail.column_distinct_value_ct) + + doc.heading(2, "Profiling Run") + if detail.job_execution_id: + doc.field("Run", detail.job_execution_id, code=True) + doc.field("Run Date", detail.started_at) + + return doc.render() + + +@with_database_session +@mcp_permission("view") +def search_hygiene_issues( + *, + project_code: str | None = None, + table_group_id: str | None = None, + issue_type: str | None = None, + quality_dimension: str | None = None, + disposition: str | None = "Confirmed", + issue_likelihood: list[str] | None = None, + pii_risk: list[str] | None = None, + since: str | None = None, + table_name: str | None = None, + column_name: str | None = None, + limit: int = 50, + page: int = 1, +) -> str: + """Search hygiene issues across profiling runs and table groups. + + To drill into a single run, use ``list_hygiene_issues``. + + Args: + project_code: Scope to a specific project. + table_group_id: UUID of a table group to scope to. + issue_type: Filter by hygiene issue type (e.g. 'Personally Identifiable Information'). + See ``testgen://hygiene-issue-types`` for the full list. + quality_dimension: Filter by data quality dimension ('Accuracy', 'Completeness', + 'Consistency', 'Recency', 'Timeliness', 'Uniqueness', 'Validity'). + disposition: Filter by disposition. Defaults to 'Confirmed'. + Valid values: 'Confirmed', 'Dismissed', 'Muted'. + issue_likelihood: Filter by issue likelihood. Values: 'Definite', 'Likely', 'Possible'. + Providing this filter auto-excludes PII issues; combine with ``pii_risk`` to include both. + pii_risk: Filter by PII risk level. Values: 'High', 'Moderate'. + Providing this filter auto-excludes regular issues. + since: Include issues from runs that started since this point — e.g. '7 days', + '2 weeks', '2026-04-01'. + table_name: Filter by table name (exact match). + column_name: Filter by column name (exact match). + limit: Maximum number of issues per page (default 50, max 200). + page: Page number, starting from 1 (default 1). + """ + validate_page(page) + validate_limit(limit, 200) + + perms = get_project_permissions() + if project_code: + perms.verify_access(project_code, not_found=MCPResourceNotAccessible("Project", project_code)) + project_codes = [project_code] + else: + project_codes = perms.allowed_codes + + table_group_uuid = parse_uuid(table_group_id, "table_group_id") if table_group_id else None + since_date = parse_since_arg(since) if since else None + type_id = resolve_issue_type(issue_type) if issue_type else None + issue_likelihood = parse_issue_likelihood_list(issue_likelihood) if issue_likelihood else None + pii_risk = parse_pii_risk_list(pii_risk) if pii_risk else None + + clauses = [ + HygieneIssue.project_code.in_(project_codes), + func.coalesce(HygieneIssue.disposition, "Confirmed") == parse_disposition(disposition or "Confirmed"), + ] + if table_group_uuid is not None: + clauses.append(HygieneIssue.table_groups_id == table_group_uuid) + if quality_dimension: + clauses.append(HygieneIssueType.dq_dimension == parse_quality_dimension(quality_dimension)) + if type_id: + clauses.append(HygieneIssue.type_id == type_id) + if table_name: + clauses.append(HygieneIssue.table_name == table_name) + if column_name: + clauses.append(HygieneIssue.column_name == column_name) + if since_date is not None: + clauses.append(JobExecution.started_at >= since_date) + likelihood_clause = _build_likelihood_clause(issue_likelihood, pii_risk) + if likelihood_clause is not None: + clauses.append(likelihood_clause) + + rows, total = HygieneIssue.search(*clauses, page=page, limit=limit) + + doc = MdDoc() + doc.heading(1, "Hygiene Issue Search") + if not rows: + doc.text("_No hygiene issues match the supplied filters._") + return doc.render() + if info := format_page_info(total, page, limit): + doc.text(info) + + view_pii_codes = set(perms.codes_allowed_to("view_pii")) + for r in rows: + location = f"`{r.column_name}` in `{r.table_name}`" if r.column_name else f"`{r.table_name}`" + doc.heading(2, f"[{r.priority or 'Unknown'}] {r.issue_type_name} on {location}") + doc.field("Issue ID", r.id, code=True) + doc.field("Issue Type", r.issue_type_name) + doc.field("Table Group", r.table_groups_name) + if r.job_execution_id: + doc.field("Profiling Run", r.job_execution_id, code=True) + doc.field("Run Date", r.started_at) + if r.dq_dimension: + doc.field("Quality Dimension", r.dq_dimension) + if r.priority: + doc.field("Priority", r.priority) + doc.field("Disposition", format_disposition(r.disposition)) + if r.detail: + doc.field("Detail", _redact_detail(r, view_pii_codes)) + + if footer := format_page_footer(total, page, limit): + doc.text(footer) + + return doc.render() diff --git a/testgen/mcp/tools/reference.py b/testgen/mcp/tools/reference.py index 114cfe45..23217cdd 100644 --- a/testgen/mcp/tools/reference.py +++ b/testgen/mcp/tools/reference.py @@ -1,4 +1,5 @@ from testgen.common.models import with_database_session +from testgen.common.models.hygiene_issue import HygieneIssueType from testgen.common.models.test_definition import TestType from testgen.mcp.tools.markdown import MdDoc @@ -80,6 +81,27 @@ def test_types_resource() -> str: return doc.render() +@with_database_session +def hygiene_issue_types_resource() -> str: + """Reference table of all hygiene issue types with their data quality dimensions, descriptions, and suggested actions.""" + issue_types = HygieneIssueType.select_where(order_by=(HygieneIssueType.name,)) + + if not issue_types: + return "No hygiene issue types found." + + doc = MdDoc() + doc.heading(1, "TestGen Hygiene Issue Types Reference") + doc.table( + headers=["Issue Type", "Quality Dimension", "Impact Dimension", "Description", "Suggested Action"], + rows=[ + [it.name, it.dq_dimension, it.impact_dimension, it.description, it.suggested_action] + for it in issue_types + ], + ) + + return doc.render() + + def glossary_resource() -> str: """Glossary of TestGen concepts, entity hierarchy, result statuses, and quality dimensions.""" return """\ diff --git a/testgen/mcp/tools/test_definitions.py b/testgen/mcp/tools/test_definitions.py index 7863c331..3c520fe1 100644 --- a/testgen/mcp/tools/test_definitions.py +++ b/testgen/mcp/tools/test_definitions.py @@ -4,6 +4,7 @@ from testgen.mcp.exceptions import MCPUserError from testgen.mcp.permissions import get_project_permissions, mcp_permission from testgen.mcp.tools.common import ( + VALID_DQ_DIMENSIONS, format_page_footer, format_page_info, parse_uuid, @@ -15,7 +16,6 @@ _VALID_SCOPES = {"column", "table", "referential", "custom"} _VALID_IMPACT_DIMENSIONS = {"Reliability", "Conformance", "Regularity", "Usability"} -_VALID_DQ_DIMENSIONS = {"Accuracy", "Completeness", "Consistency", "Recency", "Timeliness", "Uniqueness", "Validity"} @with_database_session @@ -303,8 +303,8 @@ def list_test_types( if impact_dimension and impact_dimension not in _VALID_IMPACT_DIMENSIONS: valid = ", ".join(sorted(_VALID_IMPACT_DIMENSIONS)) raise MCPUserError(f"Invalid impact_dimension `{impact_dimension}`. Valid values: {valid}") - if quality_dimension and quality_dimension not in _VALID_DQ_DIMENSIONS: - valid = ", ".join(sorted(_VALID_DQ_DIMENSIONS)) + if quality_dimension and quality_dimension not in VALID_DQ_DIMENSIONS: + valid = ", ".join(sorted(VALID_DQ_DIMENSIONS)) raise MCPUserError(f"Invalid quality_dimension `{quality_dimension}`. Valid values: {valid}") clauses = [TestType.active == "Y"] From a4c557eafc1487031a07358553c761971413a80b Mon Sep 17 00:00:00 2001 From: Luis Date: Tue, 5 May 2026 13:48:23 -0400 Subject: [PATCH 02/11] fix(ui): visual glitches in connections and test results --- .../frontend/js/pages/connections.js | 26 ++++++++++++++++--- testgen/ui/static/js/components/table.js | 5 ++++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/testgen/ui/components/frontend/js/pages/connections.js b/testgen/ui/components/frontend/js/pages/connections.js index 09cb2950..26b63d6a 100644 --- a/testgen/ui/components/frontend/js/pages/connections.js +++ b/testgen/ui/components/frontend/js/pages/connections.js @@ -46,6 +46,21 @@ const Connections = (props) => { const updatedConnection = van.state(connection); const formState = van.state({dirty: false, valid: false}); + // Build the wizard once on first appearance and keep the same instance + // across reruns so internal state (currentStepIndex, form values) is not + // reset. Re-creating it would send the user back to step 0 every time + // Streamlit reruns (e.g. after PreviewTableGroupClicked). + let wizardComponent; + const buildWizard = () => TableGroupWizard({ emit, + project_code: van.derive(() => getValue(props.setup_wizard)?.project_code), + table_group: van.derive(() => getValue(props.setup_wizard)?.table_group), + table_group_preview: van.derive(() => getValue(props.setup_wizard)?.table_group_preview), + steps: van.derive(() => getValue(props.setup_wizard)?.steps), + dialog: van.derive(() => getValue(props.setup_wizard)?.dialog), + results: van.derive(() => getValue(props.setup_wizard)?.results), + standard_cron_sample: van.derive(() => getValue(props.setup_wizard)?.standard_cron_sample), + monitor_cron_sample: van.derive(() => getValue(props.setup_wizard)?.monitor_cron_sample), + }); return div( { id: wrapperId, 'data-testid': 'connections', class: 'flex-column fx-gap-4' }, @@ -110,9 +125,14 @@ const Connections = (props) => { }, ), () => { - const wizardData = getValue(props.setup_wizard); - if (!wizardData) return div(); - return TableGroupWizard(wizardData, emit); + // Once built, always return the same instance so the wizard stays + // mounted (its Dialog handles its own open/close via display:none). + // Returning a fresh node on rerun would unmount and break VanJS + // bindings inside the wizard. + if (!getValue(props.setup_wizard) && !wizardComponent) { + return ''; + } + return wizardComponent ??= buildWizard(); }, ); } diff --git a/testgen/ui/static/js/components/table.js b/testgen/ui/static/js/components/table.js index ac53acc1..8dbe9712 100644 --- a/testgen/ui/static/js/components/table.js +++ b/testgen/ui/static/js/components/table.js @@ -104,6 +104,11 @@ const Table = (options, rows) => { }); }); van.derive(() => { + // Depend on `rows` so this derive re-runs whenever the rows array changes. + // Without this, states added to `selectedRows` beyond its original length + // (e.g., when a filter expansion grows the rows list) have no listener + // registered here, and clicks on those new rows silently drop. + getValue(rows); const selectedRows_ = []; for (let i = 0; i < selectedRows.length; i++) { if (selectedRows[i].val) { From a11176426bbfc850be82b34f8c62299d32c52a6c Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Tue, 5 May 2026 00:13:00 -0400 Subject: [PATCH 03/11] fix(bigquery): error on freshness monitor when tolerance is null --- .../dbsetup_test_types/test_types_Freshness_Trend.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/testgen/template/dbsetup_test_types/test_types_Freshness_Trend.yaml b/testgen/template/dbsetup_test_types/test_types_Freshness_Trend.yaml index 1ad7fae4..ba60e7c5 100644 --- a/testgen/template/dbsetup_test_types/test_types_Freshness_Trend.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Freshness_Trend.yaml @@ -66,7 +66,7 @@ test_types: fingerprint AS result_measure, CASE -- Training mode: tolerances not yet calculated - WHEN {LOWER_TOLERANCE} IS NULL AND {UPPER_TOLERANCE} IS NULL THEN -1 + WHEN CAST({LOWER_TOLERANCE} AS FLOAT64) IS NULL AND CAST({UPPER_TOLERANCE} AS FLOAT64) IS NULL THEN -1 -- No change and excluded day: suppress WHEN fingerprint = '{BASELINE_VALUE}' AND {IS_EXCLUDED_DAY} = 1 THEN 1 -- No change, beyond time range (business time): LATE @@ -75,14 +75,14 @@ test_types: -- Table changed outside time range (business time): UNEXPECTED WHEN fingerprint <> '{BASELINE_VALUE}' AND NOT (interval_minutes - {EXCLUDED_MINUTES}) - BETWEEN {LOWER_TOLERANCE} AND {UPPER_TOLERANCE} THEN 0 + BETWEEN CAST({LOWER_TOLERANCE} AS FLOAT64) AND CAST({UPPER_TOLERANCE} AS FLOAT64) THEN 0 ELSE 1 END AS result_code, 'Table update detected: ' || CASE WHEN fingerprint <> '{BASELINE_VALUE}' THEN 'Yes' ELSE 'No' END || CASE - WHEN fingerprint <> '{BASELINE_VALUE}' AND (interval_minutes - {EXCLUDED_MINUTES}) BETWEEN {LOWER_TOLERANCE} AND {UPPER_TOLERANCE} THEN '. On time.' - WHEN fingerprint <> '{BASELINE_VALUE}' AND (interval_minutes - {EXCLUDED_MINUTES}) < {LOWER_TOLERANCE} THEN '. Earlier than expected.' - WHEN fingerprint <> '{BASELINE_VALUE}' AND (interval_minutes - {EXCLUDED_MINUTES}) > {UPPER_TOLERANCE} THEN '. Later than expected.' + WHEN fingerprint <> '{BASELINE_VALUE}' AND (interval_minutes - {EXCLUDED_MINUTES}) BETWEEN CAST({LOWER_TOLERANCE} AS FLOAT64) AND CAST({UPPER_TOLERANCE} AS FLOAT64) THEN '. On time.' + WHEN fingerprint <> '{BASELINE_VALUE}' AND (interval_minutes - {EXCLUDED_MINUTES}) < CAST({LOWER_TOLERANCE} AS FLOAT64) THEN '. Earlier than expected.' + WHEN fingerprint <> '{BASELINE_VALUE}' AND (interval_minutes - {EXCLUDED_MINUTES}) > CAST({UPPER_TOLERANCE} AS FLOAT64) THEN '. Later than expected.' WHEN fingerprint = '{BASELINE_VALUE}' AND {IS_EXCLUDED_DAY} = 0 AND (interval_minutes - {EXCLUDED_MINUTES}) > {THRESHOLD_VALUE} THEN '. Late.' ELSE '' END AS result_message, From 61f8d0eb74491ba4f5fa3510064176e1d92998d3 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Tue, 5 May 2026 00:28:04 -0400 Subject: [PATCH 04/11] fix(logs): suppress streamlit warnings --- testgen/__main__.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/testgen/__main__.py b/testgen/__main__.py index deffaeec..a2ac98da 100644 --- a/testgen/__main__.py +++ b/testgen/__main__.py @@ -1,3 +1,40 @@ +# Silence streamlit's "missing ScriptRunContext" / "No runtime found" / +# "Session state does not function" warnings, which fire whenever streamlit- +# decorated code runs outside an active script run (our CLI, scheduler, server, +# and any import that touches @st.cache_data). Must run before the first +# streamlit-using import, so it sits at the top of the module. +# +# We replace ``set_log_level`` itself, after seeding it to "error". Streamlit's +# own ``_update_logger`` callback fires on config parse and would otherwise +# downgrade us back to "info"; the cap floors any later call at ERROR. +def _silence_streamlit_logs() -> None: + import logging as _logging + + try: + from streamlit import logger as _st_logger + except ImportError: + return + + _original = _st_logger.set_log_level + _original("error") + + def _capped(level): + if isinstance(level, str): + try: + level_num = getattr(_logging, level.upper()) + except AttributeError: + _original(level) + return + else: + level_num = level + _original(max(level_num, _logging.ERROR)) + + _st_logger.set_log_level = _capped + + +_silence_streamlit_logs() + + import base64 import importlib import logging @@ -895,6 +932,7 @@ def init_ui(): "run", app_file, "--browser.gatherUsageStats=false", + f"--logger.level={'debug' if settings.IS_DEBUG else 'error'}", "--client.showErrorDetails=none", "--client.toolbarMode=minimal", "--server.enableStaticServing=true", From e91277e903c8f3dc7c067f868dc227560fa9beed Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Tue, 5 May 2026 13:09:29 -0400 Subject: [PATCH 05/11] fix(standalone): command argument precedence - add log path to config.env --- testgen/__main__.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/testgen/__main__.py b/testgen/__main__.py index a2ac98da..1841636a 100644 --- a/testgen/__main__.py +++ b/testgen/__main__.py @@ -603,7 +603,7 @@ def generate_secret(length: int = 12) -> str: # Persist caller-supplied runtime overrides (ports, TLS) so they apply to # subsequent `testgen run-app` invocations. - persisted_env_vars = ("TG_UI_PORT", "TG_API_PORT", "SSL_CERT_FILE", "SSL_KEY_FILE") + persisted_env_vars = ("TG_UI_PORT", "TG_API_PORT", "TESTGEN_LOG_FILE_PATH", "SSL_CERT_FILE", "SSL_KEY_FILE") persisted_lines = [f"{name}={os.environ[name]}" for name in persisted_env_vars if os.environ.get(name)] if persisted_lines: config_lines.extend(["", "# Runtime overrides from installer", *persisted_lines]) @@ -611,6 +611,13 @@ def generate_secret(length: int = 12) -> str: config_path.write_text("\n".join(config_lines) + "\n") click.echo(f"Config written to {config_path}") + # `getenv` resolves env vars before config.env, so a pre-existing + # TESTGEN_USERNAME / TESTGEN_PASSWORD in the shell would override the + # CLI-supplied values and get seeded into the DB. Force the CLI args + # to win for the rest of this process. + os.environ["TESTGEN_USERNAME"] = username + os.environ["TESTGEN_PASSWORD"] = password + # Reload settings — the module was already evaluated at import time # before the config file existed. Reloading re-reads the new file # and re-evaluates all module-level variables. From e28d5a85e03124b1e198f86993523620ee50f3be Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Tue, 5 May 2026 17:17:06 -0400 Subject: [PATCH 06/11] fix(standalone): honor config.env for ports, hosts, and base URLs Several settings called os.getenv directly instead of the local getenv helper that also reads from ~/.testgen/config.env. As a result, values written by `testgen standalone-setup` (e.g. TG_UI_PORT) were silently ignored by `testgen run-app`. Also derive UI_BASE_URL from UI_PORT instead of Streamlit's STREAMLIT_SERVER_PORT, matching the API side. Co-Authored-By: Claude Opus 4.6 --- testgen/settings.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/testgen/settings.py b/testgen/settings.py index 0c69f600..94c4e002 100644 --- a/testgen/settings.py +++ b/testgen/settings.py @@ -474,7 +474,7 @@ def _ssl_files_present() -> bool: return bool(SSL_CERT_FILE) and os.path.isfile(SSL_CERT_FILE) and bool(SSL_KEY_FILE) and os.path.isfile(SSL_KEY_FILE) -_ui_tls_env = os.getenv("TG_UI_TLS_ENABLED", "").lower() +_ui_tls_env = getenv("TG_UI_TLS_ENABLED", "").lower() UI_TLS_ENABLED: bool = _ui_tls_env in ("yes", "true") if _ui_tls_env else _ssl_files_present() """ Enable TLS for the Streamlit UI server. @@ -484,7 +484,7 @@ def _ssl_files_present() -> bool: defaults to: auto-detect """ -_api_tls_env = os.getenv("TG_API_TLS_ENABLED", "").lower() +_api_tls_env = getenv("TG_API_TLS_ENABLED", "").lower() API_TLS_ENABLED: bool = _api_tls_env in ("yes", "true") if _api_tls_env else _ssl_files_present() """ Enable TLS for the API/MCP server (uvicorn). @@ -568,7 +568,7 @@ def _ssl_files_present() -> bool: defaults to: `yes` """ -UI_PORT: int = int(os.getenv("TG_UI_PORT", "8501")) +UI_PORT: int = int(getenv("TG_UI_PORT", "8501")) """ Port for the UI server. @@ -576,7 +576,7 @@ def _ssl_files_present() -> bool: defaults to: `8501` """ -API_PORT: int = int(os.getenv("TG_API_PORT", "8530")) +API_PORT: int = int(getenv("TG_API_PORT", "8530")) """ Port for the API server. @@ -584,7 +584,7 @@ def _ssl_files_present() -> bool: defaults to: `8530` """ -API_HOST: str = os.getenv("TG_API_HOST", "0.0.0.0") # noqa: S104 +API_HOST: str = getenv("TG_API_HOST", "0.0.0.0") # noqa: S104 """ Host for the API server. @@ -597,7 +597,7 @@ def _default_base_url() -> str: return f"{scheme}://localhost:{API_PORT}" -BASE_URL: str = os.getenv("TG_BASE_URL", "") or _default_base_url() +BASE_URL: str = getenv("TG_BASE_URL", "") or _default_base_url() """ Externally-reachable base URL for the API/MCP/OAuth server. @@ -608,14 +608,13 @@ def _default_base_url() -> str: def _default_ui_base_url() -> str: scheme = "https" if UI_TLS_ENABLED else "http" - port = os.getenv("STREAMLIT_SERVER_PORT", "8501") - return f"{scheme}://localhost:{port}" + return f"{scheme}://localhost:{UI_PORT}" -UI_BASE_URL: str = os.getenv("TG_UI_BASE_URL", "") or _default_ui_base_url() +UI_BASE_URL: str = getenv("TG_UI_BASE_URL", "") or _default_ui_base_url() """ Externally-reachable base URL for the Streamlit UI (used in email links, PDFs). from env variable: `TG_UI_BASE_URL` -defaults to: computed from UI_TLS_ENABLED and STREAMLIT_SERVER_PORT +defaults to: computed from UI_TLS_ENABLED and UI_PORT """ From 3c6d005ce6b86d6916695ac6fa4f6a98b71d09ff Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Mon, 4 May 2026 20:02:17 -0400 Subject: [PATCH 07/11] fix(mcp): hygiene issues review feedback (TG-1029) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round of review fixes applied on top of the initial implementation: * Drop the legacy `profile_run_id` from the MCP layer — model methods take a `job_execution_id` and JOIN `ProfilingRun` to filter. New `ProfilingRun.get_latest_complete_je_id_for_table_group` avoids reading the `table_groups.last_complete_profile_run_id` cache (which still points at the internal run PK; schema-level cleanup tracked separately). * Trust the run/JE backfill migration — drop `if x is None` guards and the descriptive-string fallback in `_resolve_profile_run`. * StrEnums for fixed string sets: `Disposition`, `IssueLikelihood`, `PiiRisk` (in `common/models/hygiene_issue.py`) and `QualityDimension` (new shared `common/enums.py`). Parsers return enum types; coalesce defaults reference enum members; sentinel sets dropped. * Tighten labels: heading uses "for profiling run"; `get_hygiene_issue` no longer triples "Run / Profiling Run / Run Date". * Empty states render with title + italic marker. * `dq_prevalence` removed from output and dataclasses (no precedent label, unhelpful score-engine internal). --- testgen/common/enums.py | 29 + testgen/common/models/hygiene_issue.py | 45 +- testgen/common/models/profiling_run.py | 17 + testgen/mcp/prompts/workflows.py | 5 +- testgen/mcp/tools/common.py | 96 ++- testgen/mcp/tools/hygiene_issues.py | 68 +- testgen/mcp/tools/reference.py | 37 +- testgen/mcp/tools/test_definitions.py | 25 +- tests/unit/mcp/test_model_hygiene_issue.py | 362 ++++++++ tests/unit/mcp/test_model_profiling_run.py | 65 ++ tests/unit/mcp/test_prompts_workflows.py | 55 ++ tests/unit/mcp/test_tools_common.py | 193 ++++- tests/unit/mcp/test_tools_hygiene_issues.py | 907 ++++++++++++++++++++ tests/unit/mcp/test_tools_reference.py | 83 +- 14 files changed, 1893 insertions(+), 94 deletions(-) create mode 100644 testgen/common/enums.py create mode 100644 tests/unit/mcp/test_model_hygiene_issue.py create mode 100644 tests/unit/mcp/test_model_profiling_run.py create mode 100644 tests/unit/mcp/test_prompts_workflows.py create mode 100644 tests/unit/mcp/test_tools_hygiene_issues.py diff --git a/testgen/common/enums.py b/testgen/common/enums.py new file mode 100644 index 00000000..94d08a37 --- /dev/null +++ b/testgen/common/enums.py @@ -0,0 +1,29 @@ +"""Shared enums used across multiple models, services, and surfaces. + +Add an enum here when its values are referenced by more than one model file or by +both the model layer and an outer surface (MCP, API, UI). Single-model enums live +in their model file. +""" +from enum import StrEnum + + +class QualityDimension(StrEnum): + """Stored ``dq_dimension`` values shared by ``profile_anomaly_types`` and ``test_types``. + Surfaced to users as "Quality Dimension".""" + ACCURACY = "Accuracy" + COMPLETENESS = "Completeness" + CONSISTENCY = "Consistency" + RECENCY = "Recency" + TIMELINESS = "Timeliness" + UNIQUENESS = "Uniqueness" + VALIDITY = "Validity" + + +class ImpactDimension(StrEnum): + """Stored ``impact_dimension`` values shared by ``profile_anomaly_types`` / + ``profile_anomaly_results`` and ``test_types``. The primary dimension breakdown + used by scorecards.""" + RELIABILITY = "Reliability" + CONFORMANCE = "Conformance" + REGULARITY = "Regularity" + USABILITY = "Usability" diff --git a/testgen/common/models/hygiene_issue.py b/testgen/common/models/hygiene_issue.py index a950c5a9..497c8180 100644 --- a/testgen/common/models/hygiene_issue.py +++ b/testgen/common/models/hygiene_issue.py @@ -2,6 +2,7 @@ from collections.abc import Iterable from dataclasses import dataclass from datetime import datetime +from enum import StrEnum from typing import Self from uuid import UUID, uuid4 @@ -21,6 +22,28 @@ PII_RISK_RE = re.compile(r"Risk: (MODERATE|HIGH),") +class Disposition(StrEnum): + """Stored disposition values for ``profile_anomaly_results.disposition`` and + ``test_results.disposition``. The user-facing label for ``INACTIVE`` is "Muted".""" + CONFIRMED = "Confirmed" + DISMISSED = "Dismissed" + INACTIVE = "Inactive" + + +class IssueLikelihood(StrEnum): + """Stored ``profile_anomaly_types.issue_likelihood`` values.""" + DEFINITE = "Definite" + LIKELY = "Likely" + POSSIBLE = "Possible" + POTENTIAL_PII = "Potential PII" + + +class PiiRisk(StrEnum): + """Risk level extracted from PII issue ``detail`` strings via ``priority`` hybrid.""" + HIGH = "High" + MODERATE = "Moderate" + + @dataclass class IssueLikelihoodCounts: """Counts of hygiene issues by likelihood category, with dismissed/inactive separated.""" @@ -51,6 +74,7 @@ class HygieneIssueListRow: schema_name: str table_name: str column_name: str + impact_dimension: str | None dq_dimension: str | None disposition: str priority: str | None @@ -72,6 +96,7 @@ class HygieneIssueSearchRow: schema_name: str table_name: str column_name: str + impact_dimension: str | None dq_dimension: str | None disposition: str priority: str | None @@ -92,7 +117,6 @@ class HygieneIssueDetail: schema_name: str table_name: str column_name: str - db_data_type: str | None dq_dimension: str | None impact_dimension: str | None disposition: str @@ -150,13 +174,12 @@ class HygieneIssue(Entity): schema_name: str = Column(String, nullable=False) table_name: str = Column(String, nullable=False) column_name: str = Column(String, nullable=False) - db_data_type: str = Column(String) detail: str = Column(String, nullable=False) disposition: str = Column(String) impact_dimension: str = Column(String) - # Unmapped: column_type, dq_prevalence. + # Unmapped: column_type, db_data_type, dq_prevalence. @hybrid_property def priority(self): @@ -237,12 +260,12 @@ def _priority_order(cls): @classmethod def list_for_run( cls, - profile_run_id: UUID, + job_execution_id: UUID, *clauses, page: int = 1, limit: int = 50, ) -> tuple[list[HygieneIssueListRow], int]: - """Paginated hygiene issues for a single profiling run. + """Paginated hygiene issues for a single profiling run, scoped by its job_execution_id. Caller-supplied ``*clauses`` carry every WHERE filter (project scoping, disposition, likelihood / pii_risk, table / column / dq_dimension / issue_type filters). @@ -255,14 +278,16 @@ def list_for_run( cls.schema_name.label("schema_name"), cls.table_name.label("table_name"), cls.column_name.label("column_name"), + cls.impact_dimension.label("impact_dimension"), HygieneIssueType.dq_dimension.label("dq_dimension"), - func.coalesce(cls.disposition, "Confirmed").label("disposition"), + func.coalesce(cls.disposition, Disposition.CONFIRMED).label("disposition"), cls.priority.label("priority"), cls.detail.label("detail"), HygieneIssueType.detail_redactable.label("detail_redactable"), ProfileResult.pii_flag.label("pii_flag"), ) .join(HygieneIssueType, HygieneIssueType.id == cls.type_id) + .join(ProfilingRun, ProfilingRun.id == cls.profile_run_id) .outerjoin( ProfileResult, and_( @@ -272,7 +297,7 @@ def list_for_run( ProfileResult.column_name == cls.column_name, ), ) - .where(cls.profile_run_id == profile_run_id, *clauses) + .where(ProfilingRun.job_execution_id == job_execution_id, *clauses) .order_by(cls._priority_order(), cls.table_name, cls.column_name, cls.id) ) return cls._paginate(query, page=page, limit=limit, data_class=HygieneIssueListRow) @@ -301,8 +326,9 @@ def search( cls.schema_name.label("schema_name"), cls.table_name.label("table_name"), cls.column_name.label("column_name"), + cls.impact_dimension.label("impact_dimension"), HygieneIssueType.dq_dimension.label("dq_dimension"), - func.coalesce(cls.disposition, "Confirmed").label("disposition"), + func.coalesce(cls.disposition, Disposition.CONFIRMED).label("disposition"), cls.priority.label("priority"), cls.detail.label("detail"), HygieneIssueType.detail_redactable.label("detail_redactable"), @@ -347,10 +373,9 @@ def get_with_context(cls, issue_id: UUID, *clauses) -> HygieneIssueDetail | None cls.schema_name.label("schema_name"), cls.table_name.label("table_name"), cls.column_name.label("column_name"), - cls.db_data_type.label("db_data_type"), HygieneIssueType.dq_dimension.label("dq_dimension"), cls.impact_dimension.label("impact_dimension"), - func.coalesce(cls.disposition, "Confirmed").label("disposition"), + func.coalesce(cls.disposition, Disposition.CONFIRMED).label("disposition"), cls.priority.label("priority"), cls.detail.label("detail"), HygieneIssueType.detail_redactable.label("detail_redactable"), diff --git a/testgen/common/models/profiling_run.py b/testgen/common/models/profiling_run.py index 18485527..95538203 100644 --- a/testgen/common/models/profiling_run.py +++ b/testgen/common/models/profiling_run.py @@ -162,6 +162,23 @@ def get_latest_run(cls, project_code: str) -> LatestProfilingRun | None: return LatestProfilingRun(str(result["id"]), result["run_time"]) return None + @classmethod + def get_latest_complete_je_id_for_table_group(cls, table_groups_id: UUID) -> UUID | None: + """Return the ``job_execution_id`` of the latest completed profiling run for a table group. + + Computed live from ``profiling_runs`` joined to ``job_executions`` — does not read the + legacy ``table_groups.last_complete_profile_run_id`` cache, which points at the internal + run PK rather than the JE id. + """ + query = ( + select(cls.job_execution_id) + .join(JobExecution, cls.job_execution_id == JobExecution.id) + .where(cls.table_groups_id == table_groups_id, JobExecution.status == JobStatus.COMPLETED) + .order_by(desc(JobExecution.started_at)) + .limit(1) + ) + return get_current_session().scalar(query) + @classmethod @st.cache_data(show_spinner=False, hash_funcs=ENTITY_HASH_FUNCS) def select_minimal_where( diff --git a/testgen/mcp/prompts/workflows.py b/testgen/mcp/prompts/workflows.py index 2317054b..4201bf75 100644 --- a/testgen/mcp/prompts/workflows.py +++ b/testgen/mcp/prompts/workflows.py @@ -95,7 +95,6 @@ def hygiene_triage(table_group_id: str | None = None) -> str: steps.append( "Call `get_data_inventory()` to see projects and table groups, with profiling status per group." ) - steps.append(f"Call `list_profiling_summaries(table_group_id={tg})` to see hygiene issue counts per run.") steps.append(f"Call `list_hygiene_issues(table_group_id={tg}, disposition='Confirmed')` for the issues to review.") steps.append( "For each top issue (ordered by priority), call `get_hygiene_issue(issue_id='...')` for full context — " @@ -103,8 +102,8 @@ def hygiene_triage(table_group_id: str | None = None) -> str: ) steps.append("For unfamiliar issue types, read `testgen://hygiene-issue-types` once for the reference table.") steps.append( - "For each issue: explain what was found, then ask the user whether to dismiss the issue " - "(call `update_hygiene_issue(issue_id='...', disposition='Dismissed')`) or investigate further." + "For each issue: explain what was found, then help the user decide a disposition — **Confirmed**, **Dismissed**, or **Muted**. " + "Apply via `update_hygiene_issue(issue_id='...', disposition='...')`, or leave it open for further investigation." ) steps.append("Summarize the triage results and any patterns noted across the issues.") diff --git a/testgen/mcp/tools/common.py b/testgen/mcp/tools/common.py index d6c2d3a3..9c0c3096 100644 --- a/testgen/mcp/tools/common.py +++ b/testgen/mcp/tools/common.py @@ -2,7 +2,8 @@ from uuid import UUID from testgen.common.date_service import parse_since -from testgen.common.models.hygiene_issue import HygieneIssueType +from testgen.common.enums import ImpactDimension, QualityDimension +from testgen.common.models.hygiene_issue import Disposition, HygieneIssueType, IssueLikelihood, PiiRisk from testgen.common.models.table_group import TableGroup from testgen.common.models.test_definition import TestType from testgen.common.models.test_result import TestResultStatus @@ -10,12 +11,15 @@ from testgen.mcp.exceptions import MCPResourceNotAccessible, MCPUserError from testgen.mcp.permissions import get_project_permissions -VALID_DQ_DIMENSIONS = {"Accuracy", "Completeness", "Consistency", "Recency", "Timeliness", "Uniqueness", "Validity"} -# DB stores "Inactive"; user-facing label is "Muted". -_DISPOSITION_USER_TO_DB = {"Confirmed": "Confirmed", "Dismissed": "Dismissed", "Muted": "Inactive"} -_DISPOSITION_DB_TO_USER = {v: k for k, v in _DISPOSITION_USER_TO_DB.items()} -_VALID_ISSUE_LIKELIHOODS = {"Definite", "Likely", "Possible"} -_VALID_PII_RISKS = {"High", "Moderate"} +# User-facing label for ``Disposition.INACTIVE`` is "Muted" — accept that label on input. +_DISPOSITION_USER_TO_DB: dict[str, Disposition] = { + "Confirmed": Disposition.CONFIRMED, + "Dismissed": Disposition.DISMISSED, + "Muted": Disposition.INACTIVE, +} +_DISPOSITION_DB_TO_USER: dict[Disposition, str] = {v: k for k, v in _DISPOSITION_USER_TO_DB.items()} +# Filter accepts only the regular likelihoods — PII rows are filtered separately via ``pii_risk``. +_FILTERABLE_LIKELIHOODS = frozenset({IssueLikelihood.DEFINITE, IssueLikelihood.LIKELY, IssueLikelihood.POSSIBLE}) def parse_uuid(value: str, label: str = "ID") -> UUID: @@ -50,44 +54,74 @@ def parse_since_arg(value: str, label: str = "since", *, today: date | None = No raise MCPUserError(f"Invalid `{label}`: {err}") from err -def parse_quality_dimension(value: str) -> str: - if value not in VALID_DQ_DIMENSIONS: - valid = ", ".join(sorted(VALID_DQ_DIMENSIONS)) - raise MCPUserError(f"Invalid quality_dimension `{value}`. Valid values: {valid}") - return value +def parse_impact_dimension(value: str) -> ImpactDimension: + try: + return ImpactDimension(value) + except ValueError as err: + valid = ", ".join(d.value for d in ImpactDimension) + raise MCPUserError(f"Invalid impact_dimension `{value}`. Valid values: {valid}") from err + +def parse_quality_dimension(value: str) -> QualityDimension: + try: + return QualityDimension(value) + except ValueError as err: + valid = ", ".join(d.value for d in QualityDimension) + raise MCPUserError(f"Invalid quality_dimension `{value}`. Valid values: {valid}") from err -def parse_disposition(value: str) -> str: - """Validate a user-facing disposition label and return the DB value. - Accepts ``Confirmed``, ``Dismissed``, ``Muted`` and returns ``Confirmed``, - ``Dismissed``, ``Inactive`` respectively (the DB encodes the legacy ``Inactive``). +def parse_disposition(value: str) -> Disposition: + """Validate a user-facing disposition label and return the stored ``Disposition``. + + Accepts ``Confirmed``, ``Dismissed``, ``Muted`` (user-facing labels). The DB encodes + ``INACTIVE`` for "Muted" — see ``Disposition``. """ - if value not in _DISPOSITION_USER_TO_DB: + db_value = _DISPOSITION_USER_TO_DB.get(value) + if db_value is None: valid = ", ".join(sorted(_DISPOSITION_USER_TO_DB)) raise MCPUserError(f"Invalid disposition `{value}`. Valid values: {valid}") - return _DISPOSITION_USER_TO_DB[value] - + return db_value -def format_disposition(value: str) -> str: - """Map a DB disposition value to its user-facing label (``Inactive`` → ``Muted``).""" - return _DISPOSITION_DB_TO_USER.get(value, value) - -def parse_issue_likelihood_list(values: list[str]) -> list[str]: - invalid = [v for v in values if v not in _VALID_ISSUE_LIKELIHOODS] +def format_disposition(value: Disposition | str) -> str: + """Map a stored disposition to its user-facing label (``INACTIVE`` → "Muted").""" + try: + return _DISPOSITION_DB_TO_USER[Disposition(value)] + except ValueError: + return str(value) + + +def parse_issue_likelihood_list(values: list[str]) -> list[IssueLikelihood]: + parsed: list[IssueLikelihood] = [] + invalid: list[str] = [] + for value in values: + try: + likelihood = IssueLikelihood(value) + except ValueError: + invalid.append(value) + continue + if likelihood not in _FILTERABLE_LIKELIHOODS: + invalid.append(value) + continue + parsed.append(likelihood) if invalid: - valid = ", ".join(sorted(_VALID_ISSUE_LIKELIHOODS)) + valid = ", ".join(sorted(v.value for v in _FILTERABLE_LIKELIHOODS)) raise MCPUserError(f"Invalid issue_likelihood values {invalid}. Valid values: {valid}") - return values + return parsed -def parse_pii_risk_list(values: list[str]) -> list[str]: - invalid = [v for v in values if v not in _VALID_PII_RISKS] +def parse_pii_risk_list(values: list[str]) -> list[PiiRisk]: + parsed: list[PiiRisk] = [] + invalid: list[str] = [] + for value in values: + try: + parsed.append(PiiRisk(value)) + except ValueError: + invalid.append(value) if invalid: - valid = ", ".join(sorted(_VALID_PII_RISKS)) + valid = ", ".join(r.value for r in PiiRisk) raise MCPUserError(f"Invalid pii_risk values {invalid}. Valid values: {valid}") - return values + return parsed def resolve_test_type(short_name: str) -> str: diff --git a/testgen/mcp/tools/hygiene_issues.py b/testgen/mcp/tools/hygiene_issues.py index 7101587a..fedabb61 100644 --- a/testgen/mcp/tools/hygiene_issues.py +++ b/testgen/mcp/tools/hygiene_issues.py @@ -5,7 +5,7 @@ from sqlalchemy.sql.functions import func from testgen.common.models import with_database_session -from testgen.common.models.hygiene_issue import HygieneIssue, HygieneIssueType +from testgen.common.models.hygiene_issue import Disposition, HygieneIssue, HygieneIssueType, IssueLikelihood, PiiRisk from testgen.common.models.job_execution import JobExecution from testgen.common.models.profiling_run import ProfilingRun from testgen.common.models.table_group import TableGroup @@ -17,6 +17,7 @@ format_page_footer, format_page_info, parse_disposition, + parse_impact_dimension, parse_issue_likelihood_list, parse_pii_risk_list, parse_quality_dimension, @@ -40,8 +41,8 @@ def _redact_detail(row, view_pii_codes: set[str]) -> str: def _build_likelihood_clause( - issue_likelihood: list[str] | None, - pii_risk: list[str] | None, + issue_likelihood: list[IssueLikelihood] | None, + pii_risk: list[PiiRisk] | None, ) -> ColumnElement[bool] | None: """Construct the WHERE clause for the likelihood / pii_risk filter pair. @@ -52,7 +53,7 @@ def _build_likelihood_clause( """ likelihood_clause = HygieneIssueType.likelihood.in_(issue_likelihood) if issue_likelihood else None pii_clause = ( - and_(HygieneIssueType.likelihood == "Potential PII", HygieneIssue.priority.in_(pii_risk)) + and_(HygieneIssueType.likelihood == IssueLikelihood.POTENTIAL_PII, HygieneIssue.priority.in_(pii_risk)) if pii_risk else None ) @@ -63,23 +64,22 @@ def _build_likelihood_clause( return or_(likelihood_clause, pii_clause) -def _resolve_profile_run( +def _resolve_profile_run_je_id( *, job_execution_id: str | None, table_group_id: str | None, -) -> tuple[UUID, str]: - """Resolve the scope to a (profile_run_id, run_id_label) tuple. +) -> UUID: + """Resolve the scope to the ``job_execution_id`` of a single profiling run. - Mutual-exclusion + at-least-one validation done by caller. Collapses missing - runs and inaccessible table groups into a single ``MCPResourceNotAccessible``. + Mutual-exclusion + at-least-one validation done by caller. Collapses missing runs + and inaccessible table groups into ``MCPResourceNotAccessible``. """ if table_group_id: tg = resolve_table_group(table_group_id) - if tg.last_complete_profile_run_id is None: + je_uuid = ProfilingRun.get_latest_complete_je_id_for_table_group(tg.id) + if je_uuid is None: raise MCPUserError(f"No completed profiling runs found for table group `{table_group_id}`.") - run = ProfilingRun.get(tg.last_complete_profile_run_id) - run_id_label = str(run.job_execution_id) if run and run.job_execution_id else f"latest run for table group `{table_group_id}`" - return tg.last_complete_profile_run_id, run_id_label + return je_uuid job_uuid = parse_uuid(job_execution_id, "job_execution_id") run = ProfilingRun.get_by_id_or_job(job_uuid) @@ -87,7 +87,7 @@ def _resolve_profile_run( tg = TableGroup.get(run.table_groups_id) if run else None if run is None or tg is None or not perms.has_access(tg.project_code): raise MCPResourceNotAccessible("Profiling run", job_execution_id) - return run.id, job_execution_id + return run.job_execution_id @with_database_session @@ -98,6 +98,7 @@ def list_hygiene_issues( table_group_id: str | None = None, table_name: str | None = None, column_name: str | None = None, + impact_dimension: str | None = None, quality_dimension: str | None = None, disposition: str | None = "Confirmed", issue_likelihood: list[str] | None = None, @@ -117,6 +118,8 @@ def list_hygiene_issues( Mutually exclusive with ``job_execution_id``. table_name: Filter by table name (exact match). column_name: Filter by column name (exact match). + impact_dimension: Filter by impact dimension ('Reliability', 'Conformance', + 'Regularity', 'Usability'). quality_dimension: Filter by data quality dimension ('Accuracy', 'Completeness', 'Consistency', 'Recency', 'Timeliness', 'Uniqueness', 'Validity'). disposition: Filter by disposition. Defaults to 'Confirmed'. @@ -125,7 +128,7 @@ def list_hygiene_issues( Providing this filter auto-excludes PII issues; combine with ``pii_risk`` to include both. pii_risk: Filter by PII risk level. Values: 'High', 'Moderate'. Providing this filter auto-excludes regular issues. - issue_type: Filter by hygiene issue type (e.g. 'Personally Identifiable Information'). + issue_type: Filter by hygiene issue type (e.g. 'Similar Values Match When Standardized'). See ``testgen://hygiene-issue-types`` for the full list. limit: Maximum number of issues per page (default 50, max 200). page: Page number, starting from 1 (default 1). @@ -138,7 +141,7 @@ def list_hygiene_issues( validate_limit(limit, 200) perms = get_project_permissions() - profile_run_id, run_id_label = _resolve_profile_run( + run_je_id = _resolve_profile_run_je_id( job_execution_id=job_execution_id, table_group_id=table_group_id, ) @@ -148,11 +151,13 @@ def list_hygiene_issues( clauses = [HygieneIssue.project_code.in_(perms.allowed_codes)] disposition_value = parse_disposition(disposition or "Confirmed") - clauses.append(func.coalesce(HygieneIssue.disposition, "Confirmed") == disposition_value) + clauses.append(func.coalesce(HygieneIssue.disposition, Disposition.CONFIRMED) == disposition_value) if table_name: clauses.append(HygieneIssue.table_name == table_name) if column_name: clauses.append(HygieneIssue.column_name == column_name) + if impact_dimension: + clauses.append(HygieneIssue.impact_dimension == parse_impact_dimension(impact_dimension)) if quality_dimension: clauses.append(HygieneIssueType.dq_dimension == parse_quality_dimension(quality_dimension)) if issue_type: @@ -161,10 +166,10 @@ def list_hygiene_issues( if likelihood_clause is not None: clauses.append(likelihood_clause) - rows, total = HygieneIssue.list_for_run(profile_run_id, *clauses, page=page, limit=limit) + rows, total = HygieneIssue.list_for_run(run_je_id, *clauses, page=page, limit=limit) doc = MdDoc() - doc.heading(1, f"Hygiene Issues for run `{run_id_label}`") + doc.heading(1, f"Hygiene Issues for profiling run `{run_je_id}`") if not rows: doc.text("_No hygiene issues match the supplied filters._") return doc.render() @@ -177,6 +182,8 @@ def list_hygiene_issues( doc.heading(2, f"[{r.priority or 'Unknown'}] {r.issue_type_name} on {location}") doc.field("Issue ID", r.id, code=True) doc.field("Issue Type", r.issue_type_name) + if r.impact_dimension: + doc.field("Impact Dimension", r.impact_dimension) if r.dq_dimension: doc.field("Quality Dimension", r.dq_dimension) doc.field("Disposition", format_disposition(r.disposition)) @@ -251,10 +258,10 @@ def get_hygiene_issue(*, issue_id: str) -> str: doc.field("Table", detail.table_name) if detail.column_name: doc.field("Column", detail.column_name) - if detail.dq_dimension: - doc.field("Quality Dimension", detail.dq_dimension) if detail.impact_dimension: doc.field("Impact Dimension", detail.impact_dimension) + if detail.dq_dimension: + doc.field("Quality Dimension", detail.dq_dimension) doc.field("Disposition", format_disposition(detail.disposition)) if detail.priority: doc.field("Priority", detail.priority) @@ -287,9 +294,8 @@ def get_hygiene_issue(*, issue_id: str) -> str: doc.field("Distinct values", detail.column_distinct_value_ct) doc.heading(2, "Profiling Run") - if detail.job_execution_id: - doc.field("Run", detail.job_execution_id, code=True) - doc.field("Run Date", detail.started_at) + doc.field("ID", detail.job_execution_id, code=True) + doc.field("Date", detail.started_at) return doc.render() @@ -301,6 +307,7 @@ def search_hygiene_issues( project_code: str | None = None, table_group_id: str | None = None, issue_type: str | None = None, + impact_dimension: str | None = None, quality_dimension: str | None = None, disposition: str | None = "Confirmed", issue_likelihood: list[str] | None = None, @@ -318,8 +325,10 @@ def search_hygiene_issues( Args: project_code: Scope to a specific project. table_group_id: UUID of a table group to scope to. - issue_type: Filter by hygiene issue type (e.g. 'Personally Identifiable Information'). + issue_type: Filter by hygiene issue type (e.g. 'Similar Values Match When Standardized'). See ``testgen://hygiene-issue-types`` for the full list. + impact_dimension: Filter by impact dimension ('Reliability', 'Conformance', + 'Regularity', 'Usability'). quality_dimension: Filter by data quality dimension ('Accuracy', 'Completeness', 'Consistency', 'Recency', 'Timeliness', 'Uniqueness', 'Validity'). disposition: Filter by disposition. Defaults to 'Confirmed'. @@ -353,10 +362,12 @@ def search_hygiene_issues( clauses = [ HygieneIssue.project_code.in_(project_codes), - func.coalesce(HygieneIssue.disposition, "Confirmed") == parse_disposition(disposition or "Confirmed"), + func.coalesce(HygieneIssue.disposition, Disposition.CONFIRMED) == parse_disposition(disposition or "Confirmed"), ] if table_group_uuid is not None: clauses.append(HygieneIssue.table_groups_id == table_group_uuid) + if impact_dimension: + clauses.append(HygieneIssue.impact_dimension == parse_impact_dimension(impact_dimension)) if quality_dimension: clauses.append(HygieneIssueType.dq_dimension == parse_quality_dimension(quality_dimension)) if type_id: @@ -388,9 +399,10 @@ def search_hygiene_issues( doc.field("Issue ID", r.id, code=True) doc.field("Issue Type", r.issue_type_name) doc.field("Table Group", r.table_groups_name) - if r.job_execution_id: - doc.field("Profiling Run", r.job_execution_id, code=True) + doc.field("Profiling Run", r.job_execution_id, code=True) doc.field("Run Date", r.started_at) + if r.impact_dimension: + doc.field("Impact Dimension", r.impact_dimension) if r.dq_dimension: doc.field("Quality Dimension", r.dq_dimension) if r.priority: diff --git a/testgen/mcp/tools/reference.py b/testgen/mcp/tools/reference.py index 23217cdd..ca38634f 100644 --- a/testgen/mcp/tools/reference.py +++ b/testgen/mcp/tools/reference.py @@ -92,9 +92,9 @@ def hygiene_issue_types_resource() -> str: doc = MdDoc() doc.heading(1, "TestGen Hygiene Issue Types Reference") doc.table( - headers=["Issue Type", "Quality Dimension", "Impact Dimension", "Description", "Suggested Action"], + headers=["Issue Type", "Impact Dimension", "Quality Dimension", "Likelihood", "Description", "Suggested Action"], rows=[ - [it.name, it.dq_dimension, it.impact_dimension, it.description, it.suggested_action] + [it.name, it.impact_dimension, it.dq_dimension, it.likelihood, it.description, it.suggested_action] for it in issue_types ], ) @@ -112,6 +112,8 @@ def glossary_resource() -> str: - **Project** — Top-level organizational unit. - **Connection** — Database connection configuration (host, credentials). - **Table Group** — A set of tables within a schema that are profiled and tested together. +- **Profiling Run** — A scan of a table group that produces column-level statistics and detects hygiene issues. +- **Hygiene Issue** — A potential data-quality concern detected by a profiling run (e.g. PII columns, non-standard blanks, mixed types). - **Test Suite** — A collection of test definitions scoped to a table group. - **Test Definition** — A configured test with parameters, thresholds, and target table/column. - **Test Run** — An execution of a test suite producing test results. @@ -125,22 +127,41 @@ def glossary_resource() -> str: - **Error** — Test could not execute (e.g., missing table or permission issue). - **Log** — Informational result recorded for reference. +## Hygiene Issue Likelihood + +How likely the issue is to indicate a real data quality problem. +- **Definite** — Strong signal; almost always a real issue worth fixing. +- **Likely** — Probable issue; review recommended. +- **Possible** — Weaker signal; confirm against the data. + +PII issues use their own classification: hygiene issues that flag potential personally identifiable information are categorized by **PII Risk** (**High** or **Moderate**) instead of the likelihoods above. + ## Disposition -Disposition is a user-assigned review status for test results: -- **Confirmed** (default) — Result is valid and counts toward scoring. -- **Dismissed** — Result reviewed and dismissed (excluded from scoring). -- **Muted** — Test was deactivated after this result (excluded from scoring). +Disposition is a user-assigned review status for both test results and hygiene issues: +- **Confirmed** (default) — Valid finding; counts toward scoring. +- **Dismissed** — Reviewed and dismissed; excluded from scoring. +- **Muted** — Acknowledged but suppressed; excluded from scoring. (For test results, this means the test was deactivated after the result.) -## Data Quality Dimensions +## Quality Dimensions +What aspect of data quality the test or hygiene issue measures. - **Accuracy** — Data values are correct and reflect real-world truth. - **Completeness** — Required data is present (no unexpected NULLs or blanks). - **Consistency** — Data agrees across columns, tables, or systems. -- **Timeliness** — Data is current and updated within expected windows. +- **Recency** — Data values themselves reflect recent points in time (e.g. dates in the data fall within expected windows). +- **Timeliness** — Data is updated on the expected schedule (no stale tables, expected refresh cadence). - **Uniqueness** — No unintended duplicates exist. - **Validity** — Data conforms to expected formats, ranges, or patterns. +## Impact Dimensions + +What's at stake when the data has issues — the primary breakdown used by scorecards. +- **Reliability** — Data is available and correct when needed. +- **Conformance** — Data meets contracts, formats, and reference standards. +- **Regularity** — Data arrives on schedule and stays structurally consistent. +- **Usability** — Data is shaped so consumers can work with it efficiently. + ## Test Scopes - **column** — Tests a single column (e.g., null rate, pattern match). diff --git a/testgen/mcp/tools/test_definitions.py b/testgen/mcp/tools/test_definitions.py index 3c520fe1..4365ee09 100644 --- a/testgen/mcp/tools/test_definitions.py +++ b/testgen/mcp/tools/test_definitions.py @@ -1,12 +1,14 @@ +from testgen.common.enums import ImpactDimension, QualityDimension from testgen.common.models import with_database_session from testgen.common.models.test_definition import TestDefinition, TestDefinitionNote, TestDefinitionSummary, TestType from testgen.common.models.test_result import TestResult from testgen.mcp.exceptions import MCPUserError from testgen.mcp.permissions import get_project_permissions, mcp_permission from testgen.mcp.tools.common import ( - VALID_DQ_DIMENSIONS, format_page_footer, format_page_info, + parse_impact_dimension, + parse_quality_dimension, parse_uuid, resolve_test_type, validate_limit, @@ -15,7 +17,6 @@ from testgen.mcp.tools.markdown import MdDoc _VALID_SCOPES = {"column", "table", "referential", "custom"} -_VALID_IMPACT_DIMENSIONS = {"Reliability", "Conformance", "Regularity", "Usability"} @with_database_session @@ -300,20 +301,20 @@ def list_test_types( if scope and scope not in _VALID_SCOPES: valid = ", ".join(sorted(_VALID_SCOPES)) raise MCPUserError(f"Invalid scope `{scope}`. Valid values: {valid}") - if impact_dimension and impact_dimension not in _VALID_IMPACT_DIMENSIONS: - valid = ", ".join(sorted(_VALID_IMPACT_DIMENSIONS)) - raise MCPUserError(f"Invalid impact_dimension `{impact_dimension}`. Valid values: {valid}") - if quality_dimension and quality_dimension not in VALID_DQ_DIMENSIONS: - valid = ", ".join(sorted(VALID_DQ_DIMENSIONS)) - raise MCPUserError(f"Invalid quality_dimension `{quality_dimension}`. Valid values: {valid}") + impact_dimension_enum: ImpactDimension | None = ( + parse_impact_dimension(impact_dimension) if impact_dimension else None + ) + quality_dimension_enum: QualityDimension | None = ( + parse_quality_dimension(quality_dimension) if quality_dimension else None + ) clauses = [TestType.active == "Y"] if scope: clauses.append(TestType.test_scope == scope) - if impact_dimension: - clauses.append(TestType.impact_dimension == impact_dimension) - if quality_dimension: - clauses.append(TestType.dq_dimension == quality_dimension) + if impact_dimension_enum is not None: + clauses.append(TestType.impact_dimension == impact_dimension_enum) + if quality_dimension_enum is not None: + clauses.append(TestType.dq_dimension == quality_dimension_enum) test_types = TestType.select_where(*clauses) diff --git a/tests/unit/mcp/test_model_hygiene_issue.py b/tests/unit/mcp/test_model_hygiene_issue.py new file mode 100644 index 00000000..bf90e2b6 --- /dev/null +++ b/tests/unit/mcp/test_model_hygiene_issue.py @@ -0,0 +1,362 @@ +from datetime import datetime +from unittest.mock import MagicMock, patch +from uuid import uuid4 + +import pytest +from sqlalchemy.dialects import postgresql + +from testgen.common.models.hygiene_issue import ( + HygieneIssue, + HygieneIssueDetail, + HygieneIssueListRow, + HygieneIssueSearchRow, + HygieneIssueType, +) + + +@pytest.fixture +def session_mock(): + with ( + patch("testgen.common.models.hygiene_issue.get_current_session") as hi_mock, + patch("testgen.common.models.entity.get_current_session") as entity_mock, + ): + entity_mock.return_value = hi_mock.return_value + yield hi_mock.return_value + + +def _compiled_sql(captured_query, literal_binds: bool = False) -> str: + compile_kwargs = {"literal_binds": True} if literal_binds else {} + return str(captured_query.compile(dialect=postgresql.dialect(), compile_kwargs=compile_kwargs)) + + +def _all_compiled_sql(session_mock) -> str: + """Concatenate all queries seen by session.scalar + session.execute. ``_paginate`` issues + both a count and a fetch; the WHERE/JOIN clauses live in both.""" + queries = [call[0][0] for call in session_mock.scalar.call_args_list] + queries += [call[0][0] for call in session_mock.execute.call_args_list] + return "\n".join(_compiled_sql(q) for q in queries) + + +def _list_row_mapping(**overrides): + base = { + "id": uuid4(), + "project_code": "demo", + "issue_type_name": "Non-Standard Blank Values", + "schema_name": "demo", + "table_name": "orders", + "column_name": "frame_size", + "impact_dimension": "Usability", + "dq_dimension": "Completeness", + "disposition": "Confirmed", + "priority": "Definite", + "detail": "…", + "detail_redactable": False, + "pii_flag": None, + } + base.update(overrides) + return base + + +def _search_row_mapping(**overrides): + base = _list_row_mapping() + base.update({ + "table_groups_name": "default", + "job_execution_id": uuid4(), + "started_at": datetime(2026, 5, 1), + }) + base.update(overrides) + return base + + +def _detail_row_mapping(**overrides): + base = _list_row_mapping() + base.update({ + "type_description": "type description", + "suggested_action": "suggested action", + "job_execution_id": uuid4(), + "started_at": datetime(2026, 5, 1), + "column_general_type": "A", + "column_db_data_type": "varchar(50)", + "column_record_ct": 100, + "column_null_value_ct": 5, + "column_distinct_value_ct": 50, + }) + base.update(overrides) + return base + + +def _stub_paginate(session_mock, *, total=0, rows=()): + session_mock.scalar.return_value = total + session_mock.execute.return_value.mappings.return_value.all.return_value = list(rows) + + +# --------------------------------------------------------------------------- +# HygieneIssue.list_for_run +# --------------------------------------------------------------------------- + + +def test_list_for_run_filters_by_je_id_not_run_pk(session_mock): + _stub_paginate(session_mock) + + HygieneIssue.list_for_run(uuid4()) + + sql = _all_compiled_sql(session_mock) + assert "profiling_runs.job_execution_id =" in sql + # The legacy run PK must NOT be the filter: + assert "profile_anomaly_results.profile_run_id =" not in sql + + +def test_list_for_run_joins_profile_anomaly_types_inner(session_mock): + _stub_paginate(session_mock) + + HygieneIssue.list_for_run(uuid4()) + + sql = _all_compiled_sql(session_mock) + assert "JOIN profile_anomaly_types" in sql + + +def test_list_for_run_joins_profile_results_outer(session_mock): + _stub_paginate(session_mock) + + HygieneIssue.list_for_run(uuid4()) + + sql = _all_compiled_sql(session_mock) + assert "LEFT OUTER JOIN profile_results" in sql + # Composite join condition guards against profile-result drift: + assert "profile_results.schema_name = profile_anomaly_results.schema_name" in sql + assert "profile_results.table_name = profile_anomaly_results.table_name" in sql + assert "profile_results.column_name = profile_anomaly_results.column_name" in sql + + +def test_list_for_run_orders_by_priority_then_table_then_column_then_id(session_mock): + _stub_paginate(session_mock) + + HygieneIssue.list_for_run(uuid4()) + + sql = _all_compiled_sql(session_mock) + # Priority CASE comes first; cls.id last for stable pagination. + assert "CASE" in sql + assert "ORDER BY" in sql + assert "profile_anomaly_results.table_name" in sql + assert "profile_anomaly_results.column_name" in sql + assert "profile_anomaly_results.id" in sql + + +def test_list_for_run_caller_clauses_appended_to_where(session_mock): + _stub_paginate(session_mock) + + HygieneIssue.list_for_run( + uuid4(), + HygieneIssue.table_name == "orders", + HygieneIssue.project_code.in_(["demo"]), + ) + + sql = _all_compiled_sql(session_mock) + assert "profile_anomaly_results.table_name =" in sql + assert "profile_anomaly_results.project_code IN" in sql + + +def test_list_for_run_coalesces_disposition(session_mock): + """The SELECT list coalesces disposition so NULL rows render as the default.""" + _stub_paginate(session_mock) + + HygieneIssue.list_for_run(uuid4()) + + sql = _all_compiled_sql(session_mock) + assert "coalesce(profile_anomaly_results.disposition" in sql + + +def test_list_for_run_returns_paginated_tuple(session_mock): + rows = [_list_row_mapping(), _list_row_mapping()] + _stub_paginate(session_mock, total=42, rows=rows) + + result_rows, total = HygieneIssue.list_for_run(uuid4()) + + assert total == 42 + assert len(result_rows) == 2 + assert all(isinstance(r, HygieneIssueListRow) for r in result_rows) + + +# --------------------------------------------------------------------------- +# HygieneIssue.search +# --------------------------------------------------------------------------- + + +def test_search_joins_table_group_inner(session_mock): + _stub_paginate(session_mock) + + HygieneIssue.search() + + sql = _all_compiled_sql(session_mock) + assert "JOIN table_groups" in sql + + +def test_search_joins_job_execution_outer(session_mock): + _stub_paginate(session_mock) + + HygieneIssue.search() + + sql = _all_compiled_sql(session_mock) + assert "LEFT OUTER JOIN job_executions" in sql + + +def test_search_joins_profile_results_outer(session_mock): + _stub_paginate(session_mock) + + HygieneIssue.search() + + sql = _all_compiled_sql(session_mock) + assert "LEFT OUTER JOIN profile_results" in sql + + +def test_search_orders_by_started_at_desc_then_priority_then_id(session_mock): + _stub_paginate(session_mock) + + HygieneIssue.search() + + sql = _all_compiled_sql(session_mock) + assert "ORDER BY" in sql + assert "job_executions.started_at DESC" in sql + assert "profile_anomaly_results.id" in sql + + +def test_search_caller_clauses_propagate(session_mock): + _stub_paginate(session_mock) + + HygieneIssue.search( + HygieneIssue.project_code.in_(["demo"]), + HygieneIssue.table_name == "orders", + ) + + sql = _all_compiled_sql(session_mock) + assert "profile_anomaly_results.project_code IN" in sql + assert "profile_anomaly_results.table_name =" in sql + + +def test_search_returns_paginated_tuple(session_mock): + rows = [_search_row_mapping()] + _stub_paginate(session_mock, total=7, rows=rows) + + result_rows, total = HygieneIssue.search() + + assert total == 7 + assert len(result_rows) == 1 + assert isinstance(result_rows[0], HygieneIssueSearchRow) + + +# --------------------------------------------------------------------------- +# HygieneIssue.get_with_context +# --------------------------------------------------------------------------- + + +def test_get_with_context_filters_by_id_and_extra_clauses(session_mock): + session_mock.execute.return_value.mappings.return_value.first.return_value = None + + HygieneIssue.get_with_context(uuid4(), HygieneIssue.project_code.in_(["demo"])) + + sql = _compiled_sql(session_mock.execute.call_args[0][0]) + assert "profile_anomaly_results.id =" in sql + assert "profile_anomaly_results.project_code IN" in sql + + +def test_get_with_context_outer_joins_profile_results(session_mock): + session_mock.execute.return_value.mappings.return_value.first.return_value = None + + HygieneIssue.get_with_context(uuid4()) + + sql = _compiled_sql(session_mock.execute.call_args[0][0]) + assert "LEFT OUTER JOIN profile_results" in sql + + +def test_get_with_context_outer_joins_job_execution(session_mock): + session_mock.execute.return_value.mappings.return_value.first.return_value = None + + HygieneIssue.get_with_context(uuid4()) + + sql = _compiled_sql(session_mock.execute.call_args[0][0]) + assert "LEFT OUTER JOIN job_executions" in sql + + +def test_get_with_context_returns_none_when_no_row(session_mock): + session_mock.execute.return_value.mappings.return_value.first.return_value = None + + result = HygieneIssue.get_with_context(uuid4()) + + assert result is None + + +def test_get_with_context_returns_dataclass_when_row_present(session_mock): + session_mock.execute.return_value.mappings.return_value.first.return_value = _detail_row_mapping() + + result = HygieneIssue.get_with_context(uuid4()) + + assert isinstance(result, HygieneIssueDetail) + assert result.issue_type_name == "Non-Standard Blank Values" + + +# --------------------------------------------------------------------------- +# HygieneIssue.update_disposition +# --------------------------------------------------------------------------- + + +def test_update_disposition_compiles_to_update_with_id_and_clauses(session_mock): + session_mock.execute.return_value.rowcount = 1 + + HygieneIssue.update_disposition( + uuid4(), + "Dismissed", + HygieneIssue.project_code.in_(["demo"]), + ) + + sql = _compiled_sql(session_mock.execute.call_args[0][0]) + assert "UPDATE profile_anomaly_results" in sql + assert "SET disposition" in sql + assert "profile_anomaly_results.id =" in sql + # Project-scope clause must be in the WHERE — write-side authorization. + assert "profile_anomaly_results.project_code IN" in sql + + +def test_update_disposition_returns_true_when_rowcount_positive(session_mock): + session_mock.execute.return_value.rowcount = 1 + + assert HygieneIssue.update_disposition(uuid4(), "Dismissed") is True + + +def test_update_disposition_returns_false_when_rowcount_zero(session_mock): + session_mock.execute.return_value.rowcount = 0 + + assert HygieneIssue.update_disposition(uuid4(), "Dismissed") is False + + +# --------------------------------------------------------------------------- +# HygieneIssueType.select_where +# --------------------------------------------------------------------------- + + +def test_hygiene_issue_type_select_where_no_clauses_returns_all(session_mock): + session_mock.scalars.return_value = [MagicMock(), MagicMock()] + + result = HygieneIssueType.select_where() + + assert len(result) == 2 + sql = _compiled_sql(session_mock.scalars.call_args[0][0]) + assert "SELECT" in sql.upper() + assert "profile_anomaly_types" in sql + + +def test_hygiene_issue_type_select_where_with_order_by(session_mock): + session_mock.scalars.return_value = [] + + HygieneIssueType.select_where(order_by=(HygieneIssueType.name,)) + + sql = _compiled_sql(session_mock.scalars.call_args[0][0]) + assert "ORDER BY profile_anomaly_types.anomaly_name" in sql + + +def test_hygiene_issue_type_select_where_filter_clause(session_mock): + session_mock.scalars.return_value = [] + + HygieneIssueType.select_where(HygieneIssueType.name == "Some Type") + + sql = _compiled_sql(session_mock.scalars.call_args[0][0]) + assert "profile_anomaly_types.anomaly_name =" in sql diff --git a/tests/unit/mcp/test_model_profiling_run.py b/tests/unit/mcp/test_model_profiling_run.py new file mode 100644 index 00000000..94d88b1b --- /dev/null +++ b/tests/unit/mcp/test_model_profiling_run.py @@ -0,0 +1,65 @@ +from unittest.mock import patch +from uuid import uuid4 + +import pytest +from sqlalchemy.dialects import postgresql + +from testgen.common.models.profiling_run import ProfilingRun + + +@pytest.fixture +def session_mock(): + with patch("testgen.common.models.profiling_run.get_current_session") as mock: + yield mock.return_value + + +def _compiled_sql(captured_query) -> str: + return str(captured_query.compile(dialect=postgresql.dialect())) + + +def test_get_latest_complete_je_id_returns_je_id_when_present(session_mock): + je_id = uuid4() + session_mock.scalar.return_value = je_id + + result = ProfilingRun.get_latest_complete_je_id_for_table_group(uuid4()) + + assert result == je_id + + +def test_get_latest_complete_je_id_returns_none_when_no_runs(session_mock): + session_mock.scalar.return_value = None + + result = ProfilingRun.get_latest_complete_je_id_for_table_group(uuid4()) + + assert result is None + + +def test_get_latest_complete_je_id_filters_to_completed(session_mock): + session_mock.scalar.return_value = None + + ProfilingRun.get_latest_complete_je_id_for_table_group(uuid4()) + + sql = _compiled_sql(session_mock.scalar.call_args[0][0]) + assert "job_executions.status =" in sql + + +def test_get_latest_complete_je_id_orders_desc_limit_1(session_mock): + session_mock.scalar.return_value = None + + ProfilingRun.get_latest_complete_je_id_for_table_group(uuid4()) + + sql = _compiled_sql(session_mock.scalar.call_args[0][0]) + assert "ORDER BY job_executions.started_at DESC" in sql + assert "LIMIT" in sql + + +def test_get_latest_complete_je_id_selects_je_id_not_run_pk(session_mock): + """Pin the docstring contract — does NOT read ``table_groups.last_complete_profile_run_id``, + selects the JE id directly from ``profiling_runs``.""" + session_mock.scalar.return_value = None + + ProfilingRun.get_latest_complete_je_id_for_table_group(uuid4()) + + sql = _compiled_sql(session_mock.scalar.call_args[0][0]) + assert "SELECT profiling_runs.job_execution_id" in sql + assert "table_groups.last_complete_profile_run_id" not in sql diff --git a/tests/unit/mcp/test_prompts_workflows.py b/tests/unit/mcp/test_prompts_workflows.py new file mode 100644 index 00000000..ac52520a --- /dev/null +++ b/tests/unit/mcp/test_prompts_workflows.py @@ -0,0 +1,55 @@ +from testgen.mcp.prompts.workflows import hygiene_triage + + +def test_hygiene_triage_with_table_group_id_interpolates_id(): + tg_id = "abc-123-def" + result = hygiene_triage(table_group_id=tg_id) + + assert tg_id in result + assert f"table_group_id='{tg_id}'" in result + # Without the inventory step: + assert "get_data_inventory" not in result + assert "Pick a table group" not in result + assert "Focus on table group" in result + + +def test_hygiene_triage_without_table_group_id_uses_placeholder(): + result = hygiene_triage() + + assert "Pick a table group" in result + assert "get_data_inventory" in result + # With the discovery step we expect the placeholder form for the call examples: + assert "table_group_id='...'" in result + + +def test_hygiene_triage_steps_numbered_consecutively_with_id(): + result = hygiene_triage(table_group_id="abc") + + # With table_group_id supplied, the inventory step is omitted → 5 steps numbered 1-5. + for n in (1, 2, 3, 4, 5): + assert f"{n}. " in result + assert "6. " not in result + + +def test_hygiene_triage_steps_numbered_consecutively_without_id(): + result = hygiene_triage() + + # Without table_group_id, the inventory step is included → 6 steps numbered 1-6. + for n in (1, 2, 3, 4, 5, 6): + assert f"{n}. " in result + assert "7. " not in result + + +def test_hygiene_triage_offers_all_three_dispositions(): + """The prompt should not push 'Dismissed' as the action — review feedback.""" + result = hygiene_triage() + + assert "Confirmed" in result + assert "Dismissed" in result + assert "Muted" in result + assert "update_hygiene_issue" in result + + +def test_hygiene_triage_references_resource_for_unfamiliar_types(): + result = hygiene_triage() + assert "testgen://hygiene-issue-types" in result diff --git a/tests/unit/mcp/test_tools_common.py b/tests/unit/mcp/test_tools_common.py index 8aaf8865..0e5e1b03 100644 --- a/tests/unit/mcp/test_tools_common.py +++ b/tests/unit/mcp/test_tools_common.py @@ -1,10 +1,25 @@ +from unittest.mock import MagicMock, patch from uuid import UUID import pytest +from testgen.common.enums import ImpactDimension, QualityDimension +from testgen.common.models.hygiene_issue import Disposition, IssueLikelihood, PiiRisk from testgen.common.models.test_result import TestResultStatus from testgen.mcp.exceptions import MCPUserError -from testgen.mcp.tools.common import parse_result_status, parse_uuid, validate_limit, validate_page +from testgen.mcp.tools.common import ( + format_disposition, + parse_disposition, + parse_impact_dimension, + parse_issue_likelihood_list, + parse_pii_risk_list, + parse_quality_dimension, + parse_result_status, + parse_uuid, + resolve_issue_type, + validate_limit, + validate_page, +) # --- parse_uuid --- @@ -88,3 +103,179 @@ def test_validate_limit_rejects_out_of_range(bad): def test_validate_limit_message_includes_max(): with pytest.raises(MCPUserError, match="between 1 and 200"): validate_limit(0, 200) + + +# --- parse_disposition / format_disposition --- + + +@pytest.mark.parametrize( + "user_label,expected", + [ + ("Confirmed", Disposition.CONFIRMED), + ("Dismissed", Disposition.DISMISSED), + ("Muted", Disposition.INACTIVE), + ], +) +def test_parse_disposition_user_labels_to_db_value(user_label, expected): + assert parse_disposition(user_label) is expected + + +def test_parse_disposition_rejects_db_value_inactive(): + """``Inactive`` is the DB value, not user-facing — accepting it would create two + spellings for the same disposition.""" + with pytest.raises(MCPUserError, match="Invalid disposition"): + parse_disposition("Inactive") + + +def test_parse_disposition_rejects_unknown_lists_valid_values(): + with pytest.raises(MCPUserError, match="Valid values:") as exc_info: + parse_disposition("Bogus") + msg = str(exc_info.value) + assert "Confirmed" in msg + assert "Dismissed" in msg + assert "Muted" in msg + + +def test_parse_disposition_case_sensitive(): + with pytest.raises(MCPUserError): + parse_disposition("confirmed") + + +@pytest.mark.parametrize( + "db_value,expected", + [ + (Disposition.CONFIRMED, "Confirmed"), + (Disposition.DISMISSED, "Dismissed"), + (Disposition.INACTIVE, "Muted"), + ], +) +def test_format_disposition_db_to_user_label(db_value, expected): + assert format_disposition(db_value) == expected + + +def test_format_disposition_accepts_string_form(): + """Coalesce on the column produces a plain string at runtime — both forms must work.""" + assert format_disposition("Inactive") == "Muted" + assert format_disposition("Confirmed") == "Confirmed" + + +def test_format_disposition_unknown_falls_through_to_string(): + assert format_disposition("WhoKnows") == "WhoKnows" + + +# --- parse_impact_dimension --- + + +@pytest.mark.parametrize("value", [d.value for d in ImpactDimension]) +def test_parse_impact_dimension_valid(value): + assert parse_impact_dimension(value) == ImpactDimension(value) + + +def test_parse_impact_dimension_invalid_lists_valid_values(): + with pytest.raises(MCPUserError, match="Invalid impact_dimension") as exc_info: + parse_impact_dimension("BadDim") + msg = str(exc_info.value) + for d in ImpactDimension: + assert d.value in msg + + +# --- parse_quality_dimension --- + + +@pytest.mark.parametrize("value", [d.value for d in QualityDimension]) +def test_parse_quality_dimension_valid(value): + assert parse_quality_dimension(value) == QualityDimension(value) + + +def test_parse_quality_dimension_includes_recency(): + """Recency was added during the TG-1029 enum migration; pin it as a valid value.""" + assert parse_quality_dimension("Recency") == QualityDimension.RECENCY + + +def test_parse_quality_dimension_invalid_lists_valid_values(): + with pytest.raises(MCPUserError, match="Invalid quality_dimension") as exc_info: + parse_quality_dimension("BadDim") + msg = str(exc_info.value) + for d in QualityDimension: + assert d.value in msg + + +# --- parse_issue_likelihood_list --- + + +def test_parse_issue_likelihood_list_accepts_three_filterable_values(): + result = parse_issue_likelihood_list(["Definite", "Likely", "Possible"]) + assert result == [IssueLikelihood.DEFINITE, IssueLikelihood.LIKELY, IssueLikelihood.POSSIBLE] + + +def test_parse_issue_likelihood_list_rejects_potential_pii(): + """``Potential PII`` is a valid IssueLikelihood enum value but NOT a valid filter input — + PII issues are filtered separately via ``pii_risk``. Locking this prevents a future + 'fix' that allows the full enum and breaks the auto-exclude API contract.""" + with pytest.raises(MCPUserError, match="Invalid issue_likelihood"): + parse_issue_likelihood_list(["Potential PII"]) + + +def test_parse_issue_likelihood_list_invalid_lists_valid_values_excluding_pii(): + with pytest.raises(MCPUserError, match="Valid values:") as exc_info: + parse_issue_likelihood_list(["Bogus"]) + msg = str(exc_info.value) + assert "Definite" in msg + assert "Likely" in msg + assert "Possible" in msg + assert "Potential PII" not in msg + + +def test_parse_issue_likelihood_list_collects_all_invalid(): + with pytest.raises(MCPUserError) as exc_info: + parse_issue_likelihood_list(["Definite", "Bogus", "Other"]) + msg = str(exc_info.value) + assert "Bogus" in msg + assert "Other" in msg + + +def test_parse_issue_likelihood_list_empty_returns_empty(): + assert parse_issue_likelihood_list([]) == [] + + +# --- parse_pii_risk_list --- + + +def test_parse_pii_risk_list_accepts_high_moderate(): + assert parse_pii_risk_list(["High", "Moderate"]) == [PiiRisk.HIGH, PiiRisk.MODERATE] + + +def test_parse_pii_risk_list_rejects_low(): + with pytest.raises(MCPUserError, match="Invalid pii_risk"): + parse_pii_risk_list(["Low"]) + + +def test_parse_pii_risk_list_collects_all_invalid(): + with pytest.raises(MCPUserError) as exc_info: + parse_pii_risk_list(["High", "Bogus", "Wrong"]) + msg = str(exc_info.value) + assert "Bogus" in msg + assert "Wrong" in msg + + +# --- resolve_issue_type --- + + +def test_resolve_issue_type_found_returns_id(): + fake = MagicMock() + fake.id = "1015" + with patch( + "testgen.mcp.tools.common.HygieneIssueType.select_where", return_value=[fake] + ) as select_where: + result = resolve_issue_type("Personally Identifiable Information") + assert result == "1015" + assert select_where.call_count == 1 + + +def test_resolve_issue_type_not_found_raises_with_resource_hint(): + with patch( + "testgen.mcp.tools.common.HygieneIssueType.select_where", return_value=[] + ): + with pytest.raises(MCPUserError, match="Unknown hygiene issue type") as exc_info: + resolve_issue_type("Made-Up Type") + assert "testgen://hygiene-issue-types" in str(exc_info.value) diff --git a/tests/unit/mcp/test_tools_hygiene_issues.py b/tests/unit/mcp/test_tools_hygiene_issues.py new file mode 100644 index 00000000..0a81ce3f --- /dev/null +++ b/tests/unit/mcp/test_tools_hygiene_issues.py @@ -0,0 +1,907 @@ +from datetime import datetime +from unittest.mock import MagicMock, patch +from uuid import uuid4 + +import pytest +from sqlalchemy.dialects import postgresql + +from testgen.common.models.hygiene_issue import Disposition, HygieneIssue, IssueLikelihood +from testgen.common.models.profiling_run import ProfilingRun +from testgen.common.pii_masking import PII_REDACTED +from testgen.mcp.exceptions import MCPResourceNotAccessible, MCPUserError +from testgen.mcp.permissions import ProjectPermissions, _mcp_project_permissions + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _list_row(**overrides): + base = { + "id": uuid4(), + "project_code": "demo", + "issue_type_name": "Non-Standard Blank Values", + "schema_name": "demo", + "table_name": "orders", + "column_name": "frame_size", + "impact_dimension": "Usability", + "dq_dimension": "Completeness", + "disposition": Disposition.CONFIRMED, + "priority": "Definite", + "detail": "Dummy: 5", + "detail_redactable": False, + "pii_flag": None, + } + base.update(overrides) + return MagicMock(**base) + + +def _search_row(**overrides): + base = { + "id": uuid4(), + "project_code": "demo", + "issue_type_name": "Non-Standard Blank Values", + "table_groups_name": "default", + "job_execution_id": uuid4(), + "started_at": datetime(2026, 5, 1), + "schema_name": "demo", + "table_name": "orders", + "column_name": "frame_size", + "impact_dimension": "Usability", + "dq_dimension": "Completeness", + "disposition": Disposition.CONFIRMED, + "priority": "Definite", + "detail": "Dummy: 5", + "detail_redactable": False, + "pii_flag": None, + } + base.update(overrides) + return MagicMock(**base) + + +def _detail_row(**overrides): + base = { + "id": uuid4(), + "project_code": "demo", + "issue_type_name": "Non-Standard Blank Values", + "type_description": "Description body.", + "suggested_action": "Suggested action body.", + "schema_name": "demo", + "table_name": "orders", + "column_name": "frame_size", + "dq_dimension": "Completeness", + "impact_dimension": "Usability", + "disposition": Disposition.CONFIRMED, + "priority": "Definite", + "detail": "Dummy: 5", + "detail_redactable": False, + "pii_flag": None, + "job_execution_id": uuid4(), + "started_at": datetime(2026, 5, 1), + "column_general_type": "A", + "column_db_data_type": "varchar(50)", + "column_record_ct": 100, + "column_null_value_ct": 5, + "column_distinct_value_ct": 50, + } + base.update(overrides) + return MagicMock(**base) + + +def _mock_table_group(project_code="demo"): + tg = MagicMock() + tg.id = uuid4() + tg.project_code = project_code + return tg + + +def _mock_run(project_code="demo"): + run = MagicMock() + run.id = uuid4() + run.job_execution_id = uuid4() + run.table_groups_id = uuid4() + run.project_code = project_code + return run + + +def _compiled_clauses(call_args) -> str: + """Compile every positional arg of a model-method call to Postgres SQL.""" + pieces = [] + for arg in call_args.args: + try: + pieces.append(str(arg.compile(dialect=postgresql.dialect()))) + except (AttributeError, TypeError): + pieces.append(str(arg)) + return "\n".join(pieces) + + +def _set_full_perms(permission="view"): + """Manually set a permission context. Used for helpers that read the contextvar + directly without going through @mcp_permission (e.g. _resolve_profile_run_je_id).""" + perms = MagicMock(spec=ProjectPermissions) + perms.memberships = {"demo": "role_a"} + perms.permission = permission + perms.username = "test_user" + perms.allowed_codes = ["demo"] + perms.codes_allowed_to.return_value = ["demo"] + perms.has_access.side_effect = lambda code: code in ["demo"] + return _mcp_project_permissions.set(perms) + + +# --------------------------------------------------------------------------- +# _redact_detail +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "redactable,pii_flag,project_in_view_pii,should_redact", + [ + (True, "B/NAME/Individual", False, True), + (True, "B/NAME/Individual", True, False), + (True, None, False, False), + (False, "B/NAME/Individual", False, False), + (False, None, True, False), + (None, None, False, False), + ], +) +def test_redact_detail_matrix(redactable, pii_flag, project_in_view_pii, should_redact): + from testgen.mcp.tools.hygiene_issues import _redact_detail + + row = _list_row( + detail_redactable=redactable, + pii_flag=pii_flag, + detail="ssn=123-45-6789", + project_code="demo", + ) + view_pii_codes = {"demo"} if project_in_view_pii else set() + result = _redact_detail(row, view_pii_codes) + + assert (result == PII_REDACTED) is should_redact + if not should_redact: + assert result == "ssn=123-45-6789" + + +def test_redact_detail_view_pii_per_project(): + from testgen.mcp.tools.hygiene_issues import _redact_detail + + row = _list_row(detail_redactable=True, pii_flag="B/NAME", detail="raw", project_code="proj_b") + assert _redact_detail(row, {"proj_a"}) == PII_REDACTED + + +# --------------------------------------------------------------------------- +# _build_likelihood_clause +# --------------------------------------------------------------------------- + + +def _compile_clause(clause) -> str: + return str(clause.compile(dialect=postgresql.dialect())) + + +def test_build_likelihood_clause_neither_returns_none(): + from testgen.mcp.tools.hygiene_issues import _build_likelihood_clause + + assert _build_likelihood_clause(None, None) is None + + +def test_build_likelihood_clause_likelihood_only_excludes_pii(): + from testgen.mcp.tools.hygiene_issues import _build_likelihood_clause + + sql = _compile_clause(_build_likelihood_clause([IssueLikelihood.DEFINITE], None)) + assert "profile_anomaly_types.issue_likelihood IN" in sql + assert "Potential PII" not in sql + + +def test_build_likelihood_clause_pii_only_uses_priority_hybrid(): + from testgen.mcp.tools.hygiene_issues import _build_likelihood_clause + + sql = _compile_clause(_build_likelihood_clause(None, ["High"])) + assert "profile_anomaly_types.issue_likelihood =" in sql + # priority hybrid expands to a CASE expression in SQL: + assert "CASE" in sql + + +def test_build_likelihood_clause_both_returns_or_combination(): + from testgen.mcp.tools.hygiene_issues import _build_likelihood_clause + + sql = _compile_clause(_build_likelihood_clause([IssueLikelihood.DEFINITE], ["High"])) + assert " OR " in sql + assert "issue_likelihood IN" in sql + assert "issue_likelihood =" in sql + + +# --------------------------------------------------------------------------- +# _resolve_profile_run_je_id +# --------------------------------------------------------------------------- + + +@patch("testgen.mcp.tools.hygiene_issues.resolve_table_group") +@patch.object(ProfilingRun, "get_latest_complete_je_id_for_table_group") +def test_resolve_je_id_table_group_branch(mock_latest, mock_resolve_tg, db_session_mock): + from testgen.mcp.tools.hygiene_issues import _resolve_profile_run_je_id + + tg = _mock_table_group() + mock_resolve_tg.return_value = tg + expected_je = uuid4() + mock_latest.return_value = expected_je + + token = _set_full_perms() + try: + result = _resolve_profile_run_je_id(job_execution_id=None, table_group_id=str(uuid4())) + finally: + _mcp_project_permissions.reset(token) + + assert result == expected_je + mock_latest.assert_called_once_with(tg.id) + + +@patch("testgen.mcp.tools.hygiene_issues.resolve_table_group") +@patch.object(ProfilingRun, "get_latest_complete_je_id_for_table_group") +def test_resolve_je_id_table_group_no_completed_runs(mock_latest, mock_resolve_tg, db_session_mock): + from testgen.mcp.tools.hygiene_issues import _resolve_profile_run_je_id + + mock_resolve_tg.return_value = _mock_table_group() + mock_latest.return_value = None + + token = _set_full_perms() + try: + with pytest.raises(MCPUserError, match="No completed profiling runs"): + _resolve_profile_run_je_id(job_execution_id=None, table_group_id=str(uuid4())) + finally: + _mcp_project_permissions.reset(token) + + +@patch("testgen.mcp.tools.hygiene_issues.TableGroup") +@patch.object(ProfilingRun, "get_by_id_or_job") +def test_resolve_je_id_je_branch_unknown_run(mock_get, mock_tg_cls, db_session_mock): + from testgen.mcp.tools.hygiene_issues import _resolve_profile_run_je_id + + mock_get.return_value = None + + token = _set_full_perms() + try: + with pytest.raises(MCPResourceNotAccessible): + _resolve_profile_run_je_id(job_execution_id=str(uuid4()), table_group_id=None) + finally: + _mcp_project_permissions.reset(token) + + +@patch("testgen.mcp.tools.hygiene_issues.TableGroup") +@patch.object(ProfilingRun, "get_by_id_or_job") +def test_resolve_je_id_je_branch_inaccessible_tg(mock_get, mock_tg_cls, db_session_mock): + from testgen.mcp.tools.hygiene_issues import _resolve_profile_run_je_id + + mock_get.return_value = _mock_run() + mock_tg_cls.get.return_value = _mock_table_group(project_code="forbidden") + + token = _set_full_perms() # only has access to "demo", not "forbidden" + try: + with pytest.raises(MCPResourceNotAccessible): + _resolve_profile_run_je_id(job_execution_id=str(uuid4()), table_group_id=None) + finally: + _mcp_project_permissions.reset(token) + + +# --------------------------------------------------------------------------- +# list_hygiene_issues +# --------------------------------------------------------------------------- + + +def test_list_hygiene_issues_both_args_rejected(db_session_mock): + from testgen.mcp.tools.hygiene_issues import list_hygiene_issues + + with pytest.raises(MCPUserError, match="Pass either"): + list_hygiene_issues(job_execution_id=str(uuid4()), table_group_id=str(uuid4())) + + +def test_list_hygiene_issues_neither_arg_rejected(db_session_mock): + from testgen.mcp.tools.hygiene_issues import list_hygiene_issues + + with pytest.raises(MCPUserError, match="Provide either"): + list_hygiene_issues() + + +def test_list_hygiene_issues_invalid_je_uuid(db_session_mock): + from testgen.mcp.tools.hygiene_issues import list_hygiene_issues + + with pytest.raises(MCPUserError, match="not a valid UUID"): + list_hygiene_issues(job_execution_id="not-a-uuid") + + +@patch("testgen.mcp.tools.hygiene_issues.TableGroup") +@patch.object(ProfilingRun, "get_by_id_or_job") +@patch.object(HygieneIssue, "list_for_run") +def test_list_hygiene_issues_resolves_via_je_id(mock_list, mock_get, mock_tg_cls, db_session_mock): + from testgen.mcp.tools.hygiene_issues import list_hygiene_issues + + run = _mock_run() + mock_get.return_value = run + mock_tg_cls.get.return_value = _mock_table_group() + mock_list.return_value = ([], 0) + + list_hygiene_issues(job_execution_id=str(uuid4())) + + assert mock_list.call_args.args[0] == run.job_execution_id + + +@patch("testgen.mcp.tools.hygiene_issues.resolve_table_group") +@patch.object(ProfilingRun, "get_latest_complete_je_id_for_table_group") +@patch.object(HygieneIssue, "list_for_run") +def test_list_hygiene_issues_resolves_via_table_group( + mock_list, mock_latest, mock_resolve_tg, db_session_mock, +): + from testgen.mcp.tools.hygiene_issues import list_hygiene_issues + + mock_resolve_tg.return_value = _mock_table_group() + expected_je = uuid4() + mock_latest.return_value = expected_je + mock_list.return_value = ([], 0) + + list_hygiene_issues(table_group_id=str(uuid4())) + + assert mock_list.call_args.args[0] == expected_je + + +@patch("testgen.mcp.tools.hygiene_issues.resolve_table_group") +@patch.object(ProfilingRun, "get_latest_complete_je_id_for_table_group") +@patch.object(HygieneIssue, "list_for_run") +def test_list_hygiene_issues_happy_path_renders_fields( + mock_list, mock_latest, mock_resolve_tg, db_session_mock, +): + from testgen.mcp.tools.hygiene_issues import list_hygiene_issues + + mock_resolve_tg.return_value = _mock_table_group() + mock_latest.return_value = uuid4() + issue_id = uuid4() + mock_list.return_value = ([_list_row(id=issue_id, detail="Dummy Values: 5")], 1) + + result = list_hygiene_issues(table_group_id=str(uuid4())) + + assert "[Definite]" in result + assert "Non-Standard Blank Values" in result + assert "`frame_size` in `orders`" in result + assert str(issue_id) in result + assert "Impact Dimension" in result + assert "Quality Dimension" in result + assert "Disposition" in result + assert "Confirmed" in result + assert "Dummy Values: 5" in result + + +@patch("testgen.mcp.tools.hygiene_issues.resolve_table_group") +@patch.object(ProfilingRun, "get_latest_complete_je_id_for_table_group") +@patch.object(HygieneIssue, "list_for_run") +def test_list_hygiene_issues_table_level_heading( + mock_list, mock_latest, mock_resolve_tg, db_session_mock, +): + from testgen.mcp.tools.hygiene_issues import list_hygiene_issues + + mock_resolve_tg.return_value = _mock_table_group() + mock_latest.return_value = uuid4() + mock_list.return_value = ([_list_row(column_name=None)], 1) + + result = list_hygiene_issues(table_group_id=str(uuid4())) + assert "on `orders`" in result + assert "` in `" not in result + + +@patch("testgen.mcp.tools.hygiene_issues.resolve_table_group") +@patch.object(ProfilingRun, "get_latest_complete_je_id_for_table_group") +@patch.object(HygieneIssue, "list_for_run") +def test_list_hygiene_issues_priority_unknown_fallback( + mock_list, mock_latest, mock_resolve_tg, db_session_mock, +): + from testgen.mcp.tools.hygiene_issues import list_hygiene_issues + + mock_resolve_tg.return_value = _mock_table_group() + mock_latest.return_value = uuid4() + mock_list.return_value = ([_list_row(priority=None)], 1) + + assert "[Unknown]" in list_hygiene_issues(table_group_id=str(uuid4())) + + +@patch("testgen.mcp.tools.hygiene_issues.resolve_table_group") +@patch.object(ProfilingRun, "get_latest_complete_je_id_for_table_group") +@patch.object(HygieneIssue, "list_for_run") +def test_list_hygiene_issues_empty( + mock_list, mock_latest, mock_resolve_tg, db_session_mock, +): + from testgen.mcp.tools.hygiene_issues import list_hygiene_issues + + mock_resolve_tg.return_value = _mock_table_group() + mock_latest.return_value = uuid4() + mock_list.return_value = ([], 0) + + result = list_hygiene_issues(table_group_id=str(uuid4())) + assert "_No hygiene issues match the supplied filters._" in result + + +@patch("testgen.mcp.tools.hygiene_issues.resolve_table_group") +@patch.object(ProfilingRun, "get_latest_complete_je_id_for_table_group") +@patch.object(HygieneIssue, "list_for_run") +def test_list_hygiene_issues_pagination_footer( + mock_list, mock_latest, mock_resolve_tg, db_session_mock, +): + from testgen.mcp.tools.hygiene_issues import list_hygiene_issues + + mock_resolve_tg.return_value = _mock_table_group() + mock_latest.return_value = uuid4() + mock_list.return_value = ([_list_row(), _list_row()], 100) + + result = list_hygiene_issues(table_group_id=str(uuid4()), limit=2, page=1) + assert "Showing 1–2 of 100" in result # noqa: RUF001 — page-info formatter uses EN DASH + assert "page=2" in result + + +@patch("testgen.mcp.tools.hygiene_issues.resolve_table_group") +@patch.object(ProfilingRun, "get_latest_complete_je_id_for_table_group") +@patch.object(HygieneIssue, "list_for_run") +def test_list_hygiene_issues_disposition_default_coalesces_to_confirmed( + mock_list, mock_latest, mock_resolve_tg, db_session_mock, +): + from testgen.mcp.tools.hygiene_issues import list_hygiene_issues + + mock_resolve_tg.return_value = _mock_table_group() + mock_latest.return_value = uuid4() + mock_list.return_value = ([], 0) + + list_hygiene_issues(table_group_id=str(uuid4())) + + sql = _compiled_clauses(mock_list.call_args) + assert "coalesce(profile_anomaly_results.disposition" in sql + + +@patch("testgen.mcp.tools.hygiene_issues.resolve_table_group") +@patch.object(ProfilingRun, "get_latest_complete_je_id_for_table_group") +def test_list_hygiene_issues_invalid_disposition(mock_latest, mock_resolve_tg, db_session_mock): + from testgen.mcp.tools.hygiene_issues import list_hygiene_issues + + mock_resolve_tg.return_value = _mock_table_group() + mock_latest.return_value = uuid4() + + with pytest.raises(MCPUserError, match="Invalid disposition"): + list_hygiene_issues(table_group_id=str(uuid4()), disposition="bogus") + + +@patch("testgen.mcp.tools.hygiene_issues.resolve_table_group") +@patch.object(ProfilingRun, "get_latest_complete_je_id_for_table_group") +def test_list_hygiene_issues_invalid_quality_dimension(mock_latest, mock_resolve_tg, db_session_mock): + from testgen.mcp.tools.hygiene_issues import list_hygiene_issues + + mock_resolve_tg.return_value = _mock_table_group() + mock_latest.return_value = uuid4() + + with pytest.raises(MCPUserError, match="Invalid quality_dimension"): + list_hygiene_issues(table_group_id=str(uuid4()), quality_dimension="X") + + +@patch("testgen.mcp.tools.hygiene_issues.resolve_table_group") +@patch.object(ProfilingRun, "get_latest_complete_je_id_for_table_group") +def test_list_hygiene_issues_invalid_impact_dimension(mock_latest, mock_resolve_tg, db_session_mock): + from testgen.mcp.tools.hygiene_issues import list_hygiene_issues + + mock_resolve_tg.return_value = _mock_table_group() + mock_latest.return_value = uuid4() + + with pytest.raises(MCPUserError, match="Invalid impact_dimension"): + list_hygiene_issues(table_group_id=str(uuid4()), impact_dimension="X") + + +@patch("testgen.mcp.tools.hygiene_issues.resolve_table_group") +@patch.object(ProfilingRun, "get_latest_complete_je_id_for_table_group") +def test_list_hygiene_issues_invalid_likelihood(mock_latest, mock_resolve_tg, db_session_mock): + from testgen.mcp.tools.hygiene_issues import list_hygiene_issues + + mock_resolve_tg.return_value = _mock_table_group() + mock_latest.return_value = uuid4() + + with pytest.raises(MCPUserError, match="Invalid issue_likelihood"): + list_hygiene_issues(table_group_id=str(uuid4()), issue_likelihood=["Bogus"]) + + +@patch("testgen.mcp.tools.hygiene_issues.resolve_table_group") +@patch.object(ProfilingRun, "get_latest_complete_je_id_for_table_group") +def test_list_hygiene_issues_invalid_pii_risk(mock_latest, mock_resolve_tg, db_session_mock): + from testgen.mcp.tools.hygiene_issues import list_hygiene_issues + + mock_resolve_tg.return_value = _mock_table_group() + mock_latest.return_value = uuid4() + + with pytest.raises(MCPUserError, match="Invalid pii_risk"): + list_hygiene_issues(table_group_id=str(uuid4()), pii_risk=["Low"]) + + +@patch("testgen.mcp.tools.hygiene_issues.resolve_table_group") +@patch.object(ProfilingRun, "get_latest_complete_je_id_for_table_group") +@patch.object(HygieneIssue, "list_for_run") +def test_list_hygiene_issues_passes_project_codes_clause( + mock_list, mock_latest, mock_resolve_tg, db_session_mock, +): + from testgen.mcp.tools.hygiene_issues import list_hygiene_issues + + mock_resolve_tg.return_value = _mock_table_group() + mock_latest.return_value = uuid4() + mock_list.return_value = ([], 0) + + list_hygiene_issues(table_group_id=str(uuid4())) + + sql = _compiled_clauses(mock_list.call_args) + assert "profile_anomaly_results.project_code IN" in sql + + +@patch("testgen.mcp.tools.hygiene_issues.resolve_table_group") +def test_list_hygiene_issues_invalid_page(mock_resolve_tg, db_session_mock): + from testgen.mcp.tools.hygiene_issues import list_hygiene_issues + + mock_resolve_tg.return_value = _mock_table_group() + + with pytest.raises(MCPUserError, match="Invalid page"): + list_hygiene_issues(table_group_id=str(uuid4()), page=0) + + +@patch("testgen.mcp.tools.hygiene_issues.resolve_table_group") +def test_list_hygiene_issues_limit_above_max(mock_resolve_tg, db_session_mock): + from testgen.mcp.tools.hygiene_issues import list_hygiene_issues + + mock_resolve_tg.return_value = _mock_table_group() + + with pytest.raises(MCPUserError, match="between 1 and 200"): + list_hygiene_issues(table_group_id=str(uuid4()), limit=201) + + +@patch("testgen.mcp.tools.hygiene_issues.resolve_table_group") +@patch.object(ProfilingRun, "get_latest_complete_je_id_for_table_group") +@patch.object(HygieneIssue, "list_for_run") +def test_list_hygiene_issues_pii_redacted_when_no_view_pii( + mock_list, mock_latest, mock_resolve_tg, db_session_mock, +): + from testgen.mcp.tools.hygiene_issues import list_hygiene_issues + + mock_resolve_tg.return_value = _mock_table_group() + mock_latest.return_value = uuid4() + row = _list_row(detail_redactable=True, pii_flag="B/NAME/Individual", detail="ssn=123") + mock_list.return_value = ([row], 1) + + # Default mcp_user has role_a; TEST_PERM_MATRIX has no "view_pii" entry → redaction applies. + result = list_hygiene_issues(table_group_id=str(uuid4())) + assert PII_REDACTED in result + assert "ssn=123" not in result + + +@patch("testgen.mcp.permissions._compute_project_permissions") +@patch("testgen.mcp.tools.hygiene_issues.resolve_table_group") +@patch.object(ProfilingRun, "get_latest_complete_je_id_for_table_group") +@patch.object(HygieneIssue, "list_for_run") +def test_list_hygiene_issues_pii_visible_when_view_pii_granted( + mock_list, mock_latest, mock_resolve_tg, mock_compute, db_session_mock, +): + from testgen.mcp.tools.hygiene_issues import list_hygiene_issues + + perms = MagicMock(spec=ProjectPermissions) + perms.allowed_codes = ["demo"] + perms.codes_allowed_to.return_value = ["demo"] + perms.has_access.side_effect = lambda code: code == "demo" + mock_compute.return_value = perms + + mock_resolve_tg.return_value = _mock_table_group() + mock_latest.return_value = uuid4() + row = _list_row(detail_redactable=True, pii_flag="B/NAME/Individual", detail="ssn=123") + mock_list.return_value = ([row], 1) + + result = list_hygiene_issues(table_group_id=str(uuid4())) + assert "ssn=123" in result + assert PII_REDACTED not in result + + +# --------------------------------------------------------------------------- +# get_hygiene_issue +# --------------------------------------------------------------------------- + + +def test_get_hygiene_issue_invalid_uuid(db_session_mock): + from testgen.mcp.tools.hygiene_issues import get_hygiene_issue + + with pytest.raises(MCPUserError, match="not a valid UUID"): + get_hygiene_issue(issue_id="bogus") + + +@patch.object(HygieneIssue, "get_with_context") +def test_get_hygiene_issue_not_found_collapses_to_not_accessible(mock_get, db_session_mock): + from testgen.mcp.tools.hygiene_issues import get_hygiene_issue + + mock_get.return_value = None + + with pytest.raises(MCPResourceNotAccessible): + get_hygiene_issue(issue_id=str(uuid4())) + + +@patch.object(HygieneIssue, "get_with_context") +def test_get_hygiene_issue_passes_project_codes_clause(mock_get, db_session_mock): + from testgen.mcp.tools.hygiene_issues import get_hygiene_issue + + mock_get.return_value = _detail_row() + + get_hygiene_issue(issue_id=str(uuid4())) + + sql = _compiled_clauses(mock_get.call_args) + assert "profile_anomaly_results.project_code IN" in sql + + +@patch.object(HygieneIssue, "get_with_context") +def test_get_hygiene_issue_renders_full_detail(mock_get, db_session_mock): + from testgen.mcp.tools.hygiene_issues import get_hygiene_issue + + mock_get.return_value = _detail_row() + + result = get_hygiene_issue(issue_id=str(uuid4())) + + assert "[Definite]" in result + assert "Issue ID" in result + assert "Schema" in result + assert "Table" in result + assert "Column" in result + assert "Impact Dimension" in result + assert "Quality Dimension" in result + assert "Disposition" in result + assert "## Suggested Action" in result + assert "Suggested action body." in result + assert "## Issue Type Description" in result + assert "Description body." in result + assert "## Column Profile" in result + assert "## Profiling Run" in result + + +@patch.object(HygieneIssue, "get_with_context") +def test_get_hygiene_issue_table_level_omits_column(mock_get, db_session_mock): + from testgen.mcp.tools.hygiene_issues import get_hygiene_issue + + mock_get.return_value = _detail_row(column_name=None) + result = get_hygiene_issue(issue_id=str(uuid4())) + + assert "on `orders`" in result + assert "` in `" not in result + + +@patch.object(HygieneIssue, "get_with_context") +def test_get_hygiene_issue_omits_column_profile_when_no_data(mock_get, db_session_mock): + from testgen.mcp.tools.hygiene_issues import get_hygiene_issue + + mock_get.return_value = _detail_row( + column_general_type=None, column_db_data_type=None, + column_record_ct=None, column_null_value_ct=None, column_distinct_value_ct=None, + ) + result = get_hygiene_issue(issue_id=str(uuid4())) + assert "## Column Profile" not in result + + +@patch.object(HygieneIssue, "get_with_context") +def test_get_hygiene_issue_renders_null_rate(mock_get, db_session_mock): + from testgen.mcp.tools.hygiene_issues import get_hygiene_issue + + mock_get.return_value = _detail_row(column_record_ct=1000, column_null_value_ct=50) + result = get_hygiene_issue(issue_id=str(uuid4())) + assert "Null Rate" in result + assert "5.00%" in result + + +@patch.object(HygieneIssue, "get_with_context") +def test_get_hygiene_issue_skips_null_rate_when_record_ct_zero(mock_get, db_session_mock): + from testgen.mcp.tools.hygiene_issues import get_hygiene_issue + + mock_get.return_value = _detail_row(column_record_ct=0, column_null_value_ct=0) + result = get_hygiene_issue(issue_id=str(uuid4())) + assert "Null Rate" not in result + + +# --------------------------------------------------------------------------- +# update_hygiene_issue +# --------------------------------------------------------------------------- + + +@pytest.fixture +def disposition_perms(): + """Grant 'disposition' permission on demo (the conftest's matrix omits it).""" + perms = MagicMock(spec=ProjectPermissions) + perms.memberships = {"demo": "role_a"} + perms.permission = "disposition" + perms.username = "test_user" + perms.allowed_codes = ["demo"] + perms.codes_allowed_to.return_value = ["demo"] + perms.has_access.side_effect = lambda code: code in ["demo"] + + with patch("testgen.mcp.permissions._compute_project_permissions", return_value=perms): + yield perms + + +def test_update_hygiene_issue_invalid_uuid(db_session_mock, disposition_perms): + from testgen.mcp.tools.hygiene_issues import update_hygiene_issue + + with pytest.raises(MCPUserError, match="not a valid UUID"): + update_hygiene_issue(issue_id="bogus", disposition="Confirmed") + + +def test_update_hygiene_issue_invalid_disposition(db_session_mock, disposition_perms): + from testgen.mcp.tools.hygiene_issues import update_hygiene_issue + + with pytest.raises(MCPUserError, match="Invalid disposition"): + update_hygiene_issue(issue_id=str(uuid4()), disposition="Bogus") + + +@patch.object(HygieneIssue, "update_disposition") +def test_update_hygiene_issue_muted_maps_to_inactive(mock_update, db_session_mock, disposition_perms): + from testgen.mcp.tools.hygiene_issues import update_hygiene_issue + + mock_update.return_value = True + update_hygiene_issue(issue_id=str(uuid4()), disposition="Muted") + + args = mock_update.call_args.args + assert args[1] == Disposition.INACTIVE + + +@patch.object(HygieneIssue, "update_disposition") +def test_update_hygiene_issue_returns_success_markdown(mock_update, db_session_mock, disposition_perms): + from testgen.mcp.tools.hygiene_issues import update_hygiene_issue + + mock_update.return_value = True + issue_id = str(uuid4()) + result = update_hygiene_issue(issue_id=issue_id, disposition="Dismissed") + + assert "Updated hygiene issue" in result + assert issue_id in result + assert "Dismissed" in result + + +@patch.object(HygieneIssue, "update_disposition") +def test_update_hygiene_issue_not_updated_collapses_to_not_accessible( + mock_update, db_session_mock, disposition_perms, +): + from testgen.mcp.tools.hygiene_issues import update_hygiene_issue + + mock_update.return_value = False + with pytest.raises(MCPResourceNotAccessible): + update_hygiene_issue(issue_id=str(uuid4()), disposition="Confirmed") + + +@patch.object(HygieneIssue, "update_disposition") +def test_update_hygiene_issue_passes_project_scope_clause( + mock_update, db_session_mock, disposition_perms, +): + from testgen.mcp.tools.hygiene_issues import update_hygiene_issue + + mock_update.return_value = True + update_hygiene_issue(issue_id=str(uuid4()), disposition="Confirmed") + + # Trailing args after (issue_uuid, db_disposition) are the *clauses + clauses = mock_update.call_args.args[2:] + sql = "\n".join(str(c.compile(dialect=postgresql.dialect())) for c in clauses) + assert "profile_anomaly_results.project_code IN" in sql + + +def test_update_hygiene_issue_uses_disposition_permission(): + """Pin the permission name. Silent downgrade to 'view' would be a write-side leak.""" + import testgen.mcp.tools.hygiene_issues as mod + + closure = {c.cell_contents for c in mod.update_hygiene_issue.__wrapped__.__closure__} + assert "disposition" in closure + + +# --------------------------------------------------------------------------- +# search_hygiene_issues +# --------------------------------------------------------------------------- + + +@patch.object(HygieneIssue, "search") +def test_search_no_args_uses_allowed_codes(mock_search, db_session_mock): + from testgen.mcp.tools.hygiene_issues import search_hygiene_issues + + mock_search.return_value = ([], 0) + search_hygiene_issues() + + sql = _compiled_clauses(mock_search.call_args) + assert "profile_anomaly_results.project_code IN" in sql + + +@patch("testgen.mcp.permissions._compute_project_permissions") +def test_search_inaccessible_project_rejected(mock_compute, db_session_mock): + from testgen.mcp.tools.hygiene_issues import search_hygiene_issues + + perms = MagicMock(spec=ProjectPermissions) + perms.memberships = {"demo": "role_a"} + perms.permission = "view" + perms.username = "test_user" + perms.allowed_codes = ["demo"] + perms.has_access.side_effect = lambda c: c == "demo" + perms.verify_access.side_effect = lambda _code, not_found: (_ for _ in ()).throw(not_found) + mock_compute.return_value = perms + + with pytest.raises(MCPResourceNotAccessible): + search_hygiene_issues(project_code="forbidden") + + +def test_search_invalid_tg_uuid(db_session_mock): + from testgen.mcp.tools.hygiene_issues import search_hygiene_issues + + with pytest.raises(MCPUserError, match="not a valid UUID"): + search_hygiene_issues(table_group_id="bogus") + + +@patch.object(HygieneIssue, "search") +def test_search_table_group_filter_in_clause(mock_search, db_session_mock): + from testgen.mcp.tools.hygiene_issues import search_hygiene_issues + + mock_search.return_value = ([], 0) + search_hygiene_issues(table_group_id=str(uuid4())) + + sql = _compiled_clauses(mock_search.call_args) + assert "profile_anomaly_results.table_groups_id =" in sql + + +@patch.object(HygieneIssue, "search") +def test_search_since_translates_to_started_at_clause(mock_search, db_session_mock): + from testgen.mcp.tools.hygiene_issues import search_hygiene_issues + + mock_search.return_value = ([], 0) + search_hygiene_issues(since="7 days") + + sql = _compiled_clauses(mock_search.call_args) + assert "job_executions.started_at >=" in sql + assert "profiling_runs.profiling_starttime" not in sql + + +def test_search_invalid_since_rejected(db_session_mock): + from testgen.mcp.tools.hygiene_issues import search_hygiene_issues + + with pytest.raises(MCPUserError): + search_hygiene_issues(since="not a date") + + +@patch.object(HygieneIssue, "search") +def test_search_happy_path_renders_search_row_fields(mock_search, db_session_mock): + from testgen.mcp.tools.hygiene_issues import search_hygiene_issues + + je_id = uuid4() + mock_search.return_value = ([_search_row(table_groups_name="dim-tables", job_execution_id=je_id)], 1) + + result = search_hygiene_issues() + + assert "Hygiene Issue Search" in result + assert "Table Group" in result + assert "dim-tables" in result + assert "Profiling Run" in result + assert str(je_id) in result + assert "Run Date" in result + + +@patch.object(HygieneIssue, "search") +def test_search_empty_returns_no_match_message(mock_search, db_session_mock): + from testgen.mcp.tools.hygiene_issues import search_hygiene_issues + + mock_search.return_value = ([], 0) + result = search_hygiene_issues() + assert "_No hygiene issues match the supplied filters._" in result + + +def test_search_invalid_page(db_session_mock): + from testgen.mcp.tools.hygiene_issues import search_hygiene_issues + + with pytest.raises(MCPUserError, match="Invalid page"): + search_hygiene_issues(page=0) + + +def test_search_invalid_limit(db_session_mock): + from testgen.mcp.tools.hygiene_issues import search_hygiene_issues + + with pytest.raises(MCPUserError, match="between 1 and 200"): + search_hygiene_issues(limit=201) + + +@patch("testgen.mcp.tools.hygiene_issues.resolve_issue_type") +@patch.object(HygieneIssue, "search") +def test_search_issue_type_filter_resolved(mock_search, mock_resolve, db_session_mock): + from testgen.mcp.tools.hygiene_issues import search_hygiene_issues + + mock_resolve.return_value = "1015" + mock_search.return_value = ([], 0) + search_hygiene_issues(issue_type="Personally Identifiable Information") + + sql = _compiled_clauses(mock_search.call_args) + assert "profile_anomaly_results.anomaly_id =" in sql diff --git a/tests/unit/mcp/test_tools_reference.py b/tests/unit/mcp/test_tools_reference.py index 519e6e61..55f6b508 100644 --- a/tests/unit/mcp/test_tools_reference.py +++ b/tests/unit/mcp/test_tools_reference.py @@ -91,7 +91,88 @@ def test_glossary_resource(): assert "Entity Hierarchy" in result assert "Result Statuses" in result - assert "Data Quality Dimensions" in result + assert "Quality Dimensions" in result assert "Test Scopes" in result assert "Disposition" in result assert "Monitor Types" not in result + + # Hygiene-issue additions (TG-1029): + assert "Profiling Run" in result + assert "Hygiene Issue" in result + assert "## Hygiene Issue Likelihood" in result + assert "Definite" in result + assert "Likely" in result + assert "Possible" in result + assert "PII Risk" in result + # All three disposition values defined under one section: + assert "Confirmed" in result + assert "Dismissed" in result + assert "Muted" in result + # Recency was added during the migration: + assert "Recency" in result + # Impact Dimensions section + all four values: + assert "## Impact Dimensions" in result + assert "Reliability" in result + assert "Conformance" in result + assert "Regularity" in result + assert "Usability" in result + + +@patch("testgen.mcp.tools.reference.HygieneIssueType") +def test_hygiene_issue_types_resource_basic(mock_type_cls, db_session_mock): + t1 = MagicMock() + t1.name = "Personally Identifiable Information" + t1.impact_dimension = "Conformance" + t1.dq_dimension = "Validity" + t1.likelihood = "Potential PII" + t1.description = "PII description." + t1.suggested_action = "Handle PII carefully." + t2 = MagicMock() + t2.name = "Non-Standard Blank Values" + t2.impact_dimension = "Usability" + t2.dq_dimension = "Completeness" + t2.likelihood = "Definite" + t2.description = "Blanks description." + t2.suggested_action = "Cleanse blanks." + mock_type_cls.select_where.return_value = [t1, t2] + + from testgen.mcp.tools.reference import hygiene_issue_types_resource + + result = hygiene_issue_types_resource() + + # Header order: Issue Type | Impact Dimension | Quality Dimension | Likelihood | Description | Suggested Action + header_line = next(line for line in result.split("\n") if line.startswith("| Issue Type")) + assert header_line == "| Issue Type | Impact Dimension | Quality Dimension | Likelihood | Description | Suggested Action |" + # All values surface: + assert "Personally Identifiable Information" in result + assert "Non-Standard Blank Values" in result + assert "Potential PII" in result + assert "Definite" in result + assert "Handle PII carefully." in result + + +@patch("testgen.mcp.tools.reference.HygieneIssueType") +def test_hygiene_issue_types_resource_orders_by_name(mock_type_cls, db_session_mock): + from testgen.common.models.hygiene_issue import HygieneIssueType + from testgen.mcp.tools.reference import hygiene_issue_types_resource + + mock_type_cls.select_where.return_value = [] + mock_type_cls.name = HygieneIssueType.name + + hygiene_issue_types_resource() + + # ``select_where`` was called with order_by tuple containing the name column. + mock_type_cls.select_where.assert_called_once() + kwargs = mock_type_cls.select_where.call_args.kwargs + assert "order_by" in kwargs + assert kwargs["order_by"][0] is HygieneIssueType.name + + +@patch("testgen.mcp.tools.reference.HygieneIssueType") +def test_hygiene_issue_types_resource_empty(mock_type_cls, db_session_mock): + mock_type_cls.select_where.return_value = [] + + from testgen.mcp.tools.reference import hygiene_issue_types_resource + + result = hygiene_issue_types_resource() + assert "No hygiene issue types found" in result From fde7321abc5ac5c7d9488859f3bbc85aff5df430 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Wed, 6 May 2026 01:41:07 -0400 Subject: [PATCH 08/11] fix(standalone): graceful shutdown of embedded postgres on signal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `testgen run-app` was leaving orphan postgres processes after Ctrl+C in standalone (pip) mode, breaking `tg delete` (data dir locked). pixeltable-pgserver tracks handles via a disk-backed PID registry at `/.handle_pids.json` and only stops postgres when the calling PID is the last one registered. Two paths were leaking PIDs into the registry: 1. Each `run-app all` child re-entered `cli()` and called `start_server()` itself, registering its own PID alongside the parent's. 2. On Windows, `_forward_signal_to_child` used `terminate()` (TerminateProcess), so children's atexit never ran — their PIDs stayed in the registry forever, and the parent's cleanup silently no-op'd. Fix: - `cli()` now calls `ensure_standalone_setup(uri)` instead of `start_server()` when `_TG_STANDALONE_URI` is inherited, so children attach to the parent's pgserver without registering a PID. - `run_app("all")` injects the parent's pgserver URI into `child_env`. - `run_ui()` Streamlit env-var assembly falls back to the inherited env var when this process doesn't own pgserver. - New `_subprocess_spawn_kwargs()` helper: POSIX `start_new_session`, Windows `CREATE_NEW_PROCESS_GROUP`. Applied at both `Popen` sites. - Windows branch of `_forward_signal_to_child` now sends `CTRL_BREAK_EVENT` instead of `terminate()`, so children's atexit and SIGBREAK handlers run. - New `_install_shutdown_handler()` registers SIGINT/SIGTERM/(SIGBREAK) in one call. Used in `run_ui`, `run_app("all")`, and `Scheduler.run`. - `run_app("all")` children-watcher loop now iterates a snapshot of `children` so simultaneous child exits don't leave a sibling un-reaped. Co-Authored-By: Claude Opus 4.7 (1M context) --- testgen/__main__.py | 78 ++++++++++++++++++++++++------ testgen/scheduler/cli_scheduler.py | 4 ++ 2 files changed, 67 insertions(+), 15 deletions(-) diff --git a/testgen/__main__.py b/testgen/__main__.py index 1841636a..3b8d7f12 100644 --- a/testgen/__main__.py +++ b/testgen/__main__.py @@ -97,6 +97,7 @@ def _capped(level): from testgen.common.notifications.base import smtp_configured from testgen.common.standalone_postgres import ( STANDALONE_URI_ENV_VAR, + ensure_standalone_setup, get_server_uri, is_standalone_mode, ) @@ -116,12 +117,41 @@ def _capped(level): CHILDREN_POLL_INTERVAL = 10 +def _subprocess_spawn_kwargs() -> dict: + """Spawn flags that let the parent send a *catchable* shutdown signal to a child. + + On POSIX, ``start_new_session`` puts the child in its own session so the + console SIGINT only hits the parent. On Windows, ``CREATE_NEW_PROCESS_GROUP`` + is required so the parent can deliver ``CTRL_BREAK_EVENT`` to the child — + without it, the only options are ``terminate()`` (uncatchable) or relying on + the console-wide Ctrl+C broadcast (racy). + """ + if sys.platform == "win32": + return {"creationflags": subprocess.CREATE_NEW_PROCESS_GROUP} + return {"start_new_session": True} + + +def _install_shutdown_handler(handler) -> None: + """Register *handler* for SIGINT, SIGTERM, and (on Windows) SIGBREAK. + + Windows children spawned with ``CREATE_NEW_PROCESS_GROUP`` receive + ``CTRL_BREAK_EVENT`` (delivered as SIGBREAK) instead of console Ctrl+C — + treat it the same as SIGTERM so atexit hooks run and pgserver's PID list + stays clean. + """ + signal.signal(signal.SIGINT, handler) + signal.signal(signal.SIGTERM, handler) + if sigbreak := getattr(signal, "SIGBREAK", None): + signal.signal(sigbreak, handler) + + def _forward_signal_to_child(child: subprocess.Popen, signum: int) -> None: - # On POSIX, forward the signal verbatim. On Windows, subprocess.send_signal - # rejects everything except SIGTERM / CTRL_C_EVENT / CTRL_BREAK_EVENT, so - # fall back to terminate() — equivalent to TerminateProcess(). + # POSIX: forward the signal verbatim. Windows: send CTRL_BREAK_EVENT so the + # child's atexit hooks run (it deregisters from pgserver's PID list, lets + # streamlit/uvicorn shut down their event loops, etc.). The child must have + # been spawned with CREATE_NEW_PROCESS_GROUP — see _subprocess_spawn_kwargs. if sys.platform == "win32": - child.terminate() + child.send_signal(signal.CTRL_BREAK_EVENT) else: child.send_signal(signum) @@ -169,7 +199,14 @@ def format_epilog(self, _ctx: Context, formatter: click.HelpFormatter) -> None: @click.pass_context def cli(ctx: Context, verbose: bool): if is_standalone_mode(): - start_standalone_postgres() + # Children spawned by `run-app all` (and the Streamlit subprocess) inherit the + # parent's pgserver URI. They must NOT call get_server() themselves — each call + # registers the calling PID in pgserver's on-disk handle list, and any PID left + # over after a hard kill prevents the parent from actually stopping postgres. + if inherited_uri := os.environ.get(STANDALONE_URI_ENV_VAR): + ensure_standalone_setup(inherited_uri) + else: + start_standalone_postgres() if verbose: configure_logging(level=logging.DEBUG) @@ -924,14 +961,16 @@ def init_ui(): app_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ui/app.py") # In standalone mode, pass the pgserver URI to the Streamlit subprocess - # so it can connect without acquiring the pgserver file lock. + # so it can connect without acquiring the pgserver file lock. When this + # process is itself a child of `run-app all`, the URI is already in the + # inherited environment — fall back to that so Streamlit still gets it. child_env = {**os.environ, "TG_JOB_SOURCE": "UI"} if is_standalone_mode(): - server_uri = get_server_uri() + server_uri = get_server_uri() or os.environ.get(STANDALONE_URI_ENV_VAR) if server_uri: - child_env = {**os.environ, "TG_JOB_SOURCE": "UI", STANDALONE_URI_ENV_VAR: server_uri} + child_env[STANDALONE_URI_ENV_VAR] = server_uri - process= subprocess.Popen( + process = subprocess.Popen( [ sys.executable, "-m", @@ -950,12 +989,12 @@ def init_ui(): f"{'--debug' if settings.IS_DEBUG else ''}", ], env=child_env, + **_subprocess_spawn_kwargs(), ) def term_ui(signum, _): LOG.info(f"Sending termination signal {signum} to Testgen UI") _forward_signal_to_child(process, signum) - signal.signal(signal.SIGINT, term_ui) - signal.signal(signal.SIGTERM, term_ui) + _install_shutdown_handler(term_ui) status_code = process.wait() LOG.log(logging.ERROR if status_code != 0 else logging.INFO, f"Testgen UI exited with status code {status_code}") @@ -980,8 +1019,15 @@ def run_app(module): run_server() case "all": + child_env = os.environ.copy() + if is_standalone_mode() and (server_uri := get_server_uri()): + child_env[STANDALONE_URI_ENV_VAR] = server_uri children = [ - subprocess.Popen([sys.executable, "-m", "testgen", "run-app", m], start_new_session=True) + subprocess.Popen( + [sys.executable, "-m", "testgen", "run-app", m], + env=child_env, + **_subprocess_spawn_kwargs(), + ) for m in APP_MODULES ] @@ -989,8 +1035,7 @@ def term_children(signum, _): for child in children: _forward_signal_to_child(child, signum) - signal.signal(signal.SIGINT, term_children) - signal.signal(signal.SIGTERM, term_children) + _install_shutdown_handler(term_children) terminating = False while children: @@ -999,7 +1044,10 @@ def term_children(signum, _): except subprocess.TimeoutExpired: pass - for child in children: + # Iterate a snapshot — `children.remove(child)` inside a live iteration + # would skip the next element and could leave a sibling un-reaped during + # the simultaneous-exit case we're explicitly hardening for here. + for child in list(children): if child.poll() is not None: children.remove(child) if not terminating: diff --git a/testgen/scheduler/cli_scheduler.py b/testgen/scheduler/cli_scheduler.py index bd2719ec..ddc3c9ec 100644 --- a/testgen/scheduler/cli_scheduler.py +++ b/testgen/scheduler/cli_scheduler.py @@ -195,6 +195,10 @@ def sig_handler(signum, _): signal.signal(signal.SIGINT, sig_handler) signal.signal(signal.SIGTERM, sig_handler) + # Windows: parent forwards CTRL_BREAK_EVENT (delivered as SIGBREAK) when the + # child is spawned in its own process group, so handle it the same way. + if sigbreak := getattr(signal, "SIGBREAK", None): + signal.signal(sigbreak, sig_handler) try: self.start(datetime.now(UTC)) From 043da3c8c658915b206a2726a484d1a55f95a407 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Wed, 6 May 2026 02:26:54 -0400 Subject: [PATCH 09/11] docs: simplify README install section to point at docs site MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dk-installer now offers Docker and pip install modes, both fully documented at docs.datakitchen.io. Replace the long install instructions in the README — which had drifted out of sync with the installer and the docs — with a brief pointer to the install pages. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 195 ++---------------------------------------------------- 1 file changed, 4 insertions(+), 191 deletions(-) diff --git a/README.md b/README.md index 5383516d..b3be60ad 100644 --- a/README.md +++ b/README.md @@ -29,199 +29,12 @@ A single place to manage Data Quality across data sets, locations, and te DataKitchen Open Source Data Quality TestGen Features - Single Place

-## Installation with dk-installer (recommended) +## Installation -The [dk-installer](https://github.com/DataKitchen/data-observability-installer/?tab=readme-ov-file#install-the-testgen-application) program installs DataOps Data Quality TestGen as a [Docker Compose](https://docs.docker.com/compose/) application. This is the recommended mode of installation as Docker encapsulates and isolates the application from other software on your machine and does not require you to manage Python dependencies. +The [dk-installer](https://github.com/DataKitchen/data-observability-installer/?tab=readme-ov-file#install-the-testgen-application) program installs TestGen in either Docker or pip mode. For complete instructions, see the documentation: -### Install the prerequisite software - -| Software | Tested Versions | Command to check version | -|-------------------------|-------------------------|-------------------------------| -| [Python](https://www.python.org/downloads/)
- Most Linux and macOS systems have Python pre-installed.
- On Windows machines, you will need to download and install it.
Why Python? To run the installer. | 3.9, 3.10, 3.11, 3.12 | `python3 --version` | -| [Docker](https://docs.docker.com/get-docker/)
[Docker Compose](https://docs.docker.com/compose/install/)
Why Docker? Docker lets you try TestGen without affecting your local software environment. All the dependencies TestGen needs are isolated in its own container, so installation is easy and insulated. | 26.1, 27.5, 28.0
2.32, 2.33, 2.34 | `docker -v`
`docker compose version` | - -### Download the installer - -On Unix-based operating systems, use the following command to download it to the current directory. We recommend creating a new, empty directory. - -```shell -curl -o dk-installer.py 'https://raw.githubusercontent.com/DataKitchen/data-observability-installer/main/dk-installer.py' -``` - -* Alternatively, you can manually download the [`dk-installer.py`](https://github.com/DataKitchen/data-observability-installer/blob/main/dk-installer.py) file from the [data-observability-installer](https://github.com/DataKitchen/data-observability-installer) repository. -* All commands listed below should be run from the folder containing this file. -* For usage help and command options, run `python3 dk-installer.py --help` or `python3 dk-installer.py --help`. - -### Install the TestGen application - -The installation downloads the latest Docker images for TestGen and deploys a new Docker Compose application. The process may take 5~10 minutes depending on your machine and network connection. - -```shell -python3 dk-installer.py tg install -``` - -The `--port` option may be used to set a custom localhost port for the application (default: 8501). - -To enable SSL for HTTPS support, use the `--ssl-cert-file` and `--ssl-key-file` options to specify local file paths to your SSL certificate and key files. - -Once the installation completes, verify that you can login to the UI with the URL and credentials provided in the output. - -### Optional: Run the TestGen demo setup - -The [Data Observability quickstart](https://docs.datakitchen.io/tutorials/quickstart-demo/) walks you through DataOps Data Quality TestGen capabilities to demonstrate how it covers critical use cases for data and analytic teams. - -```shell -python3 dk-installer.py tg run-demo -``` - -In the TestGen UI, you will see that new data profiling and test results have been generated. - -## Installation with pip - -As an alternative to the Docker Compose [installation with dk-installer (recommended)](#installation-with-dk-installer-recommended), DataOps Data Quality TestGen can also be installed as a Python package via [pip](https://pip.pypa.io/en/stable/). This mode of installation uses the [dataops-testgen](https://pypi.org/project/dataops-testgen/) package published to PyPI, and it requires a PostgreSQL instance to be provisioned for the application database. - -### Install the prerequisite software - -| Software | Tested Versions | Command to check version | -|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------|------------------------------| -| [Python](https://www.python.org/downloads/)
- Most Linux and macOS systems have Python pre-installed.
- On Windows machines, you will need to download and install it. | 3.11, 3.12, 3.13 | `python3 --version` | -| [PostgreSQL](https://www.postgresql.org/download/) | 14.1, 15.8, 16.4 | `psql --version`| - -### Install the TestGen package - -We recommend using a Python virtual environment to avoid any dependency conflicts with other applications installed on your machine. The [venv](https://docs.python.org/3/library/venv.html#creating-virtual-environments) module, which is part of the Python standard library, or other third-party tools, like [virtualenv](https://virtualenv.pypa.io/en/latest/) or [conda](https://docs.conda.io/en/latest/), can be used. - -Create and activate a virtual environment with a TestGen-compatible version of Python (`>=3.11`). The steps may vary based on your operating system and Python installation - the [Python packaging user guide](https://packaging.python.org/en/latest/tutorials/installing-packages/) is a useful reference. - -_On Linux/Mac_ -```shell -python3 -m venv venv -source venv/bin/activate -``` - -_On Windows_ -```powershell -py -3.13 -m venv venv -venv\Scripts\activate -``` - -Within the virtual environment, install the TestGen package with pip. -```shell -pip install dataops-testgen -``` - -Verify that the [_testgen_ command line](https://docs.datakitchen.io/testgen/cli-reference/) works. -```shell -testgen --help -``` - -### Set up the application database in PostgresSQL - -Create a `local.env` file with the following environment variables, replacing the `` placeholders with appropriate values. Refer to the [TestGen Configuration](docs/configuration.md) document for more details, defaults, and other supported configuration. -```shell -# Connection parameters for the PostgreSQL server -export TG_METADATA_DB_HOST= -export TG_METADATA_DB_PORT= - -# Connection credentials for the PostgreSQL server -# This role must have privileges to create roles, users, database and schema so that the application database can be initialized -export TG_METADATA_DB_USER= -export TG_METADATA_DB_PASSWORD= - -# Set a password and arbitrary string (the "salt") to be used for encrypting secrets in the application database -export TG_DECRYPT_PASSWORD= -export TG_DECRYPT_SALT= - -# Set credentials for the default admin user to be created for TestGen -export TESTGEN_USERNAME= -export TESTGEN_PASSWORD= - -# Set an arbitrary base64-encoded string to be used for signing authentication tokens -export TG_JWT_HASHING_KEY= - -# Set an accessible path for storing application logs -export TESTGEN_LOG_FILE_PATH= -``` - -Source the file to apply the environment variables. For the Windows equivalent, refer to [this guide](https://bennett4.medium.com/windows-alternative-to-source-env-for-setting-environment-variables-606be2a6d3e1). -```shell -source local.env -``` - -Make sure the PostgreSQL database server is up and running. Initialize the application database for TestGen. -```shell -testgen setup-system-db --yes -``` - -### Run the application modules - -Run the following command to start TestGen. It will open the browser at [http://localhost:8501](http://localhost:8501). - -```shell -testgen run-app -``` - -Verify that you can login to the UI with the `TESTGEN_USERNAME` and `TESTGEN_PASSWORD` values that you configured in the environment variables. - -### Optional: Run the TestGen demo setup - -The [Data Observability quickstart](https://docs.datakitchen.io/tutorials/quickstart-demo/) walks you through DataOps Data Quality TestGen capabilities to demonstrate how it covers critical use cases for data and analytic teams. - -```shell -testgen quick-start -``` - -In the TestGen UI, you will see that new data profiling and test results have been generated. - -## Useful Commands - -The [dk-installer](https://github.com/DataKitchen/data-observability-installer/?tab=readme-ov-file#install-the-testgen-application) and [docker compose CLI](https://docs.docker.com/compose/reference/) can be used to operate the TestGen application installed using dk-installer. All commands must be run in the same folder that contains the `dk-installer.py` and `docker-compose.yml` files used by the installation. - -### Remove demo data - -After completing the quickstart, you can remove the demo data from the application with the following command. - -```shell -python3 dk-installer.py tg delete-demo -``` - -### Upgrade to latest version - -New releases of TestGen are announced on the `#releases` channel on [Data Observability Slack](https://data-observability-slack.datakitchen.io/join), and release notes can be found on the [DataKitchen documentation portal](https://docs.datakitchen.io/testgen/release-notes/). Use the following command to upgrade to the latest released version. - - ```shell - python3 dk-installer.py tg upgrade - ``` - -### Uninstall the application - -The following command uninstalls the Docker Compose application and removes all data, containers, and images related to TestGen from your machine. - -```shell -python3 dk-installer.py tg delete -``` - -### Access the _testgen_ CLI - -The [_testgen_ command line](https://docs.datakitchen.io/testgen/cli-reference/) can be accessed within the running container. - -```shell -docker compose exec engine bash -``` - -Use `exit` to return to the regular terminal. - -### Stop the application - -```shell -docker compose down -``` - -### Restart the application - -```shell -docker compose up -d -``` +* [Install on Mac/Linux](https://docs.datakitchen.io/testgen/get-started/install-on-mac-linux/) +* [Install on Windows](https://docs.datakitchen.io/testgen/get-started/install-on-windows/) ## What Next? From 4e7cc45c259949139f17097f8c5a5881767f3a64 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Thu, 14 May 2026 22:47:18 -0400 Subject: [PATCH 10/11] fix(standalone): resolve embedded host/port at connection-build time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Windows pgserver picks a fresh ephemeral TCP port on every startup, so the demo-DB connection row written during quick-start became stale as soon as run-app started a new pgserver session — "Test Connection" and any target-DB query failed with "connection refused". Store a sentinel in project_host instead of the live host/port; resolve_connection_params rewrites it to the live values when standalone mode is active. Single chokepoint covers UI test-connection, profiling, test execution, and quick-start increments. Co-Authored-By: Claude Opus 4.7 (1M context) --- testgen/commands/run_launch_db_config.py | 9 +++++---- testgen/commands/run_quick_start.py | 9 +++++---- testgen/common/database/flavor/flavor_service.py | 11 +++++++++-- testgen/common/standalone_postgres.py | 7 +++++++ 4 files changed, 26 insertions(+), 10 deletions(-) diff --git a/testgen/commands/run_launch_db_config.py b/testgen/commands/run_launch_db_config.py index 11b2257f..f7b69bcb 100644 --- a/testgen/commands/run_launch_db_config.py +++ b/testgen/commands/run_launch_db_config.py @@ -9,7 +9,7 @@ from testgen.common.models import with_database_session from testgen.common.read_file import get_template_files from testgen.common.read_yaml_metadata_records import import_metadata_records_from_yaml -from testgen.common.standalone_postgres import get_target_host_port, is_standalone_mode +from testgen.common.standalone_postgres import EMBEDDED_HOST_SENTINEL, is_standalone_mode LOG = logging.getLogger("testgen") @@ -28,9 +28,10 @@ def _get_params_mapping() -> dict: project_user = settings.PROJECT_DATABASE_USER project_password = settings.PROJECT_DATABASE_PASSWORD if is_standalone_mode(): - project_host, server_port = get_target_host_port() - if server_port: - project_port = server_port + # Live host/port are resolved at connection-build time via the sentinel + # so the row survives pgserver picking a fresh ephemeral port on Windows. + project_host = EMBEDDED_HOST_SENTINEL + project_port = "" project_user = "postgres" project_password = "" diff --git a/testgen/commands/run_quick_start.py b/testgen/commands/run_quick_start.py index e7a9a84d..5f7389bc 100644 --- a/testgen/commands/run_quick_start.py +++ b/testgen/commands/run_quick_start.py @@ -27,7 +27,7 @@ from testgen.common.models.table_group import TableGroup from testgen.common.notifications.base import smtp_configured from testgen.common.read_file import read_template_sql_file -from testgen.common.standalone_postgres import get_target_host_port, is_standalone_mode +from testgen.common.standalone_postgres import EMBEDDED_HOST_SENTINEL, is_standalone_mode LOG = logging.getLogger("testgen") random.seed(42) @@ -148,9 +148,10 @@ def _get_settings_params_mapping() -> dict: admin_user = settings.DATABASE_ADMIN_USER admin_password = settings.DATABASE_ADMIN_PASSWORD if is_standalone_mode(): - host, server_port = get_target_host_port() - if server_port: - port = server_port + # Live host/port are resolved at connection-build time via the sentinel + # so the row survives pgserver picking a fresh ephemeral port on Windows. + host = EMBEDDED_HOST_SENTINEL + port = "" admin_user = "postgres" admin_password = "" diff --git a/testgen/common/database/flavor/flavor_service.py b/testgen/common/database/flavor/flavor_service.py index a56ac8ba..dafc7d2a 100644 --- a/testgen/common/database/flavor/flavor_service.py +++ b/testgen/common/database/flavor/flavor_service.py @@ -61,13 +61,20 @@ def _decrypt_if_needed(value: Any) -> str | None: def resolve_connection_params(connection_params: ConnectionParams) -> ResolvedConnectionParams: sql_flavor = connection_params.get("sql_flavor") or "" + host = connection_params.get("project_host") or "" + port = connection_params.get("project_port") or "" + # Lazy import to keep the flavor layer free of standalone concerns at module load. + from testgen.common.standalone_postgres import EMBEDDED_HOST_SENTINEL, get_target_host_port, is_standalone_mode + if host == EMBEDDED_HOST_SENTINEL and is_standalone_mode(): + host, live_port = get_target_host_port() + port = live_port or "" return ResolvedConnectionParams( url=connection_params.get("url") or "", connect_by_url=connection_params.get("connect_by_url", False), username=connection_params.get("project_user") or "", password=_decrypt_if_needed(connection_params.get("project_pw_encrypted")), - host=connection_params.get("project_host") or "", - port=connection_params.get("project_port") or "", + host=host, + port=port, dbname=connection_params.get("project_db") or "", dbschema=connection_params.get("table_group_schema"), sql_flavor=sql_flavor, diff --git a/testgen/common/standalone_postgres.py b/testgen/common/standalone_postgres.py index d272eb94..be27e774 100644 --- a/testgen/common/standalone_postgres.py +++ b/testgen/common/standalone_postgres.py @@ -23,6 +23,13 @@ HOME_DIR_ENV_VAR = "TG_TESTGEN_HOME" STANDALONE_URI_ENV_VAR = "_TG_STANDALONE_URI" +# Stored as ``project_host`` in the demo-DB connection row so that the actual +# host/port — which can change across sessions on Windows (pgserver picks a +# fresh ephemeral TCP port each startup) — are resolved at connection-build +# time by ``resolve_connection_params``. Angle brackets are illegal in real +# hostnames, so this can't collide with a user-defined connection. +EMBEDDED_HOST_SENTINEL = "" + def get_home_dir() -> Path: env_dir = os.getenv(HOME_DIR_ENV_VAR) From c002b5346f31897423b42201d6859dbf82e3ecbb Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Thu, 14 May 2026 23:22:56 -0400 Subject: [PATCH 11/11] release: 5.32.2 -> 5.33.2 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 3cae9aed..13924a31 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta" [project] name = "dataops-testgen" -version = "5.32.2" +version = "5.33.2" description = "DataKitchen's Data Quality DataOps TestGen" authors = [ { "name" = "DataKitchen, Inc.", "email" = "info@datakitchen.io" },