diff --git a/.github/actions/codeclone/README.md b/.github/actions/codeclone/README.md index 88dbc69..aa05a22 100644 --- a/.github/actions/codeclone/README.md +++ b/.github/actions/codeclone/README.md @@ -73,7 +73,7 @@ jobs: | Input | Default | Purpose | |-------------------------|---------------------------------|-------------------------------------------------------------------------------------------------------------------| -| `python-version` | `3.13` | Python version used to run the action | +| `python-version` | `3.14` | Python version used to run the action | | `package-version` | `""` | CodeClone version from PyPI for remote installs; ignored when the action runs from the checked-out CodeClone repo | | `path` | `.` | Project root to analyze | | `json-path` | `.cache/codeclone/report.json` | JSON report output path | diff --git a/.github/actions/codeclone/action.yml b/.github/actions/codeclone/action.yml index 7cc9975..2d0d1f1 100644 --- a/.github/actions/codeclone/action.yml +++ b/.github/actions/codeclone/action.yml @@ -13,7 +13,7 @@ inputs: python-version: description: "Python version" required: false - default: "3.13" + default: "3.14" package-version: description: "CodeClone version from PyPI for remote installs (ignored when the action runs from the checked-out CodeClone repo)" diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index f6f9ab7..259556f 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -87,7 +87,7 @@ jobs: if: env.BENCH_ENABLED == '1' && runner.os == 'macOS' uses: actions/setup-python@v6.2.0 with: - python-version: "3.13" + python-version: "3.14" allow-prereleases: true - name: Set up uv (macOS local benchmark) @@ -98,7 +98,7 @@ jobs: - name: Install dependencies (macOS local benchmark) if: env.BENCH_ENABLED == '1' && runner.os == 'macOS' - run: uv sync --all-extras --dev + run: uv sync --extra dev - name: Set benchmark output path if: env.BENCH_ENABLED == '1' diff --git a/.github/workflows/codeclone.yml b/.github/workflows/codeclone.yml index d0566e4..23ce340 100644 --- a/.github/workflows/codeclone.yml +++ b/.github/workflows/codeclone.yml @@ -26,7 +26,7 @@ jobs: - name: Run CodeClone uses: ./.github/actions/codeclone with: - python-version: "3.13" + python-version: "3.14" fail-on-new: "true" fail-health: "60" sarif: "true" diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index bcec725..85c616a 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -24,7 +24,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v6.2.0 with: - python-version: "3.13" + python-version: "3.14" allow-prereleases: true - name: Set up uv @@ -33,7 +33,7 @@ jobs: enable-cache: true - name: Install project dependencies - run: uv sync --dev + run: uv sync --extra dev - name: Configure GitHub Pages uses: actions/configure-pages@v5 diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..73e561c --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,116 @@ +name: publish +run-name: >- + publish • ${{ github.event_name }} • + ${{ github.event.release.tag_name || inputs.repository || github.ref_name }} + +on: + release: + types: [published] + workflow_dispatch: + inputs: + repository: + description: Target package index + required: true + default: testpypi + type: choice + options: + - testpypi + - pypi + +permissions: + contents: read + +concurrency: + group: publish-${{ github.event.release.tag_name || github.ref }} + cancel-in-progress: false + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v6.0.2 + + - name: Set up Python + uses: actions/setup-python@v6.2.0 + with: + python-version: "3.14" + allow-prereleases: true + + - name: Set up uv + uses: astral-sh/setup-uv@v5 + with: + enable-cache: true + + - name: Verify release tag matches project version + if: ${{ github.event_name == 'release' }} + shell: bash + run: | + set -euo pipefail + project_version="$(python - <<'PY' + import pathlib, tomllib + payload = tomllib.loads(pathlib.Path("pyproject.toml").read_text(encoding="utf-8")) + print(payload["project"]["version"]) + PY + )" + release_tag="${{ github.event.release.tag_name }}" + normalized_tag="${release_tag#v}" + if [ "$normalized_tag" != "$project_version" ]; then + echo "release tag $release_tag does not match project version $project_version" >&2 + exit 1 + fi + + - name: Build distributions + run: uv run --with build python -m build --sdist --wheel + + - name: Validate distributions + run: uv run --with twine twine check dist/* + + - name: Upload distributions + uses: actions/upload-artifact@v4 + with: + name: python-package-distributions + path: dist/ + if-no-files-found: error + + publish-testpypi: + if: ${{ github.event_name == 'workflow_dispatch' && inputs.repository == 'testpypi' }} + needs: build + runs-on: ubuntu-latest + environment: testpypi + permissions: + contents: read + id-token: write + steps: + - name: Download distributions + uses: actions/download-artifact@v5 + with: + name: python-package-distributions + path: dist/ + + - name: Publish to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ + + publish-pypi: + if: >- + ${{ + github.event_name == 'release' || + (github.event_name == 'workflow_dispatch' && inputs.repository == 'pypi') + }} + needs: build + runs-on: ubuntu-latest + environment: pypi + permissions: + contents: read + id-token: write + steps: + - name: Download distributions + uses: actions/download-artifact@v5 + with: + name: python-package-distributions + path: dist/ + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index d07ce3f..6acb4a4 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -35,7 +35,7 @@ jobs: enable-cache: true - name: Install dependencies - run: uv sync --all-extras --dev + run: uv sync --extra dev --extra mcp - name: Run tests # Smoke CLI tests intentionally disable subprocess coverage collection @@ -43,11 +43,11 @@ jobs: run: uv run pytest --cov=codeclone --cov-report=term-missing --cov-fail-under=99 - name: Verify baseline exists - if: ${{ matrix.python-version == '3.13' }} + if: ${{ matrix.python-version == '3.14' }} run: test -f codeclone.baseline.json - name: Check for new clones vs baseline - if: ${{ matrix.python-version == '3.13' }} + if: ${{ matrix.python-version == '3.14' }} run: uv run codeclone . --ci lint: @@ -59,7 +59,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v6.2.0 with: - python-version: "3.13" + python-version: "3.14" - name: Set up uv uses: astral-sh/setup-uv@v5 @@ -67,7 +67,7 @@ jobs: enable-cache: true - name: Install dependencies - run: uv sync --all-extras --dev + run: uv sync --extra dev --extra mcp - name: Ruff run: uv run ruff check . diff --git a/.gitignore b/.gitignore index e3ad2eb..71bd32f 100644 --- a/.gitignore +++ b/.gitignore @@ -40,3 +40,8 @@ site/ /package-lock.json extensions/vscode-codeclone/node_modules /coverage.xml +/.cgcignore +/mcp.json +/scripts/refactor_guard.sh +/docs/refactoring-spec.md +/smoke_cli.sh diff --git a/AGENTS.md b/AGENTS.md index 16e579d..a645b79 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -137,18 +137,18 @@ uv run pytest -q tests/test_codex_plugin.py ### Versioned constants (single source of truth) -All schema/version constants live in `codeclone/contracts.py`. **Always read them from code, never copy +All schema/version constants live in `codeclone/contracts/__init__.py`. **Always read them from code, never copy from another doc.** Current values (verified at write time): -| Constant | Source | Current value | -|-----------------------------------|------------------------------|---------------| -| `BASELINE_SCHEMA_VERSION` | `codeclone/contracts.py` | `2.1` | -| `BASELINE_FINGERPRINT_VERSION` | `codeclone/contracts.py` | `1` | -| `CACHE_VERSION` | `codeclone/contracts.py` | `2.5` | -| `REPORT_SCHEMA_VERSION` | `codeclone/contracts.py` | `2.8` | -| `METRICS_BASELINE_SCHEMA_VERSION` | `codeclone/contracts.py` | `1.2` | +| Constant | Source | Current value | +|-----------------------------------|-----------------------------------|---------------| +| `BASELINE_SCHEMA_VERSION` | `codeclone/contracts/__init__.py` | `2.1` | +| `BASELINE_FINGERPRINT_VERSION` | `codeclone/contracts/__init__.py` | `1` | +| `CACHE_VERSION` | `codeclone/contracts/__init__.py` | `2.6` | +| `REPORT_SCHEMA_VERSION` | `codeclone/contracts/__init__.py` | `2.10` | +| `METRICS_BASELINE_SCHEMA_VERSION` | `codeclone/contracts/__init__.py` | `1.2` | -When updating any doc that mentions a version, re-read `codeclone/contracts.py` first. Do not derive +When updating any doc that mentions a version, re-read `codeclone/contracts/__init__.py` first. Do not derive versions from another document. ### Baseline file structure (canonical) @@ -162,7 +162,7 @@ versions from another document. }, "schema_version": "2.1", "fingerprint_version": "1", - "python_tag": "cp313", + "python_tag": "cp314", "created_at": "2026-02-08T14:20:15Z", "payload_sha256": "…" }, @@ -181,7 +181,7 @@ versions from another document. - `schema_version` is **baseline schema**, not package version. - Runtime writes baseline schema `2.1`. - Runtime accepts baseline schema `1.0` and `2.0`–`2.1` (governed by - `_BASELINE_SCHEMA_MAX_MINOR_BY_MAJOR` in `codeclone/baseline.py`). + `_BASELINE_SCHEMA_MAX_MINOR_BY_MAJOR` in `codeclone/baseline/trust.py`). - Compatibility is tied to: - `fingerprint_version` - `python_tag` @@ -367,24 +367,29 @@ Before cutting a release: Architecture is layered, but grounded in current code (not aspirational diagrams): -- **CLI / orchestration surface** (`codeclone/cli.py`, `codeclone/_cli_*.py`) parses args, resolves runtime mode, - coordinates pipeline calls, and prints UX. -- **Pipeline orchestrator** (`codeclone/pipeline.py`) owns end-to-end flow: bootstrap → discovery → processing → - analysis → report artifacts → gating. -- **Core analysis** (`codeclone/extractor.py`, `codeclone/cfg.py`, `codeclone/normalize.py`, `codeclone/blocks.py`, - `codeclone/grouping.py`, `codeclone/scanner.py`) produces normalized structural facts and clone candidates. -- **Domain/contracts layer** (`codeclone/models.py`, `codeclone/contracts.py`, `codeclone/errors.py`, - `codeclone/domain/*.py`) defines typed entities and stable enums/constants used across layers. -- **Persistence contracts** (`codeclone/baseline.py`, `codeclone/cache.py`, `codeclone/cache_io.py`, - `codeclone/metrics_baseline.py`) store trusted comparison state and optimization state. -- **Canonical report + projections** (`codeclone/report/json_contract.py`, `codeclone/report/*.py`) converts analysis - facts to deterministic, contract-shaped outputs. -- **HTML/UI rendering** (`codeclone/html_report.py`, `codeclone/_html_report/*`, `codeclone/_html_*.py`, - `codeclone/templates.py`) renders views from report/meta facts. +- **CLI entry + orchestration surface** (`codeclone/main.py`, `codeclone/surfaces/cli/*`, `codeclone/ui_messages/*`) + owns argument parsing, runtime/config resolution, summaries, report writes, and exit routing. +- **Config layer** (`codeclone/config/*`) is the single source of truth for option specs, parser construction, + `pyproject.toml` loading, and CLI > pyproject > defaults resolution. +- **Core orchestration** (`codeclone/core/*`) owns bootstrap → discovery → worker processing → project metrics → + report/gate integration. It does not own shell UX. +- **Analysis layer** (`codeclone/analysis/*`, `codeclone/blocks/*`, `codeclone/paths/*`, `codeclone/qualnames/*`) + parses source, normalizes AST/CFG facts, extracts units, and prepares deterministic analysis inputs. +- **Clone/finding derivation layer** (`codeclone/findings/*`, `codeclone/metrics/*`) groups clones and computes + structural and quality signals from already-extracted facts. +- **Domain/contracts layer** (`codeclone/models.py`, `codeclone/contracts/*`, `codeclone/domain/*`) defines typed + entities, enums, schema/version constants, and typed exceptions used across layers. +- **Persistence contracts** (`codeclone/baseline/*`, `codeclone/cache/*`) store trusted comparison state and + optimization state. They are contracts, not analysis truth. +- **Canonical report + projections** (`codeclone/report/document/*`, `codeclone/report/gates/*`, + `codeclone/report/renderers/*`, `codeclone/report/*.py`) converts analysis facts into deterministic report payloads + and deterministic projections. +- **HTML/UI rendering** (`codeclone/report/html/*`) renders views from canonical report/meta + facts. HTML is render-only. +- **MCP agent interface** (`codeclone/surfaces/mcp/*`) exposes the same pipeline/report contracts as a deterministic, + read-only MCP surface for AI agents and MCP-capable clients. - **Documentation/publishing surface** (`docs/`, `mkdocs.yml`, `.github/workflows/docs.yml`, `scripts/build_docs_example_report.py`) publishes contract docs and the live sample report. -- **MCP agent interface** (`codeclone/mcp_service.py`, `codeclone/mcp_server.py`) exposes the current pipeline as a - deterministic, read-only MCP server for AI agents and MCP-capable clients. - **VS Code extension surface** (`extensions/vscode-codeclone/*`) is a native, workspace-only IDE client over `codeclone-mcp`, with baseline-aware, triage-first, source-first review UX. - **Claude Desktop bundle surface** (`extensions/claude-desktop-codeclone/*`) is a native `.mcpb` install wrapper for @@ -409,51 +414,53 @@ Non-negotiable interpretation: Use this map to route changes to the right owner module. -- `codeclone/cli.py` — public CLI entry and control-flow coordinator; add orchestration and top-level UX here; do not - move core analysis logic here. -- `codeclone/_cli_*.py` — CLI support slices (args, config, runtime, summary, reports, baselines, gating); keep them - thin and reusable; do not encode domain semantics that belong to pipeline/core/contracts. -- `codeclone/pipeline.py` — canonical orchestration and data plumbing between scanner/extractor/metrics/report/gating; - change integration flow here; do not move HTML-only presentation logic here. -- `codeclone/extractor.py` — AST extraction, CFG fingerprint input preparation, symbol/declaration collection, and - per-file metrics inputs; change parsing/extraction semantics here; do not couple this module to CLI/report - rendering/baseline logic. -- `codeclone/grouping.py` / `codeclone/blocks.py` — clone grouping and block/segment mechanics; normalization-adjacent - statement hashing lives with `codeclone/normalize.py`; do not mix grouping behavior with CLI/report UX concerns. -- `codeclone/metrics/` — metric computations and dead-code/dependency/health logic; change metric math and thresholds - here; do not make metrics depend on renderer/UI concerns. -- `codeclone/structural_findings.py` — structural finding extraction/normalization policy; keep it report-layer factual +- `codeclone/main.py` — public CLI entrypoint only. Keep it tiny. +- `codeclone/surfaces/cli/workflow.py` — top-level CLI orchestration and exit routing. Add CLI control flow here, not + in `main.py`. +- `codeclone/surfaces/cli/*` — CLI support slices (startup, runtime, execution, post-run handling, summaries, + reports, changed-scope logic, baseline state, console helpers). Keep them orchestration/UX-focused. +- `codeclone/config/*` — parser construction, option specs/defaults, pyproject loading, config resolution. Do not + duplicate option semantics elsewhere. +- `codeclone/core/*` — canonical runtime pipeline and payload plumbing. Change integration flow here; do not move shell + UX or HTML-only logic here. +- `codeclone/analysis/*` — AST parsing, CFG/fingerprint preparation, declaration/reference collection, and unit + extraction. Change parsing/extraction semantics here; keep it independent from CLI/report/baseline UX. +- `codeclone/findings/clones/grouping.py` + `codeclone/blocks/*` — clone grouping and block/segment mechanics. +- `codeclone/findings/structural/detectors.py` — structural finding extraction/normalization policy; keep it factual and deterministic. -- `codeclone/suppressions.py` — inline `# codeclone: ignore[...]` parse/bind/index logic; keep it declaration-scoped and - deterministic. -- `codeclone/baseline.py` — baseline schema/trust/integrity/compatibility contract; all baseline format changes go here - with explicit contract process. -- `codeclone/cache.py` — cache schema/status/profile compatibility and high-level serialization policy; cache remains - optimization-only. -- `codeclone/cache_io.py` — IO-layer helpers for the cache: atomic JSON read/write - (`read_json_document`, `write_json_document_atomically`), canonical JSON (`canonical_json`), and - HMAC signing/verification (`sign_cache_payload`, `verify_cache_payload_signature`); attribute these - functions to `cache_io.py`, not `cache.py`. -- `codeclone/report/json_contract.py` — canonical report schema builder/integrity payload; any JSON contract shape - change belongs here. -- `codeclone/report/*.py` (other modules) — deterministic projections/format transforms ( - text/markdown/sarif/derived/findings/suggestions); avoid injecting new analysis heuristics here. -- `codeclone/mcp_service.py` — typed, in-process MCP service adapter over the current pipeline/report contracts; keep - it deterministic; allow only session-local in-memory state such as reviewed markers, and never move shell UX or - `sys.exit` behavior here. -- `codeclone/mcp_server.py` — optional MCP launcher/server wiring, transport config, and MCP tool/resource +- `codeclone/metrics/*` — metric computations and dead-code/dependency/health logic; change metric math and thresholds + here; do not make metrics depend on renderer/UI concerns. +- `codeclone/analysis/suppressions.py` — inline `# codeclone: ignore[...]` parse/bind/index logic; keep it + declaration-scoped and deterministic. +- `codeclone/findings/clones/golden_fixtures.py` — golden-fixture clone exclusion policy and suppressed-clone bucket + shaping; keep it clone-derivation-only and deterministic. +- `codeclone/baseline/clone_baseline.py` + `codeclone/baseline/trust.py` — clone baseline schema/trust/integrity/ + compatibility contract; all clone-baseline format changes go here with explicit contract process. +- `codeclone/baseline/metrics_baseline.py` + `codeclone/baseline/_metrics_baseline_*` — metrics-baseline schema, + validation, payload hashing, and unified-baseline merge logic. +- `codeclone/cache/store.py`, `codeclone/cache/versioning.py`, `codeclone/cache/integrity.py`, + `codeclone/cache/_wire_*`, `codeclone/cache/projection.py` — cache schema/status/profile compatibility, canonical + JSON/signing, wire encoding/decoding, and segment projection persistence. Cache remains optimization-only. +- `codeclone/report/document/*` — canonical report schema builder and integrity payload. Any JSON contract shape change + belongs here. +- `codeclone/report/renderers/*` — deterministic text/markdown/SARIF/JSON projections over the canonical report. +- `codeclone/report/html/*` — actual HTML assembly, context shaping, tabs, sections, widgets, CSS/JS/escaping, and + snippets. Change report layout and interactive HTML UX here, not in report builders. +- `codeclone/report/gates/*` — metric-gate reason derivation over canonical metrics state. +- `codeclone/report/*.py` (other modules) — deterministic report support slices such as explainability, suggestions, + merge, overview, findings helpers, and source-kind routing. +- `codeclone/surfaces/mcp/service.py` — typed, in-process MCP service over the current pipeline/report contracts; + keep it deterministic and read-only except for session-local in-memory markers. +- `codeclone/surfaces/mcp/server.py` — optional MCP launcher/server wiring, transport config, and MCP tool/resource registration; keep dependency loading lazy so base installs/CI do not require MCP runtime packages. - `tests/test_mcp_service.py`, `tests/test_mcp_server.py` — MCP contract and integration tests; run these when touching any MCP surface. -- `codeclone/html_report.py` — public HTML facade/re-export surface; preserve backward-compatible imports here; do not - grow section/layout logic in this module. -- `codeclone/_html_report/*` — actual HTML assembly, context shaping, tabs, sections, and overview/navigation behavior; - change report layout and interactive HTML UX here, not in the facade. -- `codeclone/_html_*.py` — shared HTML badges, CSS, JS, escaping, snippets, and data-attrs; keep these as render-only - helpers. +- `codeclone/contracts/*` — version constants, schema types, exit enum, URLs, and typed exceptions. Treat as contract + surface. - `codeclone/models.py` — shared typed models crossing modules; keep model changes contract-aware. - `codeclone/domain/*.py` — centralized domain taxonomies/IDs (families, categories, source scopes, risk/severity levels); use these constants in pipeline/report/UI instead of scattering raw literals. +- `codeclone/ui_messages/*` — CLI text/marker/help constants and formatter helpers. Keep message policy centralized. - `docs/`, `mkdocs.yml`, `.github/workflows/docs.yml`, `scripts/build_docs_example_report.py` — docs-site source, publication workflow, and live sample-report generation; keep published docs aligned with code contracts. - `extensions/vscode-codeclone/*` — preview VS Code extension surface; keep it baseline-aware, triage-first, @@ -468,19 +475,22 @@ Use this map to route changes to the right owner module. Dependency direction is enforceable and partially test-guarded (`tests/test_architecture.py`): -- `codeclone.report.*` must not import `codeclone.cli`, `codeclone.html_report`, or `codeclone.ui_messages`. -- `codeclone.extractor` must not import `codeclone.report`, `codeclone.cli`, or `codeclone.baseline`. -- `codeclone.grouping` must not import `codeclone.cli`, `codeclone.baseline`, or `codeclone.html_report`. -- `codeclone.baseline` and `codeclone.cache` must not import `codeclone.cli`, `codeclone.ui_messages`, or - `codeclone.html_report`. -- `codeclone.models` may import only `codeclone.contracts` and `codeclone.errors` from local modules. +- `codeclone.report.*` must not import `codeclone.ui_messages`, `codeclone.surfaces.cli`, or HTML consumers outside + `codeclone.report.html.*`. +- `codeclone.baseline` and `codeclone.cache` must not import `codeclone.surfaces.cli`, `codeclone.ui_messages`, or + `codeclone.report.html`. +- `codeclone.core` must not import `codeclone.surfaces.*` or `codeclone.config`. +- `codeclone.analysis`, `codeclone.findings`, and `codeclone.metrics` must not import `codeclone.surfaces.*`; analysis + and findings must also stay independent from config/report-builder wiring. +- `codeclone.models` may import only `codeclone.contracts` from local modules. +- `codeclone.domain.*` must remain leaf domain modules. Operational rules: -- Core/domain code must not depend on HTML/UI. -- Renderers depend on canonical report payload/model; canonical report code must not depend on renderer/UI. +- Core/domain code must not depend on HTML/UI or MCP. +- Renderers depend on canonical report payload/model; canonical report builders must not depend on renderer/UI. - Metrics/report layers must not recompute or invent core facts in UI. -- CLI helper modules (`_cli_*`) must orchestrate/format, not own domain semantics. +- CLI support modules under `codeclone/surfaces/cli/*` must orchestrate/format, not own domain semantics. - Persistence semantics (baseline/cache trust/integrity) must stay in persistence/domain modules, not in render/UI layers. - MCP may depend on pipeline/report/contracts, but core/persistence/report layers must not depend on MCP modules. @@ -489,7 +499,7 @@ Operational rules: Inline suppressions are explicit local policy, not analysis truth. -- Supported syntax is `# codeclone: ignore[rule-id,...]` via `codeclone/suppressions.py`. +- Supported syntax is `# codeclone: ignore[rule-id,...]` via `codeclone/analysis/suppressions.py`. - Binding scope is declaration-only (`def`, `async def`, `class`) using: - leading comment on the line immediately before declaration - inline comment on the declaration header start line @@ -510,13 +520,13 @@ If you change a contract-sensitive zone, route docs/tests/approval deliberately. | Change zone | Must update docs | Must update tests | Explicit approval required when | Contract-change trigger | |-------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------| -| Baseline schema/trust/integrity (`codeclone/baseline.py`) | `docs/book/06-baseline.md`, `docs/book/14-compatibility-and-versioning.md`, `docs/book/appendix/b-schema-layouts.md`, `CHANGELOG.md` | `tests/test_baseline.py`, CI/CLI behavior tests (`tests/test_cli_inprocess.py`, `tests/test_cli_unit.py`) | schema/trust semantics, compatibility windows, payload integrity logic change | baseline key layout/status semantics/compat rules change | -| Cache schema/profile/integrity (`codeclone/cache.py`) | `docs/book/07-cache.md`, `docs/book/appendix/b-schema-layouts.md`, `CHANGELOG.md` | `tests/test_cache.py`, pipeline/CLI cache integration tests | cache schema/status/profile compatibility semantics change | cache payload/version/status semantics change | -| Canonical report JSON shape (`codeclone/report/json_contract.py`, report projections) | `docs/book/08-report.md` (+ `docs/book/10-html-render.md` if rendering contract impacted), `docs/sarif.md` when SARIF changes, `CHANGELOG.md` | `tests/test_report.py`, `tests/test_report_contract_coverage.py`, `tests/test_report_branch_invariants.py`, relevant report-format tests | finding/meta/summary schema changes | stable JSON fields/meaning/order guarantees change | -| CLI flags/help/exit behavior (`codeclone/cli.py`, `_cli_*`, `contracts.py`) | `docs/book/09-cli.md`, `docs/book/03-contracts-exit-codes.md`, `README.md`, `CHANGELOG.md` | `tests/test_cli_unit.py`, `tests/test_cli_inprocess.py`, `tests/test_cli_smoke.py` | exit-code semantics, script-facing behavior, flag contracts change | user-visible CLI contract changes | +| Baseline schema/trust/integrity (`codeclone/baseline/clone_baseline.py`, `codeclone/baseline/trust.py`) | `docs/book/06-baseline.md`, `docs/book/14-compatibility-and-versioning.md`, `docs/book/appendix/b-schema-layouts.md`, `CHANGELOG.md` | `tests/test_baseline.py`, CI/CLI behavior tests (`tests/test_cli_inprocess.py`, `tests/test_cli_unit.py`) | schema/trust semantics, compatibility windows, payload integrity logic change | baseline key layout/status semantics/compat rules change | +| Cache schema/profile/integrity (`codeclone/cache/store.py`, `codeclone/cache/versioning.py`, `codeclone/cache/integrity.py`) | `docs/book/07-cache.md`, `docs/book/appendix/b-schema-layouts.md`, `CHANGELOG.md` | `tests/test_cache.py`, pipeline/CLI cache integration tests | cache schema/status/profile compatibility semantics change | cache payload/version/status semantics change | +| Canonical report JSON shape (`codeclone/report/document/*`, report projections) | `docs/book/08-report.md` (+ `docs/book/10-html-render.md` if rendering contract impacted), `docs/sarif.md` when SARIF changes, `CHANGELOG.md` | `tests/test_report.py`, `tests/test_report_contract_coverage.py`, `tests/test_report_branch_invariants.py`, relevant report-format tests | finding/meta/summary schema changes | stable JSON fields/meaning/order guarantees change | +| CLI flags/help/exit behavior (`codeclone/main.py`, `codeclone/surfaces/cli/*`, `codeclone/config/*`, `codeclone/contracts/*`) | `docs/book/09-cli.md`, `docs/book/03-contracts-exit-codes.md`, `README.md`, `CHANGELOG.md` | `tests/test_cli_unit.py`, `tests/test_cli_inprocess.py`, `tests/test_cli_smoke.py` | exit-code semantics, script-facing behavior, flag contracts change | user-visible CLI contract changes | | Fingerprint-adjacent analysis (`extractor/cfg/normalize/grouping`) | `docs/book/05-core-pipeline.md`, `docs/cfg.md`, `docs/book/14-compatibility-and-versioning.md`, `CHANGELOG.md` | `tests/test_fingerprint.py`, `tests/test_extractor.py`, `tests/test_cfg.py`, golden tests (`tests/test_detector_golden.py`, `tests/test_golden_v2.py`) | always (see Section 1.6) | clone identity / NEW-vs-KNOWN / fingerprint inputs change | -| Suppression semantics/reporting (`suppressions`, extractor dead-code wiring, report/UI counters) | `docs/book/19-inline-suppressions.md`, `docs/book/16-dead-code-contract.md`, `docs/book/08-report.md`, and interface docs if surfaced (`09-cli`, `10-html-render`) | `tests/test_suppressions.py`, `tests/test_extractor.py`, `tests/test_metrics_modules.py`, `tests/test_pipeline_metrics.py`, report/html/cli tests | declaration scope semantics, rule effect, or contract-visible counters/fields change | suppression changes alter active finding output or contract-visible report payload | -| MCP interface (`codeclone/mcp_service.py`, `codeclone/mcp_server.py`, packaging extra/launcher) | `README.md`, `docs/book/20-mcp-interface.md`, `docs/mcp.md`, `docs/book/01-architecture-map.md`, `docs/book/14-compatibility-and-versioning.md`, `CHANGELOG.md` | `tests/test_mcp_service.py`, `tests/test_mcp_server.py`, plus CLI/package tests if launcher/install semantics change | tool/resource shapes, read-only semantics, optional-dependency packaging behavior change | public MCP tool names, resource URIs, launcher/install behavior, or response semantics change | +| Suppression semantics/reporting (`codeclone/analysis/suppressions.py`, extractor dead-code wiring, report/UI counters) | `docs/book/19-inline-suppressions.md`, `docs/book/16-dead-code-contract.md`, `docs/book/08-report.md`, and interface docs if surfaced (`09-cli`, `10-html-render`) | `tests/test_suppressions.py`, `tests/test_extractor.py`, `tests/test_metrics_modules.py`, `tests/test_pipeline_metrics.py`, report/html/cli tests | declaration scope semantics, rule effect, or contract-visible counters/fields change | suppression changes alter active finding output or contract-visible report payload | +| MCP interface (`codeclone/surfaces/mcp/*`, packaging extra/launcher) | `README.md`, `docs/book/20-mcp-interface.md`, `docs/mcp.md`, `docs/book/01-architecture-map.md`, `docs/book/14-compatibility-and-versioning.md`, `CHANGELOG.md` | `tests/test_mcp_service.py`, `tests/test_mcp_server.py`, plus CLI/package tests if launcher/install semantics change | tool/resource shapes, read-only semantics, optional-dependency packaging behavior change | public MCP tool names, resource URIs, launcher/install behavior, or response semantics change | | VS Code extension surface (`extensions/vscode-codeclone/*`) | `README.md`, `docs/book/21-vscode-extension.md`, `docs/vscode-extension.md`, `docs/book/01-architecture-map.md`, `docs/README.md`, `CHANGELOG.md` | `node --check extensions/vscode-codeclone/src/support.js`, `node --check extensions/vscode-codeclone/src/mcpClient.js`, `node --check extensions/vscode-codeclone/src/extension.js`, `node --test extensions/vscode-codeclone/test/*.test.js`, plus local extension-host smoke and package smoke when surface/manifest/assets change | command/view UX, trust/runtime model, source-first review flow, or packaging metadata change | documented commands/views/setup/trust behavior, packaged assets, or publish metadata change | | Claude Desktop bundle surface (`extensions/claude-desktop-codeclone/*`) | `docs/book/22-claude-desktop-bundle.md`, `docs/claude-desktop-bundle.md`, `docs/mcp.md`, `docs/book/01-architecture-map.md`, `docs/README.md`, `CHANGELOG.md` | `node --check extensions/claude-desktop-codeclone/server/index.js`, `node --check extensions/claude-desktop-codeclone/src/launcher.js`, `node --check extensions/claude-desktop-codeclone/scripts/build-mcpb.mjs`, `node --test extensions/claude-desktop-codeclone/test/*.test.js`, plus `.mcpb` build smoke | bundle install/runtime model, launcher UX, local-stdio constraints, or bundle metadata change | documented Claude Desktop install/setup/runtime behavior or packaged bundle semantics change | | Codex plugin surface (`plugins/codeclone/*`, `.agents/plugins/marketplace.json`) | `docs/book/23-codex-plugin.md`, `docs/codex-plugin.md`, `docs/mcp.md`, `docs/book/01-architecture-map.md`, `docs/README.md`, `CHANGELOG.md` | `python3 -m json.tool plugins/codeclone/.codex-plugin/plugin.json`, `python3 -m json.tool plugins/codeclone/.mcp.json`, `python3 -m json.tool .agents/plugins/marketplace.json`, `tests/test_codex_plugin.py` | plugin discovery/runtime model, bundled MCP config, bundled skill behavior, or plugin metadata change | documented Codex plugin install/discovery/runtime behavior or plugin manifest/marketplace semantics change | @@ -564,8 +574,9 @@ Policy: ### Internal implementation surfaces -- Local helpers and formatting utilities (`_html_*`, many private `_as_*` normalizers, local transformers). -- Internal orchestration decomposition inside `_cli_*` modules. +- Local helpers and formatting utilities (`codeclone/report/html/widgets/*`, + `codeclone/report/html/primitives/*`, many private `_as_*` normalizers, local transformers). +- Internal orchestration decomposition inside `codeclone/surfaces/cli/*`. - Private utility refactors that do not change public payloads, exit semantics, ordering, or trust rules. If classification is ambiguous, treat it as contract-sensitive and add tests/docs before merging. @@ -660,7 +671,7 @@ These rules exist because of real incidents in this repo. They are non-negotiabl - Every doc claim about code (schema version, module path, function name, MCP tool count, exit code, CLI flag) must be verified against the **current** code before writing or editing. -- Always read version constants from `codeclone/contracts.py` (see Section 4 table), never from +- Always read version constants from `codeclone/contracts/__init__.py` (see Section 4 table), never from another doc. - When updating a file that mentions schema versions, verify **every** version reference in that file — not only the one you came to change. @@ -678,10 +689,11 @@ These rules exist because of real incidents in this repo. They are non-negotiabl ### Shared helpers -- HTML/UI helpers (`_html_badges.py`, `_html_css.py`, `_html_js.py`, `_html_escape.py`, - `_html_report/_glossary.py`) are imported, not duplicated locally inside `_html_report/_sections/*`. +- HTML/UI helpers (`codeclone/report/html/widgets/*`, `codeclone/report/html/primitives/*`, + `codeclone/report/html/assets/*`) are imported, not duplicated locally inside + `codeclone/report/html/sections/*`. If you need a helper that doesn't exist, add it to the shared module. -- Glossary terms used in stat-card labels live in `codeclone/_html_report/_glossary.py`. Adding a +- Glossary terms used in stat-card labels live in `codeclone/report/html/widgets/glossary.py`. Adding a new label without a glossary entry is a contract gap. ### Conflict avoidance @@ -699,7 +711,7 @@ These rules exist because of real incidents in this repo. They are non-negotiabl - A task that touches MCP is not complete until `pytest tests/test_mcp_service.py tests/test_mcp_server.py -x -q` is green. - A task that touches docs schema/version claims is not complete until you have grep'd the whole - file for *all* version-shaped strings and verified each against `codeclone/contracts.py`. + file for *all* version-shaped strings and verified each against `codeclone/contracts/__init__.py`. --- diff --git a/CHANGELOG.md b/CHANGELOG.md index db54f6e..24dc77c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,52 @@ # Changelog +## [2.0.0b6] - 2026-04-28 + +The global package refactor lands here: the entire runtime moves onto the +canonical module layout and legacy shims are removed for good. On top of that, +dependency-depth scoring is replaced with an adaptive project-relative model, +and the report/cache contracts advance to surface the new depth profile and the +report-only `security_surfaces` layer. + +### Package layout and contracts + +- Move the runtime fully onto the canonical package layout: `main` + `surfaces/cli`, `surfaces/mcp`, `core`, `analysis`, + `baseline`, `cache`, `contracts`, `report/document`, `report/renderers`, and `report/html`. +- Remove remaining legacy root shims and stale compatibility modules in favor of direct canonical imports. +- Remove stale deleted-file cache entries and trim post-refactor import tails that were inflating dependency depth and + clone pressure. +- Bump report schema to `2.10` and cache schema to `2.6` for additive dependency depth profile fields and + `security_surfaces` facts; keep clone baseline schema `2.1` and metrics-baseline schema `1.2` unchanged. +- Preserve deterministic contracts and read-only MCP semantics across the new layout. + +### Dependency depth scoring + +- Replace the old fixed dependency-depth penalty (`max_depth > 8`) with an adaptive internal-graph profile based on + `avg_depth`, `p95_depth`, and `max_depth`. +- Keep dependency cycles as the hard signal; treat acyclic depth as adaptive pressure relative to the project's own + dependency profile. +- Limit dependency-depth scoring to the internal module graph instead of external imports such as `typing` or + `argparse`. +- Surface the dependency depth profile in the canonical report, HTML Dependencies tab, and CLI/CI summaries. + +### Security surfaces + +- Add `metrics.families.security_surfaces`: a report-only exact inventory of security-relevant capability surfaces and + trust-boundary code. +- Surface compact `security_surfaces` facts in canonical report JSON, CLI Metrics, HTML Quality, text/markdown + projections, and MCP summaries / `metrics_detail`. +- Keep the layer honest: no vulnerability claims, no score impact, no gates, no SARIF security findings, and no baseline + truth. + +### Tooling, docs, and UX + +- Refresh AGENTS, docs/book, and changelog content for the b6 package layout and report schema `2.10`. +- Tighten preview client metadata and install guidance for VS Code, Claude Desktop, and Codex. +- Replace the Codex plugin shell snippet with a repo-local shell-free launcher, and parallelize VS Code post-run MCP + artifact hydration. +- Add a quiet one-time VS Code extension hint in interactive VS Code terminals, tracked per CodeClone version next to + the resolved project cache path. + ## [2.0.0b5] - 2026-04-16 Expands the canonical contract with adoption, API-surface, and coverage-join layers; clarifies run interpretation @@ -21,7 +68,8 @@ across MCP/HTML/clients; tightens MCP launcher/runtime behavior. `--fail-on-docstring-regression`, `--fail-on-api-break`, `--fail-on-untested-hotspots`, `--coverage-min`. - Surface adoption/API/coverage-join in MCP, CLI Metrics, report payloads, and HTML (Overview + Quality subtab). - Preserve embedded metrics and optional `api_surface` in unified baselines. -- Cache `2.5`: make analysis-profile compatibility API-surface-aware; invalidate stale non-API warm caches; preserve parameter order; align warm/cold API diffs. +- Cache `2.5`: make analysis-profile compatibility API-surface-aware; invalidate stale non-API warm caches; preserve + parameter order; align warm/cold API diffs. ### MCP, HTML, and client interpretation diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index af4bb11..cd105f6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -49,7 +49,7 @@ When reporting issues related to clone detection, include: - minimal reproducible code snippets (preferred over screenshots); - the CodeClone version; -- the Python version (`python_tag`, e.g. `cp313`); +- the Python version (`python_tag`, e.g. `cp314`); - whether the issue is primarily: - AST-related, - CFG-related, diff --git a/LICENSE-MIT b/LICENSE-MIT new file mode 100644 index 0000000..9f3d32f --- /dev/null +++ b/LICENSE-MIT @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Denis Rozhnovskiy + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/LICENSE-docs b/LICENSE-docs deleted file mode 100644 index 66b3e88..0000000 --- a/LICENSE-docs +++ /dev/null @@ -1,25 +0,0 @@ -MIT License - -Copyright (c) 2024 Denis Rozhnovskiy - -This license applies to documentation in this repository, including the -`docs/` tree and Markdown documentation files, unless a file states -otherwise. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this documentation and associated files (the "Documentation"), to deal -in the Documentation without restriction, including without limitation the -rights to use, copy, modify, merge, publish, distribute, sublicense, -and/or sell copies of the Documentation, and to permit persons to whom the -Documentation is furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Documentation. - -THE DOCUMENTATION IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE DOCUMENTATION OR THE USE OR OTHER DEALINGS IN -THE DOCUMENTATION. diff --git a/LICENSES.md b/LICENSES.md new file mode 100644 index 0000000..ed0a973 --- /dev/null +++ b/LICENSES.md @@ -0,0 +1,22 @@ +# License Scope + +CodeClone uses a dual-license layout in this repository. + +## Default mapping + +- Source code and other implementation files are licensed under + [MPL-2.0](LICENSE). +- Documentation content, including the `docs/` tree and published docs-site + content, is licensed under [MIT](LICENSE-MIT). + +## File-level overrides + +If a file or bundled third-party artifact includes its own license notice, that +file-level notice takes precedence over this default mapping. + +## Notes + +- Keep [LICENSE](LICENSE) and [LICENSE-MIT](LICENSE-MIT) as canonical license + texts for tooling and GitHub license detection. +- Use this file to describe scope, not to redefine the underlying license + texts. diff --git a/README.md b/README.md index 0284773..047d97e 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,18 @@
- - - CodeClone + + + CodeClone
@@ -16,7 +26,7 @@ Tests Benchmark Python - codeclone 89 (B) + codeclone 90 (A) License

@@ -25,7 +35,7 @@ CodeClone provides deterministic structural code quality analysis for Python. It detects architectural duplication, computes quality metrics, and enforces CI gates — all with **baseline-aware governance** that separates **known** technical debt from **new** regressions. -An optional MCP interface exposes the same canonical analysis pipeline to AI agents and IDEs. +A triage-first MCP control surface exposes the same canonical pipeline to AI agents and IDEs. Docs: [orenlab.github.io/codeclone](https://orenlab.github.io/codeclone/) · Live sample report: @@ -42,13 +52,13 @@ Live sample report: - **Clone detection** — function (CFG fingerprint), block (statement windows), and segment (report-only) clones - **Structural findings** — duplicated branch families, clone guard/exit divergence, and clone-cohort drift -- **Quality metrics** — cyclomatic complexity, coupling (CBO), cohesion (LCOM4), dependency cycles, dead code, - health score, and overloaded-module profiling +- **Quality metrics** — cyclomatic complexity, coupling (CBO), cohesion (LCOM4), dependency cycles, adaptive depth + profile, dead code, health score, and overloaded-module profiling - **Adoption & API** — type/docstring annotation coverage, public API surface inventory and baseline diff - **Coverage Join** — fuse external Cobertura XML into the current run to surface coverage hotspots and scope gaps - **Baseline governance** — separates accepted **legacy** debt from **new regressions**; CI fails only on what changed - **Reports** — interactive HTML, JSON, Markdown, SARIF, and text from one canonical report -- **MCP server** — optional read-only surface for AI agents and IDEs +- **MCP control surface** — triage-first agent and IDE interface over the same canonical pipeline; read-only by contract - **IDE & agent clients** — VS Code extension, Claude Desktop bundle, and Codex plugin over the same MCP contract - **CI-first** — deterministic output, stable ordering, exit code contract, pre-commit support - **Fast** — incremental caching, parallel processing, warm-run optimization @@ -169,16 +179,21 @@ repos: types: [ python ] ``` -## MCP Server +## MCP Control Surface -Optional read-only MCP server for AI agents and IDE clients. -Never mutates source, baselines, or repo state. +Triage-first MCP server for AI agents and IDE clients, built on the same canonical pipeline as the CLI. Read-only by +contract: never mutates source, baselines, or repo state. ```bash -uv tool install --pre "codeclone[mcp]" # or: uv pip install --pre "codeclone[mcp]" +uv tool install --pre "codeclone[mcp]" +# or +uv pip install --pre "codeclone[mcp]" -codeclone-mcp --transport stdio # local (Claude Code, Codex, Copilot, Gemini CLI) -codeclone-mcp --transport streamable-http # remote / HTTP-only clients +# local stdio clients +codeclone-mcp --transport stdio + +# remote / HTTP-only clients +codeclone-mcp --transport streamable-http ``` [MCP usage guide](https://orenlab.github.io/codeclone/mcp/) · @@ -192,7 +207,7 @@ codeclone-mcp --transport streamable-http # remote / HTTP-only clients | **Claude Desktop bundle** | [`extensions/claude-desktop-codeclone/`](https://github.com/orenlab/codeclone/tree/main/extensions/claude-desktop-codeclone) | Local `.mcpb` install with pre-loaded instructions | | **Codex plugin** | [`plugins/codeclone/`](https://github.com/orenlab/codeclone/tree/main/plugins/codeclone) | Native discovery, two skills, and MCP definition | -All three are thin wrappers over the same `codeclone-mcp` contract — no second analysis engine. +All three are native clients over the same `codeclone-mcp` contract — no second analysis engine. [VS Code extension docs](https://orenlab.github.io/codeclone/book/21-vscode-extension/) · [Claude Desktop docs](https://orenlab.github.io/codeclone/book/22-claude-desktop-bundle/) · @@ -268,13 +283,13 @@ Report contract: [Report contract](https://orenlab.github.io/codeclone/book/08-r [HTML render](https://orenlab.github.io/codeclone/book/10-html-render/)
-Canonical JSON report shape (v2.8) +Canonical JSON report shape (v2.10) ```json { - "report_schema_version": "2.8", + "report_schema_version": "2.10", "meta": { - "codeclone_version": "2.0.0b5", + "codeclone_version": "2.0.0b6", "project_name": "...", "scan_root": ".", "report_mode": "full", @@ -341,15 +356,27 @@ Report contract: [Report contract](https://orenlab.github.io/codeclone/book/08-r "metrics": { "summary": { "...": "...", - "coverage_adoption": { "...": "..." }, - "coverage_join": { "...": "..." }, - "api_surface": { "...": "..." } + "coverage_adoption": { + "...": "..." + }, + "coverage_join": { + "...": "..." + }, + "api_surface": { + "...": "..." + } }, "families": { "...": "...", - "coverage_adoption": { "...": "..." }, - "coverage_join": { "...": "..." }, - "api_surface": { "...": "..." } + "coverage_adoption": { + "...": "..." + }, + "coverage_join": { + "...": "..." + }, + "api_surface": { + "...": "..." + } } }, "derived": { @@ -455,8 +482,8 @@ in [Benchmarking contract](https://orenlab.github.io/codeclone/book/18-benchmark ## License -- **Code:** MPL-2.0 -- **Documentation:** MIT +- **Code:** MPL-2.0 (`LICENSE`) +- **Documentation and docs-site content:** MIT (`LICENSE-MIT`) Versions released before this change remain under their original license terms. @@ -465,4 +492,4 @@ Versions released before this change remain under their original license terms. - **Docs:** - **Issues:** - **PyPI:** -- **Licenses:** [MPL-2.0](LICENSE) · [MIT docs](LICENSE-docs) +- **Licenses:** [MPL-2.0](https://github.com/orenlab/codeclone/blob/main/LICENSE) · [MIT docs](https://github.com/orenlab/codeclone/blob/main/LICENSE-MIT) · [Scope map](https://github.com/orenlab/codeclone/blob/main/LICENSES.md) diff --git a/SECURITY.md b/SECURITY.md index 333de2d..72cf77a 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -10,7 +10,7 @@ The following versions currently receive security updates: | Version | Supported | |---------|-----------| | 2.0.x | Yes | -| 1.4.x | Yes | +| 1.4.x | No | | 1.3.x | No | | 1.2.x | No | | 1.1.x | No | diff --git a/benchmarks/Dockerfile b/benchmarks/Dockerfile index 8768aad..c747fc4 100644 --- a/benchmarks/Dockerfile +++ b/benchmarks/Dockerfile @@ -1,6 +1,6 @@ # syntax=docker/dockerfile:1.7 -FROM python:3.13.2-slim-bookworm +FROM python:3.14.3-slim-bookworm ENV PYTHONDONTWRITEBYTECODE=1 \ PYTHONUNBUFFERED=1 \ diff --git a/benchmarks/baselines/reference-cp313.json b/benchmarks/baselines/reference-cp313.json new file mode 100644 index 0000000..f8089ce --- /dev/null +++ b/benchmarks/baselines/reference-cp313.json @@ -0,0 +1,120 @@ +{ + "benchmark_schema_version": "1.0", + "tool": { + "name": "codeclone", + "version": "2.0.0b5", + "python_tag": "cp313" + }, + "config": { + "target": "", + "runs": 3, + "warmups": 1, + "python_executable": "" + }, + "environment": { + "platform": "macOS-15.7.5-arm64-arm-64bit-Mach-O", + "machine": "arm64", + "python_version": "3.13.12", + "python_implementation": "CPython", + "python_tag": "cp313", + "cpu_count": 10, + "cpu_affinity_count": null, + "container_detected": false, + "cgroup_cpu_max": null, + "cgroup_memory_max": null, + "timestamp_utc": "2026-04-17T13:45:19Z" + }, + "scenarios": [ + { + "name": "cold_full", + "mode": "cold", + "extra_args": [], + "warmups": 1, + "runs": 3, + "deterministic": true, + "digest": "0e4366a01ad8a0db646c9a984e92fa913166119541e296387d963c5ae4301bc9", + "timings_seconds": [ + 1.0560423749998336, + 1.0907688339998458, + 1.0867978750002294 + ], + "stats_seconds": { + "min": 1.0560423749998336, + "max": 1.0907688339998458, + "mean": 1.0778696946666362, + "median": 1.0867978750002294, + "p95": 1.090371738099884, + "stdev": 0.015519150357364984 + }, + "inventory_sample": { + "found": 180, + "analyzed": 180, + "cached": 0, + "skipped": 0 + } + }, + { + "name": "warm_full", + "mode": "warm", + "extra_args": [], + "warmups": 1, + "runs": 3, + "deterministic": true, + "digest": "55ea63867ffdd599784d10cd0c86d15ba0944e128d62d4d6cb8e68ce8779ea2e", + "timings_seconds": [ + 0.2863777919997119, + 0.2806324170001062, + 0.27757904200007033 + ], + "stats_seconds": { + "min": 0.27757904200007033, + "max": 0.2863777919997119, + "mean": 0.28152975033329614, + "median": 0.2806324170001062, + "p95": 0.2858032544997513, + "stdev": 0.003647684719762983 + }, + "inventory_sample": { + "found": 180, + "analyzed": 0, + "cached": 180, + "skipped": 0 + } + }, + { + "name": "warm_clones_only", + "mode": "warm", + "extra_args": [ + "--skip-metrics" + ], + "warmups": 1, + "runs": 3, + "deterministic": true, + "digest": "8e2fbaf49e9f577b89348aa54fc8f7d6866c9c8213ff1e69d831edc2f663d907", + "timings_seconds": [ + 0.2363325830001486, + 0.22605108300012944, + 0.21571508300030473 + ], + "stats_seconds": { + "min": 0.21571508300030473, + "max": 0.2363325830001486, + "mean": 0.2260329163335276, + "median": 0.22605108300012944, + "p95": 0.23530443300014667, + "stdev": 0.00841706893091738 + }, + "inventory_sample": { + "found": 180, + "analyzed": 0, + "cached": 180, + "skipped": 0 + } + } + ], + "comparisons": { + "warm_full_speedup_vs_cold_full": 3.8726740360854963, + "warm_clones_only_speedup_vs_warm_full": 1.2414557509549535 + }, + "generated_at_utc": "2026-04-17T13:45:19Z" +} diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py index b77c96b..af04cf8 100755 --- a/benchmarks/run_benchmark.py +++ b/benchmarks/run_benchmark.py @@ -15,6 +15,7 @@ import subprocess import sys import time +from collections.abc import Mapping, Sequence from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path @@ -25,6 +26,7 @@ from codeclone.baseline import current_python_tag BENCHMARK_SCHEMA_VERSION = "1.0" +BENCHMARK_CLI_MODULE = "codeclone.main" BENCHMARK_NEUTRAL_ARGS: tuple[str, ...] = ( "--no-fail-on-new", "--no-fail-on-new-metrics", @@ -48,6 +50,8 @@ "-1", "--min-docstring-coverage", "-1", + "--no-api-surface", + "--no-update-metrics-baseline", ) @@ -161,7 +165,7 @@ def _run_cli_once( cmd = [ python_executable, "-m", - "codeclone.cli", + BENCHMARK_CLI_MODULE, str(target), *BENCHMARK_NEUTRAL_ARGS, "--json", @@ -240,6 +244,14 @@ def _validate_inventory_sample( ) +def _print_bulleted_lines(header: str, lines: Sequence[str]) -> None: + if not lines: + return + print(header) + for line in lines: + print(f"- {line}") + + def _scenario_result( *, scenario: Scenario, @@ -393,6 +405,67 @@ def _median_for(name: str) -> float | None: return comparisons +def _load_benchmark_payload(path: Path) -> dict[str, object]: + payload_obj: object = json.loads(path.read_text(encoding="utf-8")) + if not isinstance(payload_obj, dict): + raise RuntimeError(f"benchmark payload is not an object: {path}") + return payload_obj + + +def _scenario_medians(payload: Mapping[str, object]) -> dict[str, float]: + scenarios_obj = payload.get("scenarios") + if not isinstance(scenarios_obj, list): + raise RuntimeError("benchmark payload is missing a scenarios list") + + medians: dict[str, float] = {} + for item in scenarios_obj: + if not isinstance(item, dict): + raise RuntimeError("benchmark scenario entry is not an object") + name = item.get("name") + stats = item.get("stats_seconds") + if not isinstance(name, str) or not isinstance(stats, dict): + raise RuntimeError("benchmark scenario entry is missing name/stats_seconds") + median = stats.get("median") + if not isinstance(median, (int, float)): + raise RuntimeError(f"benchmark scenario {name} is missing median timing") + medians[name] = float(median) + return medians + + +def _timing_regressions( + *, + current_payload: Mapping[str, object], + baseline_payload: Mapping[str, object], + max_regression_pct: float, +) -> list[str]: + current_medians = _scenario_medians(current_payload) + baseline_medians = _scenario_medians(baseline_payload) + + missing = sorted(set(baseline_medians) - set(current_medians)) + if missing: + raise RuntimeError( + "benchmark payload is missing baseline scenario(s): " + ", ".join(missing) + ) + + regressions: list[str] = [] + for name, baseline_median in sorted(baseline_medians.items()): + if baseline_median <= 0: + raise RuntimeError( + f"baseline scenario {name} has non-positive median: {baseline_median}" + ) + current_median = current_medians[name] + allowed_median = baseline_median * (1.0 + (max_regression_pct / 100.0)) + if current_median <= allowed_median: + continue + regression_pct = ((current_median - baseline_median) / baseline_median) * 100.0 + regressions.append( + f"{name}: median {current_median:.4f}s exceeds baseline " + f"{baseline_median:.4f}s by {regression_pct:.2f}% " + f"(allowed {max_regression_pct:.2f}%)" + ) + return regressions + + def _parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description=( @@ -440,6 +513,18 @@ def _parse_args() -> argparse.Namespace: default=sys.executable, help="Python executable used to invoke codeclone CLI", ) + parser.add_argument( + "--baseline", + type=Path, + default=None, + help="Existing benchmark JSON used for per-scenario median regression checks.", + ) + parser.add_argument( + "--max-regression-pct", + type=float, + default=5.0, + help="Allowed per-scenario median slowdown versus --baseline.", + ) return parser.parse_args() @@ -449,6 +534,8 @@ def main() -> int: raise SystemExit("--runs must be > 0") if args.warmups < 0: raise SystemExit("--warmups must be >= 0") + if args.max_regression_pct < 0: + raise SystemExit("--max-regression-pct must be >= 0") target = args.target.resolve() if not target.exists(): raise SystemExit(f"target does not exist: {target}") @@ -501,6 +588,22 @@ def main() -> int: .replace("+00:00", "Z"), } + regressions: list[str] = [] + baseline_path = args.baseline.resolve() if args.baseline is not None else None + if baseline_path is not None: + baseline_payload = _load_benchmark_payload(baseline_path) + regressions = _timing_regressions( + current_payload=payload, + baseline_payload=baseline_payload, + max_regression_pct=args.max_regression_pct, + ) + payload["baseline_comparison"] = { + "baseline_path": str(baseline_path), + "max_regression_pct": args.max_regression_pct, + "status": "regression" if regressions else "ok", + "regressions": regressions, + } + args.output.parent.mkdir(parents=True, exist_ok=True) tmp_output = args.output.with_suffix(args.output.suffix + ".tmp") rendered = json.dumps(payload, ensure_ascii=False, indent=2) @@ -522,12 +625,19 @@ def main() -> int: f"p95={p95_s:.4f}s stdev={stdev_s:.4f}s " f"digest={scenario['digest']}" ) - if comparisons: - print("ratios:") - for name, value in sorted(comparisons.items()): - print(f"- {name}={value:.3f}x") + _print_bulleted_lines( + "ratios:", + [f"{name}={value:.3f}x" for name, value in sorted(comparisons.items())], + ) + if baseline_path is not None: + print(f"baseline={baseline_path}") + print(f"max_regression_pct={args.max_regression_pct:.2f}") + if regressions: + _print_bulleted_lines("regressions:", regressions) + else: + print("baseline_status=ok") print(f"output={args.output}") - return 0 + return 1 if regressions else 0 if __name__ == "__main__": diff --git a/benchmarks/run_docker_benchmark.sh b/benchmarks/run_docker_benchmark.sh index 7a11fe7..c828a80 100755 --- a/benchmarks/run_docker_benchmark.sh +++ b/benchmarks/run_docker_benchmark.sh @@ -2,7 +2,7 @@ set -euo pipefail ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" -IMAGE_TAG="${IMAGE_TAG:-codeclone-benchmark:2.0.0b5}" +IMAGE_TAG="${IMAGE_TAG:-codeclone-benchmark:local}" OUT_DIR="${OUT_DIR:-$ROOT_DIR/.cache/benchmarks}" OUTPUT_BASENAME="${OUTPUT_BASENAME:-codeclone-benchmark.json}" CPUSET="${CPUSET:-0}" diff --git a/codeclone.baseline.json b/codeclone.baseline.json index b4656f8..c6b5719 100644 --- a/codeclone.baseline.json +++ b/codeclone.baseline.json @@ -2,14 +2,14 @@ "meta": { "generator": { "name": "codeclone", - "version": "2.0.0b5" + "version": "2.0.0b6" }, "schema_version": "2.1", "fingerprint_version": "1", - "python_tag": "cp313", - "created_at": "2026-04-13T13:10:37Z", - "payload_sha256": "07a383c1d0974593c83ac30430aec9b99d89fe50f640a9b3b433658e0bd029e8", - "metrics_payload_sha256": "122ee5d2d3dc2d4e9553b1d440c0314515dcb60cc79ada264b13c39c6ba18e04" + "python_tag": "cp314", + "created_at": "2026-04-24T14:37:27Z", + "payload_sha256": "a2e5e3ac672ddbc7ba95c3a9608257727a01480ef343bc6a70c168fc9355e99a", + "metrics_payload_sha256": "26ebd9e502bb4d98d97da593532395de140b2c64b03d85ab91e681f9025fedff" }, "clones": { "functions": [], @@ -18,14 +18,18 @@ "metrics": { "max_complexity": 20, "high_risk_functions": [], - "max_coupling": 10, + "max_coupling": 9, "high_coupling_classes": [], "max_cohesion": 3, "low_cohesion_classes": [], "dependency_cycles": [], - "dependency_max_depth": 11, + "dependency_max_depth": 16, "dead_code_items": [], - "health_score": 89, - "health_grade": "B" + "health_score": 90, + "health_grade": "A", + "typing_param_permille": 1000, + "typing_return_permille": 999, + "docstring_permille": 39, + "typing_any_count": 10 } } diff --git a/codeclone/_cli_args.py b/codeclone/_cli_args.py deleted file mode 100644 index 7ad4c95..0000000 --- a/codeclone/_cli_args.py +++ /dev/null @@ -1,456 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at https://mozilla.org/MPL/2.0/. -# SPDX-License-Identifier: MPL-2.0 -# Copyright (c) 2026 Den Rozhnovskiy - -from __future__ import annotations - -import argparse -import sys -from typing import NoReturn - -from . import ui_messages as ui -from .contracts import ( - DEFAULT_COHESION_THRESHOLD, - DEFAULT_COMPLEXITY_THRESHOLD, - DEFAULT_COUPLING_THRESHOLD, - DEFAULT_HEALTH_THRESHOLD, - ExitCode, - cli_help_epilog, -) - -DEFAULT_ROOT = "." -DEFAULT_MIN_LOC = 10 -DEFAULT_MIN_STMT = 6 -DEFAULT_BLOCK_MIN_LOC = 20 -DEFAULT_BLOCK_MIN_STMT = 8 -DEFAULT_SEGMENT_MIN_LOC = 20 -DEFAULT_SEGMENT_MIN_STMT = 10 -DEFAULT_PROCESSES = 4 -DEFAULT_MAX_CACHE_SIZE_MB = 50 -DEFAULT_MAX_BASELINE_SIZE_MB = 5 - -DEFAULT_BASELINE_PATH = "codeclone.baseline.json" -DEFAULT_HTML_REPORT_PATH = ".cache/codeclone/report.html" -DEFAULT_JSON_REPORT_PATH = ".cache/codeclone/report.json" -DEFAULT_MARKDOWN_REPORT_PATH = ".cache/codeclone/report.md" -DEFAULT_SARIF_REPORT_PATH = ".cache/codeclone/report.sarif" -DEFAULT_TEXT_REPORT_PATH = ".cache/codeclone/report.txt" - - -class _ArgumentParser(argparse.ArgumentParser): - def error(self, message: str) -> NoReturn: - self.print_usage(sys.stderr) - self.exit( - int(ExitCode.CONTRACT_ERROR), - f"CONTRACT ERROR: {message}\n", - ) - - -class _HelpFormatter(argparse.RawTextHelpFormatter): - """Product-oriented help formatter extension point.""" - - -def _add_optional_path_argument( - group: argparse._ArgumentGroup, - *, - flag: str, - dest: str, - help_text: str, - default: str | None = None, - const: str | None = None, - metavar: str = "FILE", -) -> None: - group.add_argument( - flag, - dest=dest, - nargs="?", - metavar=metavar, - default=default, - const=const, - help=help_text, - ) - - -def _add_bool_optional_argument( - group: argparse._ArgumentGroup, - *, - flag: str, - help_text: str, - default: bool = False, -) -> None: - group.add_argument( - flag, - action=argparse.BooleanOptionalAction, - default=default, - help=help_text, - ) - - -def build_parser(version: str) -> _ArgumentParser: - ap = _ArgumentParser( - prog="codeclone", - description="Structural code quality analysis for Python.", - add_help=False, - formatter_class=_HelpFormatter, - epilog=cli_help_epilog(), - ) - - target_group = ap.add_argument_group("Target") - target_group.add_argument( - "root", - nargs="?", - default=DEFAULT_ROOT, - help=ui.HELP_ROOT, - ) - - analysis_group = ap.add_argument_group("Analysis") - analysis_group.add_argument( - "--min-loc", - type=int, - default=DEFAULT_MIN_LOC, - help=ui.HELP_MIN_LOC, - ) - analysis_group.add_argument( - "--min-stmt", - type=int, - default=DEFAULT_MIN_STMT, - help=ui.HELP_MIN_STMT, - ) - # Block/segment thresholds are advanced tuning: configurable via - # pyproject.toml only (no CLI flags). Defaults live on the namespace - # so apply_pyproject_config_overrides can override them. - ap.set_defaults( - block_min_loc=DEFAULT_BLOCK_MIN_LOC, - block_min_stmt=DEFAULT_BLOCK_MIN_STMT, - segment_min_loc=DEFAULT_SEGMENT_MIN_LOC, - segment_min_stmt=DEFAULT_SEGMENT_MIN_STMT, - golden_fixture_paths=(), - ) - analysis_group.add_argument( - "--processes", - type=int, - default=DEFAULT_PROCESSES, - help=ui.HELP_PROCESSES, - ) - _add_bool_optional_argument( - analysis_group, - flag="--changed-only", - help_text=ui.HELP_CHANGED_ONLY, - ) - analysis_group.add_argument( - "--diff-against", - default=None, - metavar="GIT_REF", - help=ui.HELP_DIFF_AGAINST, - ) - analysis_group.add_argument( - "--paths-from-git-diff", - default=None, - metavar="GIT_REF", - help=ui.HELP_PATHS_FROM_GIT_DIFF, - ) - _add_optional_path_argument( - analysis_group, - flag="--cache-path", - dest="cache_path", - default=None, - const=None, - help_text=ui.HELP_CACHE_PATH, - ) - _add_optional_path_argument( - analysis_group, - flag="--cache-dir", - dest="cache_path", - default=None, - const=None, - help_text=ui.HELP_CACHE_DIR_LEGACY, - ) - analysis_group.add_argument( - "--max-cache-size-mb", - type=int, - default=DEFAULT_MAX_CACHE_SIZE_MB, - metavar="MB", - help=ui.HELP_MAX_CACHE_SIZE_MB, - ) - - baselines_ci_group = ap.add_argument_group("Baselines and CI") - _add_optional_path_argument( - baselines_ci_group, - flag="--baseline", - dest="baseline", - default=DEFAULT_BASELINE_PATH, - const=DEFAULT_BASELINE_PATH, - help_text=ui.HELP_BASELINE, - ) - baselines_ci_group.add_argument( - "--max-baseline-size-mb", - type=int, - default=DEFAULT_MAX_BASELINE_SIZE_MB, - metavar="MB", - help=ui.HELP_MAX_BASELINE_SIZE_MB, - ) - _add_bool_optional_argument( - baselines_ci_group, - flag="--update-baseline", - help_text=ui.HELP_UPDATE_BASELINE, - ) - _add_optional_path_argument( - baselines_ci_group, - flag="--metrics-baseline", - dest="metrics_baseline", - default=DEFAULT_BASELINE_PATH, - const=DEFAULT_BASELINE_PATH, - help_text=ui.HELP_METRICS_BASELINE, - ) - _add_bool_optional_argument( - baselines_ci_group, - flag="--update-metrics-baseline", - help_text=ui.HELP_UPDATE_METRICS_BASELINE, - ) - _add_bool_optional_argument( - baselines_ci_group, - flag="--ci", - help_text=ui.HELP_CI, - ) - _add_bool_optional_argument( - baselines_ci_group, - flag="--api-surface", - help_text=ui.HELP_API_SURFACE, - ) - baselines_ci_group.add_argument( - "--coverage", - dest="coverage_xml", - metavar="FILE", - default=None, - help=ui.HELP_COVERAGE, - ) - - quality_group = ap.add_argument_group("Quality gates") - _add_bool_optional_argument( - quality_group, - flag="--fail-on-new", - help_text=ui.HELP_FAIL_ON_NEW, - ) - _add_bool_optional_argument( - quality_group, - flag="--fail-on-new-metrics", - help_text=ui.HELP_FAIL_ON_NEW_METRICS, - ) - quality_group.add_argument( - "--fail-threshold", - type=int, - default=-1, - metavar="MAX_CLONES", - help=ui.HELP_FAIL_THRESHOLD, - ) - quality_group.add_argument( - "--fail-complexity", - type=int, - nargs="?", - const=DEFAULT_COMPLEXITY_THRESHOLD, - default=-1, - metavar="CC_MAX", - help=ui.HELP_FAIL_COMPLEXITY, - ) - quality_group.add_argument( - "--fail-coupling", - type=int, - nargs="?", - const=DEFAULT_COUPLING_THRESHOLD, - default=-1, - metavar="CBO_MAX", - help=ui.HELP_FAIL_COUPLING, - ) - quality_group.add_argument( - "--fail-cohesion", - type=int, - nargs="?", - const=DEFAULT_COHESION_THRESHOLD, - default=-1, - metavar="LCOM4_MAX", - help=ui.HELP_FAIL_COHESION, - ) - _add_bool_optional_argument( - quality_group, - flag="--fail-cycles", - help_text=ui.HELP_FAIL_CYCLES, - ) - _add_bool_optional_argument( - quality_group, - flag="--fail-dead-code", - help_text=ui.HELP_FAIL_DEAD_CODE, - ) - quality_group.add_argument( - "--fail-health", - type=int, - nargs="?", - const=DEFAULT_HEALTH_THRESHOLD, - default=-1, - metavar="SCORE_MIN", - help=ui.HELP_FAIL_HEALTH, - ) - _add_bool_optional_argument( - quality_group, - flag="--fail-on-typing-regression", - help_text=ui.HELP_FAIL_ON_TYPING_REGRESSION, - ) - _add_bool_optional_argument( - quality_group, - flag="--fail-on-docstring-regression", - help_text=ui.HELP_FAIL_ON_DOCSTRING_REGRESSION, - ) - _add_bool_optional_argument( - quality_group, - flag="--fail-on-api-break", - help_text=ui.HELP_FAIL_ON_API_BREAK, - ) - _add_bool_optional_argument( - quality_group, - flag="--fail-on-untested-hotspots", - help_text=ui.HELP_FAIL_ON_UNTESTED_HOTSPOTS, - ) - quality_group.add_argument( - "--min-typing-coverage", - type=int, - default=-1, - metavar="PERCENT", - help=ui.HELP_MIN_TYPING_COVERAGE, - ) - quality_group.add_argument( - "--min-docstring-coverage", - type=int, - default=-1, - metavar="PERCENT", - help=ui.HELP_MIN_DOCSTRING_COVERAGE, - ) - quality_group.add_argument( - "--coverage-min", - type=int, - default=50, - metavar="PERCENT", - help=ui.HELP_COVERAGE_MIN, - ) - - stages_group = ap.add_argument_group("Analysis stages") - _add_bool_optional_argument( - stages_group, - flag="--skip-metrics", - help_text=ui.HELP_SKIP_METRICS, - ) - _add_bool_optional_argument( - stages_group, - flag="--skip-dead-code", - help_text=ui.HELP_SKIP_DEAD_CODE, - ) - _add_bool_optional_argument( - stages_group, - flag="--skip-dependencies", - help_text=ui.HELP_SKIP_DEPENDENCIES, - ) - - reporting_group = ap.add_argument_group("Reporting") - _add_optional_path_argument( - reporting_group, - flag="--html", - dest="html_out", - const=DEFAULT_HTML_REPORT_PATH, - help_text=ui.HELP_HTML, - ) - _add_optional_path_argument( - reporting_group, - flag="--json", - dest="json_out", - const=DEFAULT_JSON_REPORT_PATH, - help_text=ui.HELP_JSON, - ) - _add_optional_path_argument( - reporting_group, - flag="--md", - dest="md_out", - const=DEFAULT_MARKDOWN_REPORT_PATH, - help_text=ui.HELP_MD, - ) - _add_optional_path_argument( - reporting_group, - flag="--sarif", - dest="sarif_out", - const=DEFAULT_SARIF_REPORT_PATH, - help_text=ui.HELP_SARIF, - ) - _add_optional_path_argument( - reporting_group, - flag="--text", - dest="text_out", - const=DEFAULT_TEXT_REPORT_PATH, - help_text=ui.HELP_TEXT, - ) - _add_bool_optional_argument( - reporting_group, - flag="--timestamped-report-paths", - help_text=ui.HELP_TIMESTAMPED_REPORT_PATHS, - ) - - ui_group = ap.add_argument_group("Output and UI") - _add_bool_optional_argument( - ui_group, - flag="--open-html-report", - help_text=ui.HELP_OPEN_HTML_REPORT, - ) - ui_group.add_argument( - "--no-progress", - dest="no_progress", - action="store_true", - help=ui.HELP_NO_PROGRESS, - ) - ui_group.add_argument( - "--progress", - dest="no_progress", - action="store_false", - help=ui.HELP_PROGRESS, - ) - ui_group.add_argument( - "--no-color", - dest="no_color", - action="store_true", - help=ui.HELP_NO_COLOR, - ) - ui_group.add_argument( - "--color", - dest="no_color", - action="store_false", - help=ui.HELP_COLOR, - ) - ui_group.set_defaults(no_progress=False, no_color=False) - _add_bool_optional_argument( - ui_group, - flag="--quiet", - help_text=ui.HELP_QUIET, - ) - _add_bool_optional_argument( - ui_group, - flag="--verbose", - help_text=ui.HELP_VERBOSE, - ) - _add_bool_optional_argument( - ui_group, - flag="--debug", - help_text=ui.HELP_DEBUG, - ) - - general_group = ap.add_argument_group("General") - general_group.add_argument( - "-h", - "--help", - action="help", - help="Show this help message and exit.", - ) - general_group.add_argument( - "--version", - action="version", - version=ui.version_output(version), - help=ui.HELP_VERSION, - ) - - return ap diff --git a/codeclone/_cli_config.py b/codeclone/_cli_config.py deleted file mode 100644 index b17ba43..0000000 --- a/codeclone/_cli_config.py +++ /dev/null @@ -1,303 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at https://mozilla.org/MPL/2.0/. -# SPDX-License-Identifier: MPL-2.0 -# Copyright (c) 2026 Den Rozhnovskiy - -from __future__ import annotations - -import importlib -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import TYPE_CHECKING, Final - -from .golden_fixtures import ( - GoldenFixturePatternError, - normalize_golden_fixture_patterns, -) - -if TYPE_CHECKING: - import argparse - from collections.abc import Mapping, Sequence - - -class ConfigValidationError(ValueError): - """Raised when pyproject.toml contains invalid CodeClone configuration.""" - - -@dataclass(frozen=True, slots=True) -class _ConfigKeySpec: - expected_type: type[object] - allow_none: bool = False - expected_name: str | None = None - - -_CONFIG_KEY_SPECS: Final[dict[str, _ConfigKeySpec]] = { - "min_loc": _ConfigKeySpec(int), - "min_stmt": _ConfigKeySpec(int), - "block_min_loc": _ConfigKeySpec(int), - "block_min_stmt": _ConfigKeySpec(int), - "segment_min_loc": _ConfigKeySpec(int), - "segment_min_stmt": _ConfigKeySpec(int), - "processes": _ConfigKeySpec(int), - "cache_path": _ConfigKeySpec(str, allow_none=True), - "max_cache_size_mb": _ConfigKeySpec(int), - "baseline": _ConfigKeySpec(str), - "max_baseline_size_mb": _ConfigKeySpec(int), - "update_baseline": _ConfigKeySpec(bool), - "fail_on_new": _ConfigKeySpec(bool), - "fail_threshold": _ConfigKeySpec(int), - "ci": _ConfigKeySpec(bool), - "fail_complexity": _ConfigKeySpec(int), - "fail_coupling": _ConfigKeySpec(int), - "fail_cohesion": _ConfigKeySpec(int), - "fail_cycles": _ConfigKeySpec(bool), - "fail_dead_code": _ConfigKeySpec(bool), - "fail_health": _ConfigKeySpec(int), - "fail_on_new_metrics": _ConfigKeySpec(bool), - "api_surface": _ConfigKeySpec(bool), - "coverage_xml": _ConfigKeySpec(str, allow_none=True), - "fail_on_typing_regression": _ConfigKeySpec(bool), - "fail_on_docstring_regression": _ConfigKeySpec(bool), - "fail_on_api_break": _ConfigKeySpec(bool), - "fail_on_untested_hotspots": _ConfigKeySpec(bool), - "min_typing_coverage": _ConfigKeySpec(int), - "min_docstring_coverage": _ConfigKeySpec(int), - "coverage_min": _ConfigKeySpec(int), - "update_metrics_baseline": _ConfigKeySpec(bool), - "metrics_baseline": _ConfigKeySpec(str), - "skip_metrics": _ConfigKeySpec(bool), - "skip_dead_code": _ConfigKeySpec(bool), - "skip_dependencies": _ConfigKeySpec(bool), - "golden_fixture_paths": _ConfigKeySpec(list, expected_name="list[str]"), - "html_out": _ConfigKeySpec(str, allow_none=True), - "json_out": _ConfigKeySpec(str, allow_none=True), - "md_out": _ConfigKeySpec(str, allow_none=True), - "sarif_out": _ConfigKeySpec(str, allow_none=True), - "text_out": _ConfigKeySpec(str, allow_none=True), - "no_progress": _ConfigKeySpec(bool), - "no_color": _ConfigKeySpec(bool), - "quiet": _ConfigKeySpec(bool), - "verbose": _ConfigKeySpec(bool), - "debug": _ConfigKeySpec(bool), -} -_PATH_CONFIG_KEYS: Final[frozenset[str]] = frozenset( - { - "cache_path", - "baseline", - "metrics_baseline", - "coverage_xml", - "html_out", - "json_out", - "md_out", - "sarif_out", - "text_out", - } -) - - -def collect_explicit_cli_dests( - parser: argparse.ArgumentParser, - *, - argv: Sequence[str], -) -> set[str]: - option_to_dest: dict[str, str] = {} - for action in parser._actions: - for option in action.option_strings: - option_to_dest[option] = action.dest - - explicit: set[str] = set() - for token in argv: - if token == "--": - break - if not token.startswith("-"): - continue - option = token.split("=", maxsplit=1)[0] - dest = option_to_dest.get(option) - if dest is not None: - explicit.add(dest) - return explicit - - -def load_pyproject_config(root_path: Path) -> dict[str, object]: - config_path = root_path / "pyproject.toml" - if not config_path.exists(): - return {} - - payload: object - try: - payload = _load_toml(config_path) - except OSError as exc: - raise ConfigValidationError( - f"Cannot read pyproject.toml at {config_path}: {exc}" - ) from exc - except ValueError as exc: - raise ConfigValidationError(f"Invalid TOML in {config_path}: {exc}") from exc - - if not isinstance(payload, dict): - raise ConfigValidationError( - f"Invalid pyproject payload at {config_path}: root must be object" - ) - - tool_obj = payload.get("tool") - if tool_obj is None: - return {} - if not isinstance(tool_obj, dict): - raise ConfigValidationError( - f"Invalid pyproject payload at {config_path}: 'tool' must be object" - ) - - codeclone_obj = tool_obj.get("codeclone") - if codeclone_obj is None: - return {} - if not isinstance(codeclone_obj, dict): - raise ConfigValidationError( - "Invalid pyproject payload at " - f"{config_path}: 'tool.codeclone' must be object" - ) - - unknown = sorted(set(codeclone_obj.keys()) - set(_CONFIG_KEY_SPECS)) - if unknown: - raise ConfigValidationError( - "Unknown key(s) in tool.codeclone: " + ", ".join(unknown) - ) - - validated: dict[str, object] = {} - for key in sorted(codeclone_obj.keys()): - value = _validate_config_value( - key=key, - value=codeclone_obj[key], - ) - validated[key] = _normalize_path_config_value( - key=key, - value=value, - root_path=root_path, - ) - return validated - - -def apply_pyproject_config_overrides( - *, - args: argparse.Namespace, - config_values: Mapping[str, object], - explicit_cli_dests: set[str], -) -> None: - for key, value in config_values.items(): - if key in explicit_cli_dests: - continue - setattr(args, key, value) - - -def _validate_config_value(*, key: str, value: object) -> object: - spec = _CONFIG_KEY_SPECS[key] - if value is None: - if spec.allow_none: - return None - raise ConfigValidationError( - "Invalid value type for tool.codeclone." - f"{key}: expected {spec.expected_name or spec.expected_type.__name__}" - ) - - expected_type = spec.expected_type - if expected_type is bool: - return _validated_config_instance( - key=key, - value=value, - expected_type=bool, - expected_name="bool", - ) - - if expected_type is int: - return _validated_config_instance( - key=key, - value=value, - expected_type=int, - expected_name="int", - reject_bool=True, - ) - - if expected_type is str: - return _validated_config_instance( - key=key, - value=value, - expected_type=str, - expected_name="str", - ) - if expected_type is list: - return _validated_string_list(key=key, value=value) - - raise ConfigValidationError(f"Unsupported config key spec for tool.codeclone.{key}") - - -def _validated_config_instance( - *, - key: str, - value: object, - expected_type: type[object], - expected_name: str, - reject_bool: bool = False, -) -> object: - if isinstance(value, expected_type) and ( - not reject_bool or not isinstance(value, bool) - ): - return value - raise ConfigValidationError( - f"Invalid value type for tool.codeclone.{key}: expected {expected_name}" - ) - - -def _validated_string_list(*, key: str, value: object) -> tuple[str, ...]: - if not isinstance(value, list): - raise ConfigValidationError( - f"Invalid value type for tool.codeclone.{key}: expected list[str]" - ) - if not all(isinstance(item, str) for item in value): - raise ConfigValidationError( - f"Invalid value type for tool.codeclone.{key}: expected list[str]" - ) - try: - return normalize_golden_fixture_patterns(value) - except GoldenFixturePatternError as exc: - raise ConfigValidationError(str(exc)) from exc - - -def _load_toml(path: Path) -> object: - if sys.version_info >= (3, 11): - import tomllib - - with path.open("rb") as config_file: - return tomllib.load(config_file) - else: - try: - tomli_module = importlib.import_module("tomli") - except ModuleNotFoundError as exc: - raise ConfigValidationError( - "Python 3.10 requires dependency 'tomli' to read pyproject.toml." - ) from exc - - load_fn = getattr(tomli_module, "load", None) - if not callable(load_fn): - raise ConfigValidationError( - "Invalid 'tomli' module: missing callable 'load'." - ) - - with path.open("rb") as config_file: - return load_fn(config_file) - - -def _normalize_path_config_value( - *, - key: str, - value: object, - root_path: Path, -) -> object: - if key not in _PATH_CONFIG_KEYS: - return value - if not isinstance(value, str): - return value - - path = Path(value).expanduser() - if path.is_absolute(): - return str(path) - return str(root_path / path) diff --git a/codeclone/_cli_paths.py b/codeclone/_cli_paths.py deleted file mode 100644 index 3577dc0..0000000 --- a/codeclone/_cli_paths.py +++ /dev/null @@ -1,47 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at https://mozilla.org/MPL/2.0/. -# SPDX-License-Identifier: MPL-2.0 -# Copyright (c) 2026 Den Rozhnovskiy - -from __future__ import annotations - -import sys -from pathlib import Path -from typing import TYPE_CHECKING, Protocol - -from .contracts import ExitCode -from .ui_messages import fmt_contract_error - -if TYPE_CHECKING: - from collections.abc import Callable - - -class _Printer(Protocol): - def print(self, *objects: object, **kwargs: object) -> None: ... - - -def _validate_output_path( - path: str, - *, - expected_suffix: str, - label: str, - console: _Printer, - invalid_message: Callable[..., str], - invalid_path_message: Callable[..., str], -) -> Path: - out = Path(path).expanduser() - if out.suffix.lower() != expected_suffix: - console.print( - fmt_contract_error( - invalid_message(label=label, path=out, expected_suffix=expected_suffix) - ) - ) - sys.exit(ExitCode.CONTRACT_ERROR) - try: - return out.resolve() - except OSError as e: - console.print( - fmt_contract_error(invalid_path_message(label=label, path=out, error=e)) - ) - sys.exit(ExitCode.CONTRACT_ERROR) diff --git a/codeclone/_cli_reports.py b/codeclone/_cli_reports.py deleted file mode 100644 index 126879c..0000000 --- a/codeclone/_cli_reports.py +++ /dev/null @@ -1,150 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at https://mozilla.org/MPL/2.0/. -# SPDX-License-Identifier: MPL-2.0 -# Copyright (c) 2026 Den Rozhnovskiy - -from __future__ import annotations - -import sys -import webbrowser -from pathlib import Path -from typing import Protocol - -from . import ui_messages as ui -from .contracts import ExitCode - -__all__ = ["write_report_outputs"] - - -class _PrinterLike(Protocol): - def print(self, *objects: object, **kwargs: object) -> None: ... - - -class _QuietArgs(Protocol): - quiet: bool - - -def _path_attr(obj: object, name: str) -> Path | None: - value = getattr(obj, name, None) - return value if isinstance(value, Path) else None - - -def _text_attr(obj: object, name: str) -> str | None: - value = getattr(obj, name, None) - return value if isinstance(value, str) else None - - -def _write_report_output( - *, - out: Path, - content: str, - label: str, - console: _PrinterLike, -) -> None: - try: - out.parent.mkdir(parents=True, exist_ok=True) - out.write_text(content, "utf-8") - except OSError as exc: - console.print( - ui.fmt_contract_error( - ui.fmt_report_write_failed(label=label, path=out, error=exc) - ) - ) - sys.exit(ExitCode.CONTRACT_ERROR) - - -def _open_html_report_in_browser(*, path: Path) -> None: - if not webbrowser.open_new_tab(path.as_uri()): - raise OSError("no browser handler available") - - -def write_report_outputs( - *, - args: _QuietArgs, - output_paths: object, - report_artifacts: object, - console: _PrinterLike, - open_html_report: bool = False, -) -> str | None: - html_report_path: str | None = None - saved_reports: list[tuple[str, Path]] = [] - html_path = _path_attr(output_paths, "html") - json_path = _path_attr(output_paths, "json") - md_path = _path_attr(output_paths, "md") - sarif_path = _path_attr(output_paths, "sarif") - text_path = _path_attr(output_paths, "text") - html_report = _text_attr(report_artifacts, "html") - json_report = _text_attr(report_artifacts, "json") - md_report = _text_attr(report_artifacts, "md") - sarif_report = _text_attr(report_artifacts, "sarif") - text_report = _text_attr(report_artifacts, "text") - - if html_path and html_report is not None: - out = html_path - _write_report_output( - out=out, - content=html_report, - label="HTML", - console=console, - ) - html_report_path = str(out) - saved_reports.append(("HTML", out)) - - if json_path and json_report is not None: - out = json_path - _write_report_output( - out=out, - content=json_report, - label="JSON", - console=console, - ) - saved_reports.append(("JSON", out)) - - if md_path and md_report is not None: - out = md_path - _write_report_output( - out=out, - content=md_report, - label="Markdown", - console=console, - ) - saved_reports.append(("Markdown", out)) - - if sarif_path and sarif_report is not None: - out = sarif_path - _write_report_output( - out=out, - content=sarif_report, - label="SARIF", - console=console, - ) - saved_reports.append(("SARIF", out)) - - if text_path and text_report is not None: - out = text_path - _write_report_output( - out=out, - content=text_report, - label="text", - console=console, - ) - saved_reports.append(("Text", out)) - - if saved_reports and not args.quiet: - cwd = Path.cwd() - console.print() - for label, path in saved_reports: - try: - display = path.relative_to(cwd) - except ValueError: - display = path - console.print(f" [bold]{label} report saved:[/bold] [dim]{display}[/dim]") - - if open_html_report and html_path is not None: - try: - _open_html_report_in_browser(path=html_path) - except Exception as exc: - console.print(ui.fmt_html_report_open_failed(path=html_path, error=exc)) - - return html_report_path diff --git a/codeclone/_cli_runtime.py b/codeclone/_cli_runtime.py deleted file mode 100644 index 28ca869..0000000 --- a/codeclone/_cli_runtime.py +++ /dev/null @@ -1,222 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at https://mozilla.org/MPL/2.0/. -# SPDX-License-Identifier: MPL-2.0 -# Copyright (c) 2026 Den Rozhnovskiy - -from __future__ import annotations - -import sys -from pathlib import Path -from typing import Protocol - -from . import ui_messages as ui -from .cache import CacheStatus -from .contracts import ExitCode - -__all__ = [ - "configure_metrics_mode", - "metrics_computed", - "print_failed_files", - "resolve_cache_path", - "resolve_cache_status", - "validate_numeric_args", -] - - -class _RuntimeArgs(Protocol): - cache_path: str | None - coverage_xml: str | None - max_baseline_size_mb: int - max_cache_size_mb: int - fail_threshold: int - fail_complexity: int - fail_coupling: int - fail_cohesion: int - fail_health: int - fail_on_new_metrics: bool - fail_on_typing_regression: bool - fail_on_docstring_regression: bool - fail_on_api_break: bool - fail_on_untested_hotspots: bool - min_typing_coverage: int - min_docstring_coverage: int - coverage_min: int - api_surface: bool - update_metrics_baseline: bool - skip_metrics: bool - fail_cycles: bool - fail_dead_code: bool - skip_dead_code: bool - skip_dependencies: bool - - -class _PrinterLike(Protocol): - def print(self, *objects: object, **kwargs: object) -> None: ... - - -class _CacheLike(Protocol): - @property - def load_status(self) -> CacheStatus | str | None: ... - - @property - def load_warning(self) -> str | None: ... - - @property - def cache_schema_version(self) -> str | None: ... - - -def validate_numeric_args(args: _RuntimeArgs) -> bool: - return bool( - not ( - args.max_baseline_size_mb < 0 - or args.max_cache_size_mb < 0 - or args.fail_threshold < -1 - or args.fail_complexity < -1 - or args.fail_coupling < -1 - or args.fail_cohesion < -1 - or args.fail_health < -1 - or args.min_typing_coverage < -1 - or args.min_typing_coverage > 100 - or args.min_docstring_coverage < -1 - or args.min_docstring_coverage > 100 - or args.coverage_min < 0 - or args.coverage_min > 100 - ) - ) - - -def _metrics_flags_requested(args: _RuntimeArgs) -> bool: - return bool( - args.fail_complexity >= 0 - or args.fail_coupling >= 0 - or args.fail_cohesion >= 0 - or args.fail_cycles - or args.fail_dead_code - or args.fail_health >= 0 - or args.fail_on_new_metrics - or args.fail_on_typing_regression - or args.fail_on_docstring_regression - or args.fail_on_api_break - or args.fail_on_untested_hotspots - or args.min_typing_coverage >= 0 - or args.min_docstring_coverage >= 0 - or args.api_surface - or args.update_metrics_baseline - or bool(getattr(args, "coverage_xml", None)) - ) - - -def configure_metrics_mode( - *, - args: _RuntimeArgs, - metrics_baseline_exists: bool, - console: _PrinterLike, -) -> None: - metrics_flags_requested = _metrics_flags_requested(args) - - if args.skip_metrics and metrics_flags_requested: - console.print( - ui.fmt_contract_error( - "--skip-metrics cannot be used together with metrics gating/update " - "flags." - ) - ) - sys.exit(ExitCode.CONTRACT_ERROR) - - if ( - not args.skip_metrics - and not metrics_flags_requested - and not metrics_baseline_exists - ): - args.skip_metrics = True - - if args.skip_metrics: - args.skip_dead_code = True - args.skip_dependencies = True - return - - if args.fail_dead_code: - args.skip_dead_code = False - if args.fail_cycles: - args.skip_dependencies = False - if bool(getattr(args, "fail_on_api_break", False)): - args.api_surface = True - - -def resolve_cache_path( - *, - root_path: Path, - args: _RuntimeArgs, - from_args: bool, - legacy_cache_path: Path, - console: _PrinterLike, -) -> Path: - if from_args and args.cache_path: - return Path(args.cache_path).expanduser() - - cache_path = root_path / ".cache" / "codeclone" / "cache.json" - if legacy_cache_path.exists(): - try: - legacy_resolved = legacy_cache_path.resolve() - except OSError: - legacy_resolved = legacy_cache_path - if legacy_resolved != cache_path: - console.print( - ui.fmt_legacy_cache_warning( - legacy_path=legacy_resolved, - new_path=cache_path, - ) - ) - return cache_path - - -def metrics_computed(args: _RuntimeArgs) -> tuple[str, ...]: - if args.skip_metrics: - return () - - computed = ["complexity", "coupling", "cohesion", "health"] - if not args.skip_dependencies: - computed.append("dependencies") - if not args.skip_dead_code: - computed.append("dead_code") - computed.append("coverage_adoption") - if bool(getattr(args, "api_surface", False)): - computed.append("api_surface") - if bool(getattr(args, "coverage_xml", None)): - computed.append("coverage_join") - return tuple(computed) - - -def resolve_cache_status(cache: _CacheLike) -> tuple[CacheStatus, str | None]: - raw_cache_status = getattr(cache, "load_status", None) - load_warning = getattr(cache, "load_warning", None) - if isinstance(raw_cache_status, CacheStatus): - cache_status = raw_cache_status - elif isinstance(raw_cache_status, str): - try: - cache_status = CacheStatus(raw_cache_status) - except ValueError: - cache_status = ( - CacheStatus.OK if load_warning is None else CacheStatus.INVALID_TYPE - ) - else: - cache_status = ( - CacheStatus.OK if load_warning is None else CacheStatus.INVALID_TYPE - ) - - raw_cache_schema_version = getattr(cache, "cache_schema_version", None) - cache_schema_version = ( - raw_cache_schema_version if isinstance(raw_cache_schema_version, str) else None - ) - return cache_status, cache_schema_version - - -def print_failed_files(*, failed_files: tuple[str, ...], console: _PrinterLike) -> None: - if not failed_files: - return - console.print(ui.fmt_failed_files_header(len(failed_files))) - for failure in failed_files[:10]: - console.print(f" • {failure}") - if len(failed_files) > 10: - console.print(f" ... and {len(failed_files) - 10} more") diff --git a/codeclone/analysis/__init__.py b/codeclone/analysis/__init__.py new file mode 100644 index 0000000..a521754 --- /dev/null +++ b/codeclone/analysis/__init__.py @@ -0,0 +1,22 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 + +from __future__ import annotations + +from .cfg import CFG, CFGBuilder +from .fingerprint import bucket_loc, sha1 +from .normalizer import AstNormalizer, NormalizationConfig, stmt_hashes +from .units import extract_units_and_stats_from_source + +__all__ = [ + "CFG", + "AstNormalizer", + "CFGBuilder", + "NormalizationConfig", + "bucket_loc", + "extract_units_and_stats_from_source", + "sha1", + "stmt_hashes", +] diff --git a/codeclone/analysis/_module_walk.py b/codeclone/analysis/_module_walk.py new file mode 100644 index 0000000..bba111e --- /dev/null +++ b/codeclone/analysis/_module_walk.py @@ -0,0 +1,553 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import ast +import tokenize +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Literal, NamedTuple + +from .. import qualnames as _qualnames +from ..models import DeadCandidate, ModuleDep +from .class_metrics import _node_line_span +from .parser import ( + _build_declaration_token_index, + _declaration_end_line, + _DeclarationTokenIndexKey, + _source_tokens, +) +from .suppressions import ( + DeclarationTarget, + bind_suppressions_to_declarations, + build_suppression_index, + extract_suppression_directives, + suppression_target_key, +) + +if TYPE_CHECKING: + from collections.abc import Mapping + + from .suppressions import SuppressionTargetKey + + +_NamedDeclarationNode = _qualnames.FunctionNode | ast.ClassDef +_PROTOCOL_MODULE_NAMES = frozenset({"typing", "typing_extensions"}) + + +def _resolve_import_target( + module_name: str, + import_node: ast.ImportFrom, +) -> str: + if import_node.level <= 0: + return import_node.module or "" + + parent_parts = module_name.split(".") + keep = max(0, len(parent_parts) - import_node.level) + prefix = parent_parts[:keep] + if import_node.module: + return ".".join([*prefix, import_node.module]) + return ".".join(prefix) + + +@dataclass(slots=True) +class _ModuleWalkState: + import_names: set[str] = field(default_factory=set) + deps: list[ModuleDep] = field(default_factory=list) + referenced_names: set[str] = field(default_factory=set) + imported_symbol_bindings: dict[str, set[str]] = field(default_factory=dict) + imported_module_aliases: dict[str, str] = field(default_factory=dict) + name_nodes: list[ast.Name] = field(default_factory=list) + attr_nodes: list[ast.Attribute] = field(default_factory=list) + protocol_symbol_aliases: set[str] = field(default_factory=lambda: {"Protocol"}) + protocol_module_aliases: set[str] = field( + default_factory=lambda: set(_PROTOCOL_MODULE_NAMES) + ) + + +def _append_module_dep( + *, + module_name: str, + target: str, + import_type: Literal["import", "from_import"], + line: int, + state: _ModuleWalkState, +) -> None: + state.deps.append( + ModuleDep( + source=module_name, + target=target, + import_type=import_type, + line=line, + ) + ) + + +def _collect_import_node( + *, + node: ast.Import, + module_name: str, + state: _ModuleWalkState, + collect_referenced_names: bool, +) -> None: + line = int(getattr(node, "lineno", 0)) + for alias in node.names: + alias_name = alias.asname or alias.name.split(".", 1)[0] + state.import_names.add(alias_name) + _append_module_dep( + module_name=module_name, + target=alias.name, + import_type="import", + line=line, + state=state, + ) + if collect_referenced_names: + state.imported_module_aliases[alias_name] = alias.name + if alias.name in _PROTOCOL_MODULE_NAMES: + state.protocol_module_aliases.add(alias_name) + + +def _dotted_expr_name(expr: ast.expr) -> str | None: + if isinstance(expr, ast.Name): + return expr.id + if isinstance(expr, ast.Attribute): + prefix = _dotted_expr_name(expr.value) + if prefix is None: + return None + return f"{prefix}.{expr.attr}" + return None + + +def _collect_import_from_node( + *, + node: ast.ImportFrom, + module_name: str, + state: _ModuleWalkState, + collect_referenced_names: bool, +) -> None: + target = _resolve_import_target(module_name, node) + if target: + state.import_names.add(target.split(".", 1)[0]) + _append_module_dep( + module_name=module_name, + target=target, + import_type="from_import", + line=int(getattr(node, "lineno", 0)), + state=state, + ) + + if node.module in _PROTOCOL_MODULE_NAMES: + for alias in node.names: + if alias.name == "Protocol": + state.protocol_symbol_aliases.add(alias.asname or alias.name) + + if not collect_referenced_names or not target: + return + + for alias in node.names: + if alias.name == "*": + continue + alias_name = alias.asname or alias.name + state.imported_symbol_bindings.setdefault(alias_name, set()).add( + f"{target}:{alias.name}" + ) + + +def _collect_load_reference_node( + *, + node: ast.AST, + state: _ModuleWalkState, +) -> None: + if isinstance(node, ast.Name) and isinstance(node.ctx, ast.Load): + state.referenced_names.add(node.id) + state.name_nodes.append(node) + return + if isinstance(node, ast.Attribute) and isinstance(node.ctx, ast.Load): + state.referenced_names.add(node.attr) + state.attr_nodes.append(node) + + +def _is_protocol_class( + class_node: ast.ClassDef, + *, + protocol_symbol_aliases: frozenset[str], + protocol_module_aliases: frozenset[str], +) -> bool: + for base in class_node.bases: + base_name = _dotted_expr_name(base) + if base_name is None: + continue + if base_name in protocol_symbol_aliases: + return True + if "." in base_name and base_name.rsplit(".", 1)[-1] == "Protocol": + module_alias = base_name.rsplit(".", 1)[0] + if module_alias in protocol_module_aliases: + return True + return False + + +def _is_non_runtime_candidate(node: _qualnames.FunctionNode) -> bool: + for decorator in node.decorator_list: + name = _dotted_expr_name(decorator) + if name is None: + continue + terminal = name.rsplit(".", 1)[-1] + if terminal in {"overload", "abstractmethod"}: + return True + return False + + +def _dead_candidate_kind(local_name: str) -> Literal["function", "method"]: + return "method" if "." in local_name else "function" + + +def _should_skip_dead_candidate( + local_name: str, + node: _qualnames.FunctionNode, + *, + protocol_class_qualnames: set[str], +) -> bool: + if _is_non_runtime_candidate(node): + return True + if "." not in local_name: + return False + owner_qualname = local_name.rsplit(".", 1)[0] + return owner_qualname in protocol_class_qualnames + + +def _build_dead_candidate( + *, + module_name: str, + local_name: str, + node: _NamedDeclarationNode, + filepath: str, + kind: Literal["class", "function", "method"], + suppression_index: Mapping[SuppressionTargetKey, tuple[str, ...]], + start_line: int, + end_line: int, +) -> DeadCandidate: + qualname = f"{module_name}:{local_name}" + return DeadCandidate( + qualname=qualname, + local_name=node.name, + filepath=filepath, + start_line=start_line, + end_line=end_line, + kind=kind, + suppressed_rules=suppression_index.get( + suppression_target_key( + filepath=filepath, + qualname=qualname, + start_line=start_line, + end_line=end_line, + kind=kind, + ), + (), + ), + ) + + +def _dead_candidate_for_unit( + *, + module_name: str, + local_name: str, + node: _qualnames.FunctionNode, + filepath: str, + suppression_index: Mapping[SuppressionTargetKey, tuple[str, ...]], + protocol_class_qualnames: set[str], +) -> DeadCandidate | None: + span = _node_line_span(node) + if span is None: + return None + if _should_skip_dead_candidate( + local_name, + node, + protocol_class_qualnames=protocol_class_qualnames, + ): + return None + start, end = span + return _build_dead_candidate( + module_name=module_name, + local_name=local_name, + node=node, + filepath=filepath, + kind=_dead_candidate_kind(local_name), + suppression_index=suppression_index, + start_line=start, + end_line=end, + ) + + +def _resolve_referenced_qualnames( + *, + module_name: str, + collector: _qualnames.QualnameCollector, + state: _ModuleWalkState, +) -> frozenset[str]: + top_level_class_by_name = { + class_qualname: class_qualname + for class_qualname, _class_node in collector.class_nodes + if "." not in class_qualname + } + local_method_qualnames = frozenset( + f"{module_name}:{local_name}" + for local_name, _node in collector.units + if "." in local_name + ) + + resolved: set[str] = set() + for name_node in state.name_nodes: + for qualname in state.imported_symbol_bindings.get(name_node.id, ()): + resolved.add(qualname) + + for attr_node in state.attr_nodes: + base = attr_node.value + if isinstance(base, ast.Name): + imported_module = state.imported_module_aliases.get(base.id) + if imported_module is not None: + resolved.add(f"{imported_module}:{attr_node.attr}") + else: + class_qualname = top_level_class_by_name.get(base.id) + if class_qualname is not None: + local_method_qualname = ( + f"{module_name}:{class_qualname}.{attr_node.attr}" + ) + if local_method_qualname in local_method_qualnames: + resolved.add(local_method_qualname) + + return frozenset(resolved) + + +class _ModuleWalkResult(NamedTuple): + import_names: frozenset[str] + module_deps: tuple[ModuleDep, ...] + referenced_names: frozenset[str] + referenced_qualnames: frozenset[str] + protocol_symbol_aliases: frozenset[str] + protocol_module_aliases: frozenset[str] + + +def _collect_module_walk_data( + *, + tree: ast.AST, + module_name: str, + collector: _qualnames.QualnameCollector, + collect_referenced_names: bool, +) -> _ModuleWalkResult: + """Single ast.walk that collects imports, deps, names, qualnames & protocol aliases. + + Reduces the hot path to one tree walk plus one local qualname resolution phase. + """ + state = _ModuleWalkState() + for node in ast.walk(tree): + if isinstance(node, ast.Import): + _collect_import_node( + node=node, + module_name=module_name, + state=state, + collect_referenced_names=collect_referenced_names, + ) + elif isinstance(node, ast.ImportFrom): + _collect_import_from_node( + node=node, + module_name=module_name, + state=state, + collect_referenced_names=collect_referenced_names, + ) + elif collect_referenced_names: + _collect_load_reference_node(node=node, state=state) + + deps_sorted = tuple( + sorted( + state.deps, + key=lambda dep: (dep.source, dep.target, dep.import_type, dep.line), + ) + ) + resolved = ( + _resolve_referenced_qualnames( + module_name=module_name, + collector=collector, + state=state, + ) + if collect_referenced_names + else frozenset() + ) + + return _ModuleWalkResult( + import_names=frozenset(state.import_names), + module_deps=deps_sorted, + referenced_names=frozenset(state.referenced_names), + referenced_qualnames=resolved, + protocol_symbol_aliases=frozenset(state.protocol_symbol_aliases), + protocol_module_aliases=frozenset(state.protocol_module_aliases), + ) + + +def _collect_dead_candidates( + *, + filepath: str, + module_name: str, + collector: _qualnames.QualnameCollector, + protocol_symbol_aliases: frozenset[str] = frozenset({"Protocol"}), + protocol_module_aliases: frozenset[str] = frozenset( + {"typing", "typing_extensions"} + ), + suppression_rules_by_target: Mapping[SuppressionTargetKey, tuple[str, ...]] + | None = None, +) -> tuple[DeadCandidate, ...]: + protocol_class_qualnames = { + class_qualname + for class_qualname, class_node in collector.class_nodes + if _is_protocol_class( + class_node, + protocol_symbol_aliases=protocol_symbol_aliases, + protocol_module_aliases=protocol_module_aliases, + ) + } + + candidates: list[DeadCandidate] = [] + suppression_index = ( + suppression_rules_by_target if suppression_rules_by_target is not None else {} + ) + for local_name, node in collector.units: + candidate = _dead_candidate_for_unit( + module_name=module_name, + local_name=local_name, + node=node, + filepath=filepath, + suppression_index=suppression_index, + protocol_class_qualnames=protocol_class_qualnames, + ) + if candidate is not None: + candidates.append(candidate) + + for class_qualname, class_node in collector.class_nodes: + span = _node_line_span(class_node) + if span is not None: + start, end = span + candidates.append( + _build_dead_candidate( + module_name=module_name, + local_name=class_qualname, + node=class_node, + filepath=filepath, + kind="class", + suppression_index=suppression_index, + start_line=start, + end_line=end, + ) + ) + + return tuple( + sorted( + candidates, + key=lambda item: ( + item.filepath, + item.start_line, + item.end_line, + item.qualname, + ), + ) + ) + + +def _collect_declaration_targets( + *, + filepath: str, + module_name: str, + collector: _qualnames.QualnameCollector, + source_tokens: tuple[tokenize.TokenInfo, ...] = (), + source_token_index: Mapping[_DeclarationTokenIndexKey, int] | None = None, + include_inline_lines: bool = False, +) -> tuple[DeclarationTarget, ...]: + declarations: list[DeclarationTarget] = [] + declaration_specs: list[ + tuple[str, ast.AST, Literal["function", "method", "class"]] + ] = [ + ( + local_name, + node, + "method" if "." in local_name else "function", + ) + for local_name, node in collector.units + ] + declaration_specs.extend( + (class_qualname, class_node, "class") + for class_qualname, class_node in collector.class_nodes + ) + + for qualname_suffix, node, kind in declaration_specs: + start = int(getattr(node, "lineno", 0)) + end = int(getattr(node, "end_lineno", 0)) + if start > 0 and end > 0: + declaration_end_line = ( + _declaration_end_line( + node, + source_tokens=source_tokens, + source_token_index=source_token_index, + ) + if include_inline_lines + else None + ) + declarations.append( + DeclarationTarget( + filepath=filepath, + qualname=f"{module_name}:{qualname_suffix}", + start_line=start, + end_line=end, + kind=kind, + declaration_end_line=declaration_end_line, + ) + ) + + return tuple( + sorted( + declarations, + key=lambda item: ( + item.filepath, + item.start_line, + item.end_line, + item.qualname, + item.kind, + ), + ) + ) + + +def _build_suppression_index_for_source( + *, + source: str, + filepath: str, + module_name: str, + collector: _qualnames.QualnameCollector, +) -> Mapping[SuppressionTargetKey, tuple[str, ...]]: + suppression_directives = extract_suppression_directives(source) + if not suppression_directives: + return {} + + needs_inline_binding = any( + directive.binding == "inline" for directive in suppression_directives + ) + source_tokens: tuple[tokenize.TokenInfo, ...] = () + source_token_index: Mapping[_DeclarationTokenIndexKey, int] | None = None + if needs_inline_binding: + source_tokens = _source_tokens(source) + if source_tokens: + source_token_index = _build_declaration_token_index(source_tokens) + + declaration_targets = _collect_declaration_targets( + filepath=filepath, + module_name=module_name, + collector=collector, + source_tokens=source_tokens, + source_token_index=source_token_index, + include_inline_lines=needs_inline_binding, + ) + suppression_bindings = bind_suppressions_to_declarations( + directives=suppression_directives, + declarations=declaration_targets, + ) + return build_suppression_index(suppression_bindings) diff --git a/codeclone/cfg.py b/codeclone/analysis/cfg.py similarity index 92% rename from codeclone/cfg.py rename to codeclone/analysis/cfg.py index f10811f..5da1933 100644 --- a/codeclone/cfg.py +++ b/codeclone/analysis/cfg.py @@ -8,10 +8,10 @@ import ast from dataclasses import dataclass -from typing import TYPE_CHECKING, Protocol, cast +from typing import TYPE_CHECKING +from ..meta_markers import CFG_META_PREFIX from .cfg_model import CFG, Block -from .meta_markers import CFG_META_PREFIX if TYPE_CHECKING: from collections.abc import Iterable @@ -21,13 +21,6 @@ TryStar = getattr(ast, "TryStar", ast.Try) -class _TryLike(Protocol): - body: list[ast.stmt] - handlers: list[ast.ExceptHandler] - orelse: list[ast.stmt] - finalbody: list[ast.stmt] - - @dataclass(slots=True) class _LoopContext: continue_target: Block @@ -105,9 +98,19 @@ def _visit(self, stmt: ast.stmt) -> None: self._visit_for(stmt) # Structure is identical to For case ast.Try(): - self._visit_try(cast("_TryLike", stmt)) + self._visit_try( + body=stmt.body, + handlers=stmt.handlers, + orelse=stmt.orelse, + finalbody=stmt.finalbody, + ) case _ if TryStar is not None and isinstance(stmt, TryStar): - self._visit_try(cast("_TryLike", cast("object", stmt))) + self._visit_try( + body=stmt.body, + handlers=stmt.handlers, + orelse=stmt.orelse, + finalbody=stmt.finalbody, + ) case ast.With() | ast.AsyncWith(): self._visit_with(stmt) @@ -261,18 +264,25 @@ def _visit_with(self, stmt: ast.With | ast.AsyncWith) -> None: self.current = after_block - def _visit_try(self, stmt: _TryLike) -> None: + def _visit_try( + self, + *, + body: list[ast.stmt], + handlers: list[ast.ExceptHandler], + orelse: list[ast.stmt], + finalbody: list[ast.stmt], + ) -> None: try_entry = self.cfg.create_block() self.current.add_successor(try_entry) self.current = try_entry - handler_test_blocks = [self.cfg.create_block() for _ in stmt.handlers] - handler_body_blocks = [self.cfg.create_block() for _ in stmt.handlers] - else_block = self.cfg.create_block() if stmt.orelse else None + handler_test_blocks = [self.cfg.create_block() for _ in handlers] + handler_body_blocks = [self.cfg.create_block() for _ in handlers] + else_block = self.cfg.create_block() if orelse else None final_block = self.cfg.create_block() for idx, (handler, test_block, body_block) in enumerate( - zip(stmt.handlers, handler_test_blocks, handler_body_blocks, strict=True) + zip(handlers, handler_test_blocks, handler_body_blocks, strict=True) ): test_block.statements.append(_meta_expr(f"TRY_HANDLER_INDEX:{idx}")) if handler.type is not None: @@ -290,7 +300,7 @@ def _visit_try(self, stmt: _TryLike) -> None: # Process each statement in try body # Link only statements that can raise to exception handlers - for stmt_node in stmt.body: + for stmt_node in body: if self.current.is_terminated: break @@ -307,7 +317,7 @@ def _visit_try(self, stmt: _TryLike) -> None: self.current.add_successor(final_block) # Process handlers - for handler, body_block in zip(stmt.handlers, handler_body_blocks, strict=True): + for handler, body_block in zip(handlers, handler_body_blocks, strict=True): self.current = body_block self._visit_statements(handler.body) if not self.current.is_terminated: @@ -316,14 +326,14 @@ def _visit_try(self, stmt: _TryLike) -> None: # Process else if else_block: self.current = else_block - self._visit_statements(stmt.orelse) + self._visit_statements(orelse) if not self.current.is_terminated: self.current.add_successor(final_block) # Process finally self.current = final_block - if stmt.finalbody: - self._visit_statements(stmt.finalbody) + if finalbody: + self._visit_statements(finalbody) def _visit_match(self, stmt: ast.Match) -> None: self.current.statements.append(ast.Expr(value=stmt.subject)) diff --git a/codeclone/cfg_model.py b/codeclone/analysis/cfg_model.py similarity index 100% rename from codeclone/cfg_model.py rename to codeclone/analysis/cfg_model.py diff --git a/codeclone/analysis/class_metrics.py b/codeclone/analysis/class_metrics.py new file mode 100644 index 0000000..d343ec7 --- /dev/null +++ b/codeclone/analysis/class_metrics.py @@ -0,0 +1,55 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import ast + +from ..metrics.cohesion import cohesion_risk, compute_lcom4 +from ..metrics.coupling import compute_cbo, coupling_risk +from ..models import ClassMetrics + + +def _node_line_span(node: ast.AST) -> tuple[int, int] | None: + start = int(getattr(node, "lineno", 0)) + end = int(getattr(node, "end_lineno", 0)) + if start <= 0 or end <= 0: + return None + return start, end + + +def _class_metrics_for_node( + *, + module_name: str, + class_qualname: str, + class_node: ast.ClassDef, + filepath: str, + module_import_names: set[str], + module_class_names: set[str], +) -> ClassMetrics | None: + span = _node_line_span(class_node) + if span is None: + return None + start, end = span + cbo, coupled_classes = compute_cbo( + class_node, + module_import_names=module_import_names, + module_class_names=module_class_names, + ) + lcom4, method_count, instance_var_count = compute_lcom4(class_node) + return ClassMetrics( + qualname=f"{module_name}:{class_qualname}", + filepath=filepath, + start_line=start, + end_line=end, + cbo=cbo, + lcom4=lcom4, + method_count=method_count, + instance_var_count=instance_var_count, + risk_coupling=coupling_risk(cbo), + risk_cohesion=cohesion_risk(lcom4), + coupled_classes=coupled_classes, + ) diff --git a/codeclone/analysis/fingerprint.py b/codeclone/analysis/fingerprint.py new file mode 100644 index 0000000..dff7dbc --- /dev/null +++ b/codeclone/analysis/fingerprint.py @@ -0,0 +1,81 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import hashlib + +from .. import qualnames as _qualnames +from ..metrics.complexity import cyclomatic_complexity +from .cfg import CFGBuilder +from .normalizer import ( + AstNormalizer, + NormalizationConfig, + normalized_ast_dump_from_list, +) + + +def sha1(s: str) -> str: + return hashlib.sha1(s.encode("utf-8")).hexdigest() + + +def bucket_loc(loc: int) -> str: + # Helps avoid grouping wildly different sizes if desired + if loc < 20: + return "0-19" + if loc < 50: + return "20-49" + if loc < 100: + return "50-99" + return "100+" + + +def _cfg_fingerprint_and_complexity( + node: _qualnames.FunctionNode, + cfg: NormalizationConfig, + qualname: str, +) -> tuple[str, int]: + """ + Generate a structural fingerprint for a function using CFG analysis. + + The fingerprint is computed by: + 1. Building a Control Flow Graph (CFG) from the function + 2. Normalizing each CFG block's statements (variable names, constants, etc.) + 3. Creating a canonical representation of the CFG structure + 4. Hashing the representation with SHA-1 + + Functions with identical control flow and normalized statements will + produce the same fingerprint, even if they differ in variable names, + constants, or type annotations. + + Args: + node: Function AST node to fingerprint + cfg: Normalization configuration (what to ignore) + qualname: Qualified name for logging/debugging + + Returns: + 40-character hex SHA-1 hash of the normalized CFG + """ + builder = CFGBuilder() + graph = builder.build(qualname, node) + cfg_normalizer = AstNormalizer(cfg) + + # Use generator to avoid building large list of strings + parts: list[str] = [] + for block in sorted(graph.blocks, key=lambda b: b.id): + succ_ids = ",".join( + str(s.id) for s in sorted(block.successors, key=lambda s: s.id) + ) + block_dump = normalized_ast_dump_from_list( + block.statements, + cfg, + normalizer=cfg_normalizer, + ) + parts.append(f"BLOCK[{block.id}]:{block_dump}|SUCCESSORS:{succ_ids}") + return sha1("|".join(parts)), cyclomatic_complexity(graph) + + +_CFG_FINGERPRINT_AND_COMPLEXITY_IMPL = _cfg_fingerprint_and_complexity diff --git a/codeclone/normalize.py b/codeclone/analysis/normalizer.py similarity index 95% rename from codeclone/normalize.py rename to codeclone/analysis/normalizer.py index 31f39e8..19e44b1 100644 --- a/codeclone/normalize.py +++ b/codeclone/analysis/normalizer.py @@ -11,9 +11,9 @@ import hashlib from ast import AST from dataclasses import dataclass -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING -from .meta_markers import CFG_META_PREFIX +from ..meta_markers import CFG_META_PREFIX if TYPE_CHECKING: from collections.abc import Sequence @@ -92,11 +92,16 @@ def visit_Constant(self, node: ast.Constant) -> ast.Constant: node.value = "_CONST_" return node + def _visit_expr(self, node: ast.expr) -> ast.expr: + visited = self.visit(node) + assert isinstance(visited, ast.expr) + return visited + def visit_Call(self, node: ast.Call) -> ast.Call: node.func = self._visit_call_target(node.func) - node.args = [cast("ast.expr", self.visit(arg)) for arg in node.args] + node.args = [self._visit_expr(arg) for arg in node.args] for kw in node.keywords: - kw.value = cast("ast.expr", self.visit(kw.value)) + kw.value = self._visit_expr(kw.value) return node def _visit_call_target(self, node: ast.expr) -> ast.expr: @@ -108,9 +113,9 @@ def _visit_call_target(self, node: ast.expr) -> ast.expr: if isinstance(value, (ast.Name, ast.Attribute)): node.value = self._visit_call_target(value) else: - node.value = cast("ast.expr", self.visit(value)) + node.value = self._visit_expr(value) return node - return cast("ast.expr", self.visit(node)) + return self._visit_expr(node) def visit_AugAssign(self, node: ast.AugAssign) -> AST: # Normalize x += 1 to x = x + 1 diff --git a/codeclone/analysis/parser.py b/codeclone/analysis/parser.py new file mode 100644 index 0000000..f8bbbb5 --- /dev/null +++ b/codeclone/analysis/parser.py @@ -0,0 +1,219 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import ast +import io +import math +import os +import signal +import tokenize +from contextlib import contextmanager +from typing import TYPE_CHECKING + +from ..contracts.errors import ParseError + +if TYPE_CHECKING: + from collections.abc import Iterator, Mapping + +PARSE_TIMEOUT_SECONDS = 5 + + +class _ParseTimeoutError(Exception): + pass + + +_DeclarationTokenIndexKey = tuple[int, int, str] +_DECLARATION_TOKEN_STRINGS = frozenset({"def", "async", "class"}) + + +def _consumed_cpu_seconds(resource_module: object) -> float: + """Return consumed CPU seconds for the current process.""" + try: + usage = resource_module.getrusage( # type: ignore[attr-defined] + resource_module.RUSAGE_SELF # type: ignore[attr-defined] + ) + return float(usage.ru_utime) + float(usage.ru_stime) + except Exception: + return 0.0 + + +@contextmanager +def _parse_limits(timeout_s: int) -> Iterator[None]: + if os.name != "posix" or timeout_s <= 0: + yield + return + + old_handler = signal.getsignal(signal.SIGALRM) + + def _timeout_handler(_signum: int, _frame: object) -> None: + raise _ParseTimeoutError("AST parsing timeout") + + old_limits: tuple[int, int] | None = None + try: + signal.signal(signal.SIGALRM, _timeout_handler) + signal.setitimer(signal.ITIMER_REAL, timeout_s) + + try: + import resource + + old_limits = resource.getrlimit(resource.RLIMIT_CPU) + soft, hard = old_limits + consumed_cpu_s = _consumed_cpu_seconds(resource) + desired_soft = max(1, timeout_s + math.ceil(consumed_cpu_s)) + if soft == resource.RLIM_INFINITY: + candidate_soft = desired_soft + else: + # Never reduce finite soft limits and avoid immediate SIGXCPU + # when the process already consumed more CPU than timeout_s. + candidate_soft = max(soft, desired_soft) + if hard == resource.RLIM_INFINITY: + new_soft = candidate_soft + else: + new_soft = min(max(1, hard), candidate_soft) + # Never lower hard limit: raising it back may be disallowed for + # unprivileged processes and can lead to process termination later. + resource.setrlimit(resource.RLIMIT_CPU, (new_soft, hard)) + except Exception: + # If resource is unavailable or cannot be set, rely on alarm only. + pass + + yield + finally: + signal.setitimer(signal.ITIMER_REAL, 0) + signal.signal(signal.SIGALRM, old_handler) + if old_limits is not None: + try: + import resource + + resource.setrlimit(resource.RLIMIT_CPU, old_limits) + except Exception: + pass + + +_PARSE_LIMITS_IMPL = _parse_limits + + +def _parse_with_limits(source: str, timeout_s: int) -> ast.AST: + try: + with _parse_limits(timeout_s): + return ast.parse(source) + except _ParseTimeoutError as e: + raise ParseError(str(e)) from e + + +_PARSE_WITH_LIMITS_IMPL = _parse_with_limits + + +def _source_tokens(source: str) -> tuple[tokenize.TokenInfo, ...]: + try: + return tuple(tokenize.generate_tokens(io.StringIO(source).readline)) + except tokenize.TokenError: + return () + + +_SOURCE_TOKENS_IMPL = _source_tokens + + +def _declaration_token_name(node: ast.AST) -> str: + if isinstance(node, ast.ClassDef): + return "class" + if isinstance(node, ast.AsyncFunctionDef): + return "async" + return "def" + + +def _declaration_token_index( + *, + source_tokens: tuple[tokenize.TokenInfo, ...], + start_line: int, + start_col: int, + declaration_token: str, + source_token_index: Mapping[_DeclarationTokenIndexKey, int] | None = None, +) -> int | None: + if source_token_index is not None: + return source_token_index.get((start_line, start_col, declaration_token)) + for idx, token in enumerate(source_tokens): + if token.start != (start_line, start_col): + continue + if token.type == tokenize.NAME and token.string == declaration_token: + return idx + return None + + +def _build_declaration_token_index( + source_tokens: tuple[tokenize.TokenInfo, ...], +) -> Mapping[_DeclarationTokenIndexKey, int]: + indexed: dict[_DeclarationTokenIndexKey, int] = {} + for idx, token in enumerate(source_tokens): + if token.type == tokenize.NAME and token.string in _DECLARATION_TOKEN_STRINGS: + indexed[(token.start[0], token.start[1], token.string)] = idx + return indexed + + +def _scan_declaration_colon_line( + *, + source_tokens: tuple[tokenize.TokenInfo, ...], + start_index: int, +) -> int | None: + nesting = 0 + for token in source_tokens[start_index + 1 :]: + if token.type == tokenize.OP: + if token.string in "([{": + nesting += 1 + continue + if token.string in ")]}": + if nesting > 0: + nesting -= 1 + continue + if token.string == ":" and nesting == 0: + return token.start[0] + if token.type == tokenize.NEWLINE and nesting == 0: + return None + return None + + +def _fallback_declaration_end_line(node: ast.AST, *, start_line: int) -> int: + body = getattr(node, "body", None) + if not isinstance(body, list) or not body: + return start_line + + first_body_line = int(getattr(body[0], "lineno", 0)) + if first_body_line <= 0 or first_body_line == start_line: + return start_line + return max(start_line, first_body_line - 1) + + +def _declaration_end_line( + node: ast.AST, + *, + source_tokens: tuple[tokenize.TokenInfo, ...], + source_token_index: Mapping[_DeclarationTokenIndexKey, int] | None = None, +) -> int: + start_line = int(getattr(node, "lineno", 0)) + start_col = int(getattr(node, "col_offset", 0)) + if start_line <= 0: + return 0 + + declaration_token = _declaration_token_name(node) + start_index = _declaration_token_index( + source_tokens=source_tokens, + start_line=start_line, + start_col=start_col, + declaration_token=declaration_token, + source_token_index=source_token_index, + ) + if start_index is None: + return _fallback_declaration_end_line(node, start_line=start_line) + + colon_line = _scan_declaration_colon_line( + source_tokens=source_tokens, + start_index=start_index, + ) + if colon_line is not None: + return colon_line + return _fallback_declaration_end_line(node, start_line=start_line) diff --git a/codeclone/analysis/security_surfaces.py b/codeclone/analysis/security_surfaces.py new file mode 100644 index 0000000..0827fab --- /dev/null +++ b/codeclone/analysis/security_surfaces.py @@ -0,0 +1,476 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import ast +from dataclasses import dataclass + +from ..models import ( + SecuritySurface, + SecuritySurfaceCategory, + SecuritySurfaceClassificationMode, + SecuritySurfaceEvidenceKind, + SecuritySurfaceLocationScope, +) + + +@dataclass(frozen=True, slots=True) +class _ImportRule: + module_prefix: str + category: SecuritySurfaceCategory + capability: str + + +@dataclass(frozen=True, slots=True) +class _CallRule: + symbol: str + category: SecuritySurfaceCategory + capability: str + prefix_match: bool = False + + +_BUILTIN_RULES: dict[str, tuple[SecuritySurfaceCategory, str]] = { + "__import__": ("dynamic_loading", "builtin_import"), + "compile": ("dynamic_execution", "dynamic_compile"), + "eval": ("dynamic_execution", "dynamic_eval"), + "exec": ("dynamic_execution", "dynamic_exec"), +} + +_IMPORT_RULES: tuple[_ImportRule, ...] = ( + _ImportRule("aiohttp", "network_boundary", "aiohttp_import"), + _ImportRule("asyncpg", "database_boundary", "asyncpg_import"), + _ImportRule("authlib", "identity_token", "authlib_import"), + _ImportRule("bcrypt", "identity_token", "bcrypt_import"), + _ImportRule("cloudpickle", "deserialization", "cloudpickle_import"), + _ImportRule("cryptography", "crypto_transport", "cryptography_import"), + _ImportRule("dill", "deserialization", "dill_import"), + _ImportRule("django.http", "network_boundary", "django_http_import"), + _ImportRule("fastapi", "network_boundary", "fastapi_import"), + _ImportRule("flask", "network_boundary", "flask_import"), + _ImportRule("grpc", "network_boundary", "grpc_import"), + _ImportRule("hmac", "crypto_transport", "hmac_import"), + _ImportRule("http.server", "network_boundary", "http_server_import"), + _ImportRule("httpx", "network_boundary", "httpx_import"), + _ImportRule("importlib", "dynamic_loading", "importlib_import"), + _ImportRule("itsdangerous", "identity_token", "itsdangerous_import"), + _ImportRule("jsonpickle", "deserialization", "jsonpickle_import"), + _ImportRule("jwt", "identity_token", "jwt_import"), + _ImportRule("marshal", "deserialization", "marshal_import"), + _ImportRule("OpenSSL", "crypto_transport", "openssl_import"), + _ImportRule("passlib", "identity_token", "passlib_import"), + _ImportRule("pickle", "deserialization", "pickle_import"), + _ImportRule("psycopg", "database_boundary", "psycopg_import"), + _ImportRule("psycopg2", "database_boundary", "psycopg2_import"), + _ImportRule("pymysql", "database_boundary", "pymysql_import"), + _ImportRule("redis", "database_boundary", "redis_import"), + _ImportRule("requests", "network_boundary", "requests_import"), + _ImportRule("ruamel.yaml", "deserialization", "ruamel_yaml_import"), + _ImportRule("runpy", "dynamic_loading", "runpy_import"), + _ImportRule("secrets", "crypto_transport", "secrets_import"), + _ImportRule("shelve", "deserialization", "shelve_import"), + _ImportRule("socket", "network_boundary", "socket_import"), + _ImportRule("sqlalchemy", "database_boundary", "sqlalchemy_import"), + _ImportRule("sqlite3", "database_boundary", "sqlite3_import"), + _ImportRule("ssl", "crypto_transport", "ssl_import"), + _ImportRule("subprocess", "process_boundary", "subprocess_import"), + _ImportRule("tarfile", "archive_extraction", "tarfile_import"), + _ImportRule("websockets", "network_boundary", "websockets_import"), + _ImportRule("urllib", "network_boundary", "urllib_import"), + _ImportRule("yaml", "deserialization", "yaml_import"), + _ImportRule("zipfile", "archive_extraction", "zipfile_import"), +) + +_CALL_RULES: tuple[_CallRule, ...] = ( + _CallRule( + "asyncio.create_subprocess_exec", "process_boundary", "asyncio_subprocess_exec" + ), + _CallRule( + "asyncio.create_subprocess_shell", + "process_boundary", + "asyncio_subprocess_shell", + ), + _CallRule("cloudpickle.load", "deserialization", "cloudpickle_load"), + _CallRule("cloudpickle.loads", "deserialization", "cloudpickle_loads"), + _CallRule("dill.load", "deserialization", "dill_load"), + _CallRule("dill.loads", "deserialization", "dill_loads"), + _CallRule("importlib.import_module", "dynamic_loading", "import_module"), + _CallRule( + "importlib.util.spec_from_file_location", + "dynamic_loading", + "import_spec_from_file", + ), + _CallRule("jsonpickle.decode", "deserialization", "jsonpickle_decode"), + _CallRule("marshal.load", "deserialization", "marshal_load"), + _CallRule("marshal.loads", "deserialization", "marshal_loads"), + _CallRule("os.chmod", "filesystem_mutation", "os_chmod"), + _CallRule("os.chown", "filesystem_mutation", "os_chown"), + _CallRule("os.makedirs", "filesystem_mutation", "os_makedirs"), + _CallRule("os.remove", "filesystem_mutation", "os_remove"), + _CallRule("os.rename", "filesystem_mutation", "os_rename"), + _CallRule("os.replace", "filesystem_mutation", "os_replace"), + _CallRule("os.rmdir", "filesystem_mutation", "os_rmdir"), + _CallRule("os.spawn", "process_boundary", "os_spawn", prefix_match=True), + _CallRule("os.system", "process_boundary", "os_system"), + _CallRule("os.unlink", "filesystem_mutation", "os_unlink"), + _CallRule("pathlib.Path.chmod", "filesystem_mutation", "pathlib_chmod"), + _CallRule("pathlib.Path.mkdir", "filesystem_mutation", "pathlib_mkdir"), + _CallRule("pathlib.Path.open", "filesystem_mutation", "pathlib_open_write"), + _CallRule("pathlib.Path.rename", "filesystem_mutation", "pathlib_rename"), + _CallRule("pathlib.Path.replace", "filesystem_mutation", "pathlib_replace"), + _CallRule("pathlib.Path.rmdir", "filesystem_mutation", "pathlib_rmdir"), + _CallRule("pathlib.Path.touch", "filesystem_mutation", "pathlib_touch"), + _CallRule("pathlib.Path.unlink", "filesystem_mutation", "pathlib_unlink"), + _CallRule("pathlib.Path.write_bytes", "filesystem_mutation", "pathlib_write_bytes"), + _CallRule("pathlib.Path.write_text", "filesystem_mutation", "pathlib_write_text"), + _CallRule("pickle.load", "deserialization", "pickle_load"), + _CallRule("pickle.loads", "deserialization", "pickle_loads"), + _CallRule("pty.spawn", "process_boundary", "pty_spawn"), + _CallRule("runpy.run_module", "dynamic_loading", "run_module"), + _CallRule("runpy.run_path", "dynamic_loading", "run_path"), + _CallRule("shutil.move", "filesystem_mutation", "shutil_move"), + _CallRule("shutil.rmtree", "filesystem_mutation", "shutil_rmtree"), + _CallRule("shutil.unpack_archive", "archive_extraction", "unpack_archive"), + _CallRule("subprocess.call", "process_boundary", "subprocess_call"), + _CallRule("subprocess.check_call", "process_boundary", "subprocess_check_call"), + _CallRule("subprocess.check_output", "process_boundary", "subprocess_check_output"), + _CallRule("subprocess.Popen", "process_boundary", "subprocess_popen"), + _CallRule("subprocess.run", "process_boundary", "subprocess_run"), + _CallRule( + "tarfile.open.extract", "archive_extraction", "tar_extract", prefix_match=True + ), + _CallRule("tempfile.mkdtemp", "filesystem_mutation", "tempfile_mkdtemp"), + _CallRule( + "tempfile.NamedTemporaryFile", + "filesystem_mutation", + "tempfile_named_temporary_file", + ), + _CallRule("yaml.load", "deserialization", "yaml_load"), + _CallRule("yaml.unsafe_load", "deserialization", "yaml_unsafe_load"), + _CallRule( + "zipfile.ZipFile.extract", + "archive_extraction", + "zip_extract", + prefix_match=True, + ), +) + + +def _node_start_line(node: ast.AST) -> int | None: + line = getattr(node, "lineno", None) + if isinstance(line, int) and line > 0: + return line + return None + + +def _node_end_line(node: ast.AST) -> int: + start_line = _node_start_line(node) + if start_line is None: + return 0 + end_line = getattr(node, "end_lineno", None) + return ( + end_line if isinstance(end_line, int) and end_line >= start_line else start_line + ) + + +def _is_type_checking_guard(test: ast.AST) -> bool: + match test: + case ast.Name(id="TYPE_CHECKING"): + return True + case ast.Attribute(value=ast.Name(id="typing"), attr="TYPE_CHECKING"): + return True + case _: + return False + + +def _matches_import_prefix(imported_name: str, module_prefix: str) -> bool: + return imported_name == module_prefix or imported_name.startswith( + module_prefix + "." + ) + + +def _matches_call_rule(symbol: str, rule: _CallRule) -> bool: + return symbol == rule.symbol or ( + rule.prefix_match and symbol.startswith(rule.symbol) + ) + + +class _SecuritySurfaceVisitor(ast.NodeVisitor): + __slots__ = ( + "_aliases", + "_callable_depth", + "_class_depth", + "_filepath", + "_module_name", + "_scope_stack", + "_seen", + "items", + ) + + def __init__(self, *, module_name: str, filepath: str) -> None: + self._aliases: dict[str, str] = {} + self._module_name = module_name + self._filepath = filepath + self._scope_stack: list[str] = [] + self._callable_depth = 0 + self._class_depth = 0 + self._seen: set[ + tuple[ + str, + str, + str, + int, + int, + str, + str, + str, + ] + ] = set() + self.items: list[SecuritySurface] = [] + + def _current_scope(self) -> tuple[str, SecuritySurfaceLocationScope]: + if not self._scope_stack: + return self._module_name, "module" + return ( + f"{self._module_name}:{'.'.join(self._scope_stack)}", + "callable" if self._callable_depth > 0 else "class", + ) + + def _emit( + self, + *, + category: SecuritySurfaceCategory, + capability: str, + node: ast.AST, + classification_mode: SecuritySurfaceClassificationMode, + evidence_kind: SecuritySurfaceEvidenceKind, + evidence_symbol: str, + ) -> None: + start_line = _node_start_line(node) + if start_line is None: + return + qualname, location_scope = self._current_scope() + key = ( + category, + capability, + qualname, + start_line, + _node_end_line(node), + classification_mode, + evidence_kind, + evidence_symbol, + ) + if key in self._seen: + return + self._seen.add(key) + self.items.append( + SecuritySurface( + category=category, + capability=capability, + module=self._module_name, + filepath=self._filepath, + qualname=qualname, + start_line=start_line, + end_line=_node_end_line(node), + location_scope=location_scope, + classification_mode=classification_mode, + evidence_kind=evidence_kind, + evidence_symbol=evidence_symbol, + ) + ) + + def _register_import_alias(self, *, bound_name: str, imported_name: str) -> None: + clean_bound = bound_name.strip() + clean_imported = imported_name.strip() + if clean_bound and clean_imported: + self._aliases[clean_bound] = clean_imported + + def _emit_import_matches(self, *, imported_name: str, node: ast.AST) -> None: + for rule in _IMPORT_RULES: + if _matches_import_prefix(imported_name, rule.module_prefix): + self._emit( + category=rule.category, + capability=rule.capability, + node=node, + classification_mode="exact_import", + evidence_kind="import", + evidence_symbol=imported_name, + ) + + def _resolve_expr_symbol(self, node: ast.AST) -> str | None: + match node: + case ast.Name(id=name): + resolved = self._aliases.get(name) + if resolved is not None: + return resolved + if name in _BUILTIN_RULES or name == "open": + return name + return None + case ast.Attribute(value=value, attr=attr): + parent = self._resolve_expr_symbol(value) + if parent is None: + return None + return f"{parent}.{attr}" + case ast.Call(func=func): + return self._resolve_expr_symbol(func) + case _: + return None + + def _mode_from_open_call(self, node: ast.Call) -> str | None: + mode_arg: ast.AST | None = None + if len(node.args) >= 2: + mode_arg = node.args[1] + else: + for keyword in node.keywords: + if keyword.arg == "mode": + mode_arg = keyword.value + break + if not isinstance(mode_arg, ast.Constant) or not isinstance( + mode_arg.value, str + ): + return None + mode = mode_arg.value + if any(marker in mode for marker in ("w", "a", "x", "+")): + return mode + return None + + def _emit_call_matches(self, node: ast.Call) -> None: + symbol = self._resolve_expr_symbol(node.func) + if symbol is None: + return + if symbol in _BUILTIN_RULES: + category, capability = _BUILTIN_RULES[symbol] + self._emit( + category=category, + capability=capability, + node=node, + classification_mode="exact_builtin", + evidence_kind="builtin", + evidence_symbol=symbol, + ) + if symbol in {"open", "pathlib.Path.open"}: + mode = self._mode_from_open_call(node) + if mode is not None: + capability = ( + "pathlib_open_write" + if symbol == "pathlib.Path.open" + else "builtin_open_write" + ) + self._emit( + category="filesystem_mutation", + capability=capability, + node=node, + classification_mode="exact_call", + evidence_kind="call", + evidence_symbol=f"{symbol}[mode={mode}]", + ) + for rule in _CALL_RULES: + if _matches_call_rule(symbol, rule): + self._emit( + category=rule.category, + capability=rule.capability, + node=node, + classification_mode="exact_call", + evidence_kind="call", + evidence_symbol=symbol, + ) + + def visit_If(self, node: ast.If) -> None: + if _is_type_checking_guard(node.test): + for child in node.orelse: + self.visit(child) + return + self.generic_visit(node) + + def visit_Import(self, node: ast.Import) -> None: + for alias in node.names: + full_name = alias.name.strip() + if not full_name: + continue + bound_name = alias.asname or full_name.split(".", maxsplit=1)[0] + self._register_import_alias( + bound_name=bound_name, + imported_name=full_name if alias.asname else bound_name, + ) + self._emit_import_matches(imported_name=full_name, node=node) + + def visit_ImportFrom(self, node: ast.ImportFrom) -> None: + if ( + node.level != 0 + or not isinstance(node.module, str) + or not node.module.strip() + ): + return + module_name = node.module.strip() + for alias in node.names: + if alias.name == "*": + continue + full_name = f"{module_name}.{alias.name}" + self._register_import_alias( + bound_name=alias.asname or alias.name, + imported_name=full_name, + ) + self._emit_import_matches(imported_name=full_name, node=node) + + def _visit_scoped_node( + self, + node: ast.AST, + *, + scope_name: str, + is_callable: bool, + ) -> None: + self._scope_stack.append(scope_name) + if is_callable: + self._callable_depth += 1 + else: + self._class_depth += 1 + self.generic_visit(node) + if is_callable: + self._callable_depth -= 1 + else: + self._class_depth -= 1 + self._scope_stack.pop() + + def visit_ClassDef(self, node: ast.ClassDef) -> None: + self._visit_scoped_node(node, scope_name=node.name, is_callable=False) + + def visit_FunctionDef(self, node: ast.FunctionDef) -> None: + self._visit_scoped_node(node, scope_name=node.name, is_callable=True) + + def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None: + self._visit_scoped_node(node, scope_name=node.name, is_callable=True) + + def visit_Call(self, node: ast.Call) -> None: + self._emit_call_matches(node) + self.generic_visit(node) + + +def collect_security_surfaces( + *, + tree: ast.Module, + module_name: str, + filepath: str, +) -> tuple[SecuritySurface, ...]: + visitor = _SecuritySurfaceVisitor(module_name=module_name, filepath=filepath) + visitor.visit(tree) + return tuple( + sorted( + visitor.items, + key=lambda item: ( + item.filepath, + item.start_line, + item.end_line, + item.qualname, + item.category, + item.capability, + item.evidence_symbol, + item.classification_mode, + ), + ) + ) + + +__all__ = ["collect_security_surfaces"] diff --git a/codeclone/suppressions.py b/codeclone/analysis/suppressions.py similarity index 96% rename from codeclone/suppressions.py rename to codeclone/analysis/suppressions.py index 0b45987..b717f6b 100644 --- a/codeclone/suppressions.py +++ b/codeclone/analysis/suppressions.py @@ -31,7 +31,7 @@ SuppressionTargetKey = tuple[str, str, int, int, DeclarationKind] _SUPPRESSION_DIRECTIVE_PATTERN: Final[re.Pattern[str]] = re.compile( - r"^\s*#\s*codeclone\s*:\s*ignore\s*\[(?P[^\]]+)\]\s*$" + r"^\s*#\s*codeclone\s*:\s*ignore\s*\[(?P[^]]+)]\s*$" ) _RULE_ID_PATTERN: Final[re.Pattern[str]] = re.compile(r"^[a-z0-9][a-z0-9-]*$") @@ -174,7 +174,7 @@ def _declaration_inline_lines(target: DeclarationTarget) -> tuple[int, ...]: end_line = target.declaration_end_line or target.start_line if end_line <= 0 or end_line == target.start_line: return (target.start_line,) - return (target.start_line, end_line) + return target.start_line, end_line def _bound_inline_rules( @@ -250,7 +250,7 @@ def suppression_target_key( end_line: int, kind: DeclarationKind, ) -> SuppressionTargetKey: - return (filepath, qualname, start_line, end_line, kind) + return filepath, qualname, start_line, end_line, kind def build_suppression_index( @@ -265,6 +265,5 @@ def build_suppression_index( end_line=binding.end_line, kind=binding.kind, ) - existing = index.get(key, ()) - index[key] = _merge_rules(existing, binding.rules) + index[key] = _merge_rules(index.get(key, ()), binding.rules) return index diff --git a/codeclone/analysis/units.py b/codeclone/analysis/units.py new file mode 100644 index 0000000..88d9b4b --- /dev/null +++ b/codeclone/analysis/units.py @@ -0,0 +1,323 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import ast +from hashlib import sha1 as _sha1 + +from .. import qualnames as _qualnames +from ..blocks import extract_blocks, extract_segments +from ..contracts import ( + DEFAULT_BLOCK_MIN_LOC, + DEFAULT_BLOCK_MIN_STMT, + DEFAULT_SEGMENT_MIN_LOC, + DEFAULT_SEGMENT_MIN_STMT, +) +from ..contracts.errors import ParseError +from ..findings.structural.detectors import scan_function_structure +from ..metrics.adoption import collect_module_adoption +from ..metrics.api_surface import collect_module_api_surface +from ..metrics.complexity import risk_level +from ..models import ( + BlockUnit, + ClassMetrics, + FileMetrics, + SegmentUnit, + SourceStats, + StructuralFindingGroup, + Unit, +) +from ..paths import is_test_filepath +from ._module_walk import ( + _build_suppression_index_for_source, + _collect_dead_candidates, + _collect_module_walk_data, +) +from .class_metrics import _class_metrics_for_node, _node_line_span +from .fingerprint import _cfg_fingerprint_and_complexity, bucket_loc +from .normalizer import NormalizationConfig, stmt_hashes +from .parser import PARSE_TIMEOUT_SECONDS, _parse_with_limits +from .security_surfaces import collect_security_surfaces + +__all__ = ["extract_units_and_stats_from_source"] + + +def _stmt_count(node: ast.AST) -> int: + body = getattr(node, "body", None) + return len(body) if isinstance(body, list) else 0 + + +_STMT_COUNT_IMPL = _stmt_count + + +def _raw_source_hash_for_range( + source_lines: list[str], + start_line: int, + end_line: int, +) -> str: + window = "".join(source_lines[start_line - 1 : end_line]).strip() + no_space = "".join(window.split()) + return _sha1(no_space.encode("utf-8")).hexdigest() + + +def _eligible_unit_shape( + node: _qualnames.FunctionNode, + *, + min_loc: int, + min_stmt: int, +) -> tuple[int, int, int, int] | None: + span = _node_line_span(node) + if span is None: + return None + start, end = span + if end < start: + return None + loc = end - start + 1 + stmt_count = _stmt_count(node) + if loc < min_loc or stmt_count < min_stmt: + return None + return start, end, loc, stmt_count + + +def extract_units_and_stats_from_source( + source: str, + filepath: str, + module_name: str, + cfg: NormalizationConfig, + min_loc: int, + min_stmt: int, + *, + block_min_loc: int = DEFAULT_BLOCK_MIN_LOC, + block_min_stmt: int = DEFAULT_BLOCK_MIN_STMT, + segment_min_loc: int = DEFAULT_SEGMENT_MIN_LOC, + segment_min_stmt: int = DEFAULT_SEGMENT_MIN_STMT, + collect_structural_findings: bool = True, + collect_api_surface: bool = False, + api_include_private_modules: bool = False, +) -> tuple[ + list[Unit], + list[BlockUnit], + list[SegmentUnit], + SourceStats, + FileMetrics, + list[StructuralFindingGroup], +]: + try: + tree = _parse_with_limits(source, PARSE_TIMEOUT_SECONDS) + except SyntaxError as e: + raise ParseError(f"Failed to parse {filepath}: {e}") from e + if not isinstance(tree, ast.Module): + raise ParseError(f"Failed to parse {filepath}: expected module AST root") + + collector = _qualnames.QualnameCollector() + collector.visit(tree) + source_lines = source.splitlines() + source_line_count = len(source_lines) + + is_test_file = is_test_filepath(filepath) + + # Single-pass AST walk replaces 3 separate functions / 4 walks. + _walk = _collect_module_walk_data( + tree=tree, + module_name=module_name, + collector=collector, + collect_referenced_names=not is_test_file, + ) + import_names = _walk.import_names + module_deps = _walk.module_deps + referenced_names = _walk.referenced_names + referenced_qualnames = _walk.referenced_qualnames + protocol_symbol_aliases = _walk.protocol_symbol_aliases + protocol_module_aliases = _walk.protocol_module_aliases + + suppression_index = _build_suppression_index_for_source( + source=source, + filepath=filepath, + module_name=module_name, + collector=collector, + ) + class_names = frozenset(class_node.name for _, class_node in collector.class_nodes) + module_import_names = set(import_names) + module_class_names = set(class_names) + class_metrics: list[ClassMetrics] = [] + + units: list[Unit] = [] + block_units: list[BlockUnit] = [] + segment_units: list[SegmentUnit] = [] + structural_findings: list[StructuralFindingGroup] = [] + + for local_name, node in collector.units: + unit_shape = _eligible_unit_shape( + node, + min_loc=min_loc, + min_stmt=min_stmt, + ) + if unit_shape is None: + continue + start, end, loc, stmt_count = unit_shape + + qualname = f"{module_name}:{local_name}" + fingerprint, complexity = _cfg_fingerprint_and_complexity(node, cfg, qualname) + structure_facts = scan_function_structure( + node, + filepath, + qualname, + collect_findings=collect_structural_findings, + ) + depth = structure_facts.nesting_depth + risk = risk_level(complexity) + raw_hash = _raw_source_hash_for_range(source_lines, start, end) + + units.append( + Unit( + qualname=qualname, + filepath=filepath, + start_line=start, + end_line=end, + loc=loc, + stmt_count=stmt_count, + fingerprint=fingerprint, + loc_bucket=bucket_loc(loc), + cyclomatic_complexity=complexity, + nesting_depth=depth, + risk=risk, + raw_hash=raw_hash, + entry_guard_count=structure_facts.entry_guard_count, + entry_guard_terminal_profile=( + structure_facts.entry_guard_terminal_profile + ), + entry_guard_has_side_effect_before=( + structure_facts.entry_guard_has_side_effect_before + ), + terminal_kind=structure_facts.terminal_kind, + try_finally_profile=structure_facts.try_finally_profile, + side_effect_order_profile=structure_facts.side_effect_order_profile, + ) + ) + + needs_blocks = ( + not local_name.endswith("__init__") + and loc >= block_min_loc + and stmt_count >= block_min_stmt + ) + needs_segments = loc >= segment_min_loc and stmt_count >= segment_min_stmt + + if needs_blocks or needs_segments: + body = getattr(node, "body", None) + hashes: list[str] | None = None + if isinstance(body, list): + hashes = stmt_hashes(body, cfg) + + if needs_blocks: + block_units.extend( + extract_blocks( + node, + filepath=filepath, + qualname=qualname, + cfg=cfg, + block_size=4, + max_blocks=15, + precomputed_hashes=hashes, + ) + ) + + if needs_segments: + segment_units.extend( + extract_segments( + node, + filepath=filepath, + qualname=qualname, + cfg=cfg, + window_size=6, + max_segments=60, + precomputed_hashes=hashes, + ) + ) + + if collect_structural_findings: + structural_findings.extend(structure_facts.structural_findings) + + for class_qualname, class_node in collector.class_nodes: + class_metric = _class_metrics_for_node( + module_name=module_name, + class_qualname=class_qualname, + class_node=class_node, + filepath=filepath, + module_import_names=module_import_names, + module_class_names=module_class_names, + ) + if class_metric is not None: + class_metrics.append(class_metric) + + dead_candidates = _collect_dead_candidates( + filepath=filepath, + module_name=module_name, + collector=collector, + protocol_symbol_aliases=protocol_symbol_aliases, + protocol_module_aliases=protocol_module_aliases, + suppression_rules_by_target=suppression_index, + ) + + sorted_class_metrics = tuple( + sorted( + class_metrics, + key=lambda item: ( + item.filepath, + item.start_line, + item.end_line, + item.qualname, + ), + ) + ) + typing_coverage, docstring_coverage = collect_module_adoption( + tree=tree, + module_name=module_name, + filepath=filepath, + collector=collector, + imported_names=import_names, + ) + api_surface = None + if collect_api_surface: + api_surface = collect_module_api_surface( + tree=tree, + module_name=module_name, + filepath=filepath, + collector=collector, + imported_names=import_names, + include_private_modules=api_include_private_modules, + ) + security_surfaces = collect_security_surfaces( + tree=tree, + module_name=module_name, + filepath=filepath, + ) + + return ( + units, + block_units, + segment_units, + SourceStats( + lines=source_line_count, + functions=collector.function_count, + methods=collector.method_count, + classes=collector.class_count, + ), + FileMetrics( + class_metrics=sorted_class_metrics, + module_deps=module_deps, + dead_candidates=dead_candidates, + referenced_names=referenced_names, + import_names=import_names, + class_names=class_names, + security_surfaces=security_surfaces, + referenced_qualnames=referenced_qualnames, + typing_coverage=typing_coverage, + docstring_coverage=docstring_coverage, + api_surface=api_surface, + ), + structural_findings, + ) diff --git a/codeclone/baseline/__init__.py b/codeclone/baseline/__init__.py new file mode 100644 index 0000000..88891d4 --- /dev/null +++ b/codeclone/baseline/__init__.py @@ -0,0 +1,25 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from .clone_baseline import Baseline +from .trust import ( + BASELINE_GENERATOR, + BASELINE_UNTRUSTED_STATUSES, + MAX_BASELINE_SIZE_BYTES, + BaselineStatus, + coerce_baseline_status, + current_python_tag, +) + +__all__ = [ + "BASELINE_GENERATOR", + "BASELINE_UNTRUSTED_STATUSES", + "MAX_BASELINE_SIZE_BYTES", + "Baseline", + "BaselineStatus", + "coerce_baseline_status", + "current_python_tag", +] diff --git a/codeclone/baseline/_metrics_baseline_contract.py b/codeclone/baseline/_metrics_baseline_contract.py new file mode 100644 index 0000000..6ba78a1 --- /dev/null +++ b/codeclone/baseline/_metrics_baseline_contract.py @@ -0,0 +1,100 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from enum import Enum +from typing import Final + +METRICS_BASELINE_GENERATOR: Final = "codeclone" +MAX_METRICS_BASELINE_SIZE_BYTES: Final = 5 * 1024 * 1024 + + +class MetricsBaselineStatus(str, Enum): + OK = "ok" + MISSING = "missing" + TOO_LARGE = "too_large" + INVALID_JSON = "invalid_json" + INVALID_TYPE = "invalid_type" + MISSING_FIELDS = "missing_fields" + MISMATCH_SCHEMA_VERSION = "mismatch_schema_version" + MISMATCH_PYTHON_VERSION = "mismatch_python_version" + GENERATOR_MISMATCH = "generator_mismatch" + INTEGRITY_MISSING = "integrity_missing" + INTEGRITY_FAILED = "integrity_failed" + + +METRICS_BASELINE_UNTRUSTED_STATUSES: Final[frozenset[MetricsBaselineStatus]] = ( + frozenset( + { + MetricsBaselineStatus.MISSING, + MetricsBaselineStatus.TOO_LARGE, + MetricsBaselineStatus.INVALID_JSON, + MetricsBaselineStatus.INVALID_TYPE, + MetricsBaselineStatus.MISSING_FIELDS, + MetricsBaselineStatus.MISMATCH_SCHEMA_VERSION, + MetricsBaselineStatus.MISMATCH_PYTHON_VERSION, + MetricsBaselineStatus.GENERATOR_MISMATCH, + MetricsBaselineStatus.INTEGRITY_MISSING, + MetricsBaselineStatus.INTEGRITY_FAILED, + } + ) +) + +_TOP_LEVEL_REQUIRED_KEYS = frozenset({"meta", "metrics"}) +_TOP_LEVEL_ALLOWED_KEYS = _TOP_LEVEL_REQUIRED_KEYS | frozenset( + {"clones", "api_surface"} +) +_META_REQUIRED_KEYS = frozenset( + {"generator", "schema_version", "python_tag", "created_at", "payload_sha256"} +) +_METRICS_REQUIRED_KEYS = frozenset( + { + "max_complexity", + "high_risk_functions", + "max_coupling", + "high_coupling_classes", + "max_cohesion", + "low_cohesion_classes", + "dependency_cycles", + "dependency_max_depth", + "dead_code_items", + "health_score", + "health_grade", + } +) +_METRICS_OPTIONAL_KEYS = frozenset( + { + "typing_param_permille", + "typing_return_permille", + "docstring_permille", + "typing_any_count", + } +) +_METRICS_PAYLOAD_SHA256_KEY = "metrics_payload_sha256" +_API_SURFACE_PAYLOAD_SHA256_KEY = "api_surface_payload_sha256" + + +def coerce_metrics_baseline_status( + raw_status: str | MetricsBaselineStatus | None, +) -> MetricsBaselineStatus: + if isinstance(raw_status, MetricsBaselineStatus): + return raw_status + if isinstance(raw_status, str): + try: + return MetricsBaselineStatus(raw_status) + except ValueError: + return MetricsBaselineStatus.INVALID_TYPE + return MetricsBaselineStatus.INVALID_TYPE + + +__all__ = [ + "MAX_METRICS_BASELINE_SIZE_BYTES", + "METRICS_BASELINE_GENERATOR", + "METRICS_BASELINE_UNTRUSTED_STATUSES", + "MetricsBaselineStatus", + "coerce_metrics_baseline_status", +] diff --git a/codeclone/baseline/_metrics_baseline_payload.py b/codeclone/baseline/_metrics_baseline_payload.py new file mode 100644 index 0000000..4f24864 --- /dev/null +++ b/codeclone/baseline/_metrics_baseline_payload.py @@ -0,0 +1,243 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import hashlib +from pathlib import Path + +import orjson + +from ..cache.projection import wire_filepath_from_runtime +from ..models import ApiSurfaceSnapshot, MetricsSnapshot, ProjectMetrics +from ._metrics_baseline_contract import _API_SURFACE_PAYLOAD_SHA256_KEY + + +def snapshot_from_project_metrics(project_metrics: ProjectMetrics) -> MetricsSnapshot: + return MetricsSnapshot( + max_complexity=int(project_metrics.complexity_max), + high_risk_functions=tuple(sorted(set(project_metrics.high_risk_functions))), + max_coupling=int(project_metrics.coupling_max), + high_coupling_classes=tuple(sorted(set(project_metrics.high_risk_classes))), + max_cohesion=int(project_metrics.cohesion_max), + low_cohesion_classes=tuple(sorted(set(project_metrics.low_cohesion_classes))), + dependency_cycles=tuple( + sorted({tuple(cycle) for cycle in project_metrics.dependency_cycles}) + ), + dependency_max_depth=int(project_metrics.dependency_max_depth), + dead_code_items=tuple( + sorted({item.qualname for item in project_metrics.dead_code}) + ), + health_score=int(project_metrics.health.total), + health_grade=project_metrics.health.grade, + typing_param_permille=_permille( + project_metrics.typing_param_annotated, + project_metrics.typing_param_total, + ), + typing_return_permille=_permille( + project_metrics.typing_return_annotated, + project_metrics.typing_return_total, + ), + docstring_permille=_permille( + project_metrics.docstring_public_documented, + project_metrics.docstring_public_total, + ), + typing_any_count=int(project_metrics.typing_any_count), + ) + + +def _permille(numerator: int, denominator: int) -> int: + if denominator <= 0: + return 0 + return round((1000.0 * float(numerator)) / float(denominator)) + + +def _canonical_json(payload: object) -> str: + return orjson.dumps(payload, option=orjson.OPT_SORT_KEYS).decode("utf-8") + + +def _snapshot_payload( + snapshot: MetricsSnapshot, + *, + include_adoption: bool = True, +) -> dict[str, object]: + payload: dict[str, object] = { + "max_complexity": int(snapshot.max_complexity), + "high_risk_functions": list(snapshot.high_risk_functions), + "max_coupling": int(snapshot.max_coupling), + "high_coupling_classes": list(snapshot.high_coupling_classes), + "max_cohesion": int(snapshot.max_cohesion), + "low_cohesion_classes": list(snapshot.low_cohesion_classes), + "dependency_cycles": [list(cycle) for cycle in snapshot.dependency_cycles], + "dependency_max_depth": int(snapshot.dependency_max_depth), + "dead_code_items": list(snapshot.dead_code_items), + "health_score": int(snapshot.health_score), + "health_grade": snapshot.health_grade, + } + if include_adoption: + payload.update( + { + "typing_param_permille": int(snapshot.typing_param_permille), + "typing_return_permille": int(snapshot.typing_return_permille), + "docstring_permille": int(snapshot.docstring_permille), + "typing_any_count": int(snapshot.typing_any_count), + } + ) + return payload + + +def _compute_payload_sha256( + snapshot: MetricsSnapshot, + *, + include_adoption: bool = True, +) -> str: + canonical = _canonical_json( + _snapshot_payload(snapshot, include_adoption=include_adoption) + ) + return hashlib.sha256(canonical.encode("utf-8")).hexdigest() + + +def _has_coverage_adoption_snapshot(metrics_obj: dict[str, object]) -> bool: + return all( + key in metrics_obj + for key in ( + "typing_param_permille", + "typing_return_permille", + "docstring_permille", + ) + ) + + +def _api_surface_snapshot_payload( + snapshot: ApiSurfaceSnapshot, + *, + root: Path | None = None, + legacy_qualname: bool = False, +) -> dict[str, object]: + return { + "modules": [ + { + "module": module.module, + "filepath": wire_filepath_from_runtime(module.filepath, root=root), + "all_declared": list(module.all_declared or ()), + "symbols": [ + { + ("qualname" if legacy_qualname else "local_name"): ( + symbol.qualname + if legacy_qualname + else _local_name_from_qualname( + module=module.module, + qualname=symbol.qualname, + ) + ), + "kind": symbol.kind, + "start_line": symbol.start_line, + "end_line": symbol.end_line, + "params": [ + { + "name": param.name, + "kind": param.kind, + "has_default": param.has_default, + "annotation_hash": param.annotation_hash, + } + for param in symbol.params + ], + "returns_hash": symbol.returns_hash, + "exported_via": symbol.exported_via, + } + for symbol in sorted( + module.symbols, + key=lambda item: item.qualname, + ) + ], + } + for module in sorted( + snapshot.modules, + key=lambda item: (item.filepath, item.module), + ) + ] + } + + +def _compute_api_surface_payload_sha256( + snapshot: ApiSurfaceSnapshot, + *, + root: Path | None = None, +) -> str: + canonical = _canonical_json(_api_surface_snapshot_payload(snapshot, root=root)) + return hashlib.sha256(canonical.encode("utf-8")).hexdigest() + + +def _compute_legacy_api_surface_payload_sha256( + snapshot: ApiSurfaceSnapshot, + *, + root: Path | None = None, +) -> str: + canonical = _canonical_json( + _api_surface_snapshot_payload(snapshot, root=root, legacy_qualname=True) + ) + return hashlib.sha256(canonical.encode("utf-8")).hexdigest() + + +def _compose_api_surface_qualname(*, module: str, local_name: str) -> str: + return f"{module}:{local_name}" + + +def _local_name_from_qualname(*, module: str, qualname: str) -> str: + prefix = f"{module}:" + if qualname.startswith(prefix): + return qualname[len(prefix) :] + return qualname + + +def _build_payload( + *, + snapshot: MetricsSnapshot, + schema_version: str, + python_tag: str, + generator_name: str, + generator_version: str, + created_at: str, + include_adoption: bool = True, + api_surface_snapshot: ApiSurfaceSnapshot | None = None, + api_surface_root: Path | None = None, +) -> dict[str, object]: + payload_sha256 = _compute_payload_sha256( + snapshot, + include_adoption=include_adoption, + ) + meta: dict[str, object] = { + "generator": { + "name": generator_name, + "version": generator_version, + }, + "schema_version": schema_version, + "python_tag": python_tag, + "created_at": created_at, + "payload_sha256": payload_sha256, + } + payload: dict[str, object] = { + "meta": meta, + "metrics": _snapshot_payload( + snapshot, + include_adoption=include_adoption, + ), + } + if api_surface_snapshot is not None: + meta[_API_SURFACE_PAYLOAD_SHA256_KEY] = _compute_api_surface_payload_sha256( + api_surface_snapshot, + root=api_surface_root, + ) + payload["api_surface"] = _api_surface_snapshot_payload( + api_surface_snapshot, + root=api_surface_root, + ) + return payload + + +__all__ = [ + "snapshot_from_project_metrics", +] diff --git a/codeclone/baseline/_metrics_baseline_validation.py b/codeclone/baseline/_metrics_baseline_validation.py new file mode 100644 index 0000000..21f15a9 --- /dev/null +++ b/codeclone/baseline/_metrics_baseline_validation.py @@ -0,0 +1,648 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from json import JSONDecodeError +from pathlib import Path +from typing import Literal + +from ..cache.projection import runtime_filepath_from_wire +from ..contracts import BASELINE_SCHEMA_VERSION +from ..contracts.errors import BaselineValidationError +from ..models import ( + ApiParamSpec, + ApiSurfaceSnapshot, + MetricsSnapshot, + ModuleApiSurface, + PublicSymbol, +) +from ..utils.json_io import read_json_object as _read_json_object +from ..utils.json_io import ( + write_json_document_atomically as _write_json_document_atomically, +) +from ..utils.schema_validation import validate_top_level_structure +from ._metrics_baseline_contract import ( + _METRICS_PAYLOAD_SHA256_KEY, + _TOP_LEVEL_ALLOWED_KEYS, + _TOP_LEVEL_REQUIRED_KEYS, + MetricsBaselineStatus, +) +from ._metrics_baseline_payload import _compose_api_surface_qualname + +_HEALTH_GRADES = {"A", "B", "C", "D", "F"} +_API_PARAM_KINDS = {"pos_only", "pos_or_kw", "vararg", "kw_only", "kwarg"} +_PUBLIC_SYMBOL_KINDS = {"function", "class", "method", "constant"} +_EXPORTED_VIA_KINDS = {"all", "name"} + + +def _is_compatible_metrics_schema( + *, + baseline_version: str | None, + expected_version: str, +) -> bool: + if baseline_version is None: + return False + baseline_major_minor = _parse_major_minor(baseline_version) + expected_major_minor = _parse_major_minor(expected_version) + if baseline_major_minor is None or expected_major_minor is None: + return baseline_version == expected_version + baseline_major, baseline_minor = baseline_major_minor + expected_major, expected_minor = expected_major_minor + return baseline_major == expected_major and baseline_minor <= expected_minor + + +def _parse_major_minor(version: str) -> tuple[int, int] | None: + parts = version.split(".") + if len(parts) != 2 or not all(part.isdigit() for part in parts): + return None + return int(parts[0]), int(parts[1]) + + +def _atomic_write_json(path: Path, payload: dict[str, object]) -> None: + _write_json_document_atomically( + path, + payload, + indent=True, + trailing_newline=True, + ) + + +def _load_json_object(path: Path) -> dict[str, object]: + try: + return _read_json_object(path) + except OSError as e: + raise BaselineValidationError( + f"Cannot read metrics baseline file at {path}: {e}", + status=MetricsBaselineStatus.INVALID_JSON, + ) from e + except JSONDecodeError as e: + raise BaselineValidationError( + f"Corrupted metrics baseline file at {path}: {e}", + status=MetricsBaselineStatus.INVALID_JSON, + ) from e + except TypeError: + raise BaselineValidationError( + f"Metrics baseline payload must be an object at {path}", + status=MetricsBaselineStatus.INVALID_TYPE, + ) from None + + +def _validate_top_level_structure(payload: dict[str, object], *, path: Path) -> None: + validate_top_level_structure( + payload, + path=path, + required_keys=_TOP_LEVEL_REQUIRED_KEYS, + allowed_keys=_TOP_LEVEL_ALLOWED_KEYS, + schema_label="metrics baseline", + missing_status=MetricsBaselineStatus.MISSING_FIELDS, + extra_status=MetricsBaselineStatus.INVALID_TYPE, + ) + + +def _validate_required_keys( + payload: dict[str, object], + required: frozenset[str], + *, + path: Path, +) -> None: + missing = required - set(payload.keys()) + if missing: + raise BaselineValidationError( + "Invalid metrics baseline schema at " + f"{path}: missing required fields: {', '.join(sorted(missing))}", + status=MetricsBaselineStatus.MISSING_FIELDS, + ) + + +def _validate_exact_keys( + payload: dict[str, object], + required: frozenset[str], + *, + path: Path, +) -> None: + extra = set(payload.keys()) - set(required) + if extra: + raise BaselineValidationError( + "Invalid metrics baseline schema at " + f"{path}: unexpected fields: {', '.join(sorted(extra))}", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + + +def _require_str(payload: dict[str, object], key: str, *, path: Path) -> str: + value = payload.get(key) + if isinstance(value, str): + return value + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: {key!r} must be str", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + + +def _extract_metrics_payload_sha256( + payload: dict[str, object], + *, + path: Path, +) -> str: + direct = payload.get(_METRICS_PAYLOAD_SHA256_KEY) + if isinstance(direct, str): + return direct + return _require_str(payload, "payload_sha256", path=path) + + +def _extract_optional_payload_sha256( + payload: dict[str, object], + *, + key: str, +) -> str | None: + value = payload.get(key) + return value if isinstance(value, str) else None + + +def _require_int(payload: dict[str, object], key: str, *, path: Path) -> int: + value = payload.get(key) + if isinstance(value, bool): + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: {key!r} must be int", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + if isinstance(value, int): + return value + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: {key!r} must be int", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + + +def _optional_require_str( + payload: dict[str, object], + key: str, + *, + path: Path, +) -> str | None: + value = payload.get(key) + if value is None: + return None + if isinstance(value, str): + return value + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: {key!r} must be str", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + + +def _require_str_list( + payload: dict[str, object], + key: str, + *, + path: Path, +) -> list[str]: + value = payload.get(key) + if not isinstance(value, list): + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: {key!r} must be list[str]", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + if not all(isinstance(item, str) for item in value): + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: {key!r} must be list[str]", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + return value + + +def _parse_cycles( + payload: dict[str, object], + *, + key: str, + path: Path, +) -> tuple[tuple[str, ...], ...]: + value = payload.get(key) + if not isinstance(value, list): + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: {key!r} must be list", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + + cycles: list[tuple[str, ...]] = [] + for cycle in value: + if not isinstance(cycle, list): + raise BaselineValidationError( + "Invalid metrics baseline schema at " + f"{path}: {key!r} cycle item must be list[str]", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + if not all(isinstance(item, str) for item in cycle): + raise BaselineValidationError( + "Invalid metrics baseline schema at " + f"{path}: {key!r} cycle item must be list[str]", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + cycles.append(tuple(cycle)) + return tuple(sorted(set(cycles))) + + +def _parse_generator( + meta: dict[str, object], + *, + path: Path, +) -> tuple[str, str | None]: + generator = meta.get("generator") + if isinstance(generator, str): + version_value = meta.get("generator_version") + if version_value is None: + version_value = meta.get("codeclone_version") + if version_value is None: + return generator, None + if not isinstance(version_value, str): + raise BaselineValidationError( + "Invalid metrics baseline schema at " + f"{path}: generator_version must be str", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + return generator, version_value + + if isinstance(generator, dict): + allowed_keys = {"name", "version"} + extra = set(generator.keys()) - allowed_keys + if extra: + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: " + f"unexpected generator keys: {', '.join(sorted(extra))}", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + name = generator.get("name") + version = generator.get("version") + if not isinstance(name, str): + raise BaselineValidationError( + "Invalid metrics baseline schema at " + f"{path}: generator.name must be str", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + if version is not None and not isinstance(version, str): + raise BaselineValidationError( + "Invalid metrics baseline schema at " + f"{path}: generator.version must be str", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + return name, version if isinstance(version, str) else None + + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: generator must be object or str", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + + +def _require_embedded_clone_baseline_payload( + payload: dict[str, object], + *, + path: Path, +) -> tuple[dict[str, object], dict[str, object]]: + meta_obj = payload.get("meta") + clones_obj = payload.get("clones") + if not isinstance(meta_obj, dict): + raise BaselineValidationError( + f"Invalid baseline schema at {path}: 'meta' must be object", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + if not isinstance(clones_obj, dict): + raise BaselineValidationError( + f"Invalid baseline schema at {path}: 'clones' must be object", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + _require_str(meta_obj, "payload_sha256", path=path) + _require_str(meta_obj, "python_tag", path=path) + _require_str(meta_obj, "created_at", path=path) + functions = clones_obj.get("functions") + blocks = clones_obj.get("blocks") + if not isinstance(functions, list) or not all( + isinstance(item, str) for item in functions + ): + raise BaselineValidationError( + f"Invalid baseline schema at {path}: 'clones.functions' must be list[str]", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + if not isinstance(blocks, list) or not all( + isinstance(item, str) for item in blocks + ): + raise BaselineValidationError( + f"Invalid baseline schema at {path}: 'clones.blocks' must be list[str]", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + return meta_obj, clones_obj + + +def _resolve_embedded_schema_version(meta: dict[str, object], *, path: Path) -> str: + raw_version = _require_str(meta, "schema_version", path=path) + parts = raw_version.split(".") + if len(parts) not in {2, 3} or not all(part.isdigit() for part in parts): + raise BaselineValidationError( + "Invalid baseline schema at " + f"{path}: 'schema_version' must be semver string", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + major = int(parts[0]) + if major >= 2: + return raw_version + return BASELINE_SCHEMA_VERSION + + +def _parse_snapshot( + payload: dict[str, object], + *, + path: Path, +) -> MetricsSnapshot: + grade = _require_str(payload, "health_grade", path=path) + if grade not in {"A", "B", "C", "D", "F"}: + raise BaselineValidationError( + "Invalid metrics baseline schema at " + f"{path}: 'health_grade' must be one of A/B/C/D/F", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + + return MetricsSnapshot( + max_complexity=_require_int(payload, "max_complexity", path=path), + high_risk_functions=tuple( + sorted(set(_require_str_list(payload, "high_risk_functions", path=path))) + ), + max_coupling=_require_int(payload, "max_coupling", path=path), + high_coupling_classes=tuple( + sorted(set(_require_str_list(payload, "high_coupling_classes", path=path))) + ), + max_cohesion=_require_int(payload, "max_cohesion", path=path), + low_cohesion_classes=tuple( + sorted(set(_require_str_list(payload, "low_cohesion_classes", path=path))) + ), + dependency_cycles=_parse_cycles(payload, key="dependency_cycles", path=path), + dependency_max_depth=_require_int(payload, "dependency_max_depth", path=path), + dead_code_items=tuple( + sorted(set(_require_str_list(payload, "dead_code_items", path=path))) + ), + health_score=_require_int(payload, "health_score", path=path), + health_grade=_require_health_grade(grade, path=path), + typing_param_permille=_optional_int( + payload, + "typing_param_permille", + path=path, + ), + typing_return_permille=_optional_int( + payload, + "typing_return_permille", + path=path, + ), + docstring_permille=_optional_int(payload, "docstring_permille", path=path), + typing_any_count=_optional_int(payload, "typing_any_count", path=path), + ) + + +def _optional_int(payload: dict[str, object], key: str, *, path: Path) -> int: + value = payload.get(key) + if value is None: + return 0 + return _require_int(payload, key, path=path) + + +def _require_health_grade( + value: str, + *, + path: Path, +) -> Literal["A", "B", "C", "D", "F"]: + if value == "A": + return "A" + if value == "B": + return "B" + if value == "C": + return "C" + if value == "D": + return "D" + if value == "F": + return "F" + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: " + "'health_grade' must be one of A/B/C/D/F", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + + +def _require_api_param_kind( + value: str, + *, + path: Path, +) -> Literal["pos_only", "pos_or_kw", "vararg", "kw_only", "kwarg"]: + if value == "pos_only": + return "pos_only" + if value == "pos_or_kw": + return "pos_or_kw" + if value == "vararg": + return "vararg" + if value == "kw_only": + return "kw_only" + if value == "kwarg": + return "kwarg" + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: api param 'kind' is invalid", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + + +def _require_public_symbol_kind( + value: str, + *, + path: Path, +) -> Literal["function", "class", "method", "constant"]: + if value == "function": + return "function" + if value == "class": + return "class" + if value == "method": + return "method" + if value == "constant": + return "constant" + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: public symbol 'kind' is invalid", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + + +def _require_exported_via( + value: str, + *, + path: Path, +) -> Literal["all", "name"]: + if value == "all": + return "all" + if value == "name": + return "name" + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: " + "public symbol 'exported_via' is invalid", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + + +def _parse_api_surface_snapshot( + payload: object, + *, + path: Path, + root: Path | None = None, +) -> ApiSurfaceSnapshot | None: + if payload is None: + return None + if not isinstance(payload, dict): + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: 'api_surface' must be object", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + raw_modules = payload.get("modules", []) + if not isinstance(raw_modules, list): + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: " + "'api_surface.modules' must be list", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + modules: list[ModuleApiSurface] = [] + for raw_module in raw_modules: + if not isinstance(raw_module, dict): + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: " + "api surface module must be object", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + module = _require_str(raw_module, "module", path=path) + wire_filepath = _require_str(raw_module, "filepath", path=path) + filepath = runtime_filepath_from_wire(wire_filepath, root=root) + all_declared = _require_str_list_or_none(raw_module, "all_declared", path=path) + raw_symbols = raw_module.get("symbols", []) + if not isinstance(raw_symbols, list): + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: " + "api surface symbols must be list", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + symbols: list[PublicSymbol] = [] + for raw_symbol in raw_symbols: + if not isinstance(raw_symbol, dict): + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: " + "api surface symbol must be object", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + local_name = _optional_require_str(raw_symbol, "local_name", path=path) + legacy_qualname = _optional_require_str(raw_symbol, "qualname", path=path) + if local_name is None and legacy_qualname is None: + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: " + "api surface symbol requires 'local_name' or 'qualname'", + status=MetricsBaselineStatus.MISSING_FIELDS, + ) + qualname = ( + legacy_qualname + if local_name is None + else _compose_api_surface_qualname( + module=module, + local_name=local_name, + ) + ) + kind = _require_str(raw_symbol, "kind", path=path) + exported_via = _require_str(raw_symbol, "exported_via", path=path) + params_raw = raw_symbol.get("params", []) + if not isinstance(params_raw, list): + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: " + "api surface params must be list", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + params: list[ApiParamSpec] = [] + for raw_param in params_raw: + if not isinstance(raw_param, dict): + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: " + "api param must be object", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + name = _require_str(raw_param, "name", path=path) + param_kind = _require_str(raw_param, "kind", path=path) + has_default = raw_param.get("has_default") + annotation_hash = _optional_require_str( + raw_param, + "annotation_hash", + path=path, + ) + if not isinstance(has_default, bool): + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: " + "api param 'has_default' must be bool", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + params.append( + ApiParamSpec( + name=name, + kind=_require_api_param_kind(param_kind, path=path), + has_default=has_default, + annotation_hash=annotation_hash or "", + ) + ) + symbols.append( + PublicSymbol( + qualname=qualname or "", + kind=_require_public_symbol_kind(kind, path=path), + start_line=_require_int(raw_symbol, "start_line", path=path), + end_line=_require_int(raw_symbol, "end_line", path=path), + params=tuple(params), + returns_hash=_optional_require_str( + raw_symbol, + "returns_hash", + path=path, + ) + or "", + exported_via=_require_exported_via(exported_via, path=path), + ) + ) + modules.append( + ModuleApiSurface( + module=module, + filepath=filepath, + symbols=tuple(sorted(symbols, key=lambda item: item.qualname)), + all_declared=tuple(all_declared) if all_declared is not None else None, + ) + ) + return ApiSurfaceSnapshot( + modules=tuple(sorted(modules, key=lambda item: (item.filepath, item.module))) + ) + + +def _require_str_list_or_none( + payload: dict[str, object], + key: str, + *, + path: Path, +) -> list[str] | None: + value = payload.get(key) + if value is None: + return None + return _require_str_list(payload, key, path=path) + + +__all__ = [ + "_atomic_write_json", + "_extract_metrics_payload_sha256", + "_is_compatible_metrics_schema", + "_load_json_object", + "_optional_require_str", + "_parse_api_surface_snapshot", + "_parse_cycles", + "_parse_generator", + "_parse_snapshot", + "_require_embedded_clone_baseline_payload", + "_require_int", + "_require_str", + "_require_str_list", + "_resolve_embedded_schema_version", + "_validate_exact_keys", + "_validate_required_keys", + "_validate_top_level_structure", +] diff --git a/codeclone/baseline.py b/codeclone/baseline/clone_baseline.py similarity index 53% rename from codeclone/baseline.py rename to codeclone/baseline/clone_baseline.py index c16c08c..7422232 100644 --- a/codeclone/baseline.py +++ b/codeclone/baseline/clone_baseline.py @@ -6,83 +6,26 @@ from __future__ import annotations -import hashlib import hmac import re -import sys -from datetime import datetime, timezone -from enum import Enum -from json import JSONDecodeError from pathlib import Path -from typing import TYPE_CHECKING, Any, Final +from typing import TYPE_CHECKING -import orjson - -from . import __version__ -from ._json_io import read_json_object as _read_json_object -from ._json_io import write_json_document_atomically as _write_json_document_atomically -from ._schema_validation import validate_top_level_structure -from .contracts import ( +from .. import __version__ +from ..contracts import ( BASELINE_FINGERPRINT_VERSION, BASELINE_SCHEMA_VERSION, ) -from .errors import BaselineValidationError - -if TYPE_CHECKING: - from collections.abc import Collection, Mapping - -# Any: baseline JSON parsing/serialization boundary. Values are validated -# and narrowed before entering compatibility/integrity checks. - -BASELINE_GENERATOR = "codeclone" -_BASELINE_SCHEMA_MAX_MINOR_BY_MAJOR = {1: 0, 2: 1} -MAX_BASELINE_SIZE_BYTES = 5 * 1024 * 1024 - - -class BaselineStatus(str, Enum): - OK = "ok" - MISSING = "missing" - TOO_LARGE = "too_large" - INVALID_JSON = "invalid_json" - INVALID_TYPE = "invalid_type" - MISSING_FIELDS = "missing_fields" - MISMATCH_SCHEMA_VERSION = "mismatch_schema_version" - MISMATCH_FINGERPRINT_VERSION = "mismatch_fingerprint_version" - MISMATCH_PYTHON_VERSION = "mismatch_python_version" - GENERATOR_MISMATCH = "generator_mismatch" - INTEGRITY_MISSING = "integrity_missing" - INTEGRITY_FAILED = "integrity_failed" - - -BASELINE_UNTRUSTED_STATUSES: Final[frozenset[BaselineStatus]] = frozenset( - { - BaselineStatus.MISSING, - BaselineStatus.TOO_LARGE, - BaselineStatus.INVALID_JSON, - BaselineStatus.INVALID_TYPE, - BaselineStatus.MISSING_FIELDS, - BaselineStatus.MISMATCH_SCHEMA_VERSION, - BaselineStatus.MISMATCH_FINGERPRINT_VERSION, - BaselineStatus.MISMATCH_PYTHON_VERSION, - BaselineStatus.GENERATOR_MISMATCH, - BaselineStatus.INTEGRITY_MISSING, - BaselineStatus.INTEGRITY_FAILED, - } +from ..contracts.errors import BaselineValidationError +from ..utils.json_io import ( + write_json_document_atomically as _write_json_document_atomically, ) +from ..utils.schema_validation import validate_top_level_structure +from . import trust as _trust +from .diff import diff_clone_groups - -def coerce_baseline_status( - raw_status: str | BaselineStatus | None, -) -> BaselineStatus: - if isinstance(raw_status, BaselineStatus): - return raw_status - if isinstance(raw_status, str): - try: - return BaselineStatus(raw_status) - except ValueError: - return BaselineStatus.INVALID_TYPE - return BaselineStatus.INVALID_TYPE - +if TYPE_CHECKING: + from collections.abc import Mapping _TOP_LEVEL_REQUIRED_KEYS = {"meta", "clones"} _TOP_LEVEL_OPTIONAL_KEYS = {"metrics", "api_surface"} @@ -98,7 +41,6 @@ def coerce_baseline_status( _CLONES_REQUIRED_KEYS = {"functions", "blocks"} _FUNCTION_ID_RE = re.compile(r"^[0-9a-f]{40}\|(?:\d+-\d+|\d+\+)$") _BLOCK_ID_RE = re.compile(r"^[0-9a-f]{40}\|[0-9a-f]{40}\|[0-9a-f]{40}\|[0-9a-f]{40}$") -_UTC_ISO8601_Z_RE = re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$") class Baseline: @@ -131,44 +73,44 @@ def load( self, *, max_size_bytes: int | None = None, - preloaded_payload: dict[str, Any] | None = None, + preloaded_payload: dict[str, object] | None = None, ) -> None: try: exists = self.path.exists() except OSError as e: raise BaselineValidationError( f"Cannot stat baseline file at {self.path}: {e}", - status=BaselineStatus.INVALID_TYPE, + status=_trust.BaselineStatus.INVALID_TYPE, ) from e if not exists: return size_limit = ( - MAX_BASELINE_SIZE_BYTES if max_size_bytes is None else max_size_bytes + _trust.MAX_BASELINE_SIZE_BYTES if max_size_bytes is None else max_size_bytes ) - size = _safe_stat_size(self.path) + size = _trust._safe_stat_size(self.path) if size > size_limit: raise BaselineValidationError( "Baseline file is too large " f"({size} bytes, max {size_limit} bytes) at {self.path}. " "Increase --max-baseline-size-mb or regenerate baseline.", - status=BaselineStatus.TOO_LARGE, + status=_trust.BaselineStatus.TOO_LARGE, ) if preloaded_payload is None: - payload = _load_json_object(self.path) + payload = _trust._load_json_object(self.path) else: if not isinstance(preloaded_payload, dict): raise BaselineValidationError( f"Baseline payload must be an object at {self.path}", - status=BaselineStatus.INVALID_TYPE, + status=_trust.BaselineStatus.INVALID_TYPE, ) payload = preloaded_payload if _is_legacy_baseline_payload(payload): raise BaselineValidationError( "Baseline format is legacy (<=1.3.x) and must be regenerated. " "Please run --update-baseline.", - status=BaselineStatus.MISSING_FIELDS, + status=_trust.BaselineStatus.MISSING_FIELDS, ) _validate_top_level_structure(payload, path=self.path) @@ -178,21 +120,28 @@ def load( if not isinstance(meta_obj, dict): raise BaselineValidationError( f"Invalid baseline schema at {self.path}: 'meta' must be object", - status=BaselineStatus.INVALID_TYPE, + status=_trust.BaselineStatus.INVALID_TYPE, ) if not isinstance(clones_obj, dict): raise BaselineValidationError( f"Invalid baseline schema at {self.path}: 'clones' must be object", - status=BaselineStatus.INVALID_TYPE, + status=_trust.BaselineStatus.INVALID_TYPE, ) _validate_required_keys(meta_obj, _META_REQUIRED_KEYS, path=self.path) _validate_required_keys(clones_obj, _CLONES_REQUIRED_KEYS, path=self.path) _validate_exact_clone_keys(clones_obj, path=self.path) - generator, generator_version = _parse_generator_meta(meta_obj, path=self.path) - schema_version = _require_semver_str(meta_obj, "schema_version", path=self.path) - schema_major, _, _ = _parse_semver( + generator, generator_version = _trust._parse_generator_meta( + meta_obj, + path=self.path, + ) + schema_version = _trust._require_semver_str( + meta_obj, + "schema_version", + path=self.path, + ) + schema_major, _, _ = _trust._parse_semver( schema_version, key="schema_version", path=self.path, @@ -201,22 +150,28 @@ def load( raise BaselineValidationError( f"Invalid baseline schema at {self.path}: " "top-level 'metrics' requires baseline schema >= 2.0.", - status=BaselineStatus.MISMATCH_SCHEMA_VERSION, + status=_trust.BaselineStatus.MISMATCH_SCHEMA_VERSION, ) - fingerprint_version = _require_str( - meta_obj, "fingerprint_version", path=self.path + fingerprint_version = _trust._require_str( + meta_obj, + "fingerprint_version", + path=self.path, ) - python_tag = _require_python_tag(meta_obj, "python_tag", path=self.path) - created_at = _require_utc_iso8601_z(meta_obj, "created_at", path=self.path) - payload_sha256 = _require_str(meta_obj, "payload_sha256", path=self.path) + python_tag = _trust._require_python_tag(meta_obj, "python_tag", path=self.path) + created_at = _trust._require_utc_iso8601_z( + meta_obj, + "created_at", + path=self.path, + ) + payload_sha256 = _trust._require_str(meta_obj, "payload_sha256", path=self.path) - function_ids = _require_sorted_unique_ids( + function_ids = _trust._require_sorted_unique_ids( clones_obj, "functions", pattern=_FUNCTION_ID_RE, path=self.path, ) - block_ids = _require_sorted_unique_ids( + block_ids = _trust._require_sorted_unique_ids( clones_obj, "blocks", pattern=_BLOCK_ID_RE, @@ -298,60 +253,63 @@ def save(self) -> None: self.payload_sha256 = payload_sha256 def verify_compatibility(self, *, current_python_tag: str) -> None: - if self.generator != BASELINE_GENERATOR: + if self.generator != _trust.BASELINE_GENERATOR: raise BaselineValidationError( "Baseline generator mismatch: expected 'codeclone'.", - status=BaselineStatus.GENERATOR_MISMATCH, + status=_trust.BaselineStatus.GENERATOR_MISMATCH, ) if self.schema_version is None: raise BaselineValidationError( "Baseline schema version is missing.", - status=BaselineStatus.MISSING_FIELDS, + status=_trust.BaselineStatus.MISSING_FIELDS, ) if self.fingerprint_version is None: raise BaselineValidationError( "Baseline fingerprint version is missing.", - status=BaselineStatus.MISSING_FIELDS, + status=_trust.BaselineStatus.MISSING_FIELDS, ) if self.python_tag is None: raise BaselineValidationError( "Baseline python_tag is missing.", - status=BaselineStatus.MISSING_FIELDS, + status=_trust.BaselineStatus.MISSING_FIELDS, ) - schema_major, schema_minor, _ = _parse_semver( - self.schema_version, key="schema_version", path=self.path + schema_major, schema_minor, _ = _trust._parse_semver( + self.schema_version, + key="schema_version", + path=self.path, ) - max_minor = _BASELINE_SCHEMA_MAX_MINOR_BY_MAJOR.get(schema_major) + max_minor = _trust._BASELINE_SCHEMA_MAX_MINOR_BY_MAJOR.get(schema_major) if max_minor is None: supported = ",".join( - str(major) for major in sorted(_BASELINE_SCHEMA_MAX_MINOR_BY_MAJOR) + str(major) + for major in sorted(_trust._BASELINE_SCHEMA_MAX_MINOR_BY_MAJOR) ) raise BaselineValidationError( "Baseline schema version mismatch: " f"baseline={self.schema_version}, " f"supported_majors={supported}.", - status=BaselineStatus.MISMATCH_SCHEMA_VERSION, + status=_trust.BaselineStatus.MISMATCH_SCHEMA_VERSION, ) if schema_minor > max_minor: raise BaselineValidationError( "Baseline schema version is newer than supported: " f"baseline={self.schema_version}, " f"max={schema_major}.{max_minor}.", - status=BaselineStatus.MISMATCH_SCHEMA_VERSION, + status=_trust.BaselineStatus.MISMATCH_SCHEMA_VERSION, ) if self.fingerprint_version != BASELINE_FINGERPRINT_VERSION: raise BaselineValidationError( "Baseline fingerprint version mismatch: " f"baseline={self.fingerprint_version}, " f"expected={BASELINE_FINGERPRINT_VERSION}.", - status=BaselineStatus.MISMATCH_FINGERPRINT_VERSION, + status=_trust.BaselineStatus.MISMATCH_FINGERPRINT_VERSION, ) if self.python_tag != current_python_tag: raise BaselineValidationError( "Baseline python tag mismatch: " f"baseline={self.python_tag}, current={current_python_tag}.", - status=BaselineStatus.MISMATCH_PYTHON_VERSION, + status=_trust.BaselineStatus.MISMATCH_PYTHON_VERSION, ) self.verify_integrity() @@ -359,36 +317,36 @@ def verify_integrity(self) -> None: if not isinstance(self.payload_sha256, str): raise BaselineValidationError( "Baseline integrity payload hash is missing.", - status=BaselineStatus.INTEGRITY_MISSING, + status=_trust.BaselineStatus.INTEGRITY_MISSING, ) if len(self.payload_sha256) != 64: raise BaselineValidationError( "Baseline integrity payload hash is missing.", - status=BaselineStatus.INTEGRITY_MISSING, + status=_trust.BaselineStatus.INTEGRITY_MISSING, ) try: int(self.payload_sha256, 16) except ValueError as e: raise BaselineValidationError( "Baseline integrity payload hash is missing.", - status=BaselineStatus.INTEGRITY_MISSING, + status=_trust.BaselineStatus.INTEGRITY_MISSING, ) from e if self.schema_version is None: raise BaselineValidationError( "Baseline schema version is missing for integrity validation.", - status=BaselineStatus.MISSING_FIELDS, + status=_trust.BaselineStatus.MISSING_FIELDS, ) if self.fingerprint_version is None: raise BaselineValidationError( "Baseline fingerprint version is missing for integrity validation.", - status=BaselineStatus.MISSING_FIELDS, + status=_trust.BaselineStatus.MISSING_FIELDS, ) if self.python_tag is None: raise BaselineValidationError( "Baseline python_tag is missing for integrity validation.", - status=BaselineStatus.MISSING_FIELDS, + status=_trust.BaselineStatus.MISSING_FIELDS, ) - expected = _compute_payload_sha256( + expected = _trust._compute_payload_sha256( functions=self.functions, blocks=self.blocks, fingerprint_version=self.fingerprint_version, @@ -397,7 +355,7 @@ def verify_integrity(self) -> None: if not hmac.compare_digest(self.payload_sha256, expected): raise BaselineValidationError( "Baseline integrity check failed: payload_sha256 mismatch.", - status=BaselineStatus.INTEGRITY_FAILED, + status=_trust.BaselineStatus.INTEGRITY_FAILED, ) @staticmethod @@ -413,24 +371,27 @@ def from_groups( baseline = Baseline(path) baseline.functions = set(func_groups.keys()) baseline.blocks = set(block_groups.keys()) - baseline.generator = BASELINE_GENERATOR + baseline.generator = _trust.BASELINE_GENERATOR baseline.schema_version = schema_version or BASELINE_SCHEMA_VERSION baseline.fingerprint_version = ( fingerprint_version or BASELINE_FINGERPRINT_VERSION ) - baseline.python_tag = python_tag or current_python_tag() + baseline.python_tag = python_tag or _trust.current_python_tag() baseline.generator_version = generator_version or __version__ return baseline def diff( self, func_groups: Mapping[str, object], block_groups: Mapping[str, object] ) -> tuple[set[str], set[str]]: - new_funcs = set(func_groups.keys()) - self.functions - new_blocks = set(block_groups.keys()) - self.blocks - return new_funcs, new_blocks + return diff_clone_groups( + known_functions=self.functions, + known_blocks=self.blocks, + func_groups=func_groups, + block_groups=block_groups, + ) -def _atomic_write_json(path: Path, payload: dict[str, Any]) -> None: +def _atomic_write_json(path: Path, payload: dict[str, object]) -> None: _write_json_document_atomically( path, payload, @@ -439,80 +400,55 @@ def _atomic_write_json(path: Path, payload: dict[str, Any]) -> None: ) -def _safe_stat_size(path: Path) -> int: - try: - return path.stat().st_size - except OSError as e: - raise BaselineValidationError( - f"Cannot stat baseline file at {path}: {e}", - status=BaselineStatus.INVALID_TYPE, - ) from e - - -def _load_json_object(path: Path) -> dict[str, Any]: - try: - return _read_json_object(path) - except OSError as e: - raise BaselineValidationError( - f"Cannot read baseline file at {path}: {e}", - status=BaselineStatus.INVALID_JSON, - ) from e - except JSONDecodeError as e: - raise BaselineValidationError( - f"Corrupted baseline file at {path}: {e}", - status=BaselineStatus.INVALID_JSON, - ) from e - except TypeError: - raise BaselineValidationError( - f"Baseline payload must be an object at {path}", - status=BaselineStatus.INVALID_TYPE, - ) from None - - -def _validate_top_level_structure(payload: dict[str, Any], *, path: Path) -> None: +def _validate_top_level_structure(payload: dict[str, object], *, path: Path) -> None: validate_top_level_structure( payload, path=path, required_keys=_TOP_LEVEL_REQUIRED_KEYS, allowed_keys=_TOP_LEVEL_ALLOWED_KEYS, schema_label="baseline", - missing_status=BaselineStatus.MISSING_FIELDS, - extra_status=BaselineStatus.INVALID_TYPE, + missing_status=_trust.BaselineStatus.MISSING_FIELDS, + extra_status=_trust.BaselineStatus.INVALID_TYPE, ) def _validate_required_keys( - obj: dict[str, Any], required: set[str], *, path: Path + obj: dict[str, object], required: set[str], *, path: Path ) -> None: missing = required - set(obj.keys()) if missing: raise BaselineValidationError( f"Invalid baseline schema at {path}: missing required fields: " f"{', '.join(sorted(missing))}", - status=BaselineStatus.MISSING_FIELDS, + status=_trust.BaselineStatus.MISSING_FIELDS, ) -def _validate_exact_clone_keys(clones: dict[str, Any], *, path: Path) -> None: +def _validate_exact_clone_keys(clones: dict[str, object], *, path: Path) -> None: keys = set(clones.keys()) extra = keys - _CLONES_REQUIRED_KEYS if extra: raise BaselineValidationError( f"Invalid baseline schema at {path}: unexpected clone keys: " f"{', '.join(sorted(extra))}", - status=BaselineStatus.INVALID_TYPE, + status=_trust.BaselineStatus.INVALID_TYPE, ) -def _is_legacy_baseline_payload(payload: dict[str, Any]) -> bool: +def _is_legacy_baseline_payload(payload: dict[str, object]) -> bool: return "functions" in payload and "blocks" in payload def _preserve_embedded_metrics( path: Path, -) -> tuple[dict[str, Any] | None, str | None, dict[str, Any] | None, str | None]: +) -> tuple[ + dict[str, object] | None, + str | None, + dict[str, object] | None, + str | None, +]: try: - payload = _load_json_object(path) + payload = _trust._load_json_object(path) except BaselineValidationError: return None, None, None, None metrics_obj = payload.get("metrics") @@ -545,45 +481,6 @@ def _preserve_embedded_metrics( ) -def _parse_generator_meta( - meta_obj: dict[str, Any], *, path: Path -) -> tuple[str, str | None]: - raw_generator = meta_obj.get("generator") - - if isinstance(raw_generator, str): - generator_version = _optional_str(meta_obj, "generator_version", path=path) - if generator_version is None: - # Legacy alias for baselines produced before generator_version rename. - generator_version = _optional_str(meta_obj, "codeclone_version", path=path) - return raw_generator, generator_version - - if isinstance(raw_generator, dict): - allowed_keys = {"name", "version"} - extra = set(raw_generator.keys()) - allowed_keys - if extra: - raise BaselineValidationError( - f"Invalid baseline schema at {path}: unexpected generator keys: " - f"{', '.join(sorted(extra))}", - status=BaselineStatus.INVALID_TYPE, - ) - generator_name = _require_str(raw_generator, "name", path=path) - generator_version = _optional_str(raw_generator, "version", path=path) - - if generator_version is None: - generator_version = _optional_str(meta_obj, "generator_version", path=path) - if generator_version is None: - generator_version = _optional_str( - meta_obj, "codeclone_version", path=path - ) - - return generator_name, generator_version - - raise BaselineValidationError( - f"Invalid baseline schema at {path}: 'generator' must be string or object", - status=BaselineStatus.INVALID_TYPE, - ) - - def _baseline_payload( *, functions: set[str], @@ -594,17 +491,17 @@ def _baseline_payload( python_tag: str | None, generator_version: str | None, created_at: str | None, -) -> dict[str, Any]: - resolved_generator = generator or BASELINE_GENERATOR +) -> dict[str, object]: + resolved_generator = generator or _trust.BASELINE_GENERATOR resolved_schema = schema_version or BASELINE_SCHEMA_VERSION resolved_fingerprint = fingerprint_version or BASELINE_FINGERPRINT_VERSION - resolved_python_tag = python_tag or current_python_tag() + resolved_python_tag = python_tag or _trust.current_python_tag() resolved_generator_version = generator_version or __version__ - resolved_created_at = created_at or _utc_now_z() + resolved_created_at = created_at or _trust._utc_now_z() sorted_functions = sorted(functions) sorted_blocks = sorted(blocks) - payload_sha256 = _compute_payload_sha256( + payload_sha256 = _trust._compute_payload_sha256( functions=sorted_functions, blocks=sorted_blocks, fingerprint_version=resolved_fingerprint, @@ -630,138 +527,11 @@ def _baseline_payload( } -def _compute_payload_sha256( - *, - functions: Collection[str], - blocks: Collection[str], - fingerprint_version: str, - python_tag: str, -) -> str: - canonical = { - "blocks": sorted(blocks), - "fingerprint_version": fingerprint_version, - "functions": sorted(functions), - "python_tag": python_tag, - } - serialized = orjson.dumps(canonical, option=orjson.OPT_SORT_KEYS) - return hashlib.sha256(serialized).hexdigest() - - -def current_python_tag() -> str: - """Return the interpreter compatibility tag as an immutable string.""" - impl = sys.implementation.name - major, minor = sys.version_info[:2] - prefix = "cp" if impl == "cpython" else impl[:2] - return f"{prefix}{major}{minor}" - - -def _utc_now_z() -> str: - return ( - datetime.now(timezone.utc).replace(microsecond=0).strftime("%Y-%m-%dT%H:%M:%SZ") - ) - - -def _require_str(obj: dict[str, Any], key: str, *, path: Path) -> str: - value = obj.get(key) - if not isinstance(value, str): - raise BaselineValidationError( - f"Invalid baseline schema at {path}: '{key}' must be string", - status=BaselineStatus.INVALID_TYPE, - ) - return value - - -def _optional_str(obj: dict[str, Any], key: str, *, path: Path) -> str | None: - value = obj.get(key) - if value is None: - return None - if not isinstance(value, str): - raise BaselineValidationError( - f"Invalid baseline schema at {path}: '{key}' must be string", - status=BaselineStatus.INVALID_TYPE, - ) - return value - - -def _require_semver_str(obj: dict[str, Any], key: str, *, path: Path) -> str: - value = _require_str(obj, key, path=path) - _parse_semver(value, key=key, path=path) - return value - - -def _parse_semver(value: str, *, key: str, path: Path) -> tuple[int, int, int]: - parts = value.split(".") - if len(parts) not in {2, 3} or not all(part.isdigit() for part in parts): - raise BaselineValidationError( - f"Invalid baseline schema at {path}: '{key}' must be semver string", - status=BaselineStatus.INVALID_TYPE, - ) - if len(parts) == 2: - major, minor = int(parts[0]), int(parts[1]) - patch = 0 - else: - major, minor, patch = int(parts[0]), int(parts[1]), int(parts[2]) - return major, minor, patch - - -def _require_python_tag(obj: dict[str, Any], key: str, *, path: Path) -> str: - value = _require_str(obj, key, path=path) - if not re.fullmatch(r"[a-z]{2}\d{2,3}", value): - raise BaselineValidationError( - f"Invalid baseline schema at {path}: '{key}' must look like 'cp313'", - status=BaselineStatus.INVALID_TYPE, - ) - return value - - -def _require_utc_iso8601_z(obj: dict[str, Any], key: str, *, path: Path) -> str: - value = _require_str(obj, key, path=path) - if not _UTC_ISO8601_Z_RE.fullmatch(value): - raise BaselineValidationError( - f"Invalid baseline schema at {path}: '{key}' must be UTC ISO-8601 with Z", - status=BaselineStatus.INVALID_TYPE, - ) - try: - datetime( - int(value[0:4]), - int(value[5:7]), - int(value[8:10]), - int(value[11:13]), - int(value[14:16]), - int(value[17:19]), - tzinfo=timezone.utc, - ) - except ValueError as e: - raise BaselineValidationError( - f"Invalid baseline schema at {path}: '{key}' must be UTC ISO-8601 with Z", - status=BaselineStatus.INVALID_TYPE, - ) from e - return value - - -def _require_sorted_unique_ids( - obj: dict[str, Any], key: str, *, pattern: re.Pattern[str], path: Path -) -> list[str]: - value = obj.get(key) - if not isinstance(value, list): - raise BaselineValidationError( - f"Invalid baseline schema at {path}: '{key}' must be list[str]", - status=BaselineStatus.INVALID_TYPE, - ) - if not all(isinstance(item, str) for item in value): - raise BaselineValidationError( - f"Invalid baseline schema at {path}: '{key}' must be list[str]", - status=BaselineStatus.INVALID_TYPE, - ) - values = list(value) - if values != sorted(values) or len(values) != len(set(values)): - raise BaselineValidationError( - f"Invalid baseline schema at {path}: '{key}' must be sorted and unique", - status=BaselineStatus.INVALID_TYPE, - ) - if not all(pattern.fullmatch(item) for item in values): - raise BaselineValidationError( - f"Invalid baseline schema at {path}: '{key}' has invalid id format", - status=BaselineStatus.INVALID_TYPE, - ) - return values +__all__ = [ + "_BLOCK_ID_RE", + "_FUNCTION_ID_RE", + "Baseline", + "_atomic_write_json", + "_baseline_payload", + "_preserve_embedded_metrics", +] diff --git a/codeclone/baseline/diff.py b/codeclone/baseline/diff.py new file mode 100644 index 0000000..8c6ca2c --- /dev/null +++ b/codeclone/baseline/diff.py @@ -0,0 +1,111 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Mapping, Set + +from ..metrics.api_surface import compare_api_surfaces +from ..models import ( + ApiBreakingChange, + ApiSurfaceSnapshot, + MetricsDiff, + MetricsSnapshot, +) + + +def diff_clone_groups( + *, + known_functions: Set[str], + known_blocks: Set[str], + func_groups: Mapping[str, object], + block_groups: Mapping[str, object], +) -> tuple[set[str], set[str]]: + new_funcs = set(func_groups.keys()) - known_functions + new_blocks = set(block_groups.keys()) - known_blocks + return new_funcs, new_blocks + + +def diff_metrics( + *, + baseline_snapshot: MetricsSnapshot | None, + current_snapshot: MetricsSnapshot, + baseline_api_surface: ApiSurfaceSnapshot | None, + current_api_surface: ApiSurfaceSnapshot | None, +) -> MetricsDiff: + snapshot = baseline_snapshot or MetricsSnapshot( + max_complexity=0, + high_risk_functions=(), + max_coupling=0, + high_coupling_classes=(), + max_cohesion=0, + low_cohesion_classes=(), + dependency_cycles=(), + dependency_max_depth=0, + dead_code_items=(), + health_score=0, + health_grade="F", + typing_param_permille=0, + typing_return_permille=0, + docstring_permille=0, + typing_any_count=0, + ) + + new_high_risk_functions = tuple( + sorted( + set(current_snapshot.high_risk_functions) + - set(snapshot.high_risk_functions) + ) + ) + new_high_coupling_classes = tuple( + sorted( + set(current_snapshot.high_coupling_classes) + - set(snapshot.high_coupling_classes) + ) + ) + new_cycles = tuple( + sorted( + set(current_snapshot.dependency_cycles) - set(snapshot.dependency_cycles) + ) + ) + new_dead_code = tuple( + sorted(set(current_snapshot.dead_code_items) - set(snapshot.dead_code_items)) + ) + + if baseline_api_surface is None: + added_api_symbols: tuple[str, ...] = () + api_breaking_changes: tuple[ApiBreakingChange, ...] = () + else: + added_api_symbols, api_breaking_changes = compare_api_surfaces( + baseline=baseline_api_surface, + current=current_api_surface, + strict_types=False, + ) + + return MetricsDiff( + new_high_risk_functions=new_high_risk_functions, + new_high_coupling_classes=new_high_coupling_classes, + new_cycles=new_cycles, + new_dead_code=new_dead_code, + health_delta=current_snapshot.health_score - snapshot.health_score, + typing_param_permille_delta=( + current_snapshot.typing_param_permille - snapshot.typing_param_permille + ), + typing_return_permille_delta=( + current_snapshot.typing_return_permille - snapshot.typing_return_permille + ), + docstring_permille_delta=( + current_snapshot.docstring_permille - snapshot.docstring_permille + ), + new_api_symbols=added_api_symbols, + new_api_breaking_changes=api_breaking_changes, + ) + + +__all__ = ["diff_clone_groups", "diff_metrics"] diff --git a/codeclone/baseline/metrics_baseline.py b/codeclone/baseline/metrics_baseline.py new file mode 100644 index 0000000..2aecf3c --- /dev/null +++ b/codeclone/baseline/metrics_baseline.py @@ -0,0 +1,497 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import hmac +from dataclasses import dataclass +from datetime import datetime, timezone +from json import JSONDecodeError +from pathlib import Path + +import orjson + +from .. import __version__ +from ..contracts import BASELINE_SCHEMA_VERSION, METRICS_BASELINE_SCHEMA_VERSION +from ..contracts.errors import BaselineValidationError +from ..models import ApiSurfaceSnapshot, MetricsDiff, MetricsSnapshot, ProjectMetrics +from ._metrics_baseline_contract import ( + _API_SURFACE_PAYLOAD_SHA256_KEY, + _META_REQUIRED_KEYS, + _METRICS_OPTIONAL_KEYS, + _METRICS_PAYLOAD_SHA256_KEY, + _METRICS_REQUIRED_KEYS, + MAX_METRICS_BASELINE_SIZE_BYTES, + METRICS_BASELINE_GENERATOR, + METRICS_BASELINE_UNTRUSTED_STATUSES, + MetricsBaselineStatus, + coerce_metrics_baseline_status, +) +from ._metrics_baseline_payload import ( + _build_payload, + _compute_api_surface_payload_sha256, + _compute_legacy_api_surface_payload_sha256, + _compute_payload_sha256, + _has_coverage_adoption_snapshot, + snapshot_from_project_metrics, +) +from ._metrics_baseline_validation import ( + _atomic_write_json, + _extract_metrics_payload_sha256, + _extract_optional_payload_sha256, + _is_compatible_metrics_schema, + _load_json_object, + _optional_require_str, + _parse_api_surface_snapshot, + _parse_generator, + _parse_snapshot, + _require_embedded_clone_baseline_payload, + _require_str, + _resolve_embedded_schema_version, + _validate_exact_keys, + _validate_required_keys, + _validate_top_level_structure, +) +from .diff import diff_metrics +from .trust import current_python_tag + + +@dataclass(frozen=True, slots=True) +class MetricsBaselineSectionProbe: + has_metrics_section: bool + payload: dict[str, object] | None + + +def _now_utc_z() -> str: + return ( + datetime.now(timezone.utc) + .replace(microsecond=0) + .isoformat() + .replace("+00:00", "Z") + ) + + +def probe_metrics_baseline_section(path: Path) -> MetricsBaselineSectionProbe: + if not path.exists(): + return MetricsBaselineSectionProbe( + has_metrics_section=False, + payload=None, + ) + try: + raw_payload = orjson.loads(path.read_text("utf-8")) + except (OSError, JSONDecodeError): + return MetricsBaselineSectionProbe( + has_metrics_section=True, + payload=None, + ) + if not isinstance(raw_payload, dict): + return MetricsBaselineSectionProbe( + has_metrics_section=True, + payload=None, + ) + payload = dict(raw_payload) + return MetricsBaselineSectionProbe( + has_metrics_section=("metrics" in payload), + payload=payload, + ) + + +class MetricsBaseline: + __slots__ = ( + "api_surface_payload_sha256", + "api_surface_snapshot", + "created_at", + "generator_name", + "generator_version", + "has_coverage_adoption_snapshot", + "is_embedded_in_clone_baseline", + "path", + "payload_sha256", + "python_tag", + "schema_version", + "snapshot", + ) + + def __init__(self, path: str | Path) -> None: + self.path = Path(path) + self.generator_name: str | None = None + self.generator_version: str | None = None + self.schema_version: str | None = None + self.python_tag: str | None = None + self.created_at: str | None = None + self.payload_sha256: str | None = None + self.snapshot: MetricsSnapshot | None = None + self.has_coverage_adoption_snapshot = False + self.api_surface_payload_sha256: str | None = None + self.api_surface_snapshot: ApiSurfaceSnapshot | None = None + self.is_embedded_in_clone_baseline = False + + def load( + self, + *, + max_size_bytes: int | None = None, + preloaded_payload: dict[str, object] | None = None, + ) -> None: + try: + exists = self.path.exists() + except OSError as e: + raise BaselineValidationError( + f"Cannot stat metrics baseline file at {self.path}: {e}", + status=MetricsBaselineStatus.INVALID_TYPE, + ) from e + if not exists: + return + + size_limit = ( + MAX_METRICS_BASELINE_SIZE_BYTES + if max_size_bytes is None + else max_size_bytes + ) + try: + file_size = self.path.stat().st_size + except OSError as e: + raise BaselineValidationError( + f"Cannot stat metrics baseline file at {self.path}: {e}", + status=MetricsBaselineStatus.INVALID_TYPE, + ) from e + if file_size > size_limit: + raise BaselineValidationError( + "Metrics baseline file is too large " + f"({file_size} bytes, max {size_limit} bytes) at {self.path}.", + status=MetricsBaselineStatus.TOO_LARGE, + ) + + if preloaded_payload is None: + payload = _load_json_object(self.path) + else: + if not isinstance(preloaded_payload, dict): + raise BaselineValidationError( + f"Metrics baseline payload must be an object at {self.path}", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + payload = preloaded_payload + + _validate_top_level_structure(payload, path=self.path) + self.is_embedded_in_clone_baseline = "clones" in payload + + meta_obj = payload.get("meta") + metrics_obj = payload.get("metrics") + if not isinstance(meta_obj, dict): + raise BaselineValidationError( + f"Invalid metrics baseline schema at {self.path}: " + "'meta' must be object", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + if not isinstance(metrics_obj, dict): + raise BaselineValidationError( + f"Invalid metrics baseline schema at {self.path}: " + "'metrics' must be object", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + + _validate_required_keys(meta_obj, _META_REQUIRED_KEYS, path=self.path) + _validate_required_keys(metrics_obj, _METRICS_REQUIRED_KEYS, path=self.path) + _validate_exact_keys( + metrics_obj, + _METRICS_REQUIRED_KEYS | _METRICS_OPTIONAL_KEYS, + path=self.path, + ) + + generator_name, generator_version = _parse_generator(meta_obj, path=self.path) + self.generator_name = generator_name + self.generator_version = generator_version + self.schema_version = _require_str(meta_obj, "schema_version", path=self.path) + self.python_tag = _require_str(meta_obj, "python_tag", path=self.path) + self.created_at = _require_str(meta_obj, "created_at", path=self.path) + self.payload_sha256 = _extract_metrics_payload_sha256( + meta_obj, + path=self.path, + ) + self.api_surface_payload_sha256 = _extract_optional_payload_sha256( + meta_obj, + key=_API_SURFACE_PAYLOAD_SHA256_KEY, + ) + self.snapshot = _parse_snapshot(metrics_obj, path=self.path) + self.has_coverage_adoption_snapshot = _has_coverage_adoption_snapshot( + metrics_obj + ) + self.api_surface_snapshot = _parse_api_surface_snapshot( + payload.get("api_surface"), + path=self.path, + root=self.path.parent, + ) + + def save(self) -> None: + if self.snapshot is None: + raise BaselineValidationError( + "Metrics baseline snapshot is missing.", + status=MetricsBaselineStatus.MISSING_FIELDS, + ) + + payload = _build_payload( + snapshot=self.snapshot, + schema_version=self.schema_version or METRICS_BASELINE_SCHEMA_VERSION, + python_tag=self.python_tag or current_python_tag(), + generator_name=self.generator_name or METRICS_BASELINE_GENERATOR, + generator_version=self.generator_version or __version__, + created_at=self.created_at or _now_utc_z(), + include_adoption=self.has_coverage_adoption_snapshot, + api_surface_snapshot=self.api_surface_snapshot, + api_surface_root=self.path.parent, + ) + payload_meta = payload.get("meta") + if not isinstance(payload_meta, dict): + raise BaselineValidationError( + f"Invalid metrics baseline schema at {self.path}: " + "'meta' must be object", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + payload_metrics_hash = _require_str( + payload_meta, + "payload_sha256", + path=self.path, + ) + payload_api_surface_hash = _optional_require_str( + payload_meta, + _API_SURFACE_PAYLOAD_SHA256_KEY, + path=self.path, + ) + + existing: dict[str, object] | None = None + try: + if self.path.exists(): + loaded = _load_json_object(self.path) + if "clones" in loaded: + existing = loaded + except BaselineValidationError as e: + raise BaselineValidationError( + f"Cannot read existing baseline file at {self.path}: {e}", + status=MetricsBaselineStatus.INVALID_JSON, + ) from e + + if existing is not None: + existing_meta, clones_obj = _require_embedded_clone_baseline_payload( + existing, + path=self.path, + ) + merged_schema_version = _resolve_embedded_schema_version( + existing_meta, + path=self.path, + ) + merged_meta = dict(existing_meta) + merged_meta["schema_version"] = merged_schema_version + merged_meta[_METRICS_PAYLOAD_SHA256_KEY] = payload_metrics_hash + if payload_api_surface_hash is None: + merged_meta.pop(_API_SURFACE_PAYLOAD_SHA256_KEY, None) + else: + merged_meta[_API_SURFACE_PAYLOAD_SHA256_KEY] = payload_api_surface_hash + merged_payload: dict[str, object] = { + "meta": merged_meta, + "clones": clones_obj, + "metrics": payload["metrics"], + } + api_surface_payload = payload.get("api_surface") + if api_surface_payload is not None: + merged_payload["api_surface"] = api_surface_payload + self.path.parent.mkdir(parents=True, exist_ok=True) + _atomic_write_json(self.path, merged_payload) + self.is_embedded_in_clone_baseline = True + self.schema_version = merged_schema_version + self.python_tag = _require_str(merged_meta, "python_tag", path=self.path) + self.created_at = _require_str(merged_meta, "created_at", path=self.path) + self.payload_sha256 = _require_str( + merged_meta, + _METRICS_PAYLOAD_SHA256_KEY, + path=self.path, + ) + self.api_surface_payload_sha256 = _optional_require_str( + merged_meta, + _API_SURFACE_PAYLOAD_SHA256_KEY, + path=self.path, + ) + self.generator_name, self.generator_version = _parse_generator( + merged_meta, + path=self.path, + ) + return + + self.path.parent.mkdir(parents=True, exist_ok=True) + _atomic_write_json(self.path, payload) + self.is_embedded_in_clone_baseline = False + self.schema_version = _require_str( + payload_meta, + "schema_version", + path=self.path, + ) + self.python_tag = _require_str( + payload_meta, + "python_tag", + path=self.path, + ) + self.created_at = _require_str( + payload_meta, + "created_at", + path=self.path, + ) + self.payload_sha256 = payload_metrics_hash + self.api_surface_payload_sha256 = payload_api_surface_hash + + def verify_compatibility(self, *, runtime_python_tag: str) -> None: + if self.generator_name != METRICS_BASELINE_GENERATOR: + raise BaselineValidationError( + "Metrics baseline generator mismatch: expected 'codeclone'.", + status=MetricsBaselineStatus.GENERATOR_MISMATCH, + ) + expected_schema = ( + BASELINE_SCHEMA_VERSION + if self.is_embedded_in_clone_baseline + else METRICS_BASELINE_SCHEMA_VERSION + ) + if not _is_compatible_metrics_schema( + baseline_version=self.schema_version, + expected_version=expected_schema, + ): + raise BaselineValidationError( + "Metrics baseline schema version mismatch: " + f"baseline={self.schema_version}, " + f"expected={expected_schema}.", + status=MetricsBaselineStatus.MISMATCH_SCHEMA_VERSION, + ) + if self.python_tag != runtime_python_tag: + raise BaselineValidationError( + "Metrics baseline python tag mismatch: " + f"baseline={self.python_tag}, current={runtime_python_tag}.", + status=MetricsBaselineStatus.MISMATCH_PYTHON_VERSION, + ) + self.verify_integrity() + + def verify_integrity(self) -> None: + if self.snapshot is None: + raise BaselineValidationError( + "Metrics baseline snapshot is missing.", + status=MetricsBaselineStatus.MISSING_FIELDS, + ) + if not isinstance(self.payload_sha256, str) or len(self.payload_sha256) != 64: + raise BaselineValidationError( + "Metrics baseline integrity payload hash is missing.", + status=MetricsBaselineStatus.INTEGRITY_MISSING, + ) + + expected = _compute_payload_sha256( + self.snapshot, + include_adoption=self.has_coverage_adoption_snapshot, + ) + if not hmac.compare_digest(self.payload_sha256, expected): + raise BaselineValidationError( + "Metrics baseline integrity check failed: payload_sha256 mismatch.", + status=MetricsBaselineStatus.INTEGRITY_FAILED, + ) + + if self.api_surface_snapshot is None: + return + if ( + not isinstance(self.api_surface_payload_sha256, str) + or len(self.api_surface_payload_sha256) != 64 + ): + raise BaselineValidationError( + "Metrics baseline API surface integrity payload hash is missing.", + status=MetricsBaselineStatus.INTEGRITY_MISSING, + ) + + expected_api = _compute_api_surface_payload_sha256( + self.api_surface_snapshot, + root=self.path.parent, + ) + legacy_absolute_expected_api = _compute_api_surface_payload_sha256( + self.api_surface_snapshot + ) + legacy_expected_api = _compute_legacy_api_surface_payload_sha256( + self.api_surface_snapshot, + root=self.path.parent, + ) + legacy_absolute_qualname_expected_api = ( + _compute_legacy_api_surface_payload_sha256(self.api_surface_snapshot) + ) + if not ( + hmac.compare_digest(self.api_surface_payload_sha256, expected_api) + or hmac.compare_digest( + self.api_surface_payload_sha256, + legacy_absolute_expected_api, + ) + or hmac.compare_digest( + self.api_surface_payload_sha256, + legacy_expected_api, + ) + or hmac.compare_digest( + self.api_surface_payload_sha256, + legacy_absolute_qualname_expected_api, + ) + ): + raise BaselineValidationError( + "Metrics baseline integrity check failed: " + "api_surface payload_sha256 mismatch.", + status=MetricsBaselineStatus.INTEGRITY_FAILED, + ) + + @staticmethod + def from_project_metrics( + *, + project_metrics: ProjectMetrics, + path: str | Path, + schema_version: str | None = None, + python_tag: str | None = None, + generator_version: str | None = None, + include_adoption: bool = True, + include_api_surface: bool = True, + ) -> MetricsBaseline: + baseline = MetricsBaseline(path) + baseline.generator_name = METRICS_BASELINE_GENERATOR + baseline.generator_version = generator_version or __version__ + baseline.schema_version = schema_version or METRICS_BASELINE_SCHEMA_VERSION + baseline.python_tag = python_tag or current_python_tag() + baseline.created_at = _now_utc_z() + baseline.snapshot = snapshot_from_project_metrics(project_metrics) + baseline.payload_sha256 = _compute_payload_sha256( + baseline.snapshot, + include_adoption=include_adoption, + ) + baseline.has_coverage_adoption_snapshot = include_adoption + baseline.api_surface_snapshot = ( + project_metrics.api_surface if include_api_surface else None + ) + baseline.api_surface_payload_sha256 = ( + _compute_api_surface_payload_sha256( + baseline.api_surface_snapshot, + root=baseline.path.parent, + ) + if baseline.api_surface_snapshot is not None + else None + ) + return baseline + + def diff(self, current: ProjectMetrics) -> MetricsDiff: + return diff_metrics( + baseline_snapshot=self.snapshot, + current_snapshot=snapshot_from_project_metrics(current), + baseline_api_surface=self.api_surface_snapshot, + current_api_surface=current.api_surface, + ) + + +__all__ = [ + "BASELINE_SCHEMA_VERSION", + "MAX_METRICS_BASELINE_SIZE_BYTES", + "METRICS_BASELINE_GENERATOR", + "METRICS_BASELINE_SCHEMA_VERSION", + "METRICS_BASELINE_UNTRUSTED_STATUSES", + "MetricsBaseline", + "MetricsBaselineSectionProbe", + "MetricsBaselineStatus", + "coerce_metrics_baseline_status", + "current_python_tag", + "probe_metrics_baseline_section", + "snapshot_from_project_metrics", +] diff --git a/codeclone/baseline/trust.py b/codeclone/baseline/trust.py new file mode 100644 index 0000000..fa8179c --- /dev/null +++ b/codeclone/baseline/trust.py @@ -0,0 +1,303 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import hashlib +import re +import sys +from datetime import datetime, timezone +from enum import Enum +from json import JSONDecodeError +from pathlib import Path +from typing import TYPE_CHECKING, Final + +import orjson + +from ..contracts import DEFAULT_MAX_BASELINE_SIZE_MB +from ..contracts.errors import BaselineValidationError +from ..utils.json_io import read_json_object as _read_json_object + +if TYPE_CHECKING: + from collections.abc import Collection + +BASELINE_GENERATOR = "codeclone" +_BASELINE_SCHEMA_MAX_MINOR_BY_MAJOR = {1: 0, 2: 1} +MAX_BASELINE_SIZE_BYTES = DEFAULT_MAX_BASELINE_SIZE_MB * 1024 * 1024 +_UTC_ISO8601_Z_RE = re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$") + + +class BaselineStatus(str, Enum): + OK = "ok" + MISSING = "missing" + TOO_LARGE = "too_large" + INVALID_JSON = "invalid_json" + INVALID_TYPE = "invalid_type" + MISSING_FIELDS = "missing_fields" + MISMATCH_SCHEMA_VERSION = "mismatch_schema_version" + MISMATCH_FINGERPRINT_VERSION = "mismatch_fingerprint_version" + MISMATCH_PYTHON_VERSION = "mismatch_python_version" + GENERATOR_MISMATCH = "generator_mismatch" + INTEGRITY_MISSING = "integrity_missing" + INTEGRITY_FAILED = "integrity_failed" + + +BASELINE_UNTRUSTED_STATUSES: Final[frozenset[BaselineStatus]] = frozenset( + { + BaselineStatus.MISSING, + BaselineStatus.TOO_LARGE, + BaselineStatus.INVALID_JSON, + BaselineStatus.INVALID_TYPE, + BaselineStatus.MISSING_FIELDS, + BaselineStatus.MISMATCH_SCHEMA_VERSION, + BaselineStatus.MISMATCH_FINGERPRINT_VERSION, + BaselineStatus.MISMATCH_PYTHON_VERSION, + BaselineStatus.GENERATOR_MISMATCH, + BaselineStatus.INTEGRITY_MISSING, + BaselineStatus.INTEGRITY_FAILED, + } +) + + +def coerce_baseline_status( + raw_status: str | BaselineStatus | None, +) -> BaselineStatus: + if isinstance(raw_status, BaselineStatus): + return raw_status + if isinstance(raw_status, str): + try: + return BaselineStatus(raw_status) + except ValueError: + return BaselineStatus.INVALID_TYPE + return BaselineStatus.INVALID_TYPE + + +def _safe_stat_size(path: Path) -> int: + try: + return path.stat().st_size + except OSError as e: + raise BaselineValidationError( + f"Cannot stat baseline file at {path}: {e}", + status=BaselineStatus.INVALID_TYPE, + ) from e + + +def _load_json_object(path: Path) -> dict[str, object]: + try: + return _read_json_object(path) + except OSError as e: + raise BaselineValidationError( + f"Cannot read baseline file at {path}: {e}", + status=BaselineStatus.INVALID_JSON, + ) from e + except JSONDecodeError as e: + raise BaselineValidationError( + f"Corrupted baseline file at {path}: {e}", + status=BaselineStatus.INVALID_JSON, + ) from e + except TypeError: + raise BaselineValidationError( + f"Baseline payload must be an object at {path}", + status=BaselineStatus.INVALID_TYPE, + ) from None + + +def _parse_generator_meta( + meta_obj: dict[str, object], *, path: Path +) -> tuple[str, str | None]: + raw_generator = meta_obj.get("generator") + + if isinstance(raw_generator, str): + generator_version = _optional_str(meta_obj, "generator_version", path=path) + if generator_version is None: + generator_version = _optional_str(meta_obj, "codeclone_version", path=path) + return raw_generator, generator_version + + if isinstance(raw_generator, dict): + allowed_keys = {"name", "version"} + extra = set(raw_generator.keys()) - allowed_keys + if extra: + raise BaselineValidationError( + f"Invalid baseline schema at {path}: unexpected generator keys: " + f"{', '.join(sorted(extra))}", + status=BaselineStatus.INVALID_TYPE, + ) + generator_name = _require_str(raw_generator, "name", path=path) + generator_version = _optional_str(raw_generator, "version", path=path) + + if generator_version is None: + generator_version = _optional_str(meta_obj, "generator_version", path=path) + if generator_version is None: + generator_version = _optional_str( + meta_obj, "codeclone_version", path=path + ) + + return generator_name, generator_version + + raise BaselineValidationError( + f"Invalid baseline schema at {path}: 'generator' must be string or object", + status=BaselineStatus.INVALID_TYPE, + ) + + +def _compute_payload_sha256( + *, + functions: Collection[str], + blocks: Collection[str], + fingerprint_version: str, + python_tag: str, +) -> str: + canonical = { + "blocks": sorted(blocks), + "fingerprint_version": fingerprint_version, + "functions": sorted(functions), + "python_tag": python_tag, + } + serialized = orjson.dumps(canonical, option=orjson.OPT_SORT_KEYS) + return hashlib.sha256(serialized).hexdigest() + + +def current_python_tag() -> str: + """Return the interpreter compatibility tag as an immutable string.""" + impl = sys.implementation.name + major, minor = sys.version_info[:2] + prefix = "cp" if impl == "cpython" else impl[:2] + return f"{prefix}{major}{minor}" + + +def _utc_now_z() -> str: + return ( + datetime.now(timezone.utc).replace(microsecond=0).strftime("%Y-%m-%dT%H:%M:%SZ") + ) + + +def _require_str(obj: dict[str, object], key: str, *, path: Path) -> str: + value = obj.get(key) + if not isinstance(value, str): + raise BaselineValidationError( + f"Invalid baseline schema at {path}: '{key}' must be string", + status=BaselineStatus.INVALID_TYPE, + ) + return value + + +def _optional_str(obj: dict[str, object], key: str, *, path: Path) -> str | None: + value = obj.get(key) + if value is None: + return None + if not isinstance(value, str): + raise BaselineValidationError( + f"Invalid baseline schema at {path}: '{key}' must be string", + status=BaselineStatus.INVALID_TYPE, + ) + return value + + +def _require_semver_str(obj: dict[str, object], key: str, *, path: Path) -> str: + value = _require_str(obj, key, path=path) + _parse_semver(value, key=key, path=path) + return value + + +def _parse_semver(value: str, *, key: str, path: Path) -> tuple[int, int, int]: + parts = value.split(".") + if len(parts) not in {2, 3} or not all(part.isdigit() for part in parts): + raise BaselineValidationError( + f"Invalid baseline schema at {path}: '{key}' must be semver string", + status=BaselineStatus.INVALID_TYPE, + ) + if len(parts) == 2: + major, minor = int(parts[0]), int(parts[1]) + patch = 0 + else: + major, minor, patch = int(parts[0]), int(parts[1]), int(parts[2]) + return major, minor, patch + + +def _require_python_tag(obj: dict[str, object], key: str, *, path: Path) -> str: + value = _require_str(obj, key, path=path) + if not re.fullmatch(r"[a-z]{2}\d{2,3}", value): + raise BaselineValidationError( + f"Invalid baseline schema at {path}: '{key}' must look like 'cp313'", + status=BaselineStatus.INVALID_TYPE, + ) + return value + + +def _require_utc_iso8601_z(obj: dict[str, object], key: str, *, path: Path) -> str: + value = _require_str(obj, key, path=path) + if not _UTC_ISO8601_Z_RE.fullmatch(value): + raise BaselineValidationError( + f"Invalid baseline schema at {path}: '{key}' must be UTC ISO-8601 with Z", + status=BaselineStatus.INVALID_TYPE, + ) + try: + datetime( + int(value[0:4]), + int(value[5:7]), + int(value[8:10]), + int(value[11:13]), + int(value[14:16]), + int(value[17:19]), + tzinfo=timezone.utc, + ) + except ValueError as e: + raise BaselineValidationError( + f"Invalid baseline schema at {path}: '{key}' must be UTC ISO-8601 with Z", + status=BaselineStatus.INVALID_TYPE, + ) from e + return value + + +def _require_sorted_unique_ids( + obj: dict[str, object], key: str, *, pattern: re.Pattern[str], path: Path +) -> list[str]: + value = obj.get(key) + if not isinstance(value, list): + raise BaselineValidationError( + f"Invalid baseline schema at {path}: '{key}' must be list[str]", + status=BaselineStatus.INVALID_TYPE, + ) + if not all(isinstance(item, str) for item in value): + raise BaselineValidationError( + f"Invalid baseline schema at {path}: '{key}' must be list[str]", + status=BaselineStatus.INVALID_TYPE, + ) + values = list(value) + if values != sorted(values) or len(values) != len(set(values)): + raise BaselineValidationError( + f"Invalid baseline schema at {path}: '{key}' must be sorted and unique", + status=BaselineStatus.INVALID_TYPE, + ) + if not all(pattern.fullmatch(item) for item in values): + raise BaselineValidationError( + f"Invalid baseline schema at {path}: '{key}' has invalid id format", + status=BaselineStatus.INVALID_TYPE, + ) + return values + + +__all__ = [ + "BASELINE_GENERATOR", + "BASELINE_UNTRUSTED_STATUSES", + "MAX_BASELINE_SIZE_BYTES", + "_BASELINE_SCHEMA_MAX_MINOR_BY_MAJOR", + "BaselineStatus", + "_compute_payload_sha256", + "_load_json_object", + "_optional_str", + "_parse_generator_meta", + "_parse_semver", + "_require_python_tag", + "_require_semver_str", + "_require_sorted_unique_ids", + "_require_str", + "_require_utc_iso8601_z", + "_safe_stat_size", + "_utc_now_z", + "coerce_baseline_status", + "current_python_tag", +] diff --git a/codeclone/blocks.py b/codeclone/blocks/__init__.py similarity index 95% rename from codeclone/blocks.py rename to codeclone/blocks/__init__.py index 9089ff1..d998021 100644 --- a/codeclone/blocks.py +++ b/codeclone/blocks/__init__.py @@ -8,15 +8,15 @@ from typing import TYPE_CHECKING -from .fingerprint import sha1 -from .models import BlockUnit, SegmentUnit -from .normalize import stmt_hashes +from ..analysis.fingerprint import sha1 +from ..analysis.normalizer import stmt_hashes +from ..models import BlockUnit, SegmentUnit if TYPE_CHECKING: import ast from collections.abc import Sequence - from .normalize import NormalizationConfig + from ..analysis.normalizer import NormalizationConfig __all__ = ["BlockUnit", "SegmentUnit", "extract_blocks", "extract_segments"] diff --git a/codeclone/cache.py b/codeclone/cache.py deleted file mode 100644 index 282cf66..0000000 --- a/codeclone/cache.py +++ /dev/null @@ -1,2803 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at https://mozilla.org/MPL/2.0/. -# SPDX-License-Identifier: MPL-2.0 -# Copyright (c) 2026 Den Rozhnovskiy - -from __future__ import annotations - -import os -from collections.abc import Collection -from enum import Enum -from json import JSONDecodeError -from pathlib import Path -from typing import TYPE_CHECKING, Literal, TypedDict, TypeGuard, TypeVar, cast - -from .baseline import current_python_tag -from .cache_io import ( - as_int_or_none as _cache_as_int, -) -from .cache_io import ( - as_object_list as _cache_as_list, -) -from .cache_io import ( - as_str_dict as _cache_as_str_dict, -) -from .cache_io import ( - as_str_or_none as _cache_as_str, -) -from .cache_io import ( - read_json_document, - sign_cache_payload, - verify_cache_payload_signature, - write_json_document_atomically, -) -from .cache_paths import runtime_filepath_from_wire, wire_filepath_from_runtime -from .cache_segments import ( - SegmentReportProjection as _SegmentReportProjection, -) -from .cache_segments import ( - build_segment_report_projection as _build_segment_report_projection, -) -from .cache_segments import ( - decode_segment_report_projection, - encode_segment_report_projection, -) -from .contracts import BASELINE_FINGERPRINT_VERSION, CACHE_VERSION -from .errors import CacheError -from .models import ( - BlockGroupItem, - BlockUnit, - ClassMetrics, - DeadCandidate, - FileMetrics, - FunctionGroupItem, - ModuleApiSurface, - ModuleDep, - ModuleDocstringCoverage, - ModuleTypingCoverage, - SegmentGroupItem, - SegmentUnit, - StructuralFindingGroup, - StructuralFindingOccurrence, - Unit, -) -from .structural_findings import normalize_structural_finding_group - -if TYPE_CHECKING: - from collections.abc import Callable, Mapping, Sequence - -SegmentReportProjection = _SegmentReportProjection -build_segment_report_projection = _build_segment_report_projection -_as_str = _cache_as_str -_as_int = _cache_as_int -_as_list = _cache_as_list -_as_str_dict = _cache_as_str_dict - -MAX_CACHE_SIZE_BYTES = 50 * 1024 * 1024 -LEGACY_CACHE_SECRET_FILENAME = ".cache_secret" -_DEFAULT_WIRE_UNIT_FLOW_PROFILES = ( - 0, - "none", - False, - "fallthrough", - "none", - "none", -) - - -class CacheStatus(str, Enum): - OK = "ok" - MISSING = "missing" - TOO_LARGE = "too_large" - UNREADABLE = "unreadable" - INVALID_JSON = "invalid_json" - INVALID_TYPE = "invalid_type" - VERSION_MISMATCH = "version_mismatch" - PYTHON_TAG_MISMATCH = "python_tag_mismatch" - FINGERPRINT_MISMATCH = "mismatch_fingerprint_version" - ANALYSIS_PROFILE_MISMATCH = "analysis_profile_mismatch" - INTEGRITY_FAILED = "integrity_failed" - - -class FileStat(TypedDict): - mtime_ns: int - size: int - - -class SourceStatsDict(TypedDict): - lines: int - functions: int - methods: int - classes: int - - -UnitDict = FunctionGroupItem -BlockDict = BlockGroupItem -SegmentDict = SegmentGroupItem - - -class ClassMetricsDictBase(TypedDict): - qualname: str - filepath: str - start_line: int - end_line: int - cbo: int - lcom4: int - method_count: int - instance_var_count: int - risk_coupling: str - risk_cohesion: str - - -class ClassMetricsDict(ClassMetricsDictBase, total=False): - coupled_classes: list[str] - - -class ModuleDepDict(TypedDict): - source: str - target: str - import_type: str - line: int - - -class DeadCandidateDictBase(TypedDict): - qualname: str - local_name: str - filepath: str - start_line: int - end_line: int - kind: str - - -class DeadCandidateDict(DeadCandidateDictBase, total=False): - suppressed_rules: list[str] - - -class ModuleTypingCoverageDict(TypedDict): - module: str - filepath: str - callable_count: int - params_total: int - params_annotated: int - returns_total: int - returns_annotated: int - any_annotation_count: int - - -class ModuleDocstringCoverageDict(TypedDict): - module: str - filepath: str - public_symbol_total: int - public_symbol_documented: int - - -class ApiParamSpecDict(TypedDict): - name: str - kind: str - has_default: bool - annotation_hash: str - - -class PublicSymbolDict(TypedDict): - qualname: str - kind: str - start_line: int - end_line: int - params: list[ApiParamSpecDict] - returns_hash: str - exported_via: str - - -class ModuleApiSurfaceDict(TypedDict): - module: str - filepath: str - all_declared: list[str] - symbols: list[PublicSymbolDict] - - -class StructuralFindingOccurrenceDict(TypedDict): - qualname: str - start: int - end: int - - -class StructuralFindingGroupDict(TypedDict): - finding_kind: str - finding_key: str - signature: dict[str, str] - items: list[StructuralFindingOccurrenceDict] - - -class CacheEntryBase(TypedDict): - stat: FileStat - units: list[UnitDict] - blocks: list[BlockDict] - segments: list[SegmentDict] - - -class CacheEntry(CacheEntryBase, total=False): - source_stats: SourceStatsDict - class_metrics: list[ClassMetricsDict] - module_deps: list[ModuleDepDict] - dead_candidates: list[DeadCandidateDict] - referenced_names: list[str] - referenced_qualnames: list[str] - import_names: list[str] - class_names: list[str] - typing_coverage: ModuleTypingCoverageDict - docstring_coverage: ModuleDocstringCoverageDict - api_surface: ModuleApiSurfaceDict - structural_findings: list[StructuralFindingGroupDict] - - -class AnalysisProfile(TypedDict): - min_loc: int - min_stmt: int - block_min_loc: int - block_min_stmt: int - segment_min_loc: int - segment_min_stmt: int - collect_api_surface: bool - - -class CacheData(TypedDict): - version: str - python_tag: str - fingerprint_version: str - analysis_profile: AnalysisProfile - files: dict[str, CacheEntry] - - -def _normalize_cached_structural_group( - group: StructuralFindingGroupDict, - *, - filepath: str, -) -> StructuralFindingGroupDict | None: - signature = dict(group["signature"]) - finding_kind = group["finding_kind"] - finding_key = group["finding_key"] - normalized = normalize_structural_finding_group( - StructuralFindingGroup( - finding_kind=finding_kind, - finding_key=finding_key, - signature=signature, - items=tuple( - StructuralFindingOccurrence( - finding_kind=finding_kind, - finding_key=finding_key, - file_path=filepath, - qualname=item["qualname"], - start=item["start"], - end=item["end"], - signature=signature, - ) - for item in group["items"] - ), - ) - ) - if normalized is None: - return None - return StructuralFindingGroupDict( - finding_kind=normalized.finding_kind, - finding_key=normalized.finding_key, - signature=dict(normalized.signature), - items=[ - StructuralFindingOccurrenceDict( - qualname=item.qualname, - start=item.start, - end=item.end, - ) - for item in normalized.items - ], - ) - - -def _normalize_cached_structural_groups( - groups: Sequence[StructuralFindingGroupDict], - *, - filepath: str, -) -> list[StructuralFindingGroupDict]: - normalized = [ - candidate - for candidate in ( - _normalize_cached_structural_group(group, filepath=filepath) - for group in groups - ) - if candidate is not None - ] - normalized.sort(key=lambda group: (-len(group["items"]), group["finding_key"])) - return normalized - - -_DecodedItemT = TypeVar("_DecodedItemT") -_ValidatedItemT = TypeVar("_ValidatedItemT") - - -class Cache: - __slots__ = ( - "_canonical_runtime_paths", - "_dirty", - "analysis_profile", - "cache_schema_version", - "data", - "fingerprint_version", - "legacy_secret_warning", - "load_status", - "load_warning", - "max_size_bytes", - "path", - "root", - "segment_report_projection", - ) - - _CACHE_VERSION = CACHE_VERSION - - def __init__( - self, - path: str | Path, - *, - root: str | Path | None = None, - max_size_bytes: int | None = None, - min_loc: int = 10, - min_stmt: int = 6, - block_min_loc: int = 20, - block_min_stmt: int = 8, - segment_min_loc: int = 20, - segment_min_stmt: int = 10, - collect_api_surface: bool = False, - ): - self.path = Path(path) - self.root = _resolve_root(root) - self.fingerprint_version = BASELINE_FINGERPRINT_VERSION - self.analysis_profile: AnalysisProfile = { - "min_loc": min_loc, - "min_stmt": min_stmt, - "block_min_loc": block_min_loc, - "block_min_stmt": block_min_stmt, - "segment_min_loc": segment_min_loc, - "segment_min_stmt": segment_min_stmt, - "collect_api_surface": collect_api_surface, - } - self.data: CacheData = _empty_cache_data( - version=self._CACHE_VERSION, - python_tag=current_python_tag(), - fingerprint_version=self.fingerprint_version, - analysis_profile=self.analysis_profile, - ) - self._canonical_runtime_paths: set[str] = set() - self.legacy_secret_warning = self._detect_legacy_secret_warning() - self.cache_schema_version: str | None = None - self.load_status = CacheStatus.MISSING - self.load_warning: str | None = self.legacy_secret_warning - self.max_size_bytes = ( - MAX_CACHE_SIZE_BYTES if max_size_bytes is None else max_size_bytes - ) - self.segment_report_projection: SegmentReportProjection | None = None - self._dirty: bool = True # new cache is dirty until loaded from disk - - def _detect_legacy_secret_warning(self) -> str | None: - secret_path = self.path.parent / LEGACY_CACHE_SECRET_FILENAME - try: - if secret_path.exists(): - return ( - f"Legacy cache secret file detected at {secret_path}; " - "delete this obsolete file." - ) - except OSError as e: - return f"Legacy cache secret check failed: {e}" - return None - - def _set_load_warning(self, message: str | None) -> None: - warning = message - if warning is None: - warning = self.legacy_secret_warning - elif self.legacy_secret_warning: - warning = f"{warning}\n{self.legacy_secret_warning}" - self.load_warning = warning - - def _ignore_cache( - self, - message: str, - *, - status: CacheStatus, - schema_version: str | None = None, - ) -> None: - self._set_load_warning(message) - self.load_status = status - self.cache_schema_version = schema_version - self.data = _empty_cache_data( - version=self._CACHE_VERSION, - python_tag=current_python_tag(), - fingerprint_version=self.fingerprint_version, - analysis_profile=self.analysis_profile, - ) - self._canonical_runtime_paths = set() - self.segment_report_projection = None - - def _reject_cache_load( - self, - message: str, - *, - status: CacheStatus, - schema_version: str | None = None, - ) -> CacheData | None: - self._ignore_cache( - message, - status=status, - schema_version=schema_version, - ) - return None - - def _reject_invalid_cache_format( - self, - *, - schema_version: str | None = None, - ) -> CacheData | None: - return self._reject_cache_load( - "Cache format invalid; ignoring cache.", - status=CacheStatus.INVALID_TYPE, - schema_version=schema_version, - ) - - def _reject_version_mismatch(self, version: str) -> CacheData | None: - return self._reject_cache_load( - f"Cache version mismatch (found {version}); ignoring cache.", - status=CacheStatus.VERSION_MISMATCH, - schema_version=version, - ) - - def load(self) -> None: - try: - exists = self.path.exists() - except OSError as e: - self._ignore_cache( - f"Cache unreadable; ignoring cache: {e}", - status=CacheStatus.UNREADABLE, - ) - return - - if not exists: - self._set_load_warning(None) - self.load_status = CacheStatus.MISSING - self.cache_schema_version = None - self._canonical_runtime_paths = set() - self.segment_report_projection = None - return - - try: - size = self.path.stat().st_size - if size > self.max_size_bytes: - self._ignore_cache( - "Cache file too large " - f"({size} bytes, max {self.max_size_bytes}); ignoring cache.", - status=CacheStatus.TOO_LARGE, - ) - return - - raw_obj = read_json_document(self.path) - parsed = self._load_and_validate(raw_obj) - if parsed is None: - return - self.data = parsed - self._canonical_runtime_paths = set(parsed["files"].keys()) - self.load_status = CacheStatus.OK - self._set_load_warning(None) - self._dirty = False # freshly loaded — nothing to persist - - except OSError as e: - self._ignore_cache( - f"Cache unreadable; ignoring cache: {e}", - status=CacheStatus.UNREADABLE, - ) - except JSONDecodeError: - self._ignore_cache( - "Cache corrupted; ignoring cache.", - status=CacheStatus.INVALID_JSON, - ) - - def _load_and_validate(self, raw_obj: object) -> CacheData | None: - raw = _as_str_dict(raw_obj) - if raw is None: - return self._reject_invalid_cache_format() - - # Legacy cache format: top-level {version, files, _signature}. - legacy_version = _as_str(raw.get("version")) - if legacy_version is not None: - return self._reject_version_mismatch(legacy_version) - - version = _as_str(raw.get("v")) - if version is None: - return self._reject_invalid_cache_format() - - if version != self._CACHE_VERSION: - return self._reject_version_mismatch(version) - - sig = _as_str(raw.get("sig")) - payload_obj = raw.get("payload") - payload = _as_str_dict(payload_obj) - if sig is None or payload is None: - return self._reject_invalid_cache_format(schema_version=version) - - if not verify_cache_payload_signature(payload, sig): - return self._reject_cache_load( - "Cache signature mismatch; ignoring cache.", - status=CacheStatus.INTEGRITY_FAILED, - schema_version=version, - ) - - runtime_tag = current_python_tag() - py_tag = _as_str(payload.get("py")) - if py_tag is None: - return self._reject_invalid_cache_format(schema_version=version) - - if py_tag != runtime_tag: - return self._reject_cache_load( - "Cache python tag mismatch " - f"(found {py_tag}, expected {runtime_tag}); ignoring cache.", - status=CacheStatus.PYTHON_TAG_MISMATCH, - schema_version=version, - ) - - fp_version = _as_str(payload.get("fp")) - if fp_version is None: - return self._reject_invalid_cache_format(schema_version=version) - - if fp_version != self.fingerprint_version: - return self._reject_cache_load( - "Cache fingerprint version mismatch " - f"(found {fp_version}, expected {self.fingerprint_version}); " - "ignoring cache.", - status=CacheStatus.FINGERPRINT_MISMATCH, - schema_version=version, - ) - - analysis_profile = _as_analysis_profile(payload.get("ap")) - if analysis_profile is None: - return self._reject_invalid_cache_format(schema_version=version) - - if analysis_profile != self.analysis_profile: - return self._reject_cache_load( - "Cache analysis profile mismatch " - f"(found min_loc={analysis_profile['min_loc']}, " - f"min_stmt={analysis_profile['min_stmt']}, " - "collect_api_surface=" - f"{str(analysis_profile['collect_api_surface']).lower()}; " - f"expected min_loc={self.analysis_profile['min_loc']}, " - f"min_stmt={self.analysis_profile['min_stmt']}, " - "collect_api_surface=" - f"{str(self.analysis_profile['collect_api_surface']).lower()}); " - "ignoring cache.", - status=CacheStatus.ANALYSIS_PROFILE_MISMATCH, - schema_version=version, - ) - - files_obj = payload.get("files") - files_dict = _as_str_dict(files_obj) - if files_dict is None: - return self._reject_invalid_cache_format(schema_version=version) - - parsed_files: dict[str, CacheEntry] = {} - for wire_path, file_entry_obj in files_dict.items(): - runtime_path = runtime_filepath_from_wire(wire_path, root=self.root) - parsed_entry = self._decode_entry(file_entry_obj, runtime_path) - if parsed_entry is None: - return self._reject_invalid_cache_format(schema_version=version) - parsed_files[runtime_path] = _canonicalize_cache_entry(parsed_entry) - self.segment_report_projection = decode_segment_report_projection( - payload.get("sr"), - root=self.root, - ) - - self.cache_schema_version = version - return CacheData( - version=self._CACHE_VERSION, - python_tag=runtime_tag, - fingerprint_version=self.fingerprint_version, - analysis_profile=self.analysis_profile, - files=parsed_files, - ) - - def save(self) -> None: - if not self._dirty: - return - try: - wire_files: dict[str, object] = {} - wire_map = { - rp: wire_filepath_from_runtime(rp, root=self.root) - for rp in self.data["files"] - } - for runtime_path in sorted(self.data["files"], key=wire_map.__getitem__): - entry = self.get_file_entry(runtime_path) - if entry is None: - continue - wire_files[wire_map[runtime_path]] = self._encode_entry(entry) - - payload: dict[str, object] = { - "py": current_python_tag(), - "fp": self.fingerprint_version, - "ap": self.analysis_profile, - "files": wire_files, - } - segment_projection = encode_segment_report_projection( - self.segment_report_projection, - root=self.root, - ) - if segment_projection is not None: - payload["sr"] = segment_projection - signed_doc = { - "v": self._CACHE_VERSION, - "payload": payload, - "sig": sign_cache_payload(payload), - } - write_json_document_atomically(self.path, signed_doc) - self._dirty = False - - self.data["version"] = self._CACHE_VERSION - self.data["python_tag"] = current_python_tag() - self.data["fingerprint_version"] = self.fingerprint_version - self.data["analysis_profile"] = self.analysis_profile - - except OSError as e: - raise CacheError(f"Failed to save cache: {e}") from e - - @staticmethod - def _decode_entry(value: object, filepath: str) -> CacheEntry | None: - return _decode_wire_file_entry(value, filepath) - - @staticmethod - def _encode_entry(entry: CacheEntry) -> dict[str, object]: - return _encode_wire_file_entry(entry) - - def _store_canonical_file_entry( - self, - *, - runtime_path: str, - canonical_entry: CacheEntry, - ) -> CacheEntry: - previous_entry = self.data["files"].get(runtime_path) - was_canonical = runtime_path in self._canonical_runtime_paths - self.data["files"][runtime_path] = canonical_entry - self._canonical_runtime_paths.add(runtime_path) - if not was_canonical or previous_entry != canonical_entry: - self._dirty = True - return canonical_entry - - def get_file_entry(self, filepath: str) -> CacheEntry | None: - runtime_lookup_key = filepath - entry_obj = self.data["files"].get(runtime_lookup_key) - if entry_obj is None: - wire_key = wire_filepath_from_runtime(filepath, root=self.root) - runtime_lookup_key = runtime_filepath_from_wire(wire_key, root=self.root) - entry_obj = self.data["files"].get(runtime_lookup_key) - - if entry_obj is None: - return None - - if runtime_lookup_key in self._canonical_runtime_paths: - if _is_canonical_cache_entry(entry_obj): - return entry_obj - self._canonical_runtime_paths.discard(runtime_lookup_key) - - if not isinstance(entry_obj, dict): - return None - entry = entry_obj - - required = {"stat", "units", "blocks", "segments"} - if not required.issubset(entry.keys()): - return None - - stat = _as_file_stat_dict(entry.get("stat")) - units = _as_typed_unit_list(entry.get("units")) - blocks = _as_typed_block_list(entry.get("blocks")) - segments = _as_typed_segment_list(entry.get("segments")) - if stat is None or units is None or blocks is None or segments is None: - return None - - optional_sections = _decode_optional_cache_sections(entry) - if optional_sections is None: - return None - ( - class_metrics_raw, - module_deps_raw, - dead_candidates_raw, - referenced_names_raw, - referenced_qualnames_raw, - import_names_raw, - class_names_raw, - typing_coverage_raw, - docstring_coverage_raw, - api_surface_raw, - source_stats, - structural_findings, - ) = optional_sections - - entry_to_canonicalize: CacheEntry = _attach_optional_cache_sections( - CacheEntry( - stat=stat, - units=units, - blocks=blocks, - segments=segments, - class_metrics=class_metrics_raw, - module_deps=module_deps_raw, - dead_candidates=dead_candidates_raw, - referenced_names=referenced_names_raw, - referenced_qualnames=referenced_qualnames_raw, - import_names=import_names_raw, - class_names=class_names_raw, - ), - typing_coverage=typing_coverage_raw, - docstring_coverage=docstring_coverage_raw, - api_surface=api_surface_raw, - source_stats=source_stats, - structural_findings=structural_findings, - ) - canonical_entry = _canonicalize_cache_entry(entry_to_canonicalize) - return self._store_canonical_file_entry( - runtime_path=runtime_lookup_key, - canonical_entry=canonical_entry, - ) - - def put_file_entry( - self, - filepath: str, - stat_sig: FileStat, - units: list[Unit], - blocks: list[BlockUnit], - segments: list[SegmentUnit], - *, - source_stats: SourceStatsDict | None = None, - file_metrics: FileMetrics | None = None, - structural_findings: list[StructuralFindingGroup] | None = None, - ) -> None: - runtime_path = runtime_filepath_from_wire( - wire_filepath_from_runtime(filepath, root=self.root), - root=self.root, - ) - - unit_rows = [_unit_dict_from_model(unit, runtime_path) for unit in units] - block_rows = [_block_dict_from_model(block, runtime_path) for block in blocks] - segment_rows = [ - _segment_dict_from_model(segment, runtime_path) for segment in segments - ] - - ( - class_metrics_rows, - module_dep_rows, - dead_candidate_rows, - referenced_names, - referenced_qualnames, - import_names, - class_names, - typing_coverage, - docstring_coverage, - api_surface, - ) = _new_optional_metrics_payload() - if file_metrics is not None: - class_metrics_rows = [ - _class_metrics_dict_from_model(metric, runtime_path) - for metric in file_metrics.class_metrics - ] - module_dep_rows = [ - _module_dep_dict_from_model(dep) for dep in file_metrics.module_deps - ] - dead_candidate_rows = [ - _dead_candidate_dict_from_model(candidate, runtime_path) - for candidate in file_metrics.dead_candidates - ] - referenced_names = sorted(set(file_metrics.referenced_names)) - referenced_qualnames = sorted(set(file_metrics.referenced_qualnames)) - import_names = sorted(set(file_metrics.import_names)) - class_names = sorted(set(file_metrics.class_names)) - typing_coverage = _typing_coverage_dict_from_model( - file_metrics.typing_coverage, - filepath=runtime_path, - ) - docstring_coverage = _docstring_coverage_dict_from_model( - file_metrics.docstring_coverage, - filepath=runtime_path, - ) - api_surface = _api_surface_dict_from_model( - file_metrics.api_surface, - filepath=runtime_path, - ) - - source_stats_payload = source_stats or SourceStatsDict( - lines=0, - functions=0, - methods=0, - classes=0, - ) - entry_dict = CacheEntry( - stat=stat_sig, - source_stats=source_stats_payload, - units=unit_rows, - blocks=block_rows, - segments=segment_rows, - class_metrics=class_metrics_rows, - module_deps=module_dep_rows, - dead_candidates=dead_candidate_rows, - referenced_names=referenced_names, - referenced_qualnames=referenced_qualnames, - import_names=import_names, - class_names=class_names, - ) - if typing_coverage is not None: - entry_dict["typing_coverage"] = typing_coverage - if docstring_coverage is not None: - entry_dict["docstring_coverage"] = docstring_coverage - if api_surface is not None: - entry_dict["api_surface"] = api_surface - if structural_findings is not None: - entry_dict["structural_findings"] = _normalize_cached_structural_groups( - [ - _structural_group_dict_from_model(group) - for group in structural_findings - ], - filepath=runtime_path, - ) - canonical_entry = _canonicalize_cache_entry(entry_dict) - self._store_canonical_file_entry( - runtime_path=runtime_path, - canonical_entry=canonical_entry, - ) - - -def file_stat_signature(path: str) -> FileStat: - st = os.stat(path) - return FileStat( - mtime_ns=st.st_mtime_ns, - size=st.st_size, - ) - - -def _empty_cache_data( - *, - version: str, - python_tag: str, - fingerprint_version: str, - analysis_profile: AnalysisProfile, -) -> CacheData: - return CacheData( - version=version, - python_tag=python_tag, - fingerprint_version=fingerprint_version, - analysis_profile=analysis_profile, - files={}, - ) - - -def _as_risk_literal(value: object) -> Literal["low", "medium", "high"] | None: - match value: - case "low": - return "low" - case "medium": - return "medium" - case "high": - return "high" - case _: - return None - - -def _new_optional_metrics_payload() -> tuple[ - list[ClassMetricsDict], - list[ModuleDepDict], - list[DeadCandidateDict], - list[str], - list[str], - list[str], - list[str], - ModuleTypingCoverageDict | None, - ModuleDocstringCoverageDict | None, - ModuleApiSurfaceDict | None, -]: - return [], [], [], [], [], [], [], None, None, None - - -def _unit_dict_from_model(unit: Unit, filepath: str) -> UnitDict: - return FunctionGroupItem( - qualname=unit.qualname, - filepath=filepath, - start_line=unit.start_line, - end_line=unit.end_line, - loc=unit.loc, - stmt_count=unit.stmt_count, - fingerprint=unit.fingerprint, - loc_bucket=unit.loc_bucket, - cyclomatic_complexity=unit.cyclomatic_complexity, - nesting_depth=unit.nesting_depth, - risk=unit.risk, - raw_hash=unit.raw_hash, - entry_guard_count=unit.entry_guard_count, - entry_guard_terminal_profile=unit.entry_guard_terminal_profile, - entry_guard_has_side_effect_before=unit.entry_guard_has_side_effect_before, - terminal_kind=unit.terminal_kind, - try_finally_profile=unit.try_finally_profile, - side_effect_order_profile=unit.side_effect_order_profile, - ) - - -def _block_dict_from_model(block: BlockUnit, filepath: str) -> BlockDict: - return BlockGroupItem( - block_hash=block.block_hash, - filepath=filepath, - qualname=block.qualname, - start_line=block.start_line, - end_line=block.end_line, - size=block.size, - ) - - -def _segment_dict_from_model(segment: SegmentUnit, filepath: str) -> SegmentDict: - return SegmentGroupItem( - segment_hash=segment.segment_hash, - segment_sig=segment.segment_sig, - filepath=filepath, - qualname=segment.qualname, - start_line=segment.start_line, - end_line=segment.end_line, - size=segment.size, - ) - - -def _typing_coverage_dict_from_model( - coverage: ModuleTypingCoverage | None, - *, - filepath: str, -) -> ModuleTypingCoverageDict | None: - if coverage is None: - return None - return ModuleTypingCoverageDict( - module=coverage.module, - filepath=filepath, - callable_count=coverage.callable_count, - params_total=coverage.params_total, - params_annotated=coverage.params_annotated, - returns_total=coverage.returns_total, - returns_annotated=coverage.returns_annotated, - any_annotation_count=coverage.any_annotation_count, - ) - - -def _docstring_coverage_dict_from_model( - coverage: ModuleDocstringCoverage | None, - *, - filepath: str, -) -> ModuleDocstringCoverageDict | None: - if coverage is None: - return None - return ModuleDocstringCoverageDict( - module=coverage.module, - filepath=filepath, - public_symbol_total=coverage.public_symbol_total, - public_symbol_documented=coverage.public_symbol_documented, - ) - - -def _api_surface_dict_from_model( - surface: ModuleApiSurface | None, - *, - filepath: str, -) -> ModuleApiSurfaceDict | None: - if surface is None: - return None - return ModuleApiSurfaceDict( - module=surface.module, - filepath=filepath, - all_declared=list(surface.all_declared or ()), - symbols=[ - PublicSymbolDict( - qualname=symbol.qualname, - kind=symbol.kind, - start_line=symbol.start_line, - end_line=symbol.end_line, - params=[ - ApiParamSpecDict( - name=param.name, - kind=param.kind, - has_default=param.has_default, - annotation_hash=param.annotation_hash, - ) - for param in symbol.params - ], - returns_hash=symbol.returns_hash, - exported_via=symbol.exported_via, - ) - for symbol in surface.symbols - ], - ) - - -def _class_metrics_dict_from_model( - metric: ClassMetrics, - filepath: str, -) -> ClassMetricsDict: - return ClassMetricsDict( - qualname=metric.qualname, - filepath=filepath, - start_line=metric.start_line, - end_line=metric.end_line, - cbo=metric.cbo, - lcom4=metric.lcom4, - method_count=metric.method_count, - instance_var_count=metric.instance_var_count, - risk_coupling=metric.risk_coupling, - risk_cohesion=metric.risk_cohesion, - coupled_classes=sorted(set(metric.coupled_classes)), - ) - - -def _module_dep_dict_from_model(dep: ModuleDep) -> ModuleDepDict: - return ModuleDepDict( - source=dep.source, - target=dep.target, - import_type=dep.import_type, - line=dep.line, - ) - - -def _dead_candidate_dict_from_model( - candidate: DeadCandidate, - filepath: str, -) -> DeadCandidateDict: - result = DeadCandidateDict( - qualname=candidate.qualname, - local_name=candidate.local_name, - filepath=filepath, - start_line=candidate.start_line, - end_line=candidate.end_line, - kind=candidate.kind, - ) - if candidate.suppressed_rules: - result["suppressed_rules"] = sorted(set(candidate.suppressed_rules)) - return result - - -def _structural_occurrence_dict_from_model( - occurrence: StructuralFindingOccurrence, -) -> StructuralFindingOccurrenceDict: - return StructuralFindingOccurrenceDict( - qualname=occurrence.qualname, - start=occurrence.start, - end=occurrence.end, - ) - - -def _structural_group_dict_from_model( - group: StructuralFindingGroup, -) -> StructuralFindingGroupDict: - return StructuralFindingGroupDict( - finding_kind=group.finding_kind, - finding_key=group.finding_key, - signature=dict(group.signature), - items=[ - _structural_occurrence_dict_from_model(occurrence) - for occurrence in group.items - ], - ) - - -def _as_file_stat_dict(value: object) -> FileStat | None: - if not _is_file_stat_dict(value): - return None - obj = cast("Mapping[str, object]", value) - mtime_ns = obj.get("mtime_ns") - size = obj.get("size") - if not isinstance(mtime_ns, int) or not isinstance(size, int): - return None - return FileStat(mtime_ns=mtime_ns, size=size) - - -def _as_source_stats_dict(value: object) -> SourceStatsDict | None: - if not _is_source_stats_dict(value): - return None - obj = cast("Mapping[str, object]", value) - lines = obj.get("lines") - functions = obj.get("functions") - methods = obj.get("methods") - classes = obj.get("classes") - assert isinstance(lines, int) - assert isinstance(functions, int) - assert isinstance(methods, int) - assert isinstance(classes, int) - return SourceStatsDict( - lines=lines, - functions=functions, - methods=methods, - classes=classes, - ) - - -def _as_typed_list( - value: object, - *, - predicate: Callable[[object], bool], -) -> list[_ValidatedItemT] | None: - if not isinstance(value, list): - return None - if not all(predicate(item) for item in value): - return None - return cast("list[_ValidatedItemT]", value) - - -def _as_typed_unit_list(value: object) -> list[UnitDict] | None: - return _as_typed_list(value, predicate=_is_unit_dict) - - -def _as_typed_block_list(value: object) -> list[BlockDict] | None: - return _as_typed_list(value, predicate=_is_block_dict) - - -def _as_typed_segment_list(value: object) -> list[SegmentDict] | None: - return _as_typed_list(value, predicate=_is_segment_dict) - - -def _as_typed_class_metrics_list(value: object) -> list[ClassMetricsDict] | None: - return _as_typed_list(value, predicate=_is_class_metrics_dict) - - -def _as_typed_dead_candidates_list( - value: object, -) -> list[DeadCandidateDict] | None: - return _as_typed_list(value, predicate=_is_dead_candidate_dict) - - -def _as_typed_module_deps_list(value: object) -> list[ModuleDepDict] | None: - return _as_typed_list(value, predicate=_is_module_dep_dict) - - -def _as_typed_string_list(value: object) -> list[str] | None: - return _as_typed_list(value, predicate=lambda item: isinstance(item, str)) - - -def _as_module_typing_coverage_dict( - value: object, -) -> ModuleTypingCoverageDict | None: - if not _is_module_typing_coverage_dict(value): - return None - return cast("ModuleTypingCoverageDict", value) - - -def _as_module_docstring_coverage_dict( - value: object, -) -> ModuleDocstringCoverageDict | None: - if not _is_module_docstring_coverage_dict(value): - return None - return cast("ModuleDocstringCoverageDict", value) - - -def _as_module_api_surface_dict(value: object) -> ModuleApiSurfaceDict | None: - if not _is_module_api_surface_dict(value): - return None - return cast("ModuleApiSurfaceDict", value) - - -def _normalized_optional_string_list(value: object) -> list[str] | None: - items = _as_typed_string_list(value) - if not items: - return None - return sorted(set(items)) - - -def _is_canonical_cache_entry(value: object) -> TypeGuard[CacheEntry]: - return isinstance(value, dict) and _has_cache_entry_container_shape(value) - - -def _has_cache_entry_container_shape(entry: Mapping[str, object]) -> bool: - required = {"stat", "units", "blocks", "segments"} - if not required.issubset(entry.keys()): - return False - if not isinstance(entry.get("stat"), dict): - return False - if not isinstance(entry.get("units"), list): - return False - if not isinstance(entry.get("blocks"), list): - return False - if not isinstance(entry.get("segments"), list): - return False - source_stats = entry.get("source_stats") - if source_stats is not None and not _is_source_stats_dict(source_stats): - return False - optional_list_keys = ( - "class_metrics", - "module_deps", - "dead_candidates", - "referenced_names", - "referenced_qualnames", - "import_names", - "class_names", - "structural_findings", - ) - if not all(isinstance(entry.get(key, []), list) for key in optional_list_keys): - return False - typing_coverage = entry.get("typing_coverage") - if typing_coverage is not None and not _is_module_typing_coverage_dict( - typing_coverage - ): - return False - docstring_coverage = entry.get("docstring_coverage") - if docstring_coverage is not None and not _is_module_docstring_coverage_dict( - docstring_coverage - ): - return False - api_surface = entry.get("api_surface") - return api_surface is None or _is_module_api_surface_dict(api_surface) - - -def _decode_optional_cache_sections( - entry: Mapping[str, object], -) -> ( - tuple[ - list[ClassMetricsDict], - list[ModuleDepDict], - list[DeadCandidateDict], - list[str], - list[str], - list[str], - list[str], - ModuleTypingCoverageDict | None, - ModuleDocstringCoverageDict | None, - ModuleApiSurfaceDict | None, - SourceStatsDict | None, - list[StructuralFindingGroupDict] | None, - ] - | None -): - class_metrics_raw = _as_typed_class_metrics_list(entry.get("class_metrics", [])) - module_deps_raw = _as_typed_module_deps_list(entry.get("module_deps", [])) - dead_candidates_raw = _as_typed_dead_candidates_list( - entry.get("dead_candidates", []) - ) - referenced_names_raw = _as_typed_string_list(entry.get("referenced_names", [])) - referenced_qualnames_raw = _as_typed_string_list( - entry.get("referenced_qualnames", []) - ) - import_names_raw = _as_typed_string_list(entry.get("import_names", [])) - class_names_raw = _as_typed_string_list(entry.get("class_names", [])) - if ( - class_metrics_raw is None - or module_deps_raw is None - or dead_candidates_raw is None - or referenced_names_raw is None - or referenced_qualnames_raw is None - or import_names_raw is None - or class_names_raw is None - ): - return None - typing_coverage_raw = _as_module_typing_coverage_dict(entry.get("typing_coverage")) - docstring_coverage_raw = _as_module_docstring_coverage_dict( - entry.get("docstring_coverage") - ) - api_surface_raw = _as_module_api_surface_dict(entry.get("api_surface")) - source_stats = _as_source_stats_dict(entry.get("source_stats")) - structural_findings = entry.get("structural_findings") - typed_structural_findings = ( - structural_findings if isinstance(structural_findings, list) else None - ) - return ( - class_metrics_raw, - module_deps_raw, - dead_candidates_raw, - referenced_names_raw, - referenced_qualnames_raw, - import_names_raw, - class_names_raw, - typing_coverage_raw, - docstring_coverage_raw, - api_surface_raw, - source_stats, - typed_structural_findings, - ) - - -def _attach_optional_cache_sections( - entry: CacheEntry, - *, - typing_coverage: ModuleTypingCoverageDict | None = None, - docstring_coverage: ModuleDocstringCoverageDict | None = None, - api_surface: ModuleApiSurfaceDict | None = None, - source_stats: SourceStatsDict | None = None, - structural_findings: list[StructuralFindingGroupDict] | None = None, -) -> CacheEntry: - if typing_coverage is not None: - entry["typing_coverage"] = typing_coverage - if docstring_coverage is not None: - entry["docstring_coverage"] = docstring_coverage - if api_surface is not None: - entry["api_surface"] = api_surface - if source_stats is not None: - entry["source_stats"] = source_stats - if structural_findings is not None: - entry["structural_findings"] = structural_findings - return entry - - -def _canonicalize_cache_entry(entry: CacheEntry) -> CacheEntry: - class_metrics_sorted = sorted( - entry["class_metrics"], - key=lambda item: ( - item["start_line"], - item["end_line"], - item["qualname"], - ), - ) - for metric in class_metrics_sorted: - coupled_classes = metric.get("coupled_classes", []) - if coupled_classes: - metric["coupled_classes"] = sorted(set(coupled_classes)) - - module_deps_sorted = sorted( - entry["module_deps"], - key=lambda item: ( - item["source"], - item["target"], - item["import_type"], - item["line"], - ), - ) - dead_candidates_normalized: list[DeadCandidateDict] = [] - for candidate in entry["dead_candidates"]: - suppressed_rules = candidate.get("suppressed_rules", []) - normalized_candidate = DeadCandidateDict( - qualname=candidate["qualname"], - local_name=candidate["local_name"], - filepath=candidate["filepath"], - start_line=candidate["start_line"], - end_line=candidate["end_line"], - kind=candidate["kind"], - ) - if _is_string_list(suppressed_rules): - normalized_rules = sorted(set(suppressed_rules)) - if normalized_rules: - normalized_candidate["suppressed_rules"] = normalized_rules - dead_candidates_normalized.append(normalized_candidate) - - dead_candidates_sorted = sorted( - dead_candidates_normalized, - key=lambda item: ( - item["start_line"], - item["end_line"], - item["qualname"], - item["local_name"], - item["kind"], - tuple(item.get("suppressed_rules", [])), - ), - ) - - result: CacheEntry = { - "stat": entry["stat"], - "units": entry["units"], - "blocks": entry["blocks"], - "segments": entry["segments"], - "class_metrics": class_metrics_sorted, - "module_deps": module_deps_sorted, - "dead_candidates": dead_candidates_sorted, - "referenced_names": sorted(set(entry["referenced_names"])), - "referenced_qualnames": sorted(set(entry.get("referenced_qualnames", []))), - "import_names": sorted(set(entry["import_names"])), - "class_names": sorted(set(entry["class_names"])), - } - typing_coverage = entry.get("typing_coverage") - if typing_coverage is not None: - result["typing_coverage"] = ModuleTypingCoverageDict( - module=typing_coverage["module"], - filepath=typing_coverage["filepath"], - callable_count=typing_coverage["callable_count"], - params_total=typing_coverage["params_total"], - params_annotated=typing_coverage["params_annotated"], - returns_total=typing_coverage["returns_total"], - returns_annotated=typing_coverage["returns_annotated"], - any_annotation_count=typing_coverage["any_annotation_count"], - ) - docstring_coverage = entry.get("docstring_coverage") - if docstring_coverage is not None: - result["docstring_coverage"] = ModuleDocstringCoverageDict( - module=docstring_coverage["module"], - filepath=docstring_coverage["filepath"], - public_symbol_total=docstring_coverage["public_symbol_total"], - public_symbol_documented=docstring_coverage["public_symbol_documented"], - ) - api_surface = entry.get("api_surface") - if api_surface is not None: - symbols = sorted( - api_surface["symbols"], - key=lambda item: ( - item["qualname"], - item["kind"], - item["start_line"], - item["end_line"], - ), - ) - normalized_symbols = [ - PublicSymbolDict( - qualname=symbol["qualname"], - kind=symbol["kind"], - start_line=symbol["start_line"], - end_line=symbol["end_line"], - params=[ - ApiParamSpecDict( - name=param["name"], - kind=param["kind"], - has_default=param["has_default"], - annotation_hash=param["annotation_hash"], - ) - for param in symbol.get("params", []) - ], - returns_hash=symbol.get("returns_hash", ""), - exported_via=symbol.get("exported_via", "name"), - ) - for symbol in symbols - ] - result["api_surface"] = ModuleApiSurfaceDict( - module=api_surface["module"], - filepath=api_surface["filepath"], - all_declared=sorted(set(api_surface.get("all_declared", []))), - symbols=normalized_symbols, - ) - sf = entry.get("structural_findings") - if sf is not None: - result["structural_findings"] = sf - source_stats = entry.get("source_stats") - if source_stats is not None: - result["source_stats"] = source_stats - return result - - -def _decode_wire_qualname_span( - row: list[object], -) -> tuple[str, int, int] | None: - qualname = _as_str(row[0]) - start_line = _as_int(row[1]) - end_line = _as_int(row[2]) - if qualname is None or start_line is None or end_line is None: - return None - return qualname, start_line, end_line - - -def _decode_wire_qualname_span_size( - row: list[object], -) -> tuple[str, int, int, int] | None: - qualname_span = _decode_wire_qualname_span(row) - if qualname_span is None: - return None - size = _as_int(row[3]) - if size is None: - return None - qualname, start_line, end_line = qualname_span - return qualname, start_line, end_line, size - - -def _as_analysis_profile(value: object) -> AnalysisProfile | None: - obj = _as_str_dict(value) - if obj is None: - return None - - _REQUIRED = { - "min_loc", - "min_stmt", - "block_min_loc", - "block_min_stmt", - "segment_min_loc", - "segment_min_stmt", - } - if set(obj.keys()) < _REQUIRED: - return None - - min_loc = _as_int(obj.get("min_loc")) - min_stmt = _as_int(obj.get("min_stmt")) - block_min_loc = _as_int(obj.get("block_min_loc")) - block_min_stmt = _as_int(obj.get("block_min_stmt")) - segment_min_loc = _as_int(obj.get("segment_min_loc")) - segment_min_stmt = _as_int(obj.get("segment_min_stmt")) - collect_api_surface_raw = obj.get("collect_api_surface", False) - collect_api_surface = ( - collect_api_surface_raw if isinstance(collect_api_surface_raw, bool) else None - ) - if ( - min_loc is None - or min_stmt is None - or block_min_loc is None - or block_min_stmt is None - or segment_min_loc is None - or segment_min_stmt is None - or collect_api_surface is None - ): - return None - - return AnalysisProfile( - min_loc=min_loc, - min_stmt=min_stmt, - block_min_loc=block_min_loc, - block_min_stmt=block_min_stmt, - segment_min_loc=segment_min_loc, - segment_min_stmt=segment_min_stmt, - collect_api_surface=collect_api_surface, - ) - - -def _decode_wire_stat(obj: dict[str, object]) -> FileStat | None: - stat_list = _as_list(obj.get("st")) - if stat_list is None or len(stat_list) != 2: - return None - mtime_ns = _as_int(stat_list[0]) - size = _as_int(stat_list[1]) - if mtime_ns is None or size is None: - return None - return FileStat(mtime_ns=mtime_ns, size=size) - - -def _decode_optional_wire_source_stats( - *, - obj: dict[str, object], -) -> SourceStatsDict | None: - row = _decode_optional_wire_row(obj=obj, key="ss", expected_len=4) - if row is None: - return None - counts = _decode_wire_int_fields(row, 0, 1, 2, 3) - if counts is None: - return None - lines, functions, methods, classes = counts - if any(value < 0 for value in counts): - return None - return SourceStatsDict( - lines=lines, - functions=functions, - methods=methods, - classes=classes, - ) - - -def _decode_optional_wire_items( - *, - obj: dict[str, object], - key: str, - decode_item: Callable[[object], _DecodedItemT | None], -) -> list[_DecodedItemT] | None: - raw_items = obj.get(key) - if raw_items is None: - return [] - wire_items = _as_list(raw_items) - if wire_items is None: - return None - decoded_items: list[_DecodedItemT] = [] - for wire_item in wire_items: - decoded = decode_item(wire_item) - if decoded is None: - return None - decoded_items.append(decoded) - return decoded_items - - -def _decode_optional_wire_items_for_filepath( - *, - obj: dict[str, object], - key: str, - filepath: str, - decode_item: Callable[[object, str], _DecodedItemT | None], -) -> list[_DecodedItemT] | None: - raw_items = obj.get(key) - if raw_items is None: - return [] - wire_items = _as_list(raw_items) - if wire_items is None: - return None - decoded_items: list[_DecodedItemT] = [] - for wire_item in wire_items: - decoded = decode_item(wire_item, filepath) - if decoded is None: - return None - decoded_items.append(decoded) - return decoded_items - - -def _decode_optional_wire_row( - *, - obj: dict[str, object], - key: str, - expected_len: int, -) -> list[object] | None: - raw = obj.get(key) - if raw is None: - return None - row = _as_list(raw) - if row is None or len(row) != expected_len: - return None - return row - - -def _decode_optional_wire_names( - *, - obj: dict[str, object], - key: str, -) -> list[str] | None: - raw_names = obj.get(key) - if raw_names is None: - return [] - names = _as_list(raw_names) - if names is None or not all(isinstance(name, str) for name in names): - return None - return [str(name) for name in names] - - -def _decode_optional_wire_coupled_classes( - *, - obj: dict[str, object], - key: str, -) -> dict[str, list[str]] | None: - raw = obj.get(key) - if raw is None: - return {} - - rows = _as_list(raw) - if rows is None: - return None - - decoded: dict[str, list[str]] = {} - for wire_row in rows: - row = _as_list(wire_row) - if row is None or len(row) != 2: - return None - qualname = _as_str(row[0]) - names = _as_list(row[1]) - if qualname is None or names is None: - return None - if not all(isinstance(name, str) for name in names): - return None - decoded[qualname] = sorted({str(name) for name in names if str(name)}) - - return decoded - - -def _decode_wire_file_entry(value: object, filepath: str) -> CacheEntry | None: - obj = _as_str_dict(value) - if obj is None: - return None - - stat = _decode_wire_stat(obj) - if stat is None: - return None - source_stats = _decode_optional_wire_source_stats(obj=obj) - file_sections = _decode_wire_file_sections(obj=obj, filepath=filepath) - if file_sections is None: - return None - ( - units, - blocks, - segments, - class_metrics, - module_deps, - dead_candidates, - ) = file_sections - name_sections = _decode_wire_name_sections(obj=obj) - if name_sections is None: - return None - ( - referenced_names, - referenced_qualnames, - import_names, - class_names, - ) = name_sections - typing_coverage = _decode_optional_wire_typing_coverage(obj=obj, filepath=filepath) - docstring_coverage = _decode_optional_wire_docstring_coverage( - obj=obj, - filepath=filepath, - ) - api_surface = _decode_optional_wire_api_surface(obj=obj, filepath=filepath) - coupled_classes_map = _decode_optional_wire_coupled_classes(obj=obj, key="cc") - if coupled_classes_map is None: - return None - - for metric in class_metrics: - names = coupled_classes_map.get(metric["qualname"], []) - if names: - metric["coupled_classes"] = names - - has_structural_findings = "sf" in obj - structural_findings = _decode_wire_structural_findings_optional(obj) - if structural_findings is None: - return None - - return _attach_optional_cache_sections( - CacheEntry( - stat=stat, - units=units, - blocks=blocks, - segments=segments, - class_metrics=class_metrics, - module_deps=module_deps, - dead_candidates=dead_candidates, - referenced_names=referenced_names, - referenced_qualnames=referenced_qualnames, - import_names=import_names, - class_names=class_names, - ), - typing_coverage=typing_coverage, - docstring_coverage=docstring_coverage, - api_surface=api_surface, - source_stats=source_stats, - structural_findings=( - _normalize_cached_structural_groups(structural_findings, filepath=filepath) - if has_structural_findings - else None - ), - ) - - -def _decode_wire_file_sections( - *, - obj: dict[str, object], - filepath: str, -) -> ( - tuple[ - list[UnitDict], - list[BlockDict], - list[SegmentDict], - list[ClassMetricsDict], - list[ModuleDepDict], - list[DeadCandidateDict], - ] - | None -): - units = _decode_optional_wire_items_for_filepath( - obj=obj, - key="u", - filepath=filepath, - decode_item=_decode_wire_unit, - ) - blocks = _decode_optional_wire_items_for_filepath( - obj=obj, - key="b", - filepath=filepath, - decode_item=_decode_wire_block, - ) - segments = _decode_optional_wire_items_for_filepath( - obj=obj, - key="s", - filepath=filepath, - decode_item=_decode_wire_segment, - ) - class_metrics = _decode_optional_wire_items_for_filepath( - obj=obj, - key="cm", - filepath=filepath, - decode_item=_decode_wire_class_metric, - ) - module_deps = _decode_optional_wire_items( - obj=obj, - key="md", - decode_item=_decode_wire_module_dep, - ) - dead_candidates = _decode_optional_wire_items_for_filepath( - obj=obj, - key="dc", - filepath=filepath, - decode_item=_decode_wire_dead_candidate, - ) - if ( - units is None - or blocks is None - or segments is None - or class_metrics is None - or module_deps is None - or dead_candidates is None - ): - return None - return ( - units, - blocks, - segments, - class_metrics, - module_deps, - dead_candidates, - ) - - -def _decode_wire_name_sections( - *, - obj: dict[str, object], -) -> tuple[list[str], list[str], list[str], list[str]] | None: - referenced_names = _decode_optional_wire_names(obj=obj, key="rn") - referenced_qualnames = _decode_optional_wire_names(obj=obj, key="rq") - import_names = _decode_optional_wire_names(obj=obj, key="in") - class_names = _decode_optional_wire_names(obj=obj, key="cn") - if ( - referenced_names is None - or referenced_qualnames is None - or import_names is None - or class_names is None - ): - return None - return ( - referenced_names, - referenced_qualnames, - import_names, - class_names, - ) - - -def _decode_optional_wire_typing_coverage( - *, - obj: dict[str, object], - filepath: str, -) -> ModuleTypingCoverageDict | None: - module_and_ints = _decode_optional_wire_module_ints( - obj=obj, - key="tc", - expected_len=7, - int_indexes=(1, 2, 3, 4, 5, 6), - ) - if module_and_ints is None: - return None - module, ints = module_and_ints - ( - callable_count, - params_total, - params_annotated, - returns_total, - returns_annotated, - any_annotation_count, - ) = ints - return ModuleTypingCoverageDict( - module=module, - filepath=filepath, - callable_count=callable_count, - params_total=params_total, - params_annotated=params_annotated, - returns_total=returns_total, - returns_annotated=returns_annotated, - any_annotation_count=any_annotation_count, - ) - - -def _decode_optional_wire_docstring_coverage( - *, - obj: dict[str, object], - filepath: str, -) -> ModuleDocstringCoverageDict | None: - module_and_counts = _decode_optional_wire_module_ints( - obj=obj, - key="dg", - expected_len=3, - int_indexes=(1, 2), - ) - if module_and_counts is None: - return None - module, counts = module_and_counts - public_symbol_total, public_symbol_documented = counts - return ModuleDocstringCoverageDict( - module=module, - filepath=filepath, - public_symbol_total=public_symbol_total, - public_symbol_documented=public_symbol_documented, - ) - - -def _decode_optional_wire_api_surface( - *, - obj: dict[str, object], - filepath: str, -) -> ModuleApiSurfaceDict | None: - row = _decode_optional_wire_row(obj=obj, key="as", expected_len=3) - if row is None: - return None - module = _as_str(row[0]) - all_declared = _decode_optional_wire_names(obj={"ad": row[1]}, key="ad") - symbols_raw = _as_list(row[2]) - if module is None or all_declared is None or symbols_raw is None: - return None - symbols: list[PublicSymbolDict] = [] - for symbol_raw in symbols_raw: - decoded_symbol = _decode_wire_api_surface_symbol(symbol_raw) - if decoded_symbol is None: - return None - symbols.append(decoded_symbol) - return ModuleApiSurfaceDict( - module=module, - filepath=filepath, - all_declared=sorted(set(all_declared)), - symbols=symbols, - ) - - -def _decode_optional_wire_module_ints( - *, - obj: dict[str, object], - key: str, - expected_len: int, - int_indexes: tuple[int, ...], -) -> tuple[str, tuple[int, ...]] | None: - row = _decode_optional_wire_row(obj=obj, key=key, expected_len=expected_len) - if row is None: - return None - module = _as_str(row[0]) - ints = _decode_wire_int_fields(row, *int_indexes) - if module is None or ints is None: - return None - return module, ints - - -def _decode_wire_api_surface_symbol( - value: object, -) -> PublicSymbolDict | None: - symbol_row = _decode_wire_row(value, valid_lengths={7}) - if symbol_row is None: - return None - str_fields = _decode_wire_str_fields(symbol_row, 0, 1, 4, 5) - int_fields = _decode_wire_int_fields(symbol_row, 2, 3) - params_raw = _as_list(symbol_row[6]) - if str_fields is None or int_fields is None or params_raw is None: - return None - qualname, kind, exported_via, returns_hash = str_fields - start_line, end_line = int_fields - params: list[ApiParamSpecDict] = [] - for param_raw in params_raw: - decoded_param = _decode_wire_api_param_spec(param_raw) - if decoded_param is None: - return None - params.append(decoded_param) - return PublicSymbolDict( - qualname=qualname, - kind=kind, - start_line=start_line, - end_line=end_line, - params=params, - returns_hash=returns_hash, - exported_via=exported_via, - ) - - -def _decode_wire_api_param_spec( - value: object, -) -> ApiParamSpecDict | None: - param_row = _decode_wire_row(value, valid_lengths={4}) - if param_row is None: - return None - str_fields = _decode_wire_str_fields(param_row, 0, 1, 3) - int_fields = _decode_wire_int_fields(param_row, 2) - if str_fields is None or int_fields is None: - return None - name, param_kind, annotation_hash = str_fields - (has_default_raw,) = int_fields - return ApiParamSpecDict( - name=name, - kind=param_kind, - has_default=bool(has_default_raw), - annotation_hash=annotation_hash, - ) - - -def _decode_wire_structural_findings_optional( - obj: dict[str, object], -) -> list[StructuralFindingGroupDict] | None: - """Decode optional 'sf' wire key. Returns [] if absent, None on invalid format.""" - raw = obj.get("sf") - if raw is None: - return [] - groups_raw = _as_list(raw) - if groups_raw is None: - return None - groups: list[StructuralFindingGroupDict] = [] - for group_raw in groups_raw: - group = _decode_wire_structural_group(group_raw) - if group is None: - return None - groups.append(group) - return groups - - -def _decode_wire_row( - value: object, - *, - valid_lengths: Collection[int], -) -> list[object] | None: - row = _as_list(value) - if row is None or len(row) not in valid_lengths: - return None - return row - - -def _decode_wire_named_span( - value: object, - *, - valid_lengths: Collection[int], -) -> tuple[list[object], str, int, int] | None: - row = _decode_wire_row(value, valid_lengths=valid_lengths) - if row is None: - return None - span = _decode_wire_qualname_span(row) - if span is None: - return None - qualname, start_line, end_line = span - return row, qualname, start_line, end_line - - -def _decode_wire_named_sized_span( - value: object, - *, - valid_lengths: Collection[int], -) -> tuple[list[object], str, int, int, int] | None: - row = _decode_wire_row(value, valid_lengths=valid_lengths) - if row is None: - return None - span = _decode_wire_qualname_span_size(row) - if span is None: - return None - qualname, start_line, end_line, size = span - return row, qualname, start_line, end_line, size - - -def _decode_wire_int_fields( - row: list[object], - *indexes: int, -) -> tuple[int, ...] | None: - values: list[int] = [] - for index in indexes: - value = _as_int(row[index]) - if value is None: - return None - values.append(value) - return tuple(values) - - -def _decode_wire_str_fields( - row: list[object], - *indexes: int, -) -> tuple[str, ...] | None: - values: list[str] = [] - for index in indexes: - value = _as_str(row[index]) - if value is None: - return None - values.append(value) - return tuple(values) - - -def _decode_wire_unit_core_fields( - row: list[object], -) -> tuple[int, int, str, str, int, int, Literal["low", "medium", "high"], str] | None: - int_fields = _decode_wire_int_fields(row, 3, 4, 7, 8) - str_fields = _decode_wire_str_fields(row, 5, 6, 10) - risk = _as_risk_literal(row[9]) - if int_fields is None or str_fields is None or risk is None: - return None - loc, stmt_count, cyclomatic_complexity, nesting_depth = int_fields - fingerprint, loc_bucket, raw_hash = str_fields - return ( - loc, - stmt_count, - fingerprint, - loc_bucket, - cyclomatic_complexity, - nesting_depth, - risk, - raw_hash, - ) - - -def _decode_wire_unit_flow_profiles( - row: list[object], -) -> tuple[int, str, bool, str, str, str] | None: - if len(row) != 17: - return _DEFAULT_WIRE_UNIT_FLOW_PROFILES - - parsed_entry_guard_count = _as_int(row[11]) - parsed_entry_guard_terminal_profile = _as_str(row[12]) - parsed_entry_guard_has_side_effect_before = _as_int(row[13]) - parsed_terminal_kind = _as_str(row[14]) - parsed_try_finally_profile = _as_str(row[15]) - parsed_side_effect_order_profile = _as_str(row[16]) - if ( - parsed_entry_guard_count is None - or parsed_entry_guard_terminal_profile is None - or parsed_entry_guard_has_side_effect_before is None - or parsed_terminal_kind is None - or parsed_try_finally_profile is None - or parsed_side_effect_order_profile is None - ): - return None - return ( - max(0, parsed_entry_guard_count), - parsed_entry_guard_terminal_profile or "none", - parsed_entry_guard_has_side_effect_before != 0, - parsed_terminal_kind or "fallthrough", - parsed_try_finally_profile or "none", - parsed_side_effect_order_profile or "none", - ) - - -def _decode_wire_class_metric_fields( - row: list[object], -) -> tuple[int, int, int, int, str, str] | None: - int_fields = _decode_wire_int_fields(row, 3, 4, 5, 6) - str_fields = _decode_wire_str_fields(row, 7, 8) - if int_fields is None or str_fields is None: - return None - cbo, lcom4, method_count, instance_var_count = int_fields - risk_coupling, risk_cohesion = str_fields - return ( - cbo, - lcom4, - method_count, - instance_var_count, - risk_coupling, - risk_cohesion, - ) - - -def _decode_wire_structural_group(value: object) -> StructuralFindingGroupDict | None: - group_row = _decode_wire_row(value, valid_lengths={4}) - if group_row is None: - return None - str_fields = _decode_wire_str_fields(group_row, 0, 1) - items_raw = _as_list(group_row[3]) - signature = _decode_wire_structural_signature(group_row[2]) - if str_fields is None or items_raw is None or signature is None: - return None - finding_kind, finding_key = str_fields - items: list[StructuralFindingOccurrenceDict] = [] - for item_raw in items_raw: - item = _decode_wire_structural_occurrence(item_raw) - if item is None: - return None - items.append(item) - return StructuralFindingGroupDict( - finding_kind=finding_kind, - finding_key=finding_key, - signature=signature, - items=items, - ) - - -def _decode_wire_structural_signature(value: object) -> dict[str, str] | None: - sig_raw = _as_list(value) - if sig_raw is None: - return None - signature: dict[str, str] = {} - for pair in sig_raw: - pair_list = _as_list(pair) - if pair_list is None or len(pair_list) != 2: - return None - key = _as_str(pair_list[0]) - val = _as_str(pair_list[1]) - if key is None or val is None: - return None - signature[key] = val - return signature - - -def _decode_wire_structural_occurrence( - value: object, -) -> StructuralFindingOccurrenceDict | None: - item_list = _as_list(value) - if item_list is None or len(item_list) != 3: - return None - qualname = _as_str(item_list[0]) - start = _as_int(item_list[1]) - end = _as_int(item_list[2]) - if qualname is None or start is None or end is None: - return None - return StructuralFindingOccurrenceDict( - qualname=qualname, - start=start, - end=end, - ) - - -def _decode_wire_unit(value: object, filepath: str) -> UnitDict | None: - decoded = _decode_wire_named_span(value, valid_lengths={11, 17}) - if decoded is None: - return None - row, qualname, start_line, end_line = decoded - core_fields = _decode_wire_unit_core_fields(row) - flow_profiles = _decode_wire_unit_flow_profiles(row) - if core_fields is None or flow_profiles is None: - return None - ( - loc, - stmt_count, - fingerprint, - loc_bucket, - cyclomatic_complexity, - nesting_depth, - risk, - raw_hash, - ) = core_fields - ( - entry_guard_count, - entry_guard_terminal_profile, - entry_guard_has_side_effect_before, - terminal_kind, - try_finally_profile, - side_effect_order_profile, - ) = flow_profiles - return FunctionGroupItem( - qualname=qualname, - filepath=filepath, - start_line=start_line, - end_line=end_line, - loc=loc, - stmt_count=stmt_count, - fingerprint=fingerprint, - loc_bucket=loc_bucket, - cyclomatic_complexity=cyclomatic_complexity, - nesting_depth=nesting_depth, - risk=risk, - raw_hash=raw_hash, - entry_guard_count=entry_guard_count, - entry_guard_terminal_profile=entry_guard_terminal_profile, - entry_guard_has_side_effect_before=entry_guard_has_side_effect_before, - terminal_kind=terminal_kind, - try_finally_profile=try_finally_profile, - side_effect_order_profile=side_effect_order_profile, - ) - - -def _decode_wire_block(value: object, filepath: str) -> BlockDict | None: - decoded = _decode_wire_named_sized_span(value, valid_lengths={5}) - if decoded is None: - return None - row, qualname, start_line, end_line, size = decoded - block_hash = _as_str(row[4]) - if block_hash is None: - return None - - return BlockGroupItem( - block_hash=block_hash, - filepath=filepath, - qualname=qualname, - start_line=start_line, - end_line=end_line, - size=size, - ) - - -def _decode_wire_segment(value: object, filepath: str) -> SegmentDict | None: - decoded = _decode_wire_named_sized_span(value, valid_lengths={6}) - if decoded is None: - return None - row, qualname, start_line, end_line, size = decoded - segment_hash = _as_str(row[4]) - segment_sig = _as_str(row[5]) - if segment_hash is None or segment_sig is None: - return None - - return SegmentGroupItem( - segment_hash=segment_hash, - segment_sig=segment_sig, - filepath=filepath, - qualname=qualname, - start_line=start_line, - end_line=end_line, - size=size, - ) - - -def _decode_wire_class_metric( - value: object, - filepath: str, -) -> ClassMetricsDict | None: - decoded = _decode_wire_named_span(value, valid_lengths={9}) - if decoded is None: - return None - row, qualname, start_line, end_line = decoded - metric_fields = _decode_wire_class_metric_fields(row) - if metric_fields is None: - return None - cbo, lcom4, method_count, instance_var_count, risk_coupling, risk_cohesion = ( - metric_fields - ) - return ClassMetricsDict( - qualname=qualname, - filepath=filepath, - start_line=start_line, - end_line=end_line, - cbo=cbo, - lcom4=lcom4, - method_count=method_count, - instance_var_count=instance_var_count, - risk_coupling=risk_coupling, - risk_cohesion=risk_cohesion, - ) - - -def _decode_wire_module_dep(value: object) -> ModuleDepDict | None: - row = _as_list(value) - if row is None or len(row) != 4: - return None - source = _as_str(row[0]) - target = _as_str(row[1]) - import_type = _as_str(row[2]) - line = _as_int(row[3]) - if source is None or target is None or import_type is None or line is None: - return None - return ModuleDepDict( - source=source, - target=target, - import_type=import_type, - line=line, - ) - - -def _decode_wire_dead_candidate( - value: object, - filepath: str, -) -> DeadCandidateDict | None: - row = _decode_wire_row(value, valid_lengths={5, 6}) - if row is None: - return None - str_fields = _decode_wire_str_fields(row, 0, 1, 4) - int_fields = _decode_wire_int_fields(row, 2, 3) - suppressed_rules: list[str] | None = [] - if len(row) == 6: - raw_rules = _as_list(row[5]) - if raw_rules is None or not all(isinstance(rule, str) for rule in raw_rules): - return None - suppressed_rules = sorted({str(rule) for rule in raw_rules if str(rule)}) - if str_fields is None or int_fields is None: - return None - qualname, local_name, kind = str_fields - start_line, end_line = int_fields - decoded = DeadCandidateDict( - qualname=qualname, - local_name=local_name, - filepath=filepath, - start_line=start_line, - end_line=end_line, - kind=kind, - ) - if suppressed_rules: - decoded["suppressed_rules"] = suppressed_rules - return decoded - - -def _encode_wire_file_entry(entry: CacheEntry) -> dict[str, object]: - wire: dict[str, object] = { - "st": [entry["stat"]["mtime_ns"], entry["stat"]["size"]], - } - source_stats = entry.get("source_stats") - if source_stats is not None: - wire["ss"] = [ - source_stats["lines"], - source_stats["functions"], - source_stats["methods"], - source_stats["classes"], - ] - - units = sorted( - entry["units"], - key=lambda unit: ( - unit["qualname"], - unit["start_line"], - unit["end_line"], - unit["fingerprint"], - ), - ) - if units: - wire["u"] = [ - [ - unit["qualname"], - unit["start_line"], - unit["end_line"], - unit["loc"], - unit["stmt_count"], - unit["fingerprint"], - unit["loc_bucket"], - unit.get("cyclomatic_complexity", 1), - unit.get("nesting_depth", 0), - unit.get("risk", "low"), - unit.get("raw_hash", ""), - unit.get("entry_guard_count", 0), - unit.get("entry_guard_terminal_profile", "none"), - 1 if unit.get("entry_guard_has_side_effect_before", False) else 0, - unit.get("terminal_kind", "fallthrough"), - unit.get("try_finally_profile", "none"), - unit.get("side_effect_order_profile", "none"), - ] - for unit in units - ] - - blocks = sorted( - entry["blocks"], - key=lambda block: ( - block["qualname"], - block["start_line"], - block["end_line"], - block["block_hash"], - ), - ) - if blocks: - wire["b"] = [ - [ - block["qualname"], - block["start_line"], - block["end_line"], - block["size"], - block["block_hash"], - ] - for block in blocks - ] - - segments = sorted( - entry["segments"], - key=lambda segment: ( - segment["qualname"], - segment["start_line"], - segment["end_line"], - segment["segment_hash"], - ), - ) - if segments: - wire["s"] = [ - [ - segment["qualname"], - segment["start_line"], - segment["end_line"], - segment["size"], - segment["segment_hash"], - segment["segment_sig"], - ] - for segment in segments - ] - - class_metrics = sorted( - entry["class_metrics"], - key=lambda metric: ( - metric["start_line"], - metric["end_line"], - metric["qualname"], - ), - ) - if class_metrics: - coupled_classes_rows: list[list[object]] = [] - - def _append_coupled_classes_row(metric: ClassMetricsDict) -> None: - coupled_classes = _normalized_optional_string_list( - metric.get("coupled_classes", []) - ) - if coupled_classes: - coupled_classes_rows.append([metric["qualname"], coupled_classes]) - - wire["cm"] = [ - [ - metric["qualname"], - metric["start_line"], - metric["end_line"], - metric["cbo"], - metric["lcom4"], - metric["method_count"], - metric["instance_var_count"], - metric["risk_coupling"], - metric["risk_cohesion"], - ] - for metric in class_metrics - ] - for metric in class_metrics: - _append_coupled_classes_row(metric) - if coupled_classes_rows: - wire["cc"] = coupled_classes_rows - - module_deps = sorted( - entry["module_deps"], - key=lambda dep: (dep["source"], dep["target"], dep["import_type"], dep["line"]), - ) - if module_deps: - wire["md"] = [ - [ - dep["source"], - dep["target"], - dep["import_type"], - dep["line"], - ] - for dep in module_deps - ] - - dead_candidates = sorted( - entry["dead_candidates"], - key=lambda candidate: ( - candidate["start_line"], - candidate["end_line"], - candidate["qualname"], - candidate["local_name"], - candidate["kind"], - ), - ) - if dead_candidates: - # Dead candidates are stored inside a per-file cache entry, so the - # filepath is implicit and does not need to be repeated in every row. - encoded_dead_candidates: list[list[object]] = [] - for candidate in dead_candidates: - encoded = [ - candidate["qualname"], - candidate["local_name"], - candidate["start_line"], - candidate["end_line"], - candidate["kind"], - ] - suppressed_rules = candidate.get("suppressed_rules", []) - normalized_rules = _normalized_optional_string_list(suppressed_rules) - if normalized_rules: - encoded.append(normalized_rules) - encoded_dead_candidates.append(encoded) - wire["dc"] = encoded_dead_candidates - - if entry["referenced_names"]: - wire["rn"] = sorted(set(entry["referenced_names"])) - if entry.get("referenced_qualnames"): - wire["rq"] = sorted(set(entry["referenced_qualnames"])) - if entry["import_names"]: - wire["in"] = sorted(set(entry["import_names"])) - if entry["class_names"]: - wire["cn"] = sorted(set(entry["class_names"])) - typing_coverage = entry.get("typing_coverage") - if typing_coverage is not None: - wire["tc"] = [ - typing_coverage["module"], - typing_coverage["callable_count"], - typing_coverage["params_total"], - typing_coverage["params_annotated"], - typing_coverage["returns_total"], - typing_coverage["returns_annotated"], - typing_coverage["any_annotation_count"], - ] - docstring_coverage = entry.get("docstring_coverage") - if docstring_coverage is not None: - wire["dg"] = [ - docstring_coverage["module"], - docstring_coverage["public_symbol_total"], - docstring_coverage["public_symbol_documented"], - ] - api_surface = entry.get("api_surface") - if api_surface is not None: - wire["as"] = [ - api_surface["module"], - sorted(set(api_surface.get("all_declared", []))), - [ - [ - symbol["qualname"], - symbol["kind"], - symbol["start_line"], - symbol["end_line"], - symbol.get("exported_via", "name"), - symbol.get("returns_hash", ""), - [ - [ - param["name"], - param["kind"], - 1 if param["has_default"] else 0, - param.get("annotation_hash", ""), - ] - for param in symbol.get("params", []) - ], - ] - for symbol in api_surface["symbols"] - ], - ] - - if "structural_findings" in entry: - sf = entry.get("structural_findings", []) - wire["sf"] = [ - [ - group["finding_kind"], - group["finding_key"], - sorted(group["signature"].items()), - [ - [item["qualname"], item["start"], item["end"]] - for item in group["items"] - ], - ] - for group in sf - ] - - return wire - - -def _resolve_root(root: str | Path | None) -> Path | None: - if root is None: - return None - try: - return Path(root).resolve(strict=False) - except OSError: - return None - - -def _is_file_stat_dict(value: object) -> bool: - if not isinstance(value, dict): - return False - return isinstance(value.get("mtime_ns"), int) and isinstance(value.get("size"), int) - - -def _is_source_stats_dict(value: object) -> bool: - if not isinstance(value, dict): - return False - lines = value.get("lines") - functions = value.get("functions") - methods = value.get("methods") - classes = value.get("classes") - return ( - isinstance(lines, int) - and lines >= 0 - and isinstance(functions, int) - and functions >= 0 - and isinstance(methods, int) - and methods >= 0 - and isinstance(classes, int) - and classes >= 0 - ) - - -def _is_unit_dict(value: object) -> bool: - if not isinstance(value, dict): - return False - string_keys = ("qualname", "filepath", "fingerprint", "loc_bucket") - int_keys = ("start_line", "end_line", "loc", "stmt_count") - if not _has_typed_fields(value, string_keys=string_keys, int_keys=int_keys): - return False - cyclomatic_complexity = value.get("cyclomatic_complexity", 1) - nesting_depth = value.get("nesting_depth", 0) - risk = value.get("risk", "low") - raw_hash = value.get("raw_hash", "") - return ( - isinstance(cyclomatic_complexity, int) - and isinstance(nesting_depth, int) - and isinstance(risk, str) - and risk in {"low", "medium", "high"} - and isinstance(raw_hash, str) - and isinstance(value.get("entry_guard_count", 0), int) - and isinstance(value.get("entry_guard_terminal_profile", "none"), str) - and isinstance(value.get("entry_guard_has_side_effect_before", False), bool) - and isinstance(value.get("terminal_kind", "fallthrough"), str) - and isinstance(value.get("try_finally_profile", "none"), str) - and isinstance(value.get("side_effect_order_profile", "none"), str) - ) - - -def _is_block_dict(value: object) -> bool: - if not isinstance(value, dict): - return False - string_keys = ("block_hash", "filepath", "qualname") - int_keys = ("start_line", "end_line", "size") - return _has_typed_fields(value, string_keys=string_keys, int_keys=int_keys) - - -def _is_segment_dict(value: object) -> bool: - if not isinstance(value, dict): - return False - string_keys = ("segment_hash", "segment_sig", "filepath", "qualname") - int_keys = ("start_line", "end_line", "size") - return _has_typed_fields(value, string_keys=string_keys, int_keys=int_keys) - - -def _is_module_typing_coverage_dict(value: object) -> bool: - if not isinstance(value, dict): - return False - string_keys = ("module", "filepath") - int_keys = ( - "callable_count", - "params_total", - "params_annotated", - "returns_total", - "returns_annotated", - "any_annotation_count", - ) - return _has_typed_fields(value, string_keys=string_keys, int_keys=int_keys) - - -def _is_module_docstring_coverage_dict(value: object) -> bool: - if not isinstance(value, dict): - return False - string_keys = ("module", "filepath") - int_keys = ("public_symbol_total", "public_symbol_documented") - return _has_typed_fields(value, string_keys=string_keys, int_keys=int_keys) - - -def _is_api_param_spec_dict(value: object) -> bool: - if not isinstance(value, dict): - return False - return ( - isinstance(value.get("name"), str) - and isinstance(value.get("kind"), str) - and isinstance(value.get("has_default"), bool) - and isinstance(value.get("annotation_hash", ""), str) - ) - - -def _is_public_symbol_dict(value: object) -> bool: - if not isinstance(value, dict): - return False - if not _has_typed_fields( - value, - string_keys=("qualname", "kind", "exported_via"), - int_keys=("start_line", "end_line"), - ): - return False - params = value.get("params", []) - return ( - isinstance(value.get("returns_hash", ""), str) - and isinstance( - params, - list, - ) - and all(_is_api_param_spec_dict(item) for item in params) - ) - - -def _is_module_api_surface_dict(value: object) -> bool: - if not isinstance(value, dict): - return False - all_declared = value.get("all_declared", []) - symbols = value.get("symbols", []) - return ( - isinstance(value.get("module"), str) - and isinstance(value.get("filepath"), str) - and _is_string_list(all_declared) - and isinstance(symbols, list) - and all(_is_public_symbol_dict(item) for item in symbols) - ) - - -def _is_class_metrics_dict(value: object) -> bool: - if not isinstance(value, dict): - return False - if not _has_typed_fields( - value, - string_keys=( - "qualname", - "filepath", - "risk_coupling", - "risk_cohesion", - ), - int_keys=( - "start_line", - "end_line", - "cbo", - "lcom4", - "method_count", - "instance_var_count", - ), - ): - return False - - coupled_classes = value.get("coupled_classes") - if coupled_classes is None: - return True - return _is_string_list(coupled_classes) - - -def _is_module_dep_dict(value: object) -> bool: - if not isinstance(value, dict): - return False - return _has_typed_fields( - value, - string_keys=("source", "target", "import_type"), - int_keys=("line",), - ) - - -def _is_dead_candidate_dict(value: object) -> bool: - if not isinstance(value, dict): - return False - if not _has_typed_fields( - value, - string_keys=("qualname", "local_name", "filepath", "kind"), - int_keys=("start_line", "end_line"), - ): - return False - suppressed_rules = value.get("suppressed_rules") - if suppressed_rules is None: - return True - return _is_string_list(suppressed_rules) - - -def _is_string_list(value: object) -> bool: - return isinstance(value, list) and all(isinstance(item, str) for item in value) - - -def _has_typed_fields( - value: Mapping[str, object], - *, - string_keys: Sequence[str], - int_keys: Sequence[str], -) -> bool: - return all(isinstance(value.get(key), str) for key in string_keys) and all( - isinstance(value.get(key), int) for key in int_keys - ) diff --git a/codeclone/_html_report/_sections/__init__.py b/codeclone/cache/__init__.py similarity index 100% rename from codeclone/_html_report/_sections/__init__.py rename to codeclone/cache/__init__.py diff --git a/codeclone/cache/_canonicalize.py b/codeclone/cache/_canonicalize.py new file mode 100644 index 0000000..b3d903e --- /dev/null +++ b/codeclone/cache/_canonicalize.py @@ -0,0 +1,457 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections.abc import Callable, Mapping +from typing import TypeGuard, TypeVar + +from ._validators import ( + _is_block_dict, + _is_class_metrics_dict, + _is_dead_candidate_dict, + _is_file_stat_dict, + _is_module_api_surface_dict, + _is_module_dep_dict, + _is_module_docstring_coverage_dict, + _is_module_typing_coverage_dict, + _is_security_surface_dict, + _is_segment_dict, + _is_source_stats_dict, + _is_string_list, + _is_unit_dict, +) +from .entries import ( + ApiParamSpecDict, + BlockDict, + CacheEntry, + ClassMetricsDict, + DeadCandidateDict, + FileStat, + ModuleApiSurfaceDict, + ModuleDepDict, + ModuleDocstringCoverageDict, + ModuleTypingCoverageDict, + PublicSymbolDict, + SecuritySurfaceDict, + SegmentDict, + SourceStatsDict, + StructuralFindingGroupDict, + UnitDict, +) + +_ValidatedItemT = TypeVar("_ValidatedItemT") + + +def _is_str_item(value: object) -> TypeGuard[str]: + return isinstance(value, str) + + +def _as_file_stat_dict(value: object) -> FileStat | None: + if not _is_file_stat_dict(value): + return None + mtime_ns = value.get("mtime_ns") + size = value.get("size") + if not isinstance(mtime_ns, int) or not isinstance(size, int): + return None + return FileStat(mtime_ns=mtime_ns, size=size) + + +def _as_source_stats_dict(value: object) -> SourceStatsDict | None: + if not _is_source_stats_dict(value): + return None + return SourceStatsDict( + lines=value["lines"], + functions=value["functions"], + methods=value["methods"], + classes=value["classes"], + ) + + +def _as_typed_list( + value: object, + *, + predicate: Callable[[object], TypeGuard[_ValidatedItemT]], +) -> list[_ValidatedItemT] | None: + if not isinstance(value, list): + return None + items: list[_ValidatedItemT] = [] + for item in value: + if not predicate(item): + return None + items.append(item) + return items + + +def _as_typed_unit_list(value: object) -> list[UnitDict] | None: + return _as_typed_list(value, predicate=_is_unit_dict) + + +def _as_typed_block_list(value: object) -> list[BlockDict] | None: + return _as_typed_list(value, predicate=_is_block_dict) + + +def _as_typed_segment_list(value: object) -> list[SegmentDict] | None: + return _as_typed_list(value, predicate=_is_segment_dict) + + +def _as_typed_class_metrics_list(value: object) -> list[ClassMetricsDict] | None: + return _as_typed_list(value, predicate=_is_class_metrics_dict) + + +def _as_typed_dead_candidates_list( + value: object, +) -> list[DeadCandidateDict] | None: + return _as_typed_list(value, predicate=_is_dead_candidate_dict) + + +def _as_typed_module_deps_list(value: object) -> list[ModuleDepDict] | None: + return _as_typed_list(value, predicate=_is_module_dep_dict) + + +def _as_typed_security_surfaces_list(value: object) -> list[SecuritySurfaceDict] | None: + return _as_typed_list(value, predicate=_is_security_surface_dict) + + +def _as_typed_string_list(value: object) -> list[str] | None: + return _as_typed_list(value, predicate=_is_str_item) + + +def _as_module_typing_coverage_dict( + value: object, +) -> ModuleTypingCoverageDict | None: + if not _is_module_typing_coverage_dict(value): + return None + return value + + +def _as_module_docstring_coverage_dict( + value: object, +) -> ModuleDocstringCoverageDict | None: + if not _is_module_docstring_coverage_dict(value): + return None + return value + + +def _as_module_api_surface_dict(value: object) -> ModuleApiSurfaceDict | None: + if not _is_module_api_surface_dict(value): + return None + return value + + +def _normalized_optional_string_list(value: object) -> list[str] | None: + items = _as_typed_string_list(value) + if not items: + return None + return sorted(set(items)) + + +def _is_canonical_cache_entry(value: object) -> TypeGuard[CacheEntry]: + return isinstance(value, dict) and _has_cache_entry_container_shape(value) + + +def _has_cache_entry_container_shape(entry: Mapping[str, object]) -> bool: + required = {"stat", "units", "blocks", "segments"} + if not required.issubset(entry.keys()): + return False + if not isinstance(entry.get("stat"), dict): + return False + if not isinstance(entry.get("units"), list): + return False + if not isinstance(entry.get("blocks"), list): + return False + if not isinstance(entry.get("segments"), list): + return False + source_stats = entry.get("source_stats") + if source_stats is not None and not _is_source_stats_dict(source_stats): + return False + optional_list_keys = ( + "class_metrics", + "module_deps", + "dead_candidates", + "referenced_names", + "referenced_qualnames", + "import_names", + "class_names", + "security_surfaces", + "structural_findings", + ) + if not all(isinstance(entry.get(key, []), list) for key in optional_list_keys): + return False + typing_coverage = entry.get("typing_coverage") + if typing_coverage is not None and not _is_module_typing_coverage_dict( + typing_coverage + ): + return False + docstring_coverage = entry.get("docstring_coverage") + if docstring_coverage is not None and not _is_module_docstring_coverage_dict( + docstring_coverage + ): + return False + api_surface = entry.get("api_surface") + return api_surface is None or _is_module_api_surface_dict(api_surface) + + +def _decode_optional_cache_sections( + entry: Mapping[str, object], +) -> ( + tuple[ + list[ClassMetricsDict], + list[ModuleDepDict], + list[DeadCandidateDict], + list[str], + list[str], + list[str], + list[str], + list[SecuritySurfaceDict], + ModuleTypingCoverageDict | None, + ModuleDocstringCoverageDict | None, + ModuleApiSurfaceDict | None, + SourceStatsDict | None, + list[StructuralFindingGroupDict] | None, + ] + | None +): + class_metrics_raw = _as_typed_class_metrics_list(entry.get("class_metrics", [])) + module_deps_raw = _as_typed_module_deps_list(entry.get("module_deps", [])) + dead_candidates_raw = _as_typed_dead_candidates_list( + entry.get("dead_candidates", []) + ) + referenced_names_raw = _as_typed_string_list(entry.get("referenced_names", [])) + referenced_qualnames_raw = _as_typed_string_list( + entry.get("referenced_qualnames", []) + ) + import_names_raw = _as_typed_string_list(entry.get("import_names", [])) + class_names_raw = _as_typed_string_list(entry.get("class_names", [])) + security_surfaces_raw = _as_typed_security_surfaces_list( + entry.get("security_surfaces", []) + ) + if ( + class_metrics_raw is None + or module_deps_raw is None + or dead_candidates_raw is None + or referenced_names_raw is None + or referenced_qualnames_raw is None + or import_names_raw is None + or class_names_raw is None + or security_surfaces_raw is None + ): + return None + typing_coverage_raw = _as_module_typing_coverage_dict(entry.get("typing_coverage")) + docstring_coverage_raw = _as_module_docstring_coverage_dict( + entry.get("docstring_coverage") + ) + api_surface_raw = _as_module_api_surface_dict(entry.get("api_surface")) + source_stats = _as_source_stats_dict(entry.get("source_stats")) + structural_findings = entry.get("structural_findings") + typed_structural_findings = ( + structural_findings if isinstance(structural_findings, list) else None + ) + return ( + class_metrics_raw, + module_deps_raw, + dead_candidates_raw, + referenced_names_raw, + referenced_qualnames_raw, + import_names_raw, + class_names_raw, + security_surfaces_raw, + typing_coverage_raw, + docstring_coverage_raw, + api_surface_raw, + source_stats, + typed_structural_findings, + ) + + +def _attach_optional_cache_sections( + entry: CacheEntry, + *, + typing_coverage: ModuleTypingCoverageDict | None = None, + docstring_coverage: ModuleDocstringCoverageDict | None = None, + api_surface: ModuleApiSurfaceDict | None = None, + security_surfaces: list[SecuritySurfaceDict] | None = None, + source_stats: SourceStatsDict | None = None, + structural_findings: list[StructuralFindingGroupDict] | None = None, +) -> CacheEntry: + if typing_coverage is not None: + entry["typing_coverage"] = typing_coverage + if docstring_coverage is not None: + entry["docstring_coverage"] = docstring_coverage + if api_surface is not None: + entry["api_surface"] = api_surface + if security_surfaces is not None: + entry["security_surfaces"] = security_surfaces + if source_stats is not None: + entry["source_stats"] = source_stats + if structural_findings is not None: + entry["structural_findings"] = structural_findings + return entry + + +def _canonicalize_cache_entry(entry: CacheEntry) -> CacheEntry: + class_metrics_sorted = sorted( + entry["class_metrics"], + key=lambda item: ( + item["start_line"], + item["end_line"], + item["qualname"], + ), + ) + for metric in class_metrics_sorted: + coupled_classes = metric.get("coupled_classes", []) + if coupled_classes: + metric["coupled_classes"] = sorted(set(coupled_classes)) + + module_deps_sorted = sorted( + entry["module_deps"], + key=lambda item: ( + item["source"], + item["target"], + item["import_type"], + item["line"], + ), + ) + dead_candidates_normalized: list[DeadCandidateDict] = [] + for candidate in entry["dead_candidates"]: + suppressed_rules = candidate.get("suppressed_rules", []) + normalized_candidate = DeadCandidateDict( + qualname=candidate["qualname"], + local_name=candidate["local_name"], + filepath=candidate["filepath"], + start_line=candidate["start_line"], + end_line=candidate["end_line"], + kind=candidate["kind"], + ) + if _is_string_list(suppressed_rules): + normalized_rules = sorted(set(suppressed_rules)) + if normalized_rules: + normalized_candidate["suppressed_rules"] = normalized_rules + dead_candidates_normalized.append(normalized_candidate) + + dead_candidates_sorted = sorted( + dead_candidates_normalized, + key=lambda item: ( + item["start_line"], + item["end_line"], + item["qualname"], + item["local_name"], + item["kind"], + tuple(item.get("suppressed_rules", [])), + ), + ) + + result: CacheEntry = { + "stat": entry["stat"], + "units": entry["units"], + "blocks": entry["blocks"], + "segments": entry["segments"], + "class_metrics": class_metrics_sorted, + "module_deps": module_deps_sorted, + "dead_candidates": dead_candidates_sorted, + "referenced_names": sorted(set(entry["referenced_names"])), + "referenced_qualnames": sorted(set(entry.get("referenced_qualnames", []))), + "import_names": sorted(set(entry["import_names"])), + "class_names": sorted(set(entry["class_names"])), + "security_surfaces": sorted( + entry.get("security_surfaces", []), + key=lambda item: ( + item["start_line"], + item["end_line"], + item["qualname"], + item["category"], + item["capability"], + item["evidence_symbol"], + ), + ), + } + typing_coverage = entry.get("typing_coverage") + if typing_coverage is not None: + result["typing_coverage"] = ModuleTypingCoverageDict( + module=typing_coverage["module"], + filepath=typing_coverage["filepath"], + callable_count=typing_coverage["callable_count"], + params_total=typing_coverage["params_total"], + params_annotated=typing_coverage["params_annotated"], + returns_total=typing_coverage["returns_total"], + returns_annotated=typing_coverage["returns_annotated"], + any_annotation_count=typing_coverage["any_annotation_count"], + ) + docstring_coverage = entry.get("docstring_coverage") + if docstring_coverage is not None: + result["docstring_coverage"] = ModuleDocstringCoverageDict( + module=docstring_coverage["module"], + filepath=docstring_coverage["filepath"], + public_symbol_total=docstring_coverage["public_symbol_total"], + public_symbol_documented=docstring_coverage["public_symbol_documented"], + ) + api_surface = entry.get("api_surface") + if api_surface is not None: + symbols = sorted( + api_surface["symbols"], + key=lambda item: ( + item["qualname"], + item["kind"], + item["start_line"], + item["end_line"], + ), + ) + normalized_symbols = [ + PublicSymbolDict( + qualname=symbol["qualname"], + kind=symbol["kind"], + start_line=symbol["start_line"], + end_line=symbol["end_line"], + params=[ + ApiParamSpecDict( + name=param["name"], + kind=param["kind"], + has_default=param["has_default"], + annotation_hash=param["annotation_hash"], + ) + for param in symbol.get("params", []) + ], + returns_hash=symbol.get("returns_hash", ""), + exported_via=symbol.get("exported_via", "name"), + ) + for symbol in symbols + ] + result["api_surface"] = ModuleApiSurfaceDict( + module=api_surface["module"], + filepath=api_surface["filepath"], + all_declared=sorted(set(api_surface.get("all_declared", []))), + symbols=normalized_symbols, + ) + structural_findings = entry.get("structural_findings") + if structural_findings is not None: + result["structural_findings"] = structural_findings + source_stats = entry.get("source_stats") + if source_stats is not None: + result["source_stats"] = source_stats + return result + + +__all__ = [ + "_as_file_stat_dict", + "_as_module_api_surface_dict", + "_as_module_docstring_coverage_dict", + "_as_module_typing_coverage_dict", + "_as_source_stats_dict", + "_as_typed_block_list", + "_as_typed_class_metrics_list", + "_as_typed_dead_candidates_list", + "_as_typed_module_deps_list", + "_as_typed_security_surfaces_list", + "_as_typed_segment_list", + "_as_typed_string_list", + "_as_typed_unit_list", + "_attach_optional_cache_sections", + "_canonicalize_cache_entry", + "_decode_optional_cache_sections", + "_has_cache_entry_container_shape", + "_is_canonical_cache_entry", + "_normalized_optional_string_list", +] diff --git a/codeclone/cache/_validators.py b/codeclone/cache/_validators.py new file mode 100644 index 0000000..c289720 --- /dev/null +++ b/codeclone/cache/_validators.py @@ -0,0 +1,271 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections.abc import Mapping, Sequence +from typing import TypeGuard + +from .entries import ( + ApiParamSpecDict, + BlockDict, + ClassMetricsDict, + DeadCandidateDict, + FileStat, + ModuleApiSurfaceDict, + ModuleDepDict, + ModuleDocstringCoverageDict, + ModuleTypingCoverageDict, + PublicSymbolDict, + SecuritySurfaceDict, + SegmentDict, + SourceStatsDict, + UnitDict, +) + + +def _is_file_stat_dict(value: object) -> TypeGuard[FileStat]: + if not isinstance(value, dict): + return False + return isinstance(value.get("mtime_ns"), int) and isinstance(value.get("size"), int) + + +def _is_source_stats_dict(value: object) -> TypeGuard[SourceStatsDict]: + if not isinstance(value, dict): + return False + lines = value.get("lines") + functions = value.get("functions") + methods = value.get("methods") + classes = value.get("classes") + return ( + isinstance(lines, int) + and lines >= 0 + and isinstance(functions, int) + and functions >= 0 + and isinstance(methods, int) + and methods >= 0 + and isinstance(classes, int) + and classes >= 0 + ) + + +def _is_unit_dict(value: object) -> TypeGuard[UnitDict]: + if not isinstance(value, dict): + return False + string_keys = ("qualname", "filepath", "fingerprint", "loc_bucket") + int_keys = ("start_line", "end_line", "loc", "stmt_count") + if not _has_typed_fields(value, string_keys=string_keys, int_keys=int_keys): + return False + cyclomatic_complexity = value.get("cyclomatic_complexity", 1) + nesting_depth = value.get("nesting_depth", 0) + risk = value.get("risk", "low") + raw_hash = value.get("raw_hash", "") + return ( + isinstance(cyclomatic_complexity, int) + and isinstance(nesting_depth, int) + and isinstance(risk, str) + and risk in {"low", "medium", "high"} + and isinstance(raw_hash, str) + and isinstance(value.get("entry_guard_count", 0), int) + and isinstance(value.get("entry_guard_terminal_profile", "none"), str) + and isinstance(value.get("entry_guard_has_side_effect_before", False), bool) + and isinstance(value.get("terminal_kind", "fallthrough"), str) + and isinstance(value.get("try_finally_profile", "none"), str) + and isinstance(value.get("side_effect_order_profile", "none"), str) + ) + + +def _is_block_dict(value: object) -> TypeGuard[BlockDict]: + if not isinstance(value, dict): + return False + string_keys = ("block_hash", "filepath", "qualname") + int_keys = ("start_line", "end_line", "size") + return _has_typed_fields(value, string_keys=string_keys, int_keys=int_keys) + + +def _is_segment_dict(value: object) -> TypeGuard[SegmentDict]: + if not isinstance(value, dict): + return False + string_keys = ("segment_hash", "segment_sig", "filepath", "qualname") + int_keys = ("start_line", "end_line", "size") + return _has_typed_fields(value, string_keys=string_keys, int_keys=int_keys) + + +def _is_module_typing_coverage_dict( + value: object, +) -> TypeGuard[ModuleTypingCoverageDict]: + if not isinstance(value, dict): + return False + string_keys = ("module", "filepath") + int_keys = ( + "callable_count", + "params_total", + "params_annotated", + "returns_total", + "returns_annotated", + "any_annotation_count", + ) + return _has_typed_fields(value, string_keys=string_keys, int_keys=int_keys) + + +def _is_module_docstring_coverage_dict( + value: object, +) -> TypeGuard[ModuleDocstringCoverageDict]: + if not isinstance(value, dict): + return False + string_keys = ("module", "filepath") + int_keys = ("public_symbol_total", "public_symbol_documented") + return _has_typed_fields(value, string_keys=string_keys, int_keys=int_keys) + + +def _is_api_param_spec_dict(value: object) -> TypeGuard[ApiParamSpecDict]: + if not isinstance(value, dict): + return False + return ( + isinstance(value.get("name"), str) + and isinstance(value.get("kind"), str) + and isinstance(value.get("has_default"), bool) + and isinstance(value.get("annotation_hash", ""), str) + ) + + +def _is_public_symbol_dict(value: object) -> TypeGuard[PublicSymbolDict]: + if not isinstance(value, dict): + return False + if not _has_typed_fields( + value, + string_keys=("qualname", "kind", "exported_via"), + int_keys=("start_line", "end_line"), + ): + return False + params = value.get("params", []) + return ( + isinstance(value.get("returns_hash", ""), str) + and isinstance(params, list) + and all(_is_api_param_spec_dict(item) for item in params) + ) + + +def _is_module_api_surface_dict(value: object) -> TypeGuard[ModuleApiSurfaceDict]: + if not isinstance(value, dict): + return False + all_declared = value.get("all_declared", []) + symbols = value.get("symbols", []) + return ( + isinstance(value.get("module"), str) + and isinstance(value.get("filepath"), str) + and _is_string_list(all_declared) + and isinstance(symbols, list) + and all(_is_public_symbol_dict(item) for item in symbols) + ) + + +def _is_class_metrics_dict(value: object) -> TypeGuard[ClassMetricsDict]: + if not isinstance(value, dict): + return False + if not _has_typed_fields( + value, + string_keys=( + "qualname", + "filepath", + "risk_coupling", + "risk_cohesion", + ), + int_keys=( + "start_line", + "end_line", + "cbo", + "lcom4", + "method_count", + "instance_var_count", + ), + ): + return False + + coupled_classes = value.get("coupled_classes") + if coupled_classes is None: + return True + return _is_string_list(coupled_classes) + + +def _is_module_dep_dict(value: object) -> TypeGuard[ModuleDepDict]: + if not isinstance(value, dict): + return False + return _has_typed_fields( + value, + string_keys=("source", "target", "import_type"), + int_keys=("line",), + ) + + +def _is_dead_candidate_dict(value: object) -> TypeGuard[DeadCandidateDict]: + if not isinstance(value, dict): + return False + if not _has_typed_fields( + value, + string_keys=("qualname", "local_name", "filepath", "kind"), + int_keys=("start_line", "end_line"), + ): + return False + suppressed_rules = value.get("suppressed_rules") + if suppressed_rules is None: + return True + return _is_string_list(suppressed_rules) + + +def _is_security_surface_dict(value: object) -> TypeGuard[SecuritySurfaceDict]: + if not isinstance(value, dict): + return False + return _has_typed_fields( + value, + string_keys=( + "category", + "capability", + "module", + "filepath", + "qualname", + "location_scope", + "classification_mode", + "evidence_kind", + "evidence_symbol", + ), + int_keys=("start_line", "end_line"), + ) + + +def _is_string_list(value: object) -> TypeGuard[list[str]]: + return isinstance(value, list) and all(isinstance(item, str) for item in value) + + +def _has_typed_fields( + value: Mapping[str, object], + *, + string_keys: Sequence[str], + int_keys: Sequence[str], +) -> bool: + return all(isinstance(value.get(key), str) for key in string_keys) and all( + isinstance(value.get(key), int) for key in int_keys + ) + + +__all__ = [ + "_has_typed_fields", + "_is_api_param_spec_dict", + "_is_block_dict", + "_is_class_metrics_dict", + "_is_dead_candidate_dict", + "_is_file_stat_dict", + "_is_module_api_surface_dict", + "_is_module_dep_dict", + "_is_module_docstring_coverage_dict", + "_is_module_typing_coverage_dict", + "_is_public_symbol_dict", + "_is_security_surface_dict", + "_is_segment_dict", + "_is_source_stats_dict", + "_is_string_list", + "_is_unit_dict", +] diff --git a/codeclone/cache/_wire_decode.py b/codeclone/cache/_wire_decode.py new file mode 100644 index 0000000..55d5aeb --- /dev/null +++ b/codeclone/cache/_wire_decode.py @@ -0,0 +1,762 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from ..models import BlockGroupItem, FunctionGroupItem, SegmentGroupItem +from ._canonicalize import _attach_optional_cache_sections +from ._wire_helpers import ( + _decode_optional_wire_coupled_classes, + _decode_optional_wire_items, + _decode_optional_wire_items_for_filepath, + _decode_optional_wire_names, + _decode_optional_wire_row, + _decode_wire_class_metric_fields, + _decode_wire_int_fields, + _decode_wire_named_sized_span, + _decode_wire_named_span, + _decode_wire_qualname_span, + _decode_wire_qualname_span_size, + _decode_wire_row, + _decode_wire_str_fields, + _decode_wire_unit_core_fields, + _decode_wire_unit_flow_profiles, +) +from .entries import ( + ApiParamSpecDict, + BlockDict, + CacheEntry, + ClassMetricsDict, + DeadCandidateDict, + FileStat, + ModuleApiSurfaceDict, + ModuleDepDict, + ModuleDocstringCoverageDict, + ModuleTypingCoverageDict, + PublicSymbolDict, + SecuritySurfaceDict, + SegmentDict, + SourceStatsDict, + StructuralFindingGroupDict, + StructuralFindingOccurrenceDict, + UnitDict, + _as_security_surface_category, + _as_security_surface_classification_mode, + _as_security_surface_evidence_kind, + _as_security_surface_location_scope, + _normalize_cached_structural_groups, +) +from .integrity import ( + as_int_or_none as _as_int, +) +from .integrity import ( + as_object_list as _as_list, +) +from .integrity import ( + as_str_dict as _as_str_dict, +) +from .integrity import ( + as_str_or_none as _as_str, +) + + +def _decode_wire_stat(obj: dict[str, object]) -> FileStat | None: + stat_list = _as_list(obj.get("st")) + if stat_list is None or len(stat_list) != 2: + return None + mtime_ns = _as_int(stat_list[0]) + size = _as_int(stat_list[1]) + if mtime_ns is None or size is None: + return None + return FileStat(mtime_ns=mtime_ns, size=size) + + +def _decode_optional_wire_source_stats( + *, + obj: dict[str, object], +) -> SourceStatsDict | None: + row = _decode_optional_wire_row(obj=obj, key="ss", expected_len=4) + if row is None: + return None + counts = _decode_wire_int_fields(row, 0, 1, 2, 3) + if counts is None: + return None + lines, functions, methods, classes = counts + if any(value < 0 for value in counts): + return None + return SourceStatsDict( + lines=lines, + functions=functions, + methods=methods, + classes=classes, + ) + + +def _decode_wire_file_entry(value: object, filepath: str) -> CacheEntry | None: + obj = _as_str_dict(value) + if obj is None: + return None + + stat = _decode_wire_stat(obj) + if stat is None: + return None + source_stats = _decode_optional_wire_source_stats(obj=obj) + file_sections = _decode_wire_file_sections(obj=obj, filepath=filepath) + if file_sections is None: + return None + ( + units, + blocks, + segments, + class_metrics, + module_deps, + dead_candidates, + ) = file_sections + name_sections = _decode_wire_name_sections(obj=obj) + if name_sections is None: + return None + ( + referenced_names, + referenced_qualnames, + import_names, + class_names, + ) = name_sections + typing_coverage = _decode_optional_wire_typing_coverage(obj=obj, filepath=filepath) + docstring_coverage = _decode_optional_wire_docstring_coverage( + obj=obj, + filepath=filepath, + ) + api_surface = _decode_optional_wire_api_surface(obj=obj, filepath=filepath) + security_surfaces = _decode_optional_wire_security_surfaces( + obj=obj, + filepath=filepath, + ) + coupled_classes_map = _decode_optional_wire_coupled_classes(obj=obj, key="cc") + if coupled_classes_map is None: + return None + if security_surfaces is None: + return None + + for metric in class_metrics: + names = coupled_classes_map.get(metric["qualname"], []) + if names: + metric["coupled_classes"] = names + + has_structural_findings = "sf" in obj + structural_findings = _decode_wire_structural_findings_optional(obj) + if structural_findings is None: + return None + + return _attach_optional_cache_sections( + CacheEntry( + stat=stat, + units=units, + blocks=blocks, + segments=segments, + class_metrics=class_metrics, + module_deps=module_deps, + dead_candidates=dead_candidates, + referenced_names=referenced_names, + referenced_qualnames=referenced_qualnames, + import_names=import_names, + class_names=class_names, + ), + typing_coverage=typing_coverage, + docstring_coverage=docstring_coverage, + api_surface=api_surface, + security_surfaces=security_surfaces, + source_stats=source_stats, + structural_findings=( + _normalize_cached_structural_groups(structural_findings, filepath=filepath) + if has_structural_findings + else None + ), + ) + + +def _decode_wire_file_sections( + *, + obj: dict[str, object], + filepath: str, +) -> ( + tuple[ + list[UnitDict], + list[BlockDict], + list[SegmentDict], + list[ClassMetricsDict], + list[ModuleDepDict], + list[DeadCandidateDict], + ] + | None +): + units = _decode_optional_wire_items_for_filepath( + obj=obj, + key="u", + filepath=filepath, + decode_item=_decode_wire_unit, + ) + blocks = _decode_optional_wire_items_for_filepath( + obj=obj, + key="b", + filepath=filepath, + decode_item=_decode_wire_block, + ) + segments = _decode_optional_wire_items_for_filepath( + obj=obj, + key="s", + filepath=filepath, + decode_item=_decode_wire_segment, + ) + class_metrics = _decode_optional_wire_items_for_filepath( + obj=obj, + key="cm", + filepath=filepath, + decode_item=_decode_wire_class_metric, + ) + module_deps = _decode_optional_wire_items( + obj=obj, + key="md", + decode_item=_decode_wire_module_dep, + ) + dead_candidates = _decode_optional_wire_items_for_filepath( + obj=obj, + key="dc", + filepath=filepath, + decode_item=_decode_wire_dead_candidate, + ) + if ( + units is None + or blocks is None + or segments is None + or class_metrics is None + or module_deps is None + or dead_candidates is None + ): + return None + return ( + units, + blocks, + segments, + class_metrics, + module_deps, + dead_candidates, + ) + + +def _decode_wire_name_sections( + *, + obj: dict[str, object], +) -> tuple[list[str], list[str], list[str], list[str]] | None: + referenced_names = _decode_optional_wire_names(obj=obj, key="rn") + referenced_qualnames = _decode_optional_wire_names(obj=obj, key="rq") + import_names = _decode_optional_wire_names(obj=obj, key="in") + class_names = _decode_optional_wire_names(obj=obj, key="cn") + if ( + referenced_names is None + or referenced_qualnames is None + or import_names is None + or class_names is None + ): + return None + return ( + referenced_names, + referenced_qualnames, + import_names, + class_names, + ) + + +def _decode_optional_wire_typing_coverage( + *, + obj: dict[str, object], + filepath: str, +) -> ModuleTypingCoverageDict | None: + module_and_ints = _decode_optional_wire_module_ints( + obj=obj, + key="tc", + expected_len=7, + int_indexes=(1, 2, 3, 4, 5, 6), + ) + if module_and_ints is None: + return None + module, ints = module_and_ints + ( + callable_count, + params_total, + params_annotated, + returns_total, + returns_annotated, + any_annotation_count, + ) = ints + return ModuleTypingCoverageDict( + module=module, + filepath=filepath, + callable_count=callable_count, + params_total=params_total, + params_annotated=params_annotated, + returns_total=returns_total, + returns_annotated=returns_annotated, + any_annotation_count=any_annotation_count, + ) + + +def _decode_optional_wire_docstring_coverage( + *, + obj: dict[str, object], + filepath: str, +) -> ModuleDocstringCoverageDict | None: + module_and_counts = _decode_optional_wire_module_ints( + obj=obj, + key="dg", + expected_len=3, + int_indexes=(1, 2), + ) + if module_and_counts is None: + return None + module, counts = module_and_counts + public_symbol_total, public_symbol_documented = counts + return ModuleDocstringCoverageDict( + module=module, + filepath=filepath, + public_symbol_total=public_symbol_total, + public_symbol_documented=public_symbol_documented, + ) + + +def _decode_optional_wire_api_surface( + *, + obj: dict[str, object], + filepath: str, +) -> ModuleApiSurfaceDict | None: + row = _decode_optional_wire_row(obj=obj, key="as", expected_len=3) + if row is None: + return None + module = _as_str(row[0]) + all_declared = _decode_optional_wire_names(obj={"ad": row[1]}, key="ad") + symbols_raw = _as_list(row[2]) + if module is None or all_declared is None or symbols_raw is None: + return None + symbols: list[PublicSymbolDict] = [] + for symbol_raw in symbols_raw: + decoded_symbol = _decode_wire_api_surface_symbol(symbol_raw) + if decoded_symbol is None: + return None + symbols.append(decoded_symbol) + return ModuleApiSurfaceDict( + module=module, + filepath=filepath, + all_declared=sorted(set(all_declared)), + symbols=symbols, + ) + + +def _decode_optional_wire_security_surfaces( + *, + obj: dict[str, object], + filepath: str, +) -> list[SecuritySurfaceDict] | None: + rows = _decode_optional_wire_items_for_filepath( + obj=obj, + key="sc", + filepath=filepath, + decode_item=_decode_wire_security_surface, + ) + return rows + + +def _decode_wire_security_surface( + row_raw: object, + filepath: str, +) -> SecuritySurfaceDict | None: + row = _decode_wire_row(row_raw, valid_lengths={10}) + if row is None: + return None + category = _as_security_surface_category(_as_str(row[0])) + capability = _as_str(row[1]) + module = _as_str(row[2]) + qualname = _as_str(row[3]) + lines = _decode_wire_int_fields(row, 4, 5) + location_scope = _as_security_surface_location_scope(_as_str(row[6])) + classification_mode = _as_security_surface_classification_mode(_as_str(row[7])) + evidence_kind = _as_security_surface_evidence_kind(_as_str(row[8])) + evidence_symbol = _as_str(row[9]) + if ( + category is None + or capability is None + or module is None + or qualname is None + or lines is None + or location_scope is None + or classification_mode is None + or evidence_kind is None + or evidence_symbol is None + ): + return None + start_line, end_line = lines + return SecuritySurfaceDict( + category=category, + capability=capability, + module=module, + filepath=filepath, + qualname=qualname, + start_line=start_line, + end_line=end_line, + location_scope=location_scope, + classification_mode=classification_mode, + evidence_kind=evidence_kind, + evidence_symbol=evidence_symbol, + ) + + +def _decode_optional_wire_module_ints( + *, + obj: dict[str, object], + key: str, + expected_len: int, + int_indexes: tuple[int, ...], +) -> tuple[str, tuple[int, ...]] | None: + row = _decode_optional_wire_row(obj=obj, key=key, expected_len=expected_len) + if row is None: + return None + module = _as_str(row[0]) + ints = _decode_wire_int_fields(row, *int_indexes) + if module is None or ints is None: + return None + return module, ints + + +def _decode_wire_api_surface_symbol( + value: object, +) -> PublicSymbolDict | None: + symbol_row = _decode_wire_row(value, valid_lengths={7}) + if symbol_row is None: + return None + str_fields = _decode_wire_str_fields(symbol_row, 0, 1, 4, 5) + int_fields = _decode_wire_int_fields(symbol_row, 2, 3) + params_raw = _as_list(symbol_row[6]) + if str_fields is None or int_fields is None or params_raw is None: + return None + qualname, kind, exported_via, returns_hash = str_fields + start_line, end_line = int_fields + params: list[ApiParamSpecDict] = [] + for param_raw in params_raw: + decoded_param = _decode_wire_api_param_spec(param_raw) + if decoded_param is None: + return None + params.append(decoded_param) + return PublicSymbolDict( + qualname=qualname, + kind=kind, + start_line=start_line, + end_line=end_line, + params=params, + returns_hash=returns_hash, + exported_via=exported_via, + ) + + +def _decode_wire_api_param_spec( + value: object, +) -> ApiParamSpecDict | None: + param_row = _decode_wire_row(value, valid_lengths={4}) + if param_row is None: + return None + str_fields = _decode_wire_str_fields(param_row, 0, 1, 3) + int_fields = _decode_wire_int_fields(param_row, 2) + if str_fields is None or int_fields is None: + return None + name, param_kind, annotation_hash = str_fields + (has_default_raw,) = int_fields + return ApiParamSpecDict( + name=name, + kind=param_kind, + has_default=bool(has_default_raw), + annotation_hash=annotation_hash, + ) + + +def _decode_wire_structural_findings_optional( + obj: dict[str, object], +) -> list[StructuralFindingGroupDict] | None: + raw = obj.get("sf") + if raw is None: + return [] + groups_raw = _as_list(raw) + if groups_raw is None: + return None + groups: list[StructuralFindingGroupDict] = [] + for group_raw in groups_raw: + group = _decode_wire_structural_group(group_raw) + if group is None: + return None + groups.append(group) + return groups + + +def _decode_wire_structural_group(value: object) -> StructuralFindingGroupDict | None: + group_row = _decode_wire_row(value, valid_lengths={4}) + if group_row is None: + return None + str_fields = _decode_wire_str_fields(group_row, 0, 1) + items_raw = _as_list(group_row[3]) + signature = _decode_wire_structural_signature(group_row[2]) + if str_fields is None or items_raw is None or signature is None: + return None + finding_kind, finding_key = str_fields + items: list[StructuralFindingOccurrenceDict] = [] + for item_raw in items_raw: + item = _decode_wire_structural_occurrence(item_raw) + if item is None: + return None + items.append(item) + return StructuralFindingGroupDict( + finding_kind=finding_kind, + finding_key=finding_key, + signature=signature, + items=items, + ) + + +def _decode_wire_structural_signature(value: object) -> dict[str, str] | None: + sig_raw = _as_list(value) + if sig_raw is None: + return None + signature: dict[str, str] = {} + for pair in sig_raw: + pair_list = _as_list(pair) + if pair_list is None or len(pair_list) != 2: + return None + key = _as_str(pair_list[0]) + val = _as_str(pair_list[1]) + if key is None or val is None: + return None + signature[key] = val + return signature + + +def _decode_wire_structural_occurrence( + value: object, +) -> StructuralFindingOccurrenceDict | None: + item_list = _as_list(value) + if item_list is None or len(item_list) != 3: + return None + qualname = _as_str(item_list[0]) + start = _as_int(item_list[1]) + end = _as_int(item_list[2]) + if qualname is None or start is None or end is None: + return None + return StructuralFindingOccurrenceDict( + qualname=qualname, + start=start, + end=end, + ) + + +def _decode_wire_unit(value: object, filepath: str) -> UnitDict | None: + decoded = _decode_wire_named_span(value, valid_lengths={11, 17}) + if decoded is None: + return None + row, qualname, start_line, end_line = decoded + core_fields = _decode_wire_unit_core_fields(row) + flow_profiles = _decode_wire_unit_flow_profiles(row) + if core_fields is None or flow_profiles is None: + return None + ( + loc, + stmt_count, + fingerprint, + loc_bucket, + cyclomatic_complexity, + nesting_depth, + risk, + raw_hash, + ) = core_fields + ( + entry_guard_count, + entry_guard_terminal_profile, + entry_guard_has_side_effect_before, + terminal_kind, + try_finally_profile, + side_effect_order_profile, + ) = flow_profiles + return FunctionGroupItem( + qualname=qualname, + filepath=filepath, + start_line=start_line, + end_line=end_line, + loc=loc, + stmt_count=stmt_count, + fingerprint=fingerprint, + loc_bucket=loc_bucket, + cyclomatic_complexity=cyclomatic_complexity, + nesting_depth=nesting_depth, + risk=risk, + raw_hash=raw_hash, + entry_guard_count=entry_guard_count, + entry_guard_terminal_profile=entry_guard_terminal_profile, + entry_guard_has_side_effect_before=entry_guard_has_side_effect_before, + terminal_kind=terminal_kind, + try_finally_profile=try_finally_profile, + side_effect_order_profile=side_effect_order_profile, + ) + + +def _decode_wire_block(value: object, filepath: str) -> BlockDict | None: + decoded = _decode_wire_named_sized_span(value, valid_lengths={5}) + if decoded is None: + return None + row, qualname, start_line, end_line, size = decoded + block_hash = _as_str(row[4]) + if block_hash is None: + return None + + return BlockGroupItem( + block_hash=block_hash, + filepath=filepath, + qualname=qualname, + start_line=start_line, + end_line=end_line, + size=size, + ) + + +def _decode_wire_segment(value: object, filepath: str) -> SegmentDict | None: + decoded = _decode_wire_named_sized_span(value, valid_lengths={6}) + if decoded is None: + return None + row, qualname, start_line, end_line, size = decoded + segment_hash = _as_str(row[4]) + segment_sig = _as_str(row[5]) + if segment_hash is None or segment_sig is None: + return None + + return SegmentGroupItem( + segment_hash=segment_hash, + segment_sig=segment_sig, + filepath=filepath, + qualname=qualname, + start_line=start_line, + end_line=end_line, + size=size, + ) + + +def _decode_wire_class_metric( + value: object, + filepath: str, +) -> ClassMetricsDict | None: + decoded = _decode_wire_named_span(value, valid_lengths={9}) + if decoded is None: + return None + row, qualname, start_line, end_line = decoded + metric_fields = _decode_wire_class_metric_fields(row) + if metric_fields is None: + return None + cbo, lcom4, method_count, instance_var_count, risk_coupling, risk_cohesion = ( + metric_fields + ) + return ClassMetricsDict( + qualname=qualname, + filepath=filepath, + start_line=start_line, + end_line=end_line, + cbo=cbo, + lcom4=lcom4, + method_count=method_count, + instance_var_count=instance_var_count, + risk_coupling=risk_coupling, + risk_cohesion=risk_cohesion, + ) + + +def _decode_wire_module_dep(value: object) -> ModuleDepDict | None: + row = _as_list(value) + if row is None or len(row) != 4: + return None + source = _as_str(row[0]) + target = _as_str(row[1]) + import_type = _as_str(row[2]) + line = _as_int(row[3]) + if source is None or target is None or import_type is None or line is None: + return None + return ModuleDepDict( + source=source, + target=target, + import_type=import_type, + line=line, + ) + + +def _decode_wire_dead_candidate( + value: object, + filepath: str, +) -> DeadCandidateDict | None: + row = _decode_wire_row(value, valid_lengths={5, 6}) + if row is None: + return None + str_fields = _decode_wire_str_fields(row, 0, 1, 4) + int_fields = _decode_wire_int_fields(row, 2, 3) + suppressed_rules: list[str] | None = [] + if len(row) == 6: + raw_rules = _as_list(row[5]) + if raw_rules is None or not all(isinstance(rule, str) for rule in raw_rules): + return None + suppressed_rules = sorted({str(rule) for rule in raw_rules if str(rule)}) + if str_fields is None or int_fields is None: + return None + qualname, local_name, kind = str_fields + start_line, end_line = int_fields + decoded = DeadCandidateDict( + qualname=qualname, + local_name=local_name, + filepath=filepath, + start_line=start_line, + end_line=end_line, + kind=kind, + ) + if suppressed_rules: + decoded["suppressed_rules"] = suppressed_rules + return decoded + + +__all__ = [ + "_decode_optional_wire_api_surface", + "_decode_optional_wire_coupled_classes", + "_decode_optional_wire_docstring_coverage", + "_decode_optional_wire_items", + "_decode_optional_wire_items_for_filepath", + "_decode_optional_wire_module_ints", + "_decode_optional_wire_names", + "_decode_optional_wire_row", + "_decode_optional_wire_source_stats", + "_decode_optional_wire_typing_coverage", + "_decode_wire_api_param_spec", + "_decode_wire_api_surface_symbol", + "_decode_wire_block", + "_decode_wire_class_metric", + "_decode_wire_class_metric_fields", + "_decode_wire_dead_candidate", + "_decode_wire_file_entry", + "_decode_wire_file_sections", + "_decode_wire_int_fields", + "_decode_wire_module_dep", + "_decode_wire_name_sections", + "_decode_wire_named_sized_span", + "_decode_wire_named_span", + "_decode_wire_qualname_span", + "_decode_wire_qualname_span_size", + "_decode_wire_row", + "_decode_wire_segment", + "_decode_wire_stat", + "_decode_wire_str_fields", + "_decode_wire_structural_findings_optional", + "_decode_wire_structural_group", + "_decode_wire_structural_occurrence", + "_decode_wire_structural_signature", + "_decode_wire_unit", + "_decode_wire_unit_core_fields", + "_decode_wire_unit_flow_profiles", +] diff --git a/codeclone/cache/_wire_encode.py b/codeclone/cache/_wire_encode.py new file mode 100644 index 0000000..f7ea38a --- /dev/null +++ b/codeclone/cache/_wire_encode.py @@ -0,0 +1,320 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from ._canonicalize import _normalized_optional_string_list +from .entries import CacheEntry, ClassMetricsDict + + +def _encode_source_stats(entry: CacheEntry, wire: dict[str, object]) -> None: + source_stats = entry.get("source_stats") + if source_stats is not None: + wire["ss"] = [ + source_stats["lines"], + source_stats["functions"], + source_stats["methods"], + source_stats["classes"], + ] + + +def _encode_units(entry: CacheEntry, wire: dict[str, object]) -> None: + units = sorted( + entry["units"], + key=lambda unit: ( + unit["qualname"], + unit["start_line"], + unit["end_line"], + unit["fingerprint"], + ), + ) + if units: + wire["u"] = [ + [ + unit["qualname"], + unit["start_line"], + unit["end_line"], + unit["loc"], + unit["stmt_count"], + unit["fingerprint"], + unit["loc_bucket"], + unit.get("cyclomatic_complexity", 1), + unit.get("nesting_depth", 0), + unit.get("risk", "low"), + unit.get("raw_hash", ""), + unit.get("entry_guard_count", 0), + unit.get("entry_guard_terminal_profile", "none"), + 1 if unit.get("entry_guard_has_side_effect_before", False) else 0, + unit.get("terminal_kind", "fallthrough"), + unit.get("try_finally_profile", "none"), + unit.get("side_effect_order_profile", "none"), + ] + for unit in units + ] + + +def _encode_blocks(entry: CacheEntry, wire: dict[str, object]) -> None: + blocks = sorted( + entry["blocks"], + key=lambda block: ( + block["qualname"], + block["start_line"], + block["end_line"], + block["block_hash"], + ), + ) + if blocks: + wire["b"] = [ + [ + block["qualname"], + block["start_line"], + block["end_line"], + block["size"], + block["block_hash"], + ] + for block in blocks + ] + + +def _encode_segments(entry: CacheEntry, wire: dict[str, object]) -> None: + segments = sorted( + entry["segments"], + key=lambda segment: ( + segment["qualname"], + segment["start_line"], + segment["end_line"], + segment["segment_hash"], + ), + ) + if segments: + wire["s"] = [ + [ + segment["qualname"], + segment["start_line"], + segment["end_line"], + segment["size"], + segment["segment_hash"], + segment["segment_sig"], + ] + for segment in segments + ] + + +def _append_coupled_classes_row( + metric: ClassMetricsDict, + *, + rows: list[list[object]], +) -> None: + coupled_classes = _normalized_optional_string_list( + metric.get("coupled_classes", []) + ) + if coupled_classes: + rows.append([metric["qualname"], coupled_classes]) + + +def _encode_class_metrics(entry: CacheEntry, wire: dict[str, object]) -> None: + class_metrics = sorted( + entry["class_metrics"], + key=lambda metric: ( + metric["start_line"], + metric["end_line"], + metric["qualname"], + ), + ) + if class_metrics: + coupled_classes_rows: list[list[object]] = [] + wire["cm"] = [ + [ + metric["qualname"], + metric["start_line"], + metric["end_line"], + metric["cbo"], + metric["lcom4"], + metric["method_count"], + metric["instance_var_count"], + metric["risk_coupling"], + metric["risk_cohesion"], + ] + for metric in class_metrics + ] + for metric in class_metrics: + _append_coupled_classes_row(metric, rows=coupled_classes_rows) + if coupled_classes_rows: + wire["cc"] = coupled_classes_rows + + +def _encode_module_deps(entry: CacheEntry, wire: dict[str, object]) -> None: + module_deps = sorted( + entry["module_deps"], + key=lambda dep: (dep["source"], dep["target"], dep["import_type"], dep["line"]), + ) + if module_deps: + wire["md"] = [ + [ + dep["source"], + dep["target"], + dep["import_type"], + dep["line"], + ] + for dep in module_deps + ] + + +def _encode_dead_candidates(entry: CacheEntry, wire: dict[str, object]) -> None: + dead_candidates = sorted( + entry["dead_candidates"], + key=lambda candidate: ( + candidate["start_line"], + candidate["end_line"], + candidate["qualname"], + candidate["local_name"], + candidate["kind"], + ), + ) + if dead_candidates: + encoded_dead_candidates: list[list[object]] = [] + for candidate in dead_candidates: + encoded = [ + candidate["qualname"], + candidate["local_name"], + candidate["start_line"], + candidate["end_line"], + candidate["kind"], + ] + suppressed_rules = candidate.get("suppressed_rules", []) + normalized_rules = _normalized_optional_string_list(suppressed_rules) + if normalized_rules: + encoded.append(normalized_rules) + encoded_dead_candidates.append(encoded) + wire["dc"] = encoded_dead_candidates + + +def _encode_name_lists(entry: CacheEntry, wire: dict[str, object]) -> None: + if entry["referenced_names"]: + wire["rn"] = sorted(set(entry["referenced_names"])) + if entry.get("referenced_qualnames"): + wire["rq"] = sorted(set(entry["referenced_qualnames"])) + if entry["import_names"]: + wire["in"] = sorted(set(entry["import_names"])) + if entry["class_names"]: + wire["cn"] = sorted(set(entry["class_names"])) + + +def _encode_security_surfaces(entry: CacheEntry, wire: dict[str, object]) -> None: + security_surfaces = sorted( + entry.get("security_surfaces", []), + key=lambda item: ( + item["start_line"], + item["end_line"], + item["qualname"], + item["category"], + item["capability"], + item["evidence_symbol"], + ), + ) + if security_surfaces: + wire["sc"] = [ + [ + item["category"], + item["capability"], + item["module"], + item["qualname"], + item["start_line"], + item["end_line"], + item["location_scope"], + item["classification_mode"], + item["evidence_kind"], + item["evidence_symbol"], + ] + for item in security_surfaces + ] + + +def _encode_optional_metrics_sections( + entry: CacheEntry, wire: dict[str, object] +) -> None: + typing_coverage = entry.get("typing_coverage") + if typing_coverage is not None: + wire["tc"] = [ + typing_coverage["module"], + typing_coverage["callable_count"], + typing_coverage["params_total"], + typing_coverage["params_annotated"], + typing_coverage["returns_total"], + typing_coverage["returns_annotated"], + typing_coverage["any_annotation_count"], + ] + docstring_coverage = entry.get("docstring_coverage") + if docstring_coverage is not None: + wire["dg"] = [ + docstring_coverage["module"], + docstring_coverage["public_symbol_total"], + docstring_coverage["public_symbol_documented"], + ] + api_surface = entry.get("api_surface") + if api_surface is not None: + wire["as"] = [ + api_surface["module"], + sorted(set(api_surface.get("all_declared", []))), + [ + [ + symbol["qualname"], + symbol["kind"], + symbol["start_line"], + symbol["end_line"], + symbol.get("exported_via", "name"), + symbol.get("returns_hash", ""), + [ + [ + param["name"], + param["kind"], + 1 if param["has_default"] else 0, + param.get("annotation_hash", ""), + ] + for param in symbol.get("params", []) + ], + ] + for symbol in api_surface["symbols"] + ], + ] + + +def _encode_structural_findings(entry: CacheEntry, wire: dict[str, object]) -> None: + if "structural_findings" in entry: + structural_findings = entry.get("structural_findings", []) + wire["sf"] = [ + [ + group["finding_kind"], + group["finding_key"], + sorted(group["signature"].items()), + [ + [item["qualname"], item["start"], item["end"]] + for item in group["items"] + ], + ] + for group in structural_findings + ] + + +def _encode_wire_file_entry(entry: CacheEntry) -> dict[str, object]: + wire: dict[str, object] = { + "st": [entry["stat"]["mtime_ns"], entry["stat"]["size"]], + } + _encode_source_stats(entry, wire) + _encode_units(entry, wire) + _encode_blocks(entry, wire) + _encode_segments(entry, wire) + _encode_class_metrics(entry, wire) + _encode_module_deps(entry, wire) + _encode_dead_candidates(entry, wire) + _encode_name_lists(entry, wire) + _encode_security_surfaces(entry, wire) + _encode_optional_metrics_sections(entry, wire) + _encode_structural_findings(entry, wire) + return wire + + +__all__ = ["_encode_wire_file_entry"] diff --git a/codeclone/cache/_wire_helpers.py b/codeclone/cache/_wire_helpers.py new file mode 100644 index 0000000..3e987f7 --- /dev/null +++ b/codeclone/cache/_wire_helpers.py @@ -0,0 +1,307 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections.abc import Callable, Collection +from typing import Literal, TypeVar + +from .entries import _as_risk_literal +from .integrity import ( + as_int_or_none as _as_int, +) +from .integrity import ( + as_object_list as _as_list, +) +from .integrity import ( + as_str_or_none as _as_str, +) +from .versioning import _DEFAULT_WIRE_UNIT_FLOW_PROFILES + +_DecodedItemT = TypeVar("_DecodedItemT") + + +def _decode_wire_qualname_span( + row: list[object], +) -> tuple[str, int, int] | None: + qualname = _as_str(row[0]) + start_line = _as_int(row[1]) + end_line = _as_int(row[2]) + if qualname is None or start_line is None or end_line is None: + return None + return qualname, start_line, end_line + + +def _decode_wire_qualname_span_size( + row: list[object], +) -> tuple[str, int, int, int] | None: + qualname_span = _decode_wire_qualname_span(row) + if qualname_span is None: + return None + size = _as_int(row[3]) + if size is None: + return None + qualname, start_line, end_line = qualname_span + return qualname, start_line, end_line, size + + +def _decode_optional_wire_items( + *, + obj: dict[str, object], + key: str, + decode_item: Callable[[object], _DecodedItemT | None], +) -> list[_DecodedItemT] | None: + raw_items = obj.get(key) + if raw_items is None: + return [] + wire_items = _as_list(raw_items) + if wire_items is None: + return None + decoded_items: list[_DecodedItemT] = [] + for wire_item in wire_items: + decoded = decode_item(wire_item) + if decoded is None: + return None + decoded_items.append(decoded) + return decoded_items + + +def _decode_optional_wire_items_for_filepath( + *, + obj: dict[str, object], + key: str, + filepath: str, + decode_item: Callable[[object, str], _DecodedItemT | None], +) -> list[_DecodedItemT] | None: + raw_items = obj.get(key) + if raw_items is None: + return [] + wire_items = _as_list(raw_items) + if wire_items is None: + return None + decoded_items: list[_DecodedItemT] = [] + for wire_item in wire_items: + decoded = decode_item(wire_item, filepath) + if decoded is None: + return None + decoded_items.append(decoded) + return decoded_items + + +def _decode_optional_wire_row( + *, + obj: dict[str, object], + key: str, + expected_len: int, +) -> list[object] | None: + raw = obj.get(key) + if raw is None: + return None + row = _as_list(raw) + if row is None or len(row) != expected_len: + return None + return row + + +def _decode_optional_wire_names( + *, + obj: dict[str, object], + key: str, +) -> list[str] | None: + raw_names = obj.get(key) + if raw_names is None: + return [] + names = _as_list(raw_names) + if names is None or not all(isinstance(name, str) for name in names): + return None + return [str(name) for name in names] + + +def _decode_optional_wire_coupled_classes( + *, + obj: dict[str, object], + key: str, +) -> dict[str, list[str]] | None: + raw = obj.get(key) + if raw is None: + return {} + + rows = _as_list(raw) + if rows is None: + return None + + decoded: dict[str, list[str]] = {} + for wire_row in rows: + row = _as_list(wire_row) + if row is None or len(row) != 2: + return None + qualname = _as_str(row[0]) + names = _as_list(row[1]) + if qualname is None or names is None: + return None + if not all(isinstance(name, str) for name in names): + return None + decoded[qualname] = sorted({str(name) for name in names if str(name)}) + + return decoded + + +def _decode_wire_row( + value: object, + *, + valid_lengths: Collection[int], +) -> list[object] | None: + row = _as_list(value) + if row is None or len(row) not in valid_lengths: + return None + return row + + +def _decode_wire_named_span( + value: object, + *, + valid_lengths: Collection[int], +) -> tuple[list[object], str, int, int] | None: + row = _decode_wire_row(value, valid_lengths=valid_lengths) + if row is None: + return None + span = _decode_wire_qualname_span(row) + if span is None: + return None + qualname, start_line, end_line = span + return row, qualname, start_line, end_line + + +def _decode_wire_named_sized_span( + value: object, + *, + valid_lengths: Collection[int], +) -> tuple[list[object], str, int, int, int] | None: + row = _decode_wire_row(value, valid_lengths=valid_lengths) + if row is None: + return None + span = _decode_wire_qualname_span_size(row) + if span is None: + return None + qualname, start_line, end_line, size = span + return row, qualname, start_line, end_line, size + + +def _decode_wire_int_fields( + row: list[object], + *indexes: int, +) -> tuple[int, ...] | None: + values: list[int] = [] + for index in indexes: + value = _as_int(row[index]) + if value is None: + return None + values.append(value) + return tuple(values) + + +def _decode_wire_str_fields( + row: list[object], + *indexes: int, +) -> tuple[str, ...] | None: + values: list[str] = [] + for index in indexes: + value = _as_str(row[index]) + if value is None: + return None + values.append(value) + return tuple(values) + + +def _decode_wire_unit_core_fields( + row: list[object], +) -> tuple[int, int, str, str, int, int, Literal["low", "medium", "high"], str] | None: + int_fields = _decode_wire_int_fields(row, 3, 4, 7, 8) + str_fields = _decode_wire_str_fields(row, 5, 6, 10) + risk = _as_risk_literal(row[9]) + if int_fields is None or str_fields is None or risk is None: + return None + loc, stmt_count, cyclomatic_complexity, nesting_depth = int_fields + fingerprint, loc_bucket, raw_hash = str_fields + return ( + loc, + stmt_count, + fingerprint, + loc_bucket, + cyclomatic_complexity, + nesting_depth, + risk, + raw_hash, + ) + + +def _decode_wire_unit_flow_profiles( + row: list[object], +) -> tuple[int, str, bool, str, str, str] | None: + if len(row) != 17: + return _DEFAULT_WIRE_UNIT_FLOW_PROFILES + + parsed_entry_guard_count = _as_int(row[11]) + parsed_entry_guard_terminal_profile = _as_str(row[12]) + parsed_entry_guard_has_side_effect_before = _as_int(row[13]) + parsed_terminal_kind = _as_str(row[14]) + parsed_try_finally_profile = _as_str(row[15]) + parsed_side_effect_order_profile = _as_str(row[16]) + if ( + parsed_entry_guard_count is None + or parsed_entry_guard_terminal_profile is None + or parsed_entry_guard_has_side_effect_before is None + or parsed_terminal_kind is None + or parsed_try_finally_profile is None + or parsed_side_effect_order_profile is None + ): + return None + return ( + max(0, parsed_entry_guard_count), + parsed_entry_guard_terminal_profile or "none", + parsed_entry_guard_has_side_effect_before != 0, + parsed_terminal_kind or "fallthrough", + parsed_try_finally_profile or "none", + parsed_side_effect_order_profile or "none", + ) + + +def _decode_wire_class_metric_fields( + row: list[object], +) -> tuple[int, int, int, int, str, str] | None: + int_fields = _decode_wire_int_fields(row, 3, 4, 5, 6) + str_fields = _decode_wire_str_fields(row, 7, 8) + if int_fields is None or str_fields is None: + return None + cbo, lcom4, method_count, instance_var_count = int_fields + risk_coupling, risk_cohesion = str_fields + return ( + cbo, + lcom4, + method_count, + instance_var_count, + risk_coupling, + risk_cohesion, + ) + + +__all__ = [ + "_decode_optional_wire_coupled_classes", + "_decode_optional_wire_items", + "_decode_optional_wire_items_for_filepath", + "_decode_optional_wire_names", + "_decode_optional_wire_row", + "_decode_wire_class_metric_fields", + "_decode_wire_int_fields", + "_decode_wire_named_sized_span", + "_decode_wire_named_span", + "_decode_wire_qualname_span", + "_decode_wire_qualname_span_size", + "_decode_wire_row", + "_decode_wire_str_fields", + "_decode_wire_unit_core_fields", + "_decode_wire_unit_flow_profiles", +] diff --git a/codeclone/cache/entries.py b/codeclone/cache/entries.py new file mode 100644 index 0000000..9dcd1ee --- /dev/null +++ b/codeclone/cache/entries.py @@ -0,0 +1,559 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections.abc import Sequence +from typing import Literal, TypedDict + +from ..findings.structural.detectors import normalize_structural_finding_group +from ..models import ( + BlockGroupItem, + BlockUnit, + ClassMetrics, + DeadCandidate, + FunctionGroupItem, + ModuleApiSurface, + ModuleDep, + ModuleDocstringCoverage, + ModuleTypingCoverage, + SecuritySurface, + SegmentGroupItem, + SegmentUnit, + StructuralFindingGroup, + StructuralFindingOccurrence, + Unit, +) + + +class FileStat(TypedDict): + mtime_ns: int + size: int + + +class SourceStatsDict(TypedDict): + lines: int + functions: int + methods: int + classes: int + + +UnitDict = FunctionGroupItem +BlockDict = BlockGroupItem +SegmentDict = SegmentGroupItem + + +class ClassMetricsDictBase(TypedDict): + qualname: str + filepath: str + start_line: int + end_line: int + cbo: int + lcom4: int + method_count: int + instance_var_count: int + risk_coupling: str + risk_cohesion: str + + +class ClassMetricsDict(ClassMetricsDictBase, total=False): + coupled_classes: list[str] + + +class ModuleDepDict(TypedDict): + source: str + target: str + import_type: str + line: int + + +class DeadCandidateDictBase(TypedDict): + qualname: str + local_name: str + filepath: str + start_line: int + end_line: int + kind: str + + +class DeadCandidateDict(DeadCandidateDictBase, total=False): + suppressed_rules: list[str] + + +class SecuritySurfaceDictBase(TypedDict): + category: str + capability: str + module: str + filepath: str + qualname: str + start_line: int + end_line: int + location_scope: str + classification_mode: str + evidence_kind: str + evidence_symbol: str + + +class SecuritySurfaceDict(SecuritySurfaceDictBase): + pass + + +class ModuleTypingCoverageDict(TypedDict): + module: str + filepath: str + callable_count: int + params_total: int + params_annotated: int + returns_total: int + returns_annotated: int + any_annotation_count: int + + +class ModuleDocstringCoverageDict(TypedDict): + module: str + filepath: str + public_symbol_total: int + public_symbol_documented: int + + +class ApiParamSpecDict(TypedDict): + name: str + kind: str + has_default: bool + annotation_hash: str + + +class PublicSymbolDict(TypedDict): + qualname: str + kind: str + start_line: int + end_line: int + params: list[ApiParamSpecDict] + returns_hash: str + exported_via: str + + +class ModuleApiSurfaceDict(TypedDict): + module: str + filepath: str + all_declared: list[str] + symbols: list[PublicSymbolDict] + + +class StructuralFindingOccurrenceDict(TypedDict): + qualname: str + start: int + end: int + + +class StructuralFindingGroupDict(TypedDict): + finding_kind: str + finding_key: str + signature: dict[str, str] + items: list[StructuralFindingOccurrenceDict] + + +class _FileEntryBase(TypedDict): + stat: FileStat + units: list[UnitDict] + blocks: list[BlockDict] + segments: list[SegmentDict] + + +class _FileEntryV26(_FileEntryBase, total=False): + source_stats: SourceStatsDict + class_metrics: list[ClassMetricsDict] + module_deps: list[ModuleDepDict] + dead_candidates: list[DeadCandidateDict] + referenced_names: list[str] + referenced_qualnames: list[str] + import_names: list[str] + class_names: list[str] + security_surfaces: list[SecuritySurfaceDict] + typing_coverage: ModuleTypingCoverageDict + docstring_coverage: ModuleDocstringCoverageDict + api_surface: ModuleApiSurfaceDict + structural_findings: list[StructuralFindingGroupDict] + + +CacheEntryBase = _FileEntryBase +CacheEntry = _FileEntryV26 + + +def _normalize_cached_structural_group( + group: StructuralFindingGroupDict, + *, + filepath: str, +) -> StructuralFindingGroupDict | None: + signature = dict(group["signature"]) + finding_kind = group["finding_kind"] + finding_key = group["finding_key"] + normalized = normalize_structural_finding_group( + StructuralFindingGroup( + finding_kind=finding_kind, + finding_key=finding_key, + signature=signature, + items=tuple( + StructuralFindingOccurrence( + finding_kind=finding_kind, + finding_key=finding_key, + file_path=filepath, + qualname=item["qualname"], + start=item["start"], + end=item["end"], + signature=signature, + ) + for item in group["items"] + ), + ) + ) + if normalized is None: + return None + return StructuralFindingGroupDict( + finding_kind=normalized.finding_kind, + finding_key=normalized.finding_key, + signature=dict(normalized.signature), + items=[ + StructuralFindingOccurrenceDict( + qualname=item.qualname, + start=item.start, + end=item.end, + ) + for item in normalized.items + ], + ) + + +def _normalize_cached_structural_groups( + groups: Sequence[StructuralFindingGroupDict], + *, + filepath: str, +) -> list[StructuralFindingGroupDict]: + normalized = [ + candidate + for candidate in ( + _normalize_cached_structural_group(group, filepath=filepath) + for group in groups + ) + if candidate is not None + ] + normalized.sort(key=lambda group: (-len(group["items"]), group["finding_key"])) + return normalized + + +def _as_risk_literal(value: object) -> Literal["low", "medium", "high"] | None: + match value: + case "low": + return "low" + case "medium": + return "medium" + case "high": + return "high" + case _: + return None + + +def _as_security_surface_category(value: object) -> str | None: + match value: + case ( + "archive_extraction" + | "crypto_transport" + | "database_boundary" + | "deserialization" + | "dynamic_execution" + | "dynamic_loading" + | "filesystem_mutation" + | "identity_token" + | "network_boundary" + | "process_boundary" + ): + return value + case _: + return None + + +def _as_security_surface_location_scope(value: object) -> str | None: + match value: + case "module" | "class" | "callable": + return value + case _: + return None + + +def _as_security_surface_classification_mode(value: object) -> str | None: + match value: + case "exact_builtin" | "exact_call" | "exact_import": + return value + case _: + return None + + +def _as_security_surface_evidence_kind(value: object) -> str | None: + match value: + case "builtin" | "call" | "import": + return value + case _: + return None + + +def _new_optional_metrics_payload() -> tuple[ + list[ClassMetricsDict], + list[ModuleDepDict], + list[DeadCandidateDict], + list[str], + list[str], + list[str], + list[str], + list[SecuritySurfaceDict], + ModuleTypingCoverageDict | None, + ModuleDocstringCoverageDict | None, + ModuleApiSurfaceDict | None, +]: + return [], [], [], [], [], [], [], [], None, None, None + + +def _unit_dict_from_model(unit: Unit, filepath: str) -> UnitDict: + return FunctionGroupItem( + qualname=unit.qualname, + filepath=filepath, + start_line=unit.start_line, + end_line=unit.end_line, + loc=unit.loc, + stmt_count=unit.stmt_count, + fingerprint=unit.fingerprint, + loc_bucket=unit.loc_bucket, + cyclomatic_complexity=unit.cyclomatic_complexity, + nesting_depth=unit.nesting_depth, + risk=unit.risk, + raw_hash=unit.raw_hash, + entry_guard_count=unit.entry_guard_count, + entry_guard_terminal_profile=unit.entry_guard_terminal_profile, + entry_guard_has_side_effect_before=unit.entry_guard_has_side_effect_before, + terminal_kind=unit.terminal_kind, + try_finally_profile=unit.try_finally_profile, + side_effect_order_profile=unit.side_effect_order_profile, + ) + + +def _block_dict_from_model(block: BlockUnit, filepath: str) -> BlockDict: + return BlockGroupItem( + block_hash=block.block_hash, + filepath=filepath, + qualname=block.qualname, + start_line=block.start_line, + end_line=block.end_line, + size=block.size, + ) + + +def _segment_dict_from_model(segment: SegmentUnit, filepath: str) -> SegmentDict: + return SegmentGroupItem( + segment_hash=segment.segment_hash, + segment_sig=segment.segment_sig, + filepath=filepath, + qualname=segment.qualname, + start_line=segment.start_line, + end_line=segment.end_line, + size=segment.size, + ) + + +def _typing_coverage_dict_from_model( + coverage: ModuleTypingCoverage | None, + *, + filepath: str, +) -> ModuleTypingCoverageDict | None: + if coverage is None: + return None + return ModuleTypingCoverageDict( + module=coverage.module, + filepath=filepath, + callable_count=coverage.callable_count, + params_total=coverage.params_total, + params_annotated=coverage.params_annotated, + returns_total=coverage.returns_total, + returns_annotated=coverage.returns_annotated, + any_annotation_count=coverage.any_annotation_count, + ) + + +def _docstring_coverage_dict_from_model( + coverage: ModuleDocstringCoverage | None, + *, + filepath: str, +) -> ModuleDocstringCoverageDict | None: + if coverage is None: + return None + return ModuleDocstringCoverageDict( + module=coverage.module, + filepath=filepath, + public_symbol_total=coverage.public_symbol_total, + public_symbol_documented=coverage.public_symbol_documented, + ) + + +def _api_surface_dict_from_model( + surface: ModuleApiSurface | None, + *, + filepath: str, +) -> ModuleApiSurfaceDict | None: + if surface is None: + return None + return ModuleApiSurfaceDict( + module=surface.module, + filepath=filepath, + all_declared=list(surface.all_declared or ()), + symbols=[ + PublicSymbolDict( + qualname=symbol.qualname, + kind=symbol.kind, + start_line=symbol.start_line, + end_line=symbol.end_line, + params=[ + ApiParamSpecDict( + name=param.name, + kind=param.kind, + has_default=param.has_default, + annotation_hash=param.annotation_hash, + ) + for param in symbol.params + ], + returns_hash=symbol.returns_hash, + exported_via=symbol.exported_via, + ) + for symbol in surface.symbols + ], + ) + + +def _class_metrics_dict_from_model( + metric: ClassMetrics, + filepath: str, +) -> ClassMetricsDict: + return ClassMetricsDict( + qualname=metric.qualname, + filepath=filepath, + start_line=metric.start_line, + end_line=metric.end_line, + cbo=metric.cbo, + lcom4=metric.lcom4, + method_count=metric.method_count, + instance_var_count=metric.instance_var_count, + risk_coupling=metric.risk_coupling, + risk_cohesion=metric.risk_cohesion, + coupled_classes=sorted(set(metric.coupled_classes)), + ) + + +def _module_dep_dict_from_model(dep: ModuleDep) -> ModuleDepDict: + return ModuleDepDict( + source=dep.source, + target=dep.target, + import_type=dep.import_type, + line=dep.line, + ) + + +def _dead_candidate_dict_from_model( + candidate: DeadCandidate, + filepath: str, +) -> DeadCandidateDict: + result = DeadCandidateDict( + qualname=candidate.qualname, + local_name=candidate.local_name, + filepath=filepath, + start_line=candidate.start_line, + end_line=candidate.end_line, + kind=candidate.kind, + ) + if candidate.suppressed_rules: + result["suppressed_rules"] = sorted(set(candidate.suppressed_rules)) + return result + + +def _security_surface_dict_from_model( + surface: SecuritySurface, + filepath: str, +) -> SecuritySurfaceDict: + return SecuritySurfaceDict( + category=surface.category, + capability=surface.capability, + module=surface.module, + filepath=filepath, + qualname=surface.qualname, + start_line=surface.start_line, + end_line=surface.end_line, + location_scope=surface.location_scope, + classification_mode=surface.classification_mode, + evidence_kind=surface.evidence_kind, + evidence_symbol=surface.evidence_symbol, + ) + + +def _structural_occurrence_dict_from_model( + occurrence: StructuralFindingOccurrence, +) -> StructuralFindingOccurrenceDict: + return StructuralFindingOccurrenceDict( + qualname=occurrence.qualname, + start=occurrence.start, + end=occurrence.end, + ) + + +def _structural_group_dict_from_model( + group: StructuralFindingGroup, +) -> StructuralFindingGroupDict: + return StructuralFindingGroupDict( + finding_kind=group.finding_kind, + finding_key=group.finding_key, + signature=dict(group.signature), + items=[ + _structural_occurrence_dict_from_model(occurrence) + for occurrence in group.items + ], + ) + + +__all__ = [ + "ApiParamSpecDict", + "BlockDict", + "CacheEntry", + "CacheEntryBase", + "ClassMetricsDict", + "DeadCandidateDict", + "FileStat", + "ModuleApiSurfaceDict", + "ModuleDepDict", + "ModuleDocstringCoverageDict", + "ModuleTypingCoverageDict", + "PublicSymbolDict", + "SecuritySurfaceDict", + "SegmentDict", + "SourceStatsDict", + "StructuralFindingGroupDict", + "StructuralFindingOccurrenceDict", + "UnitDict", + "_api_surface_dict_from_model", + "_as_risk_literal", + "_as_security_surface_category", + "_as_security_surface_classification_mode", + "_as_security_surface_evidence_kind", + "_as_security_surface_location_scope", + "_block_dict_from_model", + "_class_metrics_dict_from_model", + "_dead_candidate_dict_from_model", + "_docstring_coverage_dict_from_model", + "_module_dep_dict_from_model", + "_new_optional_metrics_payload", + "_normalize_cached_structural_group", + "_normalize_cached_structural_groups", + "_security_surface_dict_from_model", + "_segment_dict_from_model", + "_structural_group_dict_from_model", + "_structural_occurrence_dict_from_model", + "_typing_coverage_dict_from_model", + "_unit_dict_from_model", +] diff --git a/codeclone/cache_io.py b/codeclone/cache/integrity.py similarity index 80% rename from codeclone/cache_io.py rename to codeclone/cache/integrity.py index c077cc8..12086b1 100644 --- a/codeclone/cache_io.py +++ b/codeclone/cache/integrity.py @@ -11,13 +11,9 @@ from collections.abc import Mapping from pathlib import Path -from ._json_io import ( - json_text as _json_text, -) -from ._json_io import ( - read_json_document as _read_json_document, -) -from ._json_io import ( +from ..utils.json_io import json_text as _json_text +from ..utils.json_io import read_json_document as _read_json_document +from ..utils.json_io import ( write_json_document_atomically as _write_json_document_atomically, ) @@ -64,3 +60,16 @@ def read_json_document(path: Path) -> object: def write_json_document_atomically(path: Path, document: object) -> None: _write_json_document_atomically(path, document, sort_keys=True) + + +__all__ = [ + "as_int_or_none", + "as_object_list", + "as_str_dict", + "as_str_or_none", + "canonical_json", + "read_json_document", + "sign_cache_payload", + "verify_cache_payload_signature", + "write_json_document_atomically", +] diff --git a/codeclone/cache_segments.py b/codeclone/cache/projection.py similarity index 82% rename from codeclone/cache_segments.py rename to codeclone/cache/projection.py index a771e51..49b2db3 100644 --- a/codeclone/cache_segments.py +++ b/codeclone/cache/projection.py @@ -10,18 +10,57 @@ from pathlib import Path from typing import TypedDict -from .cache_io import ( +from ..models import SegmentGroupItem +from .integrity import ( as_int_or_none, as_object_list, as_str_dict, as_str_or_none, ) -from .cache_paths import runtime_filepath_from_wire, wire_filepath_from_runtime -from .models import SegmentGroupItem SegmentDict = SegmentGroupItem +def wire_filepath_from_runtime( + runtime_filepath: str, + *, + root: Path | None, +) -> str: + runtime_path = Path(runtime_filepath) + if root is None: + return runtime_path.as_posix() + + try: + relative = runtime_path.relative_to(root) + return relative.as_posix() + except ValueError: + pass + + try: + relative = runtime_path.resolve().relative_to(root.resolve()) + return relative.as_posix() + except OSError: + return runtime_path.as_posix() + except ValueError: + return runtime_path.as_posix() + + +def runtime_filepath_from_wire( + wire_filepath: str, + *, + root: Path | None, +) -> str: + wire_path = Path(wire_filepath) + if root is None or wire_path.is_absolute(): + return str(wire_path) + + combined = root / wire_path + try: + return str(combined.resolve(strict=False)) + except OSError: + return str(combined) + + class SegmentReportProjection(TypedDict): digest: str suppressed: int @@ -182,3 +221,14 @@ def encode_segment_report_projection( "s": max(0, int(projection["suppressed"])), "g": groups_rows, } + + +__all__ = [ + "SegmentDict", + "SegmentReportProjection", + "build_segment_report_projection", + "decode_segment_report_projection", + "encode_segment_report_projection", + "runtime_filepath_from_wire", + "wire_filepath_from_runtime", +] diff --git a/codeclone/cache/store.py b/codeclone/cache/store.py new file mode 100644 index 0000000..6ac9884 --- /dev/null +++ b/codeclone/cache/store.py @@ -0,0 +1,681 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import os +from collections.abc import Collection +from json import JSONDecodeError +from pathlib import Path +from typing import Protocol + +from ..baseline.trust import current_python_tag +from ..contracts import ( + BASELINE_FINGERPRINT_VERSION, + CACHE_VERSION, + DEFAULT_BLOCK_MIN_LOC, + DEFAULT_BLOCK_MIN_STMT, + DEFAULT_MIN_LOC, + DEFAULT_MIN_STMT, + DEFAULT_SEGMENT_MIN_LOC, + DEFAULT_SEGMENT_MIN_STMT, +) +from ..contracts.errors import CacheError +from ..models import BlockUnit, FileMetrics, SegmentUnit, StructuralFindingGroup, Unit +from ._canonicalize import ( + _as_file_stat_dict, + _as_typed_block_list, + _as_typed_segment_list, + _as_typed_unit_list, + _attach_optional_cache_sections, + _canonicalize_cache_entry, + _decode_optional_cache_sections, + _is_canonical_cache_entry, +) +from ._wire_decode import _decode_wire_file_entry +from ._wire_encode import _encode_wire_file_entry +from .entries import ( + CacheEntry, + FileStat, + SourceStatsDict, + _api_surface_dict_from_model, + _block_dict_from_model, + _class_metrics_dict_from_model, + _dead_candidate_dict_from_model, + _docstring_coverage_dict_from_model, + _module_dep_dict_from_model, + _new_optional_metrics_payload, + _normalize_cached_structural_groups, + _security_surface_dict_from_model, + _segment_dict_from_model, + _structural_group_dict_from_model, + _typing_coverage_dict_from_model, + _unit_dict_from_model, +) +from .integrity import ( + as_str_dict as _as_str_dict, +) +from .integrity import ( + as_str_or_none as _as_str, +) +from .integrity import ( + read_json_document, + sign_cache_payload, + verify_cache_payload_signature, + write_json_document_atomically, +) +from .projection import ( + SegmentReportProjection, + decode_segment_report_projection, + encode_segment_report_projection, + runtime_filepath_from_wire, + wire_filepath_from_runtime, +) +from .versioning import ( + LEGACY_CACHE_SECRET_FILENAME, + MAX_CACHE_SIZE_BYTES, + AnalysisProfile, + CacheData, + CacheStatus, + _as_analysis_profile, + _empty_cache_data, + _resolve_root, +) + + +class _CacheStatusLike(Protocol): + @property + def load_status(self) -> CacheStatus | str | None: ... + + @property + def load_warning(self) -> str | None: ... + + @property + def cache_schema_version(self) -> str | None: ... + + +def resolve_cache_status(cache: _CacheStatusLike) -> tuple[CacheStatus, str | None]: + raw_cache_status = getattr(cache, "load_status", None) + load_warning = getattr(cache, "load_warning", None) + if isinstance(raw_cache_status, CacheStatus): + cache_status = raw_cache_status + elif isinstance(raw_cache_status, str): + try: + cache_status = CacheStatus(raw_cache_status) + except ValueError: + cache_status = ( + CacheStatus.OK if load_warning is None else CacheStatus.INVALID_TYPE + ) + else: + cache_status = ( + CacheStatus.OK if load_warning is None else CacheStatus.INVALID_TYPE + ) + + raw_cache_schema_version = getattr(cache, "cache_schema_version", None) + cache_schema_version = ( + raw_cache_schema_version if isinstance(raw_cache_schema_version, str) else None + ) + return cache_status, cache_schema_version + + +class Cache: + __slots__ = ( + "_canonical_runtime_paths", + "_dirty", + "analysis_profile", + "cache_schema_version", + "data", + "fingerprint_version", + "legacy_secret_warning", + "load_status", + "load_warning", + "max_size_bytes", + "path", + "root", + "segment_report_projection", + ) + + _CACHE_VERSION = CACHE_VERSION + + def __init__( + self, + path: str | Path, + *, + root: str | Path | None = None, + max_size_bytes: int | None = None, + min_loc: int = DEFAULT_MIN_LOC, + min_stmt: int = DEFAULT_MIN_STMT, + block_min_loc: int = DEFAULT_BLOCK_MIN_LOC, + block_min_stmt: int = DEFAULT_BLOCK_MIN_STMT, + segment_min_loc: int = DEFAULT_SEGMENT_MIN_LOC, + segment_min_stmt: int = DEFAULT_SEGMENT_MIN_STMT, + collect_api_surface: bool = False, + ): + self.path = Path(path) + self.root = _resolve_root(root) + self.fingerprint_version = BASELINE_FINGERPRINT_VERSION + self.analysis_profile: AnalysisProfile = { + "min_loc": min_loc, + "min_stmt": min_stmt, + "block_min_loc": block_min_loc, + "block_min_stmt": block_min_stmt, + "segment_min_loc": segment_min_loc, + "segment_min_stmt": segment_min_stmt, + "collect_api_surface": collect_api_surface, + } + self.data: CacheData = _empty_cache_data( + version=self._CACHE_VERSION, + python_tag=current_python_tag(), + fingerprint_version=self.fingerprint_version, + analysis_profile=self.analysis_profile, + ) + self._canonical_runtime_paths: set[str] = set() + self.legacy_secret_warning = self._detect_legacy_secret_warning() + self.cache_schema_version: str | None = None + self.load_status = CacheStatus.MISSING + self.load_warning: str | None = self.legacy_secret_warning + self.max_size_bytes = ( + MAX_CACHE_SIZE_BYTES if max_size_bytes is None else max_size_bytes + ) + self.segment_report_projection: SegmentReportProjection | None = None + self._dirty: bool = True + + def _detect_legacy_secret_warning(self) -> str | None: + secret_path = self.path.parent / LEGACY_CACHE_SECRET_FILENAME + try: + if secret_path.exists(): + return ( + f"Legacy cache secret file detected at {secret_path}; " + "delete this obsolete file." + ) + except OSError as exc: + return f"Legacy cache secret check failed: {exc}" + return None + + def _set_load_warning(self, message: str | None) -> None: + warning = message + if warning is None: + warning = self.legacy_secret_warning + elif self.legacy_secret_warning: + warning = f"{warning}\n{self.legacy_secret_warning}" + self.load_warning = warning + + def _ignore_cache( + self, + message: str, + *, + status: CacheStatus, + schema_version: str | None = None, + ) -> None: + self._set_load_warning(message) + self.load_status = status + self.cache_schema_version = schema_version + self.data = _empty_cache_data( + version=self._CACHE_VERSION, + python_tag=current_python_tag(), + fingerprint_version=self.fingerprint_version, + analysis_profile=self.analysis_profile, + ) + self._canonical_runtime_paths = set() + self.segment_report_projection = None + + def _reject_cache_load( + self, + message: str, + *, + status: CacheStatus, + schema_version: str | None = None, + ) -> CacheData | None: + self._ignore_cache( + message, + status=status, + schema_version=schema_version, + ) + return None + + def _reject_invalid_cache_format( + self, + *, + schema_version: str | None = None, + ) -> CacheData | None: + return self._reject_cache_load( + "Cache format invalid; ignoring cache.", + status=CacheStatus.INVALID_TYPE, + schema_version=schema_version, + ) + + def _reject_version_mismatch(self, version: str) -> CacheData | None: + return self._reject_cache_load( + f"Cache version mismatch (found {version}); ignoring cache.", + status=CacheStatus.VERSION_MISMATCH, + schema_version=version, + ) + + def load(self) -> None: + try: + exists = self.path.exists() + except OSError as exc: + self._ignore_cache( + f"Cache unreadable; ignoring cache: {exc}", + status=CacheStatus.UNREADABLE, + ) + return + + if not exists: + self._set_load_warning(None) + self.load_status = CacheStatus.MISSING + self.cache_schema_version = None + self._canonical_runtime_paths = set() + self.segment_report_projection = None + return + + try: + size = self.path.stat().st_size + if size > self.max_size_bytes: + self._ignore_cache( + "Cache file too large " + f"({size} bytes, max {self.max_size_bytes}); ignoring cache.", + status=CacheStatus.TOO_LARGE, + ) + return + + raw_obj = read_json_document(self.path) + parsed = self._load_and_validate(raw_obj) + if parsed is None: + return + self.data = parsed + self._canonical_runtime_paths = set(parsed["files"].keys()) + self.load_status = CacheStatus.OK + self._set_load_warning(None) + self._dirty = False + except OSError as exc: + self._ignore_cache( + f"Cache unreadable; ignoring cache: {exc}", + status=CacheStatus.UNREADABLE, + ) + except JSONDecodeError: + self._ignore_cache( + "Cache corrupted; ignoring cache.", + status=CacheStatus.INVALID_JSON, + ) + + def _load_and_validate(self, raw_obj: object) -> CacheData | None: + raw = _as_str_dict(raw_obj) + if raw is None: + return self._reject_invalid_cache_format() + + legacy_version = _as_str(raw.get("version")) + if legacy_version is not None: + return self._reject_version_mismatch(legacy_version) + + version = _as_str(raw.get("v")) + if version is None: + return self._reject_invalid_cache_format() + + if version != self._CACHE_VERSION: + return self._reject_version_mismatch(version) + + sig = _as_str(raw.get("sig")) + payload = _as_str_dict(raw.get("payload")) + if sig is None or payload is None: + return self._reject_invalid_cache_format(schema_version=version) + + if not verify_cache_payload_signature(payload, sig): + return self._reject_cache_load( + "Cache signature mismatch; ignoring cache.", + status=CacheStatus.INTEGRITY_FAILED, + schema_version=version, + ) + + runtime_tag = current_python_tag() + py_tag = _as_str(payload.get("py")) + if py_tag is None: + return self._reject_invalid_cache_format(schema_version=version) + + if py_tag != runtime_tag: + return self._reject_cache_load( + "Cache python tag mismatch " + f"(found {py_tag}, expected {runtime_tag}); ignoring cache.", + status=CacheStatus.PYTHON_TAG_MISMATCH, + schema_version=version, + ) + + fp_version = _as_str(payload.get("fp")) + if fp_version is None: + return self._reject_invalid_cache_format(schema_version=version) + + if fp_version != self.fingerprint_version: + return self._reject_cache_load( + "Cache fingerprint version mismatch " + f"(found {fp_version}, expected {self.fingerprint_version}); " + "ignoring cache.", + status=CacheStatus.FINGERPRINT_MISMATCH, + schema_version=version, + ) + + analysis_profile = _as_analysis_profile(payload.get("ap")) + if analysis_profile is None: + return self._reject_invalid_cache_format(schema_version=version) + + if analysis_profile != self.analysis_profile: + return self._reject_cache_load( + "Cache analysis profile mismatch " + f"(found min_loc={analysis_profile['min_loc']}, " + f"min_stmt={analysis_profile['min_stmt']}, " + "collect_api_surface=" + f"{str(analysis_profile['collect_api_surface']).lower()}; " + f"expected min_loc={self.analysis_profile['min_loc']}, " + f"min_stmt={self.analysis_profile['min_stmt']}, " + "collect_api_surface=" + f"{str(self.analysis_profile['collect_api_surface']).lower()}); " + "ignoring cache.", + status=CacheStatus.ANALYSIS_PROFILE_MISMATCH, + schema_version=version, + ) + + files_dict = _as_str_dict(payload.get("files")) + if files_dict is None: + return self._reject_invalid_cache_format(schema_version=version) + + parsed_files: dict[str, CacheEntry] = {} + for wire_path, file_entry_obj in files_dict.items(): + runtime_path = runtime_filepath_from_wire(wire_path, root=self.root) + parsed_entry = self._decode_entry(file_entry_obj, runtime_path) + if parsed_entry is None: + return self._reject_invalid_cache_format(schema_version=version) + parsed_files[runtime_path] = _canonicalize_cache_entry(parsed_entry) + self.segment_report_projection = decode_segment_report_projection( + payload.get("sr"), + root=self.root, + ) + + self.cache_schema_version = version + return CacheData( + version=self._CACHE_VERSION, + python_tag=runtime_tag, + fingerprint_version=self.fingerprint_version, + analysis_profile=self.analysis_profile, + files=parsed_files, + ) + + def save(self) -> None: + if not self._dirty: + return + try: + wire_files: dict[str, object] = {} + wire_map = { + runtime_path: wire_filepath_from_runtime(runtime_path, root=self.root) + for runtime_path in self.data["files"] + } + for runtime_path in sorted(self.data["files"], key=wire_map.__getitem__): + entry = self.get_file_entry(runtime_path) + if entry is None: + continue + wire_files[wire_map[runtime_path]] = self._encode_entry(entry) + + payload: dict[str, object] = { + "py": current_python_tag(), + "fp": self.fingerprint_version, + "ap": self.analysis_profile, + "files": wire_files, + } + segment_projection = encode_segment_report_projection( + self.segment_report_projection, + root=self.root, + ) + if segment_projection is not None: + payload["sr"] = segment_projection + signed_doc = { + "v": self._CACHE_VERSION, + "payload": payload, + "sig": sign_cache_payload(payload), + } + write_json_document_atomically(self.path, signed_doc) + self._dirty = False + + self.data["version"] = self._CACHE_VERSION + self.data["python_tag"] = current_python_tag() + self.data["fingerprint_version"] = self.fingerprint_version + self.data["analysis_profile"] = self.analysis_profile + except OSError as exc: + raise CacheError(f"Failed to save cache: {exc}") from exc + + @staticmethod + def _decode_entry(value: object, filepath: str) -> CacheEntry | None: + return _decode_wire_file_entry(value, filepath) + + @staticmethod + def _encode_entry(entry: CacheEntry) -> dict[str, object]: + return _encode_wire_file_entry(entry) + + def _store_canonical_file_entry( + self, + *, + runtime_path: str, + canonical_entry: CacheEntry, + ) -> CacheEntry: + previous_entry = self.data["files"].get(runtime_path) + was_canonical = runtime_path in self._canonical_runtime_paths + self.data["files"][runtime_path] = canonical_entry + self._canonical_runtime_paths.add(runtime_path) + if not was_canonical or previous_entry != canonical_entry: + self._dirty = True + return canonical_entry + + def get_file_entry(self, filepath: str) -> CacheEntry | None: + runtime_lookup_key = filepath + entry_obj = self.data["files"].get(runtime_lookup_key) + if entry_obj is None: + wire_key = wire_filepath_from_runtime(filepath, root=self.root) + runtime_lookup_key = runtime_filepath_from_wire(wire_key, root=self.root) + entry_obj = self.data["files"].get(runtime_lookup_key) + + if entry_obj is None: + return None + + if runtime_lookup_key in self._canonical_runtime_paths: + if _is_canonical_cache_entry(entry_obj): + return entry_obj + self._canonical_runtime_paths.discard(runtime_lookup_key) + + if not isinstance(entry_obj, dict): + return None + + stat = _as_file_stat_dict(entry_obj.get("stat")) + units = _as_typed_unit_list(entry_obj.get("units")) + blocks = _as_typed_block_list(entry_obj.get("blocks")) + segments = _as_typed_segment_list(entry_obj.get("segments")) + if stat is None or units is None or blocks is None or segments is None: + return None + + optional_sections = _decode_optional_cache_sections(entry_obj) + if optional_sections is None: + return None + ( + class_metrics_raw, + module_deps_raw, + dead_candidates_raw, + referenced_names_raw, + referenced_qualnames_raw, + import_names_raw, + class_names_raw, + security_surfaces_raw, + typing_coverage_raw, + docstring_coverage_raw, + api_surface_raw, + source_stats, + structural_findings, + ) = optional_sections + + entry_to_canonicalize: CacheEntry = _attach_optional_cache_sections( + CacheEntry( + stat=stat, + units=units, + blocks=blocks, + segments=segments, + class_metrics=class_metrics_raw, + module_deps=module_deps_raw, + dead_candidates=dead_candidates_raw, + referenced_names=referenced_names_raw, + referenced_qualnames=referenced_qualnames_raw, + import_names=import_names_raw, + class_names=class_names_raw, + security_surfaces=security_surfaces_raw, + ), + typing_coverage=typing_coverage_raw, + docstring_coverage=docstring_coverage_raw, + api_surface=api_surface_raw, + security_surfaces=security_surfaces_raw, + source_stats=source_stats, + structural_findings=structural_findings, + ) + canonical_entry = _canonicalize_cache_entry(entry_to_canonicalize) + return self._store_canonical_file_entry( + runtime_path=runtime_lookup_key, + canonical_entry=canonical_entry, + ) + + def put_file_entry( + self, + filepath: str, + stat_sig: FileStat, + units: list[Unit], + blocks: list[BlockUnit], + segments: list[SegmentUnit], + *, + source_stats: SourceStatsDict | None = None, + file_metrics: FileMetrics | None = None, + structural_findings: list[StructuralFindingGroup] | None = None, + ) -> None: + runtime_path = runtime_filepath_from_wire( + wire_filepath_from_runtime(filepath, root=self.root), + root=self.root, + ) + + unit_rows = [_unit_dict_from_model(unit, runtime_path) for unit in units] + block_rows = [_block_dict_from_model(block, runtime_path) for block in blocks] + segment_rows = [ + _segment_dict_from_model(segment, runtime_path) for segment in segments + ] + + ( + class_metrics_rows, + module_dep_rows, + dead_candidate_rows, + referenced_names, + referenced_qualnames, + import_names, + class_names, + security_surfaces, + typing_coverage, + docstring_coverage, + api_surface, + ) = _new_optional_metrics_payload() + if file_metrics is not None: + class_metrics_rows = [ + _class_metrics_dict_from_model(metric, runtime_path) + for metric in file_metrics.class_metrics + ] + module_dep_rows = [ + _module_dep_dict_from_model(dep) for dep in file_metrics.module_deps + ] + dead_candidate_rows = [ + _dead_candidate_dict_from_model(candidate, runtime_path) + for candidate in file_metrics.dead_candidates + ] + referenced_names = sorted(set(file_metrics.referenced_names)) + referenced_qualnames = sorted(set(file_metrics.referenced_qualnames)) + import_names = sorted(set(file_metrics.import_names)) + class_names = sorted(set(file_metrics.class_names)) + security_surfaces = [ + _security_surface_dict_from_model(surface, runtime_path) + for surface in file_metrics.security_surfaces + ] + typing_coverage = _typing_coverage_dict_from_model( + file_metrics.typing_coverage, + filepath=runtime_path, + ) + docstring_coverage = _docstring_coverage_dict_from_model( + file_metrics.docstring_coverage, + filepath=runtime_path, + ) + api_surface = _api_surface_dict_from_model( + file_metrics.api_surface, + filepath=runtime_path, + ) + + source_stats_payload = source_stats or SourceStatsDict( + lines=0, + functions=0, + methods=0, + classes=0, + ) + entry_dict = CacheEntry( + stat=stat_sig, + source_stats=source_stats_payload, + units=unit_rows, + blocks=block_rows, + segments=segment_rows, + class_metrics=class_metrics_rows, + module_deps=module_dep_rows, + dead_candidates=dead_candidate_rows, + referenced_names=referenced_names, + referenced_qualnames=referenced_qualnames, + import_names=import_names, + class_names=class_names, + security_surfaces=security_surfaces, + ) + if typing_coverage is not None: + entry_dict["typing_coverage"] = typing_coverage + if docstring_coverage is not None: + entry_dict["docstring_coverage"] = docstring_coverage + if api_surface is not None: + entry_dict["api_surface"] = api_surface + if structural_findings is not None: + entry_dict["structural_findings"] = _normalize_cached_structural_groups( + [ + _structural_group_dict_from_model(group) + for group in structural_findings + ], + filepath=runtime_path, + ) + canonical_entry = _canonicalize_cache_entry(entry_dict) + self._store_canonical_file_entry( + runtime_path=runtime_path, + canonical_entry=canonical_entry, + ) + + def prune_file_entries(self, existing_filepaths: Collection[str]) -> int: + keep_runtime_paths = { + runtime_filepath_from_wire( + wire_filepath_from_runtime(filepath, root=self.root), + root=self.root, + ) + for filepath in existing_filepaths + } + stale_runtime_paths = sorted( + runtime_path + for runtime_path in self.data["files"] + if runtime_path not in keep_runtime_paths + ) + if not stale_runtime_paths: + return 0 + for runtime_path in stale_runtime_paths: + self.data["files"].pop(runtime_path, None) + self._canonical_runtime_paths.discard(runtime_path) + self._dirty = True + return len(stale_runtime_paths) + + +def file_stat_signature(path: str) -> FileStat: + stat_result = os.stat(path) + return FileStat( + mtime_ns=stat_result.st_mtime_ns, + size=stat_result.st_size, + ) + + +__all__ = ["Cache", "file_stat_signature"] diff --git a/codeclone/cache/versioning.py b/codeclone/cache/versioning.py new file mode 100644 index 0000000..2081242 --- /dev/null +++ b/codeclone/cache/versioning.py @@ -0,0 +1,136 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from enum import Enum +from pathlib import Path +from typing import TypedDict + +from ..contracts import CACHE_VERSION, DEFAULT_MAX_CACHE_SIZE_MB +from ..contracts.schemas import AnalysisProfile +from .entries import CacheEntry +from .integrity import as_int_or_none, as_str_dict + +MAX_CACHE_SIZE_BYTES = DEFAULT_MAX_CACHE_SIZE_MB * 1024 * 1024 +LEGACY_CACHE_SECRET_FILENAME = ".cache_secret" +_DEFAULT_WIRE_UNIT_FLOW_PROFILES = ( + 0, + "none", + False, + "fallthrough", + "none", + "none", +) + + +class CacheStatus(str, Enum): + OK = "ok" + MISSING = "missing" + TOO_LARGE = "too_large" + UNREADABLE = "unreadable" + INVALID_JSON = "invalid_json" + INVALID_TYPE = "invalid_type" + VERSION_MISMATCH = "version_mismatch" + PYTHON_TAG_MISMATCH = "python_tag_mismatch" + FINGERPRINT_MISMATCH = "mismatch_fingerprint_version" + ANALYSIS_PROFILE_MISMATCH = "analysis_profile_mismatch" + INTEGRITY_FAILED = "integrity_failed" + + +class CacheData(TypedDict): + version: str + python_tag: str + fingerprint_version: str + analysis_profile: AnalysisProfile + files: dict[str, CacheEntry] + + +def _empty_cache_data( + *, + version: str = CACHE_VERSION, + python_tag: str, + fingerprint_version: str, + analysis_profile: AnalysisProfile, +) -> CacheData: + return CacheData( + version=version, + python_tag=python_tag, + fingerprint_version=fingerprint_version, + analysis_profile=analysis_profile, + files={}, + ) + + +def _as_analysis_profile(value: object) -> AnalysisProfile | None: + obj = as_str_dict(value) + if obj is None: + return None + + required = { + "min_loc", + "min_stmt", + "block_min_loc", + "block_min_stmt", + "segment_min_loc", + "segment_min_stmt", + } + if set(obj.keys()) < required: + return None + + min_loc = as_int_or_none(obj.get("min_loc")) + min_stmt = as_int_or_none(obj.get("min_stmt")) + block_min_loc = as_int_or_none(obj.get("block_min_loc")) + block_min_stmt = as_int_or_none(obj.get("block_min_stmt")) + segment_min_loc = as_int_or_none(obj.get("segment_min_loc")) + segment_min_stmt = as_int_or_none(obj.get("segment_min_stmt")) + collect_api_surface_raw = obj.get("collect_api_surface", False) + collect_api_surface = ( + collect_api_surface_raw if isinstance(collect_api_surface_raw, bool) else None + ) + if ( + min_loc is None + or min_stmt is None + or block_min_loc is None + or block_min_stmt is None + or segment_min_loc is None + or segment_min_stmt is None + or collect_api_surface is None + ): + return None + + return AnalysisProfile( + min_loc=min_loc, + min_stmt=min_stmt, + block_min_loc=block_min_loc, + block_min_stmt=block_min_stmt, + segment_min_loc=segment_min_loc, + segment_min_stmt=segment_min_stmt, + collect_api_surface=collect_api_surface, + ) + + +def _resolve_root(root: str | Path | None) -> Path | None: + if root is None: + return None + try: + return Path(root).resolve(strict=False) + except OSError: + return None + + +__all__ = [ + "CACHE_VERSION", + "LEGACY_CACHE_SECRET_FILENAME", + "MAX_CACHE_SIZE_BYTES", + "_DEFAULT_WIRE_UNIT_FLOW_PROFILES", + "AnalysisProfile", + "CacheData", + "CacheStatus", + "_as_analysis_profile", + "_empty_cache_data", + "_resolve_root", +] diff --git a/codeclone/cache_paths.py b/codeclone/cache_paths.py deleted file mode 100644 index 8de7c63..0000000 --- a/codeclone/cache_paths.py +++ /dev/null @@ -1,49 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at https://mozilla.org/MPL/2.0/. -# SPDX-License-Identifier: MPL-2.0 -# Copyright (c) 2026 Den Rozhnovskiy - -from __future__ import annotations - -from pathlib import Path - - -def wire_filepath_from_runtime( - runtime_filepath: str, - *, - root: Path | None, -) -> str: - runtime_path = Path(runtime_filepath) - if root is None: - return runtime_path.as_posix() - - try: - relative = runtime_path.relative_to(root) - return relative.as_posix() - except ValueError: - pass - - try: - relative = runtime_path.resolve().relative_to(root.resolve()) - return relative.as_posix() - except OSError: - return runtime_path.as_posix() - except ValueError: - return runtime_path.as_posix() - - -def runtime_filepath_from_wire( - wire_filepath: str, - *, - root: Path | None, -) -> str: - wire_path = Path(wire_filepath) - if root is None or wire_path.is_absolute(): - return str(wire_path) - - combined = root / wire_path - try: - return str(combined.resolve(strict=False)) - except OSError: - return str(combined) diff --git a/codeclone/cli.py b/codeclone/cli.py deleted file mode 100644 index 09ac8c5..0000000 --- a/codeclone/cli.py +++ /dev/null @@ -1,1741 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at https://mozilla.org/MPL/2.0/. -# SPDX-License-Identifier: MPL-2.0 -# Copyright (c) 2026 Den Rozhnovskiy - -from __future__ import annotations - -import os -import subprocess -import sys -import time -from collections.abc import Mapping, Sequence -from dataclasses import dataclass -from pathlib import Path -from typing import TYPE_CHECKING, Literal, Protocol, cast - -from . import __version__, _coerce -from . import ui_messages as ui -from ._cli_args import build_parser -from ._cli_baselines import ( - CloneBaselineState as _CloneBaselineStateImpl, -) -from ._cli_baselines import ( - MetricsBaselineSectionProbe as _MetricsBaselineSectionProbeImpl, -) -from ._cli_baselines import ( - MetricsBaselineState as _MetricsBaselineStateImpl, -) -from ._cli_baselines import ( - probe_metrics_baseline_section as _probe_metrics_baseline_section_impl, -) -from ._cli_baselines import ( - resolve_clone_baseline_state as _resolve_clone_baseline_state_impl, -) -from ._cli_baselines import ( - resolve_metrics_baseline_state as _resolve_metrics_baseline_state_impl, -) -from ._cli_config import ( - ConfigValidationError, - apply_pyproject_config_overrides, - collect_explicit_cli_dests, - load_pyproject_config, -) -from ._cli_gating import ( - parse_metric_reason_entry as _parse_metric_reason_entry_impl, -) -from ._cli_gating import ( - print_gating_failure_block as _print_gating_failure_block_impl, -) -from ._cli_paths import _validate_output_path -from ._cli_reports import ( - write_report_outputs as _write_report_outputs_impl, -) -from ._cli_rich import ( - PlainConsole as _PlainConsole, -) -from ._cli_rich import ( - make_console as _make_rich_console, -) -from ._cli_rich import ( - make_plain_console as _make_plain_console_impl, -) -from ._cli_rich import ( - print_banner as _print_banner_impl, -) -from ._cli_rich import ( - rich_progress_symbols as _rich_progress_symbols_impl, -) -from ._cli_runtime import ( - configure_metrics_mode as _configure_metrics_mode_impl, -) -from ._cli_runtime import ( - metrics_computed as _metrics_computed_impl, -) -from ._cli_runtime import ( - print_failed_files as _print_failed_files_impl, -) -from ._cli_runtime import ( - resolve_cache_path as _resolve_cache_path_impl, -) -from ._cli_runtime import ( - resolve_cache_status as _resolve_cache_status_impl, -) -from ._cli_runtime import ( - validate_numeric_args as _validate_numeric_args_impl, -) -from ._cli_summary import ( - ChangedScopeSnapshot, - MetricsSnapshot, - _print_changed_scope, - _print_metrics, - _print_summary, -) -from ._git_diff import validate_git_diff_ref -from .baseline import Baseline -from .cache import Cache, CacheStatus, build_segment_report_projection -from .contracts import ( - DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, - DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, - DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, - ISSUES_URL, - ExitCode, -) -from .errors import CacheError - -if TYPE_CHECKING: - from argparse import Namespace - from collections.abc import Callable, Mapping, Sequence - from types import ModuleType - - from rich.console import Console as RichConsole - from rich.progress import BarColumn as RichBarColumn - from rich.progress import Progress as RichProgress - from rich.progress import SpinnerColumn as RichSpinnerColumn - from rich.progress import TextColumn as RichTextColumn - from rich.progress import TimeElapsedColumn as RichTimeElapsedColumn - - from ._cli_baselines import _BaselineArgs as _BaselineArgsLike - from ._cli_gating import _GatingArgs as _GatingArgsLike - from ._cli_reports import _QuietArgs as _QuietArgsLike - from ._cli_runtime import _RuntimeArgs as _RuntimeArgsLike - from .models import MetricsDiff - from .normalize import NormalizationConfig - from .pipeline import ( - AnalysisResult, - BootstrapResult, - DiscoveryResult, - GatingResult, - ReportArtifacts, - ) - from .pipeline import ( - OutputPaths as PipelineOutputPaths, - ) - from .pipeline import ( - ProcessingResult as PipelineProcessingResult, - ) - -MAX_FILE_SIZE = 10 * 1024 * 1024 -__all__ = [ - "MAX_FILE_SIZE", - "ExitCode", - "ProcessingResult", - "analyze", - "bootstrap", - "discover", - "gate", - "main", - "process", - "process_file", - "report", -] - -# Lazy singleton for pipeline module — deferred import to keep CLI startup fast. -# Tests monkeypatch this via _pipeline_module() to inject mocks. -_PIPELINE_MODULE: ModuleType | None = None - - -def _pipeline_module() -> ModuleType: - global _PIPELINE_MODULE - if _PIPELINE_MODULE is None: - from . import pipeline as _pipeline - - _PIPELINE_MODULE = _pipeline - return _PIPELINE_MODULE - - -@dataclass(frozen=True, slots=True) -class OutputPaths: - html: Path | None = None - json: Path | None = None - text: Path | None = None - md: Path | None = None - sarif: Path | None = None - - -@dataclass(frozen=True, slots=True) -class ProcessingResult: - filepath: str - success: bool - error: str | None = None - units: list[object] | None = None - blocks: list[object] | None = None - segments: list[object] | None = None - lines: int = 0 - functions: int = 0 - methods: int = 0 - classes: int = 0 - stat: Mapping[str, int] | None = None - error_kind: str | None = None - file_metrics: object | None = None - structural_findings: list[object] | None = None - - -@dataclass(frozen=True, slots=True) -class ChangedCloneGate: - changed_paths: tuple[str, ...] - new_func: frozenset[str] - new_block: frozenset[str] - total_clone_groups: int - findings_total: int - findings_new: int - findings_known: int - - -_as_mapping = _coerce.as_mapping -_as_int = _coerce.as_int -_as_sequence = _coerce.as_sequence - - -def _validate_changed_scope_args(*, args: Namespace) -> str | None: - if args.diff_against and args.paths_from_git_diff: - console.print( - ui.fmt_contract_error( - "Use --diff-against or --paths-from-git-diff, not both." - ) - ) - sys.exit(ExitCode.CONTRACT_ERROR) - if args.paths_from_git_diff: - args.changed_only = True - return str(args.paths_from_git_diff) - if args.diff_against and not args.changed_only: - console.print(ui.fmt_contract_error("--diff-against requires --changed-only.")) - sys.exit(ExitCode.CONTRACT_ERROR) - if args.changed_only and not args.diff_against: - console.print( - ui.fmt_contract_error( - "--changed-only requires --diff-against or --paths-from-git-diff." - ) - ) - sys.exit(ExitCode.CONTRACT_ERROR) - return str(args.diff_against) if args.diff_against else None - - -def _normalize_changed_paths( - *, - root_path: Path, - paths: Sequence[str], -) -> tuple[str, ...]: - normalized: set[str] = set() - for raw_path in paths: - candidate = raw_path.strip() - if not candidate: - continue - candidate_path = Path(candidate) - try: - absolute_path = ( - candidate_path.resolve() - if candidate_path.is_absolute() - else (root_path / candidate_path).resolve() - ) - except OSError as exc: - console.print( - ui.fmt_contract_error( - f"Unable to resolve changed path '{candidate}': {exc}" - ) - ) - sys.exit(ExitCode.CONTRACT_ERROR) - try: - relative_path = absolute_path.relative_to(root_path) - except ValueError: - console.print( - ui.fmt_contract_error( - f"Changed path '{candidate}' is outside the scan root." - ) - ) - sys.exit(ExitCode.CONTRACT_ERROR) - cleaned = str(relative_path).replace("\\", "/").strip("/") - if cleaned: - normalized.add(cleaned) - return tuple(sorted(normalized)) - - -def _git_diff_changed_paths(*, root_path: Path, git_diff_ref: str) -> tuple[str, ...]: - try: - validated_ref = validate_git_diff_ref(git_diff_ref) - except ValueError as exc: - console.print(ui.fmt_contract_error(str(exc))) - sys.exit(ExitCode.CONTRACT_ERROR) - try: - completed = subprocess.run( - ["git", "diff", "--name-only", validated_ref, "--"], - cwd=str(root_path), - check=True, - capture_output=True, - text=True, - timeout=30, - ) - except ( - FileNotFoundError, - subprocess.CalledProcessError, - subprocess.TimeoutExpired, - ) as exc: - console.print( - ui.fmt_contract_error( - "Unable to resolve changed files from git diff ref " - f"'{validated_ref}': {exc}" - ) - ) - sys.exit(ExitCode.CONTRACT_ERROR) - lines = [line.strip() for line in completed.stdout.splitlines() if line.strip()] - return _normalize_changed_paths(root_path=root_path, paths=lines) - - -def _path_matches(relative_path: str, changed_paths: Sequence[str]) -> bool: - return any( - relative_path == candidate or relative_path.startswith(candidate + "/") - for candidate in changed_paths - ) - - -def _flatten_report_findings( - report_document: Mapping[str, object], -) -> list[dict[str, object]]: - findings = _as_mapping(report_document.get("findings")) - groups = _as_mapping(findings.get("groups")) - clone_groups = _as_mapping(groups.get("clones")) - return [ - *[ - dict(_as_mapping(item)) - for item in _as_sequence(clone_groups.get("functions")) - ], - *[dict(_as_mapping(item)) for item in _as_sequence(clone_groups.get("blocks"))], - *[ - dict(_as_mapping(item)) - for item in _as_sequence(clone_groups.get("segments")) - ], - *[ - dict(_as_mapping(item)) - for item in _as_sequence( - _as_mapping(groups.get("structural")).get("groups") - ) - ], - *[ - dict(_as_mapping(item)) - for item in _as_sequence(_as_mapping(groups.get("dead_code")).get("groups")) - ], - *[ - dict(_as_mapping(item)) - for item in _as_sequence(_as_mapping(groups.get("design")).get("groups")) - ], - ] - - -def _finding_touches_changed_paths( - finding: Mapping[str, object], - *, - changed_paths: Sequence[str], -) -> bool: - for item in _as_sequence(finding.get("items")): - relative_path = str(_as_mapping(item).get("relative_path", "")).strip() - if relative_path and _path_matches(relative_path, changed_paths): - return True - return False - - -def _changed_clone_gate_from_report( - report_document: Mapping[str, object], - *, - changed_paths: Sequence[str], -) -> ChangedCloneGate: - findings = [ - finding - for finding in _flatten_report_findings(report_document) - if _finding_touches_changed_paths(finding, changed_paths=changed_paths) - ] - clone_findings = [ - finding - for finding in findings - if str(finding.get("family", "")).strip() == "clone" - and str(finding.get("category", "")).strip() in {"function", "block"} - ] - new_func = frozenset( - str(finding.get("id", "")) - for finding in clone_findings - if str(finding.get("category", "")).strip() == "function" - and str(finding.get("novelty", "")).strip() == "new" - ) - new_block = frozenset( - str(finding.get("id", "")) - for finding in clone_findings - if str(finding.get("category", "")).strip() == "block" - and str(finding.get("novelty", "")).strip() == "new" - ) - findings_new = sum( - 1 for finding in findings if str(finding.get("novelty", "")).strip() == "new" - ) - findings_known = sum( - 1 for finding in findings if str(finding.get("novelty", "")).strip() == "known" - ) - return ChangedCloneGate( - changed_paths=tuple(changed_paths), - new_func=new_func, - new_block=new_block, - total_clone_groups=len(clone_findings), - findings_total=len(findings), - findings_new=findings_new, - findings_known=findings_known, - ) - - -def process_file( - filepath: str, - root: str, - cfg: NormalizationConfig, - min_loc: int, - min_stmt: int, - collect_structural_findings: bool = True, -) -> ProcessingResult: - pipeline_mod = _pipeline_module() - result = pipeline_mod.process_file( - filepath, - root, - cfg, - min_loc, - min_stmt, - collect_structural_findings, - ) - return cast("ProcessingResult", result) - - -def bootstrap( - *, - args: Namespace, - root: Path, - output_paths: PipelineOutputPaths | OutputPaths, - cache_path: Path, -) -> BootstrapResult: - return cast( - "BootstrapResult", - _pipeline_module().bootstrap( - args=args, - root=root, - output_paths=output_paths, - cache_path=cache_path, - ), - ) - - -def discover(*, boot: BootstrapResult, cache: Cache) -> DiscoveryResult: - return cast("DiscoveryResult", _pipeline_module().discover(boot=boot, cache=cache)) - - -def process( - *, - boot: BootstrapResult, - discovery: DiscoveryResult, - cache: Cache, - on_advance: Callable[[], None] | None = None, - on_worker_error: Callable[[str], None] | None = None, - on_parallel_fallback: Callable[[Exception], None] | None = None, -) -> PipelineProcessingResult: - return cast( - "PipelineProcessingResult", - _pipeline_module().process( - boot=boot, - discovery=discovery, - cache=cache, - on_advance=on_advance, - on_worker_error=on_worker_error, - on_parallel_fallback=on_parallel_fallback, - ), - ) - - -def analyze( - *, - boot: BootstrapResult, - discovery: DiscoveryResult, - processing: PipelineProcessingResult, -) -> AnalysisResult: - return cast( - "AnalysisResult", - _pipeline_module().analyze( - boot=boot, - discovery=discovery, - processing=processing, - ), - ) - - -def report( - *, - boot: BootstrapResult, - discovery: DiscoveryResult, - processing: PipelineProcessingResult, - analysis: AnalysisResult, - report_meta: Mapping[str, object], - new_func: set[str], - new_block: set[str], - html_builder: Callable[..., str] | None = None, - metrics_diff: MetricsDiff | None = None, - coverage_adoption_diff_available: bool = False, - api_surface_diff_available: bool = False, - include_report_document: bool = False, -) -> ReportArtifacts: - return cast( - "ReportArtifacts", - _pipeline_module().report( - boot=boot, - discovery=discovery, - processing=processing, - analysis=analysis, - report_meta=report_meta, - new_func=new_func, - new_block=new_block, - html_builder=html_builder, - metrics_diff=metrics_diff, - coverage_adoption_diff_available=coverage_adoption_diff_available, - api_surface_diff_available=api_surface_diff_available, - include_report_document=include_report_document, - ), - ) - - -def gate( - *, - boot: BootstrapResult, - analysis: AnalysisResult, - new_func: set[str], - new_block: set[str], - metrics_diff: MetricsDiff | None, -) -> GatingResult: - return cast( - "GatingResult", - _pipeline_module().gate( - boot=boot, - analysis=analysis, - new_func=new_func, - new_block=new_block, - metrics_diff=metrics_diff, - ), - ) - - -class _PrinterLike(Protocol): - def print(self, *objects: object, **kwargs: object) -> None: ... - - -LEGACY_CACHE_PATH = Path("~/.cache/codeclone/cache.json").expanduser() -ReportPathOrigin = Literal["default", "explicit"] - - -def _rich_progress_symbols() -> tuple[ - type[RichProgress], - type[RichSpinnerColumn], - type[RichTextColumn], - type[RichBarColumn], - type[RichTimeElapsedColumn], -]: - return _rich_progress_symbols_impl() - - -def _make_console(*, no_color: bool) -> RichConsole: - return _make_rich_console( - no_color=no_color, - width=ui.CLI_LAYOUT_MAX_WIDTH, - ) - - -def _print_verbose_clone_hashes( - console: _PrinterLike, - *, - label: str, - clone_hashes: set[str], -) -> None: - if not clone_hashes: - return - console.print(f"\n {label}:") - for clone_hash in sorted(clone_hashes): - console.print(f" - {clone_hash}") - - -def _make_plain_console() -> _PlainConsole: - return _make_plain_console_impl() - - -console: RichConsole | _PlainConsole = _make_plain_console() - - -def _parse_metric_reason_entry(reason: str) -> tuple[str, str]: - return _parse_metric_reason_entry_impl(reason) - - -def _print_gating_failure_block( - *, - code: str, - entries: Sequence[tuple[str, object]], - args: Namespace, -) -> None: - _print_gating_failure_block_impl( - console=cast("_PrinterLike", console), - code=code, - entries=list(entries), - args=cast("_GatingArgsLike", cast(object, args)), - ) - - -def build_html_report(*args: object, **kwargs: object) -> str: - # Lazy import avoids pulling HTML renderer in non-HTML CLI runs. - from .html_report import build_html_report as _build_html_report - - html_builder: Callable[..., str] = _build_html_report - return html_builder(*args, **kwargs) - - -_CloneBaselineState = _CloneBaselineStateImpl -_MetricsBaselineState = _MetricsBaselineStateImpl -_MetricsBaselineSectionProbe = _MetricsBaselineSectionProbeImpl - - -def print_banner(*, root: Path | None = None) -> None: - _print_banner_impl( - console=cast("_PrinterLike", console), - banner_title=ui.banner_title(__version__), - project_name=(root.name if root is not None else None), - root_display=(str(root) if root is not None else None), - ) - - -def _is_debug_enabled( - *, - argv: Sequence[str] | None = None, - environ: Mapping[str, str] | None = None, -) -> bool: - args = list(sys.argv[1:] if argv is None else argv) - debug_from_flag = any(arg == "--debug" for arg in args) - env = os.environ if environ is None else environ - debug_from_env = env.get("CODECLONE_DEBUG") == "1" - return debug_from_flag or debug_from_env - - -def _report_path_origins(argv: Sequence[str]) -> dict[str, ReportPathOrigin | None]: - origins: dict[str, ReportPathOrigin | None] = { - "html": None, - "json": None, - "md": None, - "sarif": None, - "text": None, - } - flag_to_field = { - "--html": "html", - "--json": "json", - "--md": "md", - "--sarif": "sarif", - "--text": "text", - } - index = 0 - while index < len(argv): - token = argv[index] - if token == "--": - break - if "=" in token: - flag, _value = token.split("=", maxsplit=1) - field_name = flag_to_field.get(flag) - if field_name is not None: - origins[field_name] = "explicit" - index += 1 - continue - field_name = flag_to_field.get(token) - if field_name is None: - index += 1 - continue - next_token = argv[index + 1] if index + 1 < len(argv) else None - if next_token is None or next_token.startswith("-"): - origins[field_name] = "default" - index += 1 - continue - origins[field_name] = "explicit" - index += 2 - return origins - - -def _report_path_timestamp_slug(report_generated_at_utc: str) -> str: - return report_generated_at_utc.replace("-", "").replace(":", "") - - -def _timestamped_report_path(path: Path, *, report_generated_at_utc: str) -> Path: - suffix = path.suffix - stem = path.name[: -len(suffix)] if suffix else path.name - return path.with_name( - f"{stem}-{_report_path_timestamp_slug(report_generated_at_utc)}{suffix}" - ) - - -def _resolve_output_paths( - args: Namespace, - *, - report_path_origins: Mapping[str, ReportPathOrigin | None], - report_generated_at_utc: str, -) -> OutputPaths: - printer = cast("_PrinterLike", console) - resolved: dict[str, Path | None] = { - "html": None, - "json": None, - "md": None, - "sarif": None, - "text": None, - } - output_specs = ( - ("html", "html_out", ".html", "HTML"), - ("json", "json_out", ".json", "JSON"), - ("md", "md_out", ".md", "Markdown"), - ("sarif", "sarif_out", ".sarif", "SARIF"), - ("text", "text_out", ".txt", "text"), - ) - - for field_name, arg_name, expected_suffix, label in output_specs: - raw_value = getattr(args, arg_name, None) - if not raw_value: - continue - path = _validate_output_path( - raw_value, - expected_suffix=expected_suffix, - label=label, - console=printer, - invalid_message=ui.fmt_invalid_output_extension, - invalid_path_message=ui.fmt_invalid_output_path, - ) - if ( - args.timestamped_report_paths - and report_path_origins.get(field_name) == "default" - ): - path = _timestamped_report_path( - path, - report_generated_at_utc=report_generated_at_utc, - ) - resolved[field_name] = path - - return OutputPaths( - html=resolved["html"], - json=resolved["json"], - text=resolved["text"], - md=resolved["md"], - sarif=resolved["sarif"], - ) - - -def _validate_report_ui_flags(*, args: Namespace, output_paths: OutputPaths) -> None: - if args.open_html_report and output_paths.html is None: - console.print(ui.fmt_contract_error(ui.ERR_OPEN_HTML_REPORT_REQUIRES_HTML)) - sys.exit(ExitCode.CONTRACT_ERROR) - - if args.timestamped_report_paths and not any( - ( - output_paths.html, - output_paths.json, - output_paths.md, - output_paths.sarif, - output_paths.text, - ) - ): - console.print( - ui.fmt_contract_error(ui.ERR_TIMESTAMPED_REPORT_PATHS_REQUIRES_REPORT) - ) - sys.exit(ExitCode.CONTRACT_ERROR) - - -def _resolve_cache_path(*, root_path: Path, args: Namespace, from_args: bool) -> Path: - return _resolve_cache_path_impl( - root_path=root_path, - args=cast("_RuntimeArgsLike", cast(object, args)), - from_args=from_args, - legacy_cache_path=LEGACY_CACHE_PATH, - console=cast("_PrinterLike", console), - ) - - -def _validate_numeric_args(args: Namespace) -> bool: - return _validate_numeric_args_impl(cast("_RuntimeArgsLike", cast(object, args))) - - -def _configure_metrics_mode(*, args: Namespace, metrics_baseline_exists: bool) -> None: - _configure_metrics_mode_impl( - args=cast("_RuntimeArgsLike", cast(object, args)), - metrics_baseline_exists=metrics_baseline_exists, - console=cast("_PrinterLike", console), - ) - - -def _print_failed_files(failed_files: Sequence[str]) -> None: - _print_failed_files_impl( - failed_files=tuple(failed_files), - console=cast("_PrinterLike", console), - ) - - -def _metrics_computed(args: Namespace) -> tuple[str, ...]: - return _metrics_computed_impl(cast("_RuntimeArgsLike", cast(object, args))) - - -def _probe_metrics_baseline_section(path: Path) -> _MetricsBaselineSectionProbe: - return _probe_metrics_baseline_section_impl(path) - - -def _resolve_clone_baseline_state( - *, - args: Namespace, - baseline_path: Path, - baseline_exists: bool, - analysis: AnalysisResult, - shared_baseline_payload: dict[str, object] | None = None, -) -> _CloneBaselineState: - return _resolve_clone_baseline_state_impl( - args=cast("_BaselineArgsLike", cast(object, args)), - baseline_path=baseline_path, - baseline_exists=baseline_exists, - func_groups=analysis.func_groups, - block_groups=analysis.block_groups, - codeclone_version=__version__, - console=cast("_PrinterLike", console), - shared_baseline_payload=shared_baseline_payload, - ) - - -def _resolve_metrics_baseline_state( - *, - args: Namespace, - metrics_baseline_path: Path, - metrics_baseline_exists: bool, - baseline_updated_path: Path | None, - analysis: AnalysisResult, - shared_baseline_payload: dict[str, object] | None = None, -) -> _MetricsBaselineState: - return _resolve_metrics_baseline_state_impl( - args=cast("_BaselineArgsLike", cast(object, args)), - metrics_baseline_path=metrics_baseline_path, - metrics_baseline_exists=metrics_baseline_exists, - baseline_updated_path=baseline_updated_path, - project_metrics=analysis.project_metrics, - console=cast("_PrinterLike", console), - shared_baseline_payload=shared_baseline_payload, - ) - - -def _resolve_cache_status(cache: Cache) -> tuple[CacheStatus, str | None]: - return _resolve_cache_status_impl(cache) - - -def _cache_update_segment_projection(cache: Cache, analysis: AnalysisResult) -> None: - if not hasattr(cache, "segment_report_projection"): - return - new_projection = build_segment_report_projection( - digest=analysis.segment_groups_raw_digest, - suppressed=analysis.suppressed_segment_groups, - groups=analysis.segment_groups, - ) - if new_projection != cache.segment_report_projection: - cache.segment_report_projection = new_projection - cache._dirty = True - - -def _run_analysis_stages( - *, - args: Namespace, - boot: BootstrapResult, - cache: Cache, -) -> tuple[DiscoveryResult, PipelineProcessingResult, AnalysisResult]: - def _require_rich_console( - value: RichConsole | _PlainConsole, - ) -> RichConsole: - if isinstance(value, _PlainConsole): - raise RuntimeError("Rich console is required when progress UI is enabled.") - return value - - use_status = not args.quiet and not args.no_progress - try: - if use_status: - with console.status(ui.STATUS_DISCOVERING, spinner="dots"): - discovery_result = discover(boot=boot, cache=cache) - else: - discovery_result = discover(boot=boot, cache=cache) - except OSError as exc: - console.print(ui.fmt_contract_error(ui.ERR_SCAN_FAILED.format(error=exc))) - sys.exit(ExitCode.CONTRACT_ERROR) - - for warning in discovery_result.skipped_warnings: - console.print(f"[warning]{warning}[/warning]") - - total_files = len(discovery_result.files_to_process) - if total_files > 0 and not args.quiet and args.no_progress: - console.print(ui.fmt_processing_changed(total_files)) - - if total_files > 0 and not args.no_progress: - ( - progress_cls, - spinner_column_cls, - text_column_cls, - bar_column_cls, - time_elapsed_column_cls, - ) = _rich_progress_symbols() - - with progress_cls( - spinner_column_cls(), - text_column_cls("[progress.description]{task.description}"), - bar_column_cls(), - text_column_cls("[progress.percentage]{task.percentage:>3.0f}%"), - time_elapsed_column_cls(), - console=_require_rich_console(console), - ) as progress_ui: - task_id = progress_ui.add_task( - f"Analyzing {total_files} files...", - total=total_files, - ) - processing_result = process( - boot=boot, - discovery=discovery_result, - cache=cache, - on_advance=lambda: progress_ui.advance(task_id), - on_worker_error=lambda reason: console.print( - ui.fmt_worker_failed(reason) - ), - on_parallel_fallback=lambda exc: console.print( - ui.fmt_parallel_fallback(exc) - ), - ) - else: - processing_result = process( - boot=boot, - discovery=discovery_result, - cache=cache, - on_worker_error=( - (lambda reason: console.print(ui.fmt_batch_item_failed(reason))) - if args.no_progress - else (lambda reason: console.print(ui.fmt_worker_failed(reason))) - ), - on_parallel_fallback=lambda exc: console.print( - ui.fmt_parallel_fallback(exc) - ), - ) - - _print_failed_files(processing_result.failed_files) - # Keep unreadable-source diagnostics visible in normal mode even if - # failed_files was filtered/empty due upstream transport differences. - if not processing_result.failed_files and processing_result.source_read_failures: - _print_failed_files(processing_result.source_read_failures) - - if use_status: - with console.status(ui.STATUS_GROUPING, spinner="dots"): - analysis_result = analyze( - boot=boot, - discovery=discovery_result, - processing=processing_result, - ) - _cache_update_segment_projection(cache, analysis_result) - try: - cache.save() - except CacheError as exc: - console.print(ui.fmt_cache_save_failed(exc)) - else: - analysis_result = analyze( - boot=boot, - discovery=discovery_result, - processing=processing_result, - ) - _cache_update_segment_projection(cache, analysis_result) - try: - cache.save() - except CacheError as exc: - console.print(ui.fmt_cache_save_failed(exc)) - - coverage_join = getattr(analysis_result, "coverage_join", None) - if ( - coverage_join is not None - and coverage_join.status != "ok" - and coverage_join.invalid_reason - ): - console.print(ui.fmt_coverage_join_ignored(coverage_join.invalid_reason)) - - return discovery_result, processing_result, analysis_result - - -def _write_report_outputs( - *, - args: Namespace, - output_paths: OutputPaths, - report_artifacts: ReportArtifacts, - open_html_report: bool = False, -) -> str | None: - return _write_report_outputs_impl( - args=cast("_QuietArgsLike", cast(object, args)), - output_paths=output_paths, - report_artifacts=report_artifacts, - console=cast("_PrinterLike", console), - open_html_report=open_html_report, - ) - - -def _enforce_gating( - *, - args: Namespace, - boot: BootstrapResult, - analysis: AnalysisResult, - processing: PipelineProcessingResult, - source_read_contract_failure: bool, - baseline_failure_code: ExitCode | None, - metrics_baseline_failure_code: ExitCode | None, - new_func: set[str], - new_block: set[str], - metrics_diff: MetricsDiff | None, - html_report_path: str | None, - clone_threshold_total: int | None = None, -) -> None: - if source_read_contract_failure: - console.print( - ui.fmt_contract_error( - ui.fmt_unreadable_source_in_gating( - count=len(processing.source_read_failures) - ) - ) - ) - for failure in processing.source_read_failures[:10]: - console.print(f" • {failure}") - if len(processing.source_read_failures) > 10: - console.print(f" ... and {len(processing.source_read_failures) - 10} more") - sys.exit(ExitCode.CONTRACT_ERROR) - - if baseline_failure_code is not None: - console.print(ui.fmt_contract_error(ui.ERR_BASELINE_GATING_REQUIRES_TRUSTED)) - sys.exit(baseline_failure_code) - - if metrics_baseline_failure_code is not None: - console.print( - ui.fmt_contract_error( - "Metrics baseline is untrusted or missing for requested metrics gating." - ) - ) - sys.exit(metrics_baseline_failure_code) - - if bool(getattr(args, "fail_on_untested_hotspots", False)): - if analysis.coverage_join is None: - console.print( - ui.fmt_contract_error( - "--fail-on-untested-hotspots requires --coverage." - ) - ) - sys.exit(ExitCode.CONTRACT_ERROR) - if analysis.coverage_join.status != "ok": - detail = analysis.coverage_join.invalid_reason or "invalid coverage input" - console.print( - ui.fmt_contract_error( - "Coverage gating requires a valid Cobertura XML input.\n" - f"Reason: {detail}" - ) - ) - sys.exit(ExitCode.CONTRACT_ERROR) - - gate_result = gate( - boot=boot, - analysis=analysis, - new_func=new_func, - new_block=new_block, - metrics_diff=metrics_diff, - ) - if clone_threshold_total is not None: - reasons = [ - reason - for reason in gate_result.reasons - if not reason.startswith("clone:threshold:") - ] - if 0 <= args.fail_threshold < clone_threshold_total: - reasons.append( - f"clone:threshold:{clone_threshold_total}:{args.fail_threshold}" - ) - gate_result = cast( - "GatingResult", - _pipeline_module().GatingResult( - exit_code=( - int(ExitCode.GATING_FAILURE) if reasons else int(ExitCode.SUCCESS) - ), - reasons=tuple(reasons), - ), - ) - - metric_reasons = [ - reason[len("metric:") :] - for reason in gate_result.reasons - if reason.startswith("metric:") - ] - if metric_reasons: - _print_gating_failure_block( - code="metrics", - entries=[_parse_metric_reason_entry(reason) for reason in metric_reasons], - args=args, - ) - sys.exit(ExitCode.GATING_FAILURE) - - if "clone:new" in gate_result.reasons: - default_report = Path(".cache/codeclone/report.html") - resolved_html_report_path = html_report_path - if resolved_html_report_path is None and default_report.exists(): - resolved_html_report_path = str(default_report) - - clone_entries: list[tuple[str, object]] = [ - ("new_function_clone_groups", len(new_func)), - ("new_block_clone_groups", len(new_block)), - ] - if resolved_html_report_path: - clone_entries.append(("report", resolved_html_report_path)) - clone_entries.append(("accept", "codeclone . --update-baseline")) - _print_gating_failure_block( - code="new-clones", - entries=clone_entries, - args=args, - ) - - if args.verbose: - _print_verbose_clone_hashes( - cast("_PrinterLike", console), - label="Function clone hashes", - clone_hashes=new_func, - ) - _print_verbose_clone_hashes( - cast("_PrinterLike", console), - label="Block clone hashes", - clone_hashes=new_block, - ) - - sys.exit(ExitCode.GATING_FAILURE) - - threshold_reason = next( - ( - reason - for reason in gate_result.reasons - if reason.startswith("clone:threshold:") - ), - None, - ) - if threshold_reason is not None: - _, _, total_raw, threshold_raw = threshold_reason.split(":", maxsplit=3) - total = int(total_raw) - threshold = int(threshold_raw) - _print_gating_failure_block( - code="threshold", - entries=( - ("clone_groups_total", total), - ("clone_groups_limit", threshold), - ), - args=args, - ) - sys.exit(ExitCode.GATING_FAILURE) - - -def _main_impl() -> None: - global console - - run_started_at = time.monotonic() - from ._cli_meta import _build_report_meta, _current_report_timestamp_utc - - analysis_started_at_utc = _current_report_timestamp_utc() - ap = build_parser(__version__) - - def _resolve_runtime_path_arg( - *, - root_path: Path, - raw_path: str, - from_cli: bool, - ) -> Path: - candidate_path = Path(raw_path).expanduser() - if from_cli or candidate_path.is_absolute(): - return candidate_path.resolve() - return (root_path / candidate_path).resolve() - - def _prepare_run_inputs() -> tuple[ - Namespace, - Path, - Path, - bool, - Path, - bool, - OutputPaths, - Path, - dict[str, object] | None, - tuple[str, ...], - str, - str, - ]: - global console - raw_argv = tuple(sys.argv[1:]) - explicit_cli_dests = collect_explicit_cli_dests(ap, argv=raw_argv) - report_path_origins = _report_path_origins(raw_argv) - report_generated_at_utc = _current_report_timestamp_utc() - cache_path_from_args = any( - arg in {"--cache-dir", "--cache-path"} - or arg.startswith(("--cache-dir=", "--cache-path=")) - for arg in sys.argv - ) - baseline_path_from_args = any( - arg == "--baseline" or arg.startswith("--baseline=") for arg in sys.argv - ) - metrics_path_from_args = any( - arg == "--metrics-baseline" or arg.startswith("--metrics-baseline=") - for arg in sys.argv - ) - args = ap.parse_args() - - try: - root_path = Path(args.root).resolve() - if not root_path.exists(): - console.print( - ui.fmt_contract_error(ui.ERR_ROOT_NOT_FOUND.format(path=root_path)) - ) - sys.exit(ExitCode.CONTRACT_ERROR) - except OSError as exc: - console.print( - ui.fmt_contract_error(ui.ERR_INVALID_ROOT_PATH.format(error=exc)) - ) - sys.exit(ExitCode.CONTRACT_ERROR) - - try: - pyproject_config = load_pyproject_config(root_path) - except ConfigValidationError as exc: - console.print(ui.fmt_contract_error(str(exc))) - sys.exit(ExitCode.CONTRACT_ERROR) - apply_pyproject_config_overrides( - args=args, - config_values=pyproject_config, - explicit_cli_dests=explicit_cli_dests, - ) - git_diff_ref = _validate_changed_scope_args(args=args) - changed_paths = ( - _git_diff_changed_paths(root_path=root_path, git_diff_ref=git_diff_ref) - if git_diff_ref is not None - else () - ) - if args.debug: - os.environ["CODECLONE_DEBUG"] = "1" - - if args.ci: - args.fail_on_new = True - args.no_color = True - args.quiet = True - - console = ( - _make_plain_console() - if args.quiet - else _make_console(no_color=args.no_color) - ) - - if not _validate_numeric_args(args): - console.print( - ui.fmt_contract_error( - "Size limits must be non-negative integers (MB), " - "threshold flags must be >= 0 or -1, and coverage thresholds " - "must be between 0 and 100." - ) - ) - sys.exit(ExitCode.CONTRACT_ERROR) - - baseline_arg_path = Path(args.baseline).expanduser() - try: - baseline_path = _resolve_runtime_path_arg( - root_path=root_path, - raw_path=args.baseline, - from_cli=baseline_path_from_args, - ) - baseline_exists = baseline_path.exists() - except OSError as exc: - console.print( - ui.fmt_contract_error( - ui.fmt_invalid_baseline_path(path=baseline_arg_path, error=exc) - ) - ) - sys.exit(ExitCode.CONTRACT_ERROR) - - shared_baseline_payload: dict[str, object] | None = None - default_metrics_baseline = ap.get_default("metrics_baseline") - metrics_path_overridden = metrics_path_from_args or ( - args.metrics_baseline != default_metrics_baseline - ) - metrics_baseline_arg_path = Path( - args.metrics_baseline if metrics_path_overridden else args.baseline - ).expanduser() - try: - metrics_baseline_path = _resolve_runtime_path_arg( - root_path=root_path, - raw_path=( - args.metrics_baseline if metrics_path_overridden else args.baseline - ), - from_cli=metrics_path_from_args, - ) - if metrics_baseline_path == baseline_path: - probe = _probe_metrics_baseline_section(metrics_baseline_path) - metrics_baseline_exists = probe.has_metrics_section - shared_baseline_payload = probe.payload - else: - metrics_baseline_exists = metrics_baseline_path.exists() - except OSError as exc: - console.print( - ui.fmt_contract_error( - ui.fmt_invalid_baseline_path( - path=metrics_baseline_arg_path, - error=exc, - ) - ) - ) - sys.exit(ExitCode.CONTRACT_ERROR) - - if ( - args.update_baseline - and not args.skip_metrics - and not args.update_metrics_baseline - ): - args.update_metrics_baseline = True - _configure_metrics_mode( - args=args, - metrics_baseline_exists=metrics_baseline_exists, - ) - if ( - args.update_metrics_baseline - and metrics_baseline_path == baseline_path - and not baseline_exists - and not args.update_baseline - ): - # Unified baseline needs clone payload before metrics can be embedded. - args.update_baseline = True - - if args.quiet: - args.no_progress = True - - if not args.quiet: - print_banner(root=root_path) - - output_paths = _resolve_output_paths( - args, - report_path_origins=report_path_origins, - report_generated_at_utc=report_generated_at_utc, - ) - _validate_report_ui_flags(args=args, output_paths=output_paths) - cache_path = _resolve_cache_path( - root_path=root_path, - args=args, - from_args=cache_path_from_args, - ) - return ( - args, - root_path, - baseline_path, - baseline_exists, - metrics_baseline_path, - metrics_baseline_exists, - output_paths, - cache_path, - shared_baseline_payload, - changed_paths, - analysis_started_at_utc, - report_generated_at_utc, - ) - - ( - args, - root_path, - baseline_path, - baseline_exists, - metrics_baseline_path, - metrics_baseline_exists, - output_paths, - cache_path, - shared_baseline_payload, - changed_paths, - analysis_started_at_utc, - report_generated_at_utc, - ) = _prepare_run_inputs() - - cache = Cache( - cache_path, - root=root_path, - max_size_bytes=args.max_cache_size_mb * 1024 * 1024, - min_loc=args.min_loc, - min_stmt=args.min_stmt, - block_min_loc=args.block_min_loc, - block_min_stmt=args.block_min_stmt, - segment_min_loc=args.segment_min_loc, - segment_min_stmt=args.segment_min_stmt, - collect_api_surface=bool(args.api_surface), - ) - cache.load() - if cache.load_warning: - console.print(f"[warning]{cache.load_warning}[/warning]") - - boot = bootstrap( - args=args, - root=root_path, - output_paths=output_paths, - cache_path=cache_path, - ) - discovery_result, processing_result, analysis_result = _run_analysis_stages( - args=args, - boot=boot, - cache=cache, - ) - - gating_mode = ( - args.fail_on_new - or args.fail_threshold >= 0 - or args.fail_complexity >= 0 - or args.fail_coupling >= 0 - or args.fail_cohesion >= 0 - or args.fail_cycles - or args.fail_dead_code - or args.fail_health >= 0 - or args.fail_on_new_metrics - or args.fail_on_typing_regression - or args.fail_on_docstring_regression - or args.fail_on_api_break - or args.min_typing_coverage >= 0 - or args.min_docstring_coverage >= 0 - ) - source_read_contract_failure = ( - bool(processing_result.source_read_failures) - and gating_mode - and not args.update_baseline - ) - baseline_state = _resolve_clone_baseline_state( - args=args, - baseline_path=baseline_path, - baseline_exists=baseline_exists, - analysis=analysis_result, - shared_baseline_payload=( - shared_baseline_payload if metrics_baseline_path == baseline_path else None - ), - ) - metrics_baseline_state = _resolve_metrics_baseline_state( - args=args, - metrics_baseline_path=metrics_baseline_path, - metrics_baseline_exists=metrics_baseline_exists, - baseline_updated_path=baseline_state.updated_path, - analysis=analysis_result, - shared_baseline_payload=( - shared_baseline_payload if metrics_baseline_path == baseline_path else None - ), - ) - - try: - report_cache_path = cache_path.resolve() - except OSError: - report_cache_path = cache_path - - cache_status, cache_schema_version = _resolve_cache_status(cache) - - report_meta = _build_report_meta( - codeclone_version=__version__, - scan_root=root_path, - baseline_path=baseline_path, - baseline=baseline_state.baseline, - baseline_loaded=baseline_state.loaded, - baseline_status=baseline_state.status.value, - cache_path=report_cache_path, - cache_used=cache_status == CacheStatus.OK, - cache_status=cache_status.value, - cache_schema_version=cache_schema_version, - files_skipped_source_io=len(processing_result.source_read_failures), - metrics_baseline_path=metrics_baseline_path, - metrics_baseline=metrics_baseline_state.baseline, - metrics_baseline_loaded=metrics_baseline_state.loaded, - metrics_baseline_status=metrics_baseline_state.status.value, - health_score=( - analysis_result.project_metrics.health.total - if analysis_result.project_metrics - else None - ), - health_grade=( - analysis_result.project_metrics.health.grade - if analysis_result.project_metrics - else None - ), - analysis_mode=("clones_only" if args.skip_metrics else "full"), - metrics_computed=_metrics_computed(args), - min_loc=args.min_loc, - min_stmt=args.min_stmt, - block_min_loc=args.block_min_loc, - block_min_stmt=args.block_min_stmt, - segment_min_loc=args.segment_min_loc, - segment_min_stmt=args.segment_min_stmt, - design_complexity_threshold=DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, - design_coupling_threshold=DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, - design_cohesion_threshold=DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, - analysis_started_at_utc=analysis_started_at_utc, - report_generated_at_utc=report_generated_at_utc, - ) - - baseline_for_diff = ( - baseline_state.baseline - if baseline_state.trusted_for_diff - else Baseline(baseline_path) - ) - new_func, new_block = baseline_for_diff.diff( - analysis_result.func_groups, - analysis_result.block_groups, - ) - new_clones_count = len(new_func) + len(new_block) - - metrics_diff: MetricsDiff | None = None - if ( - analysis_result.project_metrics is not None - and metrics_baseline_state.trusted_for_diff - ): - metrics_diff = metrics_baseline_state.baseline.diff( - analysis_result.project_metrics - ) - coverage_adoption_diff_available = bool( - metrics_baseline_state.trusted_for_diff - and getattr( - metrics_baseline_state.baseline, - "has_coverage_adoption_snapshot", - False, - ) - ) - api_surface_diff_available = bool( - metrics_baseline_state.trusted_for_diff - and getattr(metrics_baseline_state.baseline, "api_surface_snapshot", None) - is not None - ) - - _print_summary( - console=cast("_PrinterLike", console), - quiet=args.quiet, - files_found=discovery_result.files_found, - files_analyzed=processing_result.files_analyzed, - cache_hits=discovery_result.cache_hits, - files_skipped=processing_result.files_skipped, - analyzed_lines=( - processing_result.analyzed_lines - + int(getattr(discovery_result, "cached_lines", 0)) - ), - analyzed_functions=( - processing_result.analyzed_functions - + int(getattr(discovery_result, "cached_functions", 0)) - ), - analyzed_methods=( - processing_result.analyzed_methods - + int(getattr(discovery_result, "cached_methods", 0)) - ), - analyzed_classes=( - processing_result.analyzed_classes - + int(getattr(discovery_result, "cached_classes", 0)) - ), - func_clones_count=analysis_result.func_clones_count, - block_clones_count=analysis_result.block_clones_count, - segment_clones_count=analysis_result.segment_clones_count, - suppressed_golden_fixture_groups=len( - getattr(analysis_result, "suppressed_clone_groups", ()) - ), - suppressed_segment_groups=analysis_result.suppressed_segment_groups, - new_clones_count=new_clones_count, - ) - - if analysis_result.project_metrics is not None: - pm = analysis_result.project_metrics - metrics_payload_map = _as_mapping(analysis_result.metrics_payload) - overloaded_modules_summary = _as_mapping( - _as_mapping(metrics_payload_map.get("overloaded_modules")).get("summary") - ) - adoption_summary = _as_mapping( - _as_mapping(metrics_payload_map.get("coverage_adoption")).get("summary") - ) - api_surface_summary = _as_mapping( - _as_mapping(metrics_payload_map.get("api_surface")).get("summary") - ) - coverage_join_summary = _as_mapping( - _as_mapping(metrics_payload_map.get("coverage_join")).get("summary") - ) - overloaded_modules_summary_map = _as_mapping(overloaded_modules_summary) - coverage_join_source = str(coverage_join_summary.get("source", "")).strip() - _print_metrics( - console=cast("_PrinterLike", console), - quiet=args.quiet, - metrics=MetricsSnapshot( - complexity_avg=pm.complexity_avg, - complexity_max=pm.complexity_max, - high_risk_count=len(pm.high_risk_functions), - coupling_avg=pm.coupling_avg, - coupling_max=pm.coupling_max, - cohesion_avg=pm.cohesion_avg, - cohesion_max=pm.cohesion_max, - cycles_count=len(pm.dependency_cycles), - dead_code_count=len(pm.dead_code), - health_total=pm.health.total, - health_grade=pm.health.grade, - suppressed_dead_code_count=analysis_result.suppressed_dead_code_items, - overloaded_modules_candidates=_as_int( - overloaded_modules_summary_map.get("candidates") - ), - overloaded_modules_total=_as_int( - overloaded_modules_summary_map.get("total") - ), - overloaded_modules_population_status=str( - overloaded_modules_summary_map.get("population_status", "") - ), - overloaded_modules_top_score=_coerce.as_float( - overloaded_modules_summary_map.get("top_score") - ), - adoption_param_permille=( - _as_int(adoption_summary.get("param_permille")) - if adoption_summary - else None - ), - adoption_return_permille=( - _as_int(adoption_summary.get("return_permille")) - if adoption_summary - else None - ), - adoption_docstring_permille=( - _as_int(adoption_summary.get("docstring_permille")) - if adoption_summary - else None - ), - adoption_any_annotation_count=_as_int( - adoption_summary.get("typing_any_count") - ), - api_surface_enabled=bool(api_surface_summary.get("enabled")), - api_surface_modules=_as_int(api_surface_summary.get("modules")), - api_surface_public_symbols=_as_int( - api_surface_summary.get("public_symbols") - ), - api_surface_added=( - len(metrics_diff.new_api_symbols) - if metrics_diff is not None and api_surface_diff_available - else 0 - ), - api_surface_breaking=( - len(metrics_diff.new_api_breaking_changes) - if metrics_diff is not None and api_surface_diff_available - else 0 - ), - coverage_join_status=str( - coverage_join_summary.get("status", "") - ).strip(), - coverage_join_overall_permille=_as_int( - coverage_join_summary.get("overall_permille") - ), - coverage_join_coverage_hotspots=_as_int( - coverage_join_summary.get("coverage_hotspots") - ), - coverage_join_scope_gap_hotspots=_as_int( - coverage_join_summary.get("scope_gap_hotspots") - ), - coverage_join_threshold_percent=_as_int( - coverage_join_summary.get("hotspot_threshold_percent") - ), - coverage_join_source_label=( - Path(coverage_join_source).name if coverage_join_source else "" - ), - ), - ) - - report_artifacts = report( - boot=boot, - discovery=discovery_result, - processing=processing_result, - analysis=analysis_result, - report_meta=report_meta, - new_func=new_func, - new_block=new_block, - html_builder=build_html_report, - metrics_diff=metrics_diff, - coverage_adoption_diff_available=coverage_adoption_diff_available, - api_surface_diff_available=api_surface_diff_available, - include_report_document=bool(changed_paths), - ) - changed_clone_gate = ( - _changed_clone_gate_from_report( - report_artifacts.report_document or {}, - changed_paths=changed_paths, - ) - if args.changed_only and report_artifacts.report_document is not None - else None - ) - if changed_clone_gate is not None: - _print_changed_scope( - console=cast("_PrinterLike", console), - quiet=args.quiet, - changed_scope=ChangedScopeSnapshot( - paths_count=len(changed_clone_gate.changed_paths), - findings_total=changed_clone_gate.findings_total, - findings_new=changed_clone_gate.findings_new, - findings_known=changed_clone_gate.findings_known, - ), - ) - html_report_path = _write_report_outputs( - args=args, - output_paths=output_paths, - report_artifacts=report_artifacts, - open_html_report=args.open_html_report, - ) - - _enforce_gating( - args=args, - boot=boot, - analysis=analysis_result, - processing=processing_result, - source_read_contract_failure=source_read_contract_failure, - baseline_failure_code=baseline_state.failure_code, - metrics_baseline_failure_code=metrics_baseline_state.failure_code, - new_func=set(changed_clone_gate.new_func) if changed_clone_gate else new_func, - new_block=( - set(changed_clone_gate.new_block) if changed_clone_gate else new_block - ), - metrics_diff=metrics_diff, - html_report_path=html_report_path, - clone_threshold_total=( - changed_clone_gate.total_clone_groups if changed_clone_gate else None - ), - ) - - notice_new_clones_count = ( - len(changed_clone_gate.new_func) + len(changed_clone_gate.new_block) - if changed_clone_gate is not None - else new_clones_count - ) - if ( - not args.update_baseline - and not args.fail_on_new - and notice_new_clones_count > 0 - ): - console.print(ui.WARN_NEW_CLONES_WITHOUT_FAIL) - - if not args.quiet: - elapsed = time.monotonic() - run_started_at - console.print() - console.print(ui.fmt_pipeline_done(elapsed)) - - -def main() -> None: - try: - _main_impl() - except SystemExit: - raise - except Exception as exc: - console.print( - ui.fmt_internal_error( - exc, - issues_url=ISSUES_URL, - debug=_is_debug_enabled(), - ) - ) - sys.exit(ExitCode.INTERNAL_ERROR) - - -if __name__ == "__main__": - main() diff --git a/codeclone/config/__init__.py b/codeclone/config/__init__.py new file mode 100644 index 0000000..557317f --- /dev/null +++ b/codeclone/config/__init__.py @@ -0,0 +1,4 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 diff --git a/codeclone/config/argparse_builder.py b/codeclone/config/argparse_builder.py new file mode 100644 index 0000000..aec5a63 --- /dev/null +++ b/codeclone/config/argparse_builder.py @@ -0,0 +1,106 @@ +from __future__ import annotations + +import argparse +import sys +from typing import NoReturn + +from .. import ui_messages as ui +from ..contracts import ExitCode, cli_help_epilog +from .spec import ARGUMENT_GROUP_TITLES, DEFAULTS_BY_DEST, OPTIONS, OptionSpec + + +class _ArgumentParser(argparse.ArgumentParser): + def error(self, message: str) -> NoReturn: + self.print_usage(sys.stderr) + self.exit( + int(ExitCode.CONTRACT_ERROR), + f"CONTRACT ERROR: {message}\n", + ) + + +class _HelpFormatter(argparse.RawTextHelpFormatter): + """Product-oriented help formatter extension point.""" + + +def _add_option( + group: argparse._ArgumentGroup, + *, + option: OptionSpec, + version: str, +) -> None: + if option.cli_kind == "positional": + group.add_argument( + option.dest, + nargs=option.nargs, + metavar=option.metavar, + help=option.help_text, + ) + return + + argument_kwargs: dict[str, object] = {"help": option.help_text} + + if option.cli_kind == "value": + argument_kwargs.update( + dest=option.dest, + nargs=option.nargs, + const=option.const, + metavar=option.metavar, + ) + if option.value_type is not None: + argument_kwargs["type"] = option.value_type + elif option.cli_kind == "optional_path": + argument_kwargs.update( + dest=option.dest, + nargs="?", + const=option.const, + metavar=option.metavar or "FILE", + ) + elif option.cli_kind == "bool_optional": + argument_kwargs.update( + action=argparse.BooleanOptionalAction, + default=argparse.SUPPRESS, + ) + elif option.cli_kind in {"store_true", "store_false"}: + argument_kwargs.update( + dest=option.dest, + action=option.cli_kind, + default=argparse.SUPPRESS, + ) + elif option.cli_kind == "help": + argument_kwargs["action"] = "help" + elif option.cli_kind == "version": + argument_kwargs.update( + action="version", + version=ui.version_output(version), + ) + else: + raise RuntimeError(f"Unsupported CLI option kind: {option.cli_kind}") + + group.add_argument(*option.flags, **argument_kwargs) # type: ignore[arg-type] + + +def build_parser(version: str) -> _ArgumentParser: + parser = _ArgumentParser( + prog="codeclone", + description="Structural code quality analysis for Python.", + add_help=False, + formatter_class=_HelpFormatter, + epilog=cli_help_epilog(), + ) + + for group_title in ARGUMENT_GROUP_TITLES: + argument_group = parser.add_argument_group(group_title) + for option in OPTIONS: + if option.group != group_title or option.cli_kind is None: + continue + _add_option( + argument_group, + option=option, + version=version, + ) + + parser.set_defaults(**DEFAULTS_BY_DEST) + return parser + + +__all__ = ["_ArgumentParser", "_HelpFormatter", "build_parser"] diff --git a/codeclone/config/pyproject_loader.py b/codeclone/config/pyproject_loader.py new file mode 100644 index 0000000..3c2cd20 --- /dev/null +++ b/codeclone/config/pyproject_loader.py @@ -0,0 +1,216 @@ +from __future__ import annotations + +import importlib +import sys +from pathlib import Path +from typing import TYPE_CHECKING + +from ..findings.clones.golden_fixtures import ( + GoldenFixturePatternError, + normalize_golden_fixture_patterns, +) +from .spec import CONFIG_KEY_SPECS, PATH_CONFIG_KEYS, ConfigKeySpec + +if TYPE_CHECKING: + from collections.abc import Callable, Mapping, Set + + +class ConfigValidationError(ValueError): + """Raised when pyproject.toml contains invalid CodeClone configuration.""" + + +def validate_config_value( + *, + key: str, + value: object, + config_key_specs: Mapping[str, ConfigKeySpec] = CONFIG_KEY_SPECS, +) -> object: + spec = config_key_specs[key] + if value is None: + if spec.allow_none: + return None + raise ConfigValidationError( + "Invalid value type for tool.codeclone." + f"{key}: expected {spec.expected_name or spec.expected_type.__name__}" + ) + + expected_type = spec.expected_type + if expected_type is bool: + return _validated_config_instance( + key=key, + value=value, + expected_type=bool, + expected_name="bool", + ) + + if expected_type is int: + return _validated_config_instance( + key=key, + value=value, + expected_type=int, + expected_name="int", + reject_bool=True, + ) + + if expected_type is str: + return _validated_config_instance( + key=key, + value=value, + expected_type=str, + expected_name="str", + ) + + if expected_type is list: + return _validated_string_list(key=key, value=value) + + raise ConfigValidationError(f"Unsupported config key spec for tool.codeclone.{key}") + + +def load_pyproject_config( + root_path: Path, + *, + load_toml: Callable[[Path], object] | None = None, + config_key_specs: Mapping[str, ConfigKeySpec] = CONFIG_KEY_SPECS, + path_config_keys: Set[str] | frozenset[str] = PATH_CONFIG_KEYS, +) -> dict[str, object]: + config_path = root_path / "pyproject.toml" + if not config_path.exists(): + return {} + + load_toml_fn = _load_toml if load_toml is None else load_toml + + payload: object + try: + payload = load_toml_fn(config_path) + except OSError as exc: + raise ConfigValidationError( + f"Cannot read pyproject.toml at {config_path}: {exc}" + ) from exc + except ValueError as exc: + raise ConfigValidationError(f"Invalid TOML in {config_path}: {exc}") from exc + + if not isinstance(payload, dict): + raise ConfigValidationError( + f"Invalid pyproject payload at {config_path}: root must be object" + ) + + tool_obj = payload.get("tool") + if tool_obj is None: + return {} + if not isinstance(tool_obj, dict): + raise ConfigValidationError( + f"Invalid pyproject payload at {config_path}: 'tool' must be object" + ) + + codeclone_obj = tool_obj.get("codeclone") + if codeclone_obj is None: + return {} + if not isinstance(codeclone_obj, dict): + raise ConfigValidationError( + "Invalid pyproject payload at " + f"{config_path}: 'tool.codeclone' must be object" + ) + + unknown = sorted(set(codeclone_obj.keys()) - set(config_key_specs)) + if unknown: + raise ConfigValidationError( + "Unknown key(s) in tool.codeclone: " + ", ".join(unknown) + ) + + validated: dict[str, object] = {} + for key in sorted(codeclone_obj.keys()): + value = validate_config_value( + key=key, + value=codeclone_obj[key], + config_key_specs=config_key_specs, + ) + validated[key] = normalize_path_config_value( + key=key, + value=value, + root_path=root_path, + path_config_keys=path_config_keys, + ) + return validated + + +def normalize_path_config_value( + *, + key: str, + value: object, + root_path: Path, + path_config_keys: Set[str] | frozenset[str] = PATH_CONFIG_KEYS, +) -> object: + if key not in path_config_keys: + return value + if not isinstance(value, str): + return value + + path = Path(value).expanduser() + if path.is_absolute(): + return str(path) + return str(root_path / path) + + +def _validated_config_instance( + *, + key: str, + value: object, + expected_type: type[object], + expected_name: str, + reject_bool: bool = False, +) -> object: + if isinstance(value, expected_type) and ( + not reject_bool or not isinstance(value, bool) + ): + return value + raise ConfigValidationError( + f"Invalid value type for tool.codeclone.{key}: expected {expected_name}" + ) + + +def _validated_string_list(*, key: str, value: object) -> tuple[str, ...]: + if not isinstance(value, list): + raise ConfigValidationError( + f"Invalid value type for tool.codeclone.{key}: expected list[str]" + ) + if not all(isinstance(item, str) for item in value): + raise ConfigValidationError( + f"Invalid value type for tool.codeclone.{key}: expected list[str]" + ) + try: + return normalize_golden_fixture_patterns(value) + except GoldenFixturePatternError as exc: + raise ConfigValidationError(str(exc)) from exc + + +def _load_toml(path: Path) -> object: + if sys.version_info >= (3, 11): + import tomllib + + with path.open("rb") as config_file: + return tomllib.load(config_file) + + try: + tomli_module = importlib.import_module("tomli") + except ModuleNotFoundError as exc: + raise ConfigValidationError( + "Python 3.10 requires dependency 'tomli' to read pyproject.toml." + ) from exc + + load_fn = getattr(tomli_module, "load", None) + if not callable(load_fn): + raise ConfigValidationError("Invalid 'tomli' module: missing callable 'load'.") + + with path.open("rb") as config_file: + return load_fn(config_file) + + +__all__ = [ + "CONFIG_KEY_SPECS", + "PATH_CONFIG_KEYS", + "ConfigValidationError", + "_load_toml", + "load_pyproject_config", + "normalize_path_config_value", + "validate_config_value", +] diff --git a/codeclone/config/resolver.py b/codeclone/config/resolver.py new file mode 100644 index 0000000..03ef896 --- /dev/null +++ b/codeclone/config/resolver.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import argparse + from collections.abc import Mapping, Sequence + + +@dataclass(frozen=True, slots=True) +class ResolvedConfig: + values: dict[str, object] + explicit_cli_dests: frozenset[str] + pyproject_values: dict[str, object] + + +def collect_explicit_cli_dests( + parser: argparse.ArgumentParser, + *, + argv: Sequence[str], +) -> set[str]: + option_to_dest: dict[str, str] = {} + for action in parser._actions: + for option in action.option_strings: + option_to_dest[option] = action.dest + + explicit: set[str] = set() + for token in argv: + if token == "--": + break + if not token.startswith("-"): + continue + option = token.split("=", maxsplit=1)[0] + dest = option_to_dest.get(option) + if dest is not None: + explicit.add(dest) + return explicit + + +def resolve_config( + *, + args: argparse.Namespace, + config_values: Mapping[str, object], + explicit_cli_dests: set[str], +) -> ResolvedConfig: + resolved_values = vars(args).copy() + for key, value in config_values.items(): + if key in explicit_cli_dests: + continue + resolved_values[key] = value + + return ResolvedConfig( + values=resolved_values, + explicit_cli_dests=frozenset(explicit_cli_dests), + pyproject_values=dict(config_values), + ) + + +def apply_resolved_config( + *, + args: argparse.Namespace, + resolved: ResolvedConfig, +) -> None: + for key, value in resolved.values.items(): + setattr(args, key, value) + + +def apply_pyproject_config_overrides( + *, + args: argparse.Namespace, + config_values: Mapping[str, object], + explicit_cli_dests: set[str], +) -> None: + apply_resolved_config( + args=args, + resolved=resolve_config( + args=args, + config_values=config_values, + explicit_cli_dests=explicit_cli_dests, + ), + ) + + +__all__ = [ + "ResolvedConfig", + "apply_pyproject_config_overrides", + "apply_resolved_config", + "collect_explicit_cli_dests", + "resolve_config", +] diff --git a/codeclone/config/spec.py b/codeclone/config/spec.py new file mode 100644 index 0000000..798e2bf --- /dev/null +++ b/codeclone/config/spec.py @@ -0,0 +1,770 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Final, Literal + +from .. import ui_messages as ui +from ..contracts import ( + DEFAULT_BASELINE_PATH, + DEFAULT_BLOCK_MIN_LOC, + DEFAULT_BLOCK_MIN_STMT, + DEFAULT_COHESION_THRESHOLD, + DEFAULT_COMPLEXITY_THRESHOLD, + DEFAULT_COUPLING_THRESHOLD, + DEFAULT_COVERAGE_MIN, + DEFAULT_HEALTH_THRESHOLD, + DEFAULT_HTML_REPORT_PATH, + DEFAULT_JSON_REPORT_PATH, + DEFAULT_MARKDOWN_REPORT_PATH, + DEFAULT_MAX_BASELINE_SIZE_MB, + DEFAULT_MAX_CACHE_SIZE_MB, + DEFAULT_MIN_LOC, + DEFAULT_MIN_STMT, + DEFAULT_PROCESSES, + DEFAULT_ROOT, + DEFAULT_SARIF_REPORT_PATH, + DEFAULT_SEGMENT_MIN_LOC, + DEFAULT_SEGMENT_MIN_STMT, + DEFAULT_TEXT_REPORT_PATH, +) + +CliKind = Literal[ + "positional", + "value", + "optional_path", + "bool_optional", + "store_true", + "store_false", + "help", + "version", +] + +_UNSET: Final[object] = object() +_INFER_PYPROJECT_KEY: Final[object] = object() + + +@dataclass(frozen=True, slots=True) +class ConfigKeySpec: + expected_type: type[object] + allow_none: bool = False + expected_name: str | None = None + + +@dataclass(frozen=True, slots=True) +class OptionSpec: + dest: str + group: str | None + cli_kind: CliKind | None = None + flags: tuple[str, ...] = () + default: object = _UNSET + value_type: type[object] | None = None + const: object | None = None + nargs: str | int | None = None + metavar: str | None = None + help_text: str | None = None + pyproject_key: str | None = None + config_spec: ConfigKeySpec | None = None + path_value: bool = False + + @property + def has_default(self) -> bool: + return self.default is not _UNSET + + +def _option( + *, + dest: str, + group: str | None, + cli_kind: CliKind | None = None, + flags: tuple[str, ...] = (), + default: object = _UNSET, + value_type: type[object] | None = None, + const: object | None = None, + nargs: str | int | None = None, + metavar: str | None = None, + help_text: str | None = None, + pyproject_type: type[object] | None = None, + allow_none: bool = False, + expected_name: str | None = None, + pyproject_key: object = _INFER_PYPROJECT_KEY, + path_value: bool = False, +) -> OptionSpec: + config_spec = ( + ConfigKeySpec( + expected_type=pyproject_type, + allow_none=allow_none, + expected_name=expected_name, + ) + if pyproject_type is not None + else None + ) + resolved_pyproject_key: str | None + if pyproject_type is None: + resolved_pyproject_key = None + elif pyproject_key is _INFER_PYPROJECT_KEY: + resolved_pyproject_key = dest + elif pyproject_key is None or isinstance(pyproject_key, str): + resolved_pyproject_key = pyproject_key + else: + raise TypeError("pyproject_key must be str | None when pyproject_type is set") + return OptionSpec( + dest=dest, + group=group, + cli_kind=cli_kind, + flags=flags, + default=default, + value_type=value_type, + const=const, + nargs=nargs, + metavar=metavar, + help_text=help_text, + pyproject_key=resolved_pyproject_key, + config_spec=config_spec, + path_value=path_value, + ) + + +ARGUMENT_GROUP_TITLES: Final[tuple[str, ...]] = ( + "Target", + "Analysis", + "Baselines and CI", + "Quality gates", + "Analysis stages", + "Reporting", + "Output and UI", + "General", +) + +OPTIONS: Final[tuple[OptionSpec, ...]] = ( + _option( + dest="root", + group="Target", + cli_kind="positional", + default=DEFAULT_ROOT, + nargs="?", + help_text=ui.HELP_ROOT, + ), + _option( + dest="min_loc", + group="Analysis", + cli_kind="value", + flags=("--min-loc",), + default=DEFAULT_MIN_LOC, + value_type=int, + help_text=ui.HELP_MIN_LOC, + pyproject_type=int, + ), + _option( + dest="min_stmt", + group="Analysis", + cli_kind="value", + flags=("--min-stmt",), + default=DEFAULT_MIN_STMT, + value_type=int, + help_text=ui.HELP_MIN_STMT, + pyproject_type=int, + ), + _option( + dest="block_min_loc", + group="Analysis", + default=DEFAULT_BLOCK_MIN_LOC, + pyproject_type=int, + ), + _option( + dest="block_min_stmt", + group="Analysis", + default=DEFAULT_BLOCK_MIN_STMT, + pyproject_type=int, + ), + _option( + dest="segment_min_loc", + group="Analysis", + default=DEFAULT_SEGMENT_MIN_LOC, + pyproject_type=int, + ), + _option( + dest="segment_min_stmt", + group="Analysis", + default=DEFAULT_SEGMENT_MIN_STMT, + pyproject_type=int, + ), + _option( + dest="golden_fixture_paths", + group="Analysis", + default=(), + pyproject_type=list, + expected_name="list[str]", + ), + _option( + dest="processes", + group="Analysis", + cli_kind="value", + flags=("--processes",), + default=DEFAULT_PROCESSES, + value_type=int, + help_text=ui.HELP_PROCESSES, + pyproject_type=int, + ), + _option( + dest="changed_only", + group="Analysis", + cli_kind="bool_optional", + flags=("--changed-only",), + default=False, + help_text=ui.HELP_CHANGED_ONLY, + ), + _option( + dest="diff_against", + group="Analysis", + cli_kind="value", + flags=("--diff-against",), + default=None, + metavar="GIT_REF", + help_text=ui.HELP_DIFF_AGAINST, + ), + _option( + dest="paths_from_git_diff", + group="Analysis", + cli_kind="value", + flags=("--paths-from-git-diff",), + default=None, + metavar="GIT_REF", + help_text=ui.HELP_PATHS_FROM_GIT_DIFF, + ), + _option( + dest="cache_path", + group="Analysis", + cli_kind="optional_path", + flags=("--cache-path",), + default=None, + metavar="FILE", + help_text=ui.HELP_CACHE_PATH, + pyproject_type=str, + allow_none=True, + path_value=True, + ), + _option( + dest="cache_path", + group="Analysis", + cli_kind="optional_path", + flags=("--cache-dir",), + metavar="FILE", + help_text=ui.HELP_CACHE_DIR_LEGACY, + pyproject_key=None, + ), + _option( + dest="max_cache_size_mb", + group="Analysis", + cli_kind="value", + flags=("--max-cache-size-mb",), + default=DEFAULT_MAX_CACHE_SIZE_MB, + value_type=int, + metavar="MB", + help_text=ui.HELP_MAX_CACHE_SIZE_MB, + pyproject_type=int, + ), + _option( + dest="baseline", + group="Baselines and CI", + cli_kind="optional_path", + flags=("--baseline",), + default=DEFAULT_BASELINE_PATH, + const=DEFAULT_BASELINE_PATH, + metavar="FILE", + help_text=ui.HELP_BASELINE, + pyproject_type=str, + path_value=True, + ), + _option( + dest="max_baseline_size_mb", + group="Baselines and CI", + cli_kind="value", + flags=("--max-baseline-size-mb",), + default=DEFAULT_MAX_BASELINE_SIZE_MB, + value_type=int, + metavar="MB", + help_text=ui.HELP_MAX_BASELINE_SIZE_MB, + pyproject_type=int, + ), + _option( + dest="update_baseline", + group="Baselines and CI", + cli_kind="bool_optional", + flags=("--update-baseline",), + default=False, + help_text=ui.HELP_UPDATE_BASELINE, + pyproject_type=bool, + ), + _option( + dest="metrics_baseline", + group="Baselines and CI", + cli_kind="optional_path", + flags=("--metrics-baseline",), + default=DEFAULT_BASELINE_PATH, + const=DEFAULT_BASELINE_PATH, + metavar="FILE", + help_text=ui.HELP_METRICS_BASELINE, + pyproject_type=str, + path_value=True, + ), + _option( + dest="update_metrics_baseline", + group="Baselines and CI", + cli_kind="bool_optional", + flags=("--update-metrics-baseline",), + default=False, + help_text=ui.HELP_UPDATE_METRICS_BASELINE, + pyproject_type=bool, + ), + _option( + dest="ci", + group="Baselines and CI", + cli_kind="bool_optional", + flags=("--ci",), + default=False, + help_text=ui.HELP_CI, + pyproject_type=bool, + ), + _option( + dest="api_surface", + group="Baselines and CI", + cli_kind="bool_optional", + flags=("--api-surface",), + default=False, + help_text=ui.HELP_API_SURFACE, + pyproject_type=bool, + ), + _option( + dest="coverage_xml", + group="Baselines and CI", + cli_kind="value", + flags=("--coverage",), + default=None, + metavar="FILE", + help_text=ui.HELP_COVERAGE, + pyproject_type=str, + allow_none=True, + path_value=True, + ), + _option( + dest="fail_on_new", + group="Quality gates", + cli_kind="bool_optional", + flags=("--fail-on-new",), + default=False, + help_text=ui.HELP_FAIL_ON_NEW, + pyproject_type=bool, + ), + _option( + dest="fail_on_new_metrics", + group="Quality gates", + cli_kind="bool_optional", + flags=("--fail-on-new-metrics",), + default=False, + help_text=ui.HELP_FAIL_ON_NEW_METRICS, + pyproject_type=bool, + ), + _option( + dest="fail_threshold", + group="Quality gates", + cli_kind="value", + flags=("--fail-threshold",), + default=-1, + value_type=int, + metavar="MAX_CLONES", + help_text=ui.HELP_FAIL_THRESHOLD, + pyproject_type=int, + ), + _option( + dest="fail_complexity", + group="Quality gates", + cli_kind="value", + flags=("--fail-complexity",), + default=-1, + value_type=int, + nargs="?", + const=DEFAULT_COMPLEXITY_THRESHOLD, + metavar="CC_MAX", + help_text=ui.HELP_FAIL_COMPLEXITY, + pyproject_type=int, + ), + _option( + dest="fail_coupling", + group="Quality gates", + cli_kind="value", + flags=("--fail-coupling",), + default=-1, + value_type=int, + nargs="?", + const=DEFAULT_COUPLING_THRESHOLD, + metavar="CBO_MAX", + help_text=ui.HELP_FAIL_COUPLING, + pyproject_type=int, + ), + _option( + dest="fail_cohesion", + group="Quality gates", + cli_kind="value", + flags=("--fail-cohesion",), + default=-1, + value_type=int, + nargs="?", + const=DEFAULT_COHESION_THRESHOLD, + metavar="LCOM4_MAX", + help_text=ui.HELP_FAIL_COHESION, + pyproject_type=int, + ), + _option( + dest="fail_cycles", + group="Quality gates", + cli_kind="bool_optional", + flags=("--fail-cycles",), + default=False, + help_text=ui.HELP_FAIL_CYCLES, + pyproject_type=bool, + ), + _option( + dest="fail_dead_code", + group="Quality gates", + cli_kind="bool_optional", + flags=("--fail-dead-code",), + default=False, + help_text=ui.HELP_FAIL_DEAD_CODE, + pyproject_type=bool, + ), + _option( + dest="fail_health", + group="Quality gates", + cli_kind="value", + flags=("--fail-health",), + default=-1, + value_type=int, + nargs="?", + const=DEFAULT_HEALTH_THRESHOLD, + metavar="SCORE_MIN", + help_text=ui.HELP_FAIL_HEALTH, + pyproject_type=int, + ), + _option( + dest="fail_on_typing_regression", + group="Quality gates", + cli_kind="bool_optional", + flags=("--fail-on-typing-regression",), + default=False, + help_text=ui.HELP_FAIL_ON_TYPING_REGRESSION, + pyproject_type=bool, + ), + _option( + dest="fail_on_docstring_regression", + group="Quality gates", + cli_kind="bool_optional", + flags=("--fail-on-docstring-regression",), + default=False, + help_text=ui.HELP_FAIL_ON_DOCSTRING_REGRESSION, + pyproject_type=bool, + ), + _option( + dest="fail_on_api_break", + group="Quality gates", + cli_kind="bool_optional", + flags=("--fail-on-api-break",), + default=False, + help_text=ui.HELP_FAIL_ON_API_BREAK, + pyproject_type=bool, + ), + _option( + dest="fail_on_untested_hotspots", + group="Quality gates", + cli_kind="bool_optional", + flags=("--fail-on-untested-hotspots",), + default=False, + help_text=ui.HELP_FAIL_ON_UNTESTED_HOTSPOTS, + pyproject_type=bool, + ), + _option( + dest="min_typing_coverage", + group="Quality gates", + cli_kind="value", + flags=("--min-typing-coverage",), + default=-1, + value_type=int, + metavar="PERCENT", + help_text=ui.HELP_MIN_TYPING_COVERAGE, + pyproject_type=int, + ), + _option( + dest="min_docstring_coverage", + group="Quality gates", + cli_kind="value", + flags=("--min-docstring-coverage",), + default=-1, + value_type=int, + metavar="PERCENT", + help_text=ui.HELP_MIN_DOCSTRING_COVERAGE, + pyproject_type=int, + ), + _option( + dest="coverage_min", + group="Quality gates", + cli_kind="value", + flags=("--coverage-min",), + default=DEFAULT_COVERAGE_MIN, + value_type=int, + metavar="PERCENT", + help_text=ui.HELP_COVERAGE_MIN, + pyproject_type=int, + ), + _option( + dest="skip_metrics", + group="Analysis stages", + cli_kind="bool_optional", + flags=("--skip-metrics",), + default=False, + help_text=ui.HELP_SKIP_METRICS, + pyproject_type=bool, + ), + _option( + dest="skip_dead_code", + group="Analysis stages", + cli_kind="bool_optional", + flags=("--skip-dead-code",), + default=False, + help_text=ui.HELP_SKIP_DEAD_CODE, + pyproject_type=bool, + ), + _option( + dest="skip_dependencies", + group="Analysis stages", + cli_kind="bool_optional", + flags=("--skip-dependencies",), + default=False, + help_text=ui.HELP_SKIP_DEPENDENCIES, + pyproject_type=bool, + ), + _option( + dest="html_out", + group="Reporting", + cli_kind="optional_path", + flags=("--html",), + default=None, + const=DEFAULT_HTML_REPORT_PATH, + metavar="FILE", + help_text=ui.HELP_HTML, + pyproject_type=str, + allow_none=True, + path_value=True, + ), + _option( + dest="json_out", + group="Reporting", + cli_kind="optional_path", + flags=("--json",), + default=None, + const=DEFAULT_JSON_REPORT_PATH, + metavar="FILE", + help_text=ui.HELP_JSON, + pyproject_type=str, + allow_none=True, + path_value=True, + ), + _option( + dest="md_out", + group="Reporting", + cli_kind="optional_path", + flags=("--md",), + default=None, + const=DEFAULT_MARKDOWN_REPORT_PATH, + metavar="FILE", + help_text=ui.HELP_MD, + pyproject_type=str, + allow_none=True, + path_value=True, + ), + _option( + dest="sarif_out", + group="Reporting", + cli_kind="optional_path", + flags=("--sarif",), + default=None, + const=DEFAULT_SARIF_REPORT_PATH, + metavar="FILE", + help_text=ui.HELP_SARIF, + pyproject_type=str, + allow_none=True, + path_value=True, + ), + _option( + dest="text_out", + group="Reporting", + cli_kind="optional_path", + flags=("--text",), + default=None, + const=DEFAULT_TEXT_REPORT_PATH, + metavar="FILE", + help_text=ui.HELP_TEXT, + pyproject_type=str, + allow_none=True, + path_value=True, + ), + _option( + dest="timestamped_report_paths", + group="Reporting", + cli_kind="bool_optional", + flags=("--timestamped-report-paths",), + default=False, + help_text=ui.HELP_TIMESTAMPED_REPORT_PATHS, + ), + _option( + dest="open_html_report", + group="Output and UI", + cli_kind="bool_optional", + flags=("--open-html-report",), + default=False, + help_text=ui.HELP_OPEN_HTML_REPORT, + ), + _option( + dest="no_progress", + group="Output and UI", + cli_kind="store_true", + flags=("--no-progress",), + default=False, + help_text=ui.HELP_NO_PROGRESS, + pyproject_type=bool, + ), + _option( + dest="no_progress", + group="Output and UI", + cli_kind="store_false", + flags=("--progress",), + help_text=ui.HELP_PROGRESS, + pyproject_key=None, + ), + _option( + dest="no_color", + group="Output and UI", + cli_kind="store_true", + flags=("--no-color",), + default=False, + help_text=ui.HELP_NO_COLOR, + pyproject_type=bool, + ), + _option( + dest="no_color", + group="Output and UI", + cli_kind="store_false", + flags=("--color",), + help_text=ui.HELP_COLOR, + pyproject_key=None, + ), + _option( + dest="quiet", + group="Output and UI", + cli_kind="bool_optional", + flags=("--quiet",), + default=False, + help_text=ui.HELP_QUIET, + pyproject_type=bool, + ), + _option( + dest="verbose", + group="Output and UI", + cli_kind="bool_optional", + flags=("--verbose",), + default=False, + help_text=ui.HELP_VERBOSE, + pyproject_type=bool, + ), + _option( + dest="debug", + group="Output and UI", + cli_kind="bool_optional", + flags=("--debug",), + default=False, + help_text=ui.HELP_DEBUG, + pyproject_type=bool, + ), + _option( + dest="help", + group="General", + cli_kind="help", + flags=("-h", "--help"), + help_text="Show this help message and exit.", + ), + _option( + dest="version", + group="General", + cli_kind="version", + flags=("--version",), + help_text=ui.HELP_VERSION, + ), +) + + +def _build_defaults_by_dest() -> dict[str, object]: + defaults: dict[str, object] = {} + for spec in OPTIONS: + if not spec.has_default or spec.dest in defaults: + continue + defaults[spec.dest] = spec.default + return defaults + + +def _build_pyproject_specs() -> dict[str, ConfigKeySpec]: + config_specs: dict[str, ConfigKeySpec] = {} + for spec in OPTIONS: + if spec.pyproject_key is None or spec.config_spec is None: + continue + if spec.pyproject_key in config_specs: + existing = config_specs[spec.pyproject_key] + if existing != spec.config_spec: + raise RuntimeError( + f"Conflicting pyproject spec for {spec.pyproject_key}" + ) + continue + config_specs[spec.pyproject_key] = spec.config_spec + return config_specs + + +DEFAULTS_BY_DEST: Final[dict[str, object]] = _build_defaults_by_dest() +CONFIG_KEY_SPECS: Final[dict[str, ConfigKeySpec]] = _build_pyproject_specs() +PATH_CONFIG_KEYS: Final[frozenset[str]] = frozenset( + spec.pyproject_key + for spec in OPTIONS + if spec.pyproject_key is not None and spec.path_value +) +TESTABLE_CLI_OPTIONS: Final[tuple[OptionSpec, ...]] = tuple( + spec + for spec in OPTIONS + if spec.cli_kind is not None and spec.cli_kind not in {"help", "version"} +) +PYPROJECT_OPTIONS: Final[tuple[OptionSpec, ...]] = tuple( + spec for spec in OPTIONS if spec.pyproject_key is not None and spec.config_spec +) + +__all__ = [ + "ARGUMENT_GROUP_TITLES", + "CONFIG_KEY_SPECS", + "DEFAULTS_BY_DEST", + "DEFAULT_BASELINE_PATH", + "DEFAULT_BLOCK_MIN_LOC", + "DEFAULT_BLOCK_MIN_STMT", + "DEFAULT_HTML_REPORT_PATH", + "DEFAULT_JSON_REPORT_PATH", + "DEFAULT_MARKDOWN_REPORT_PATH", + "DEFAULT_MAX_BASELINE_SIZE_MB", + "DEFAULT_MAX_CACHE_SIZE_MB", + "DEFAULT_MIN_LOC", + "DEFAULT_MIN_STMT", + "DEFAULT_PROCESSES", + "DEFAULT_ROOT", + "DEFAULT_SARIF_REPORT_PATH", + "DEFAULT_SEGMENT_MIN_LOC", + "DEFAULT_SEGMENT_MIN_STMT", + "DEFAULT_TEXT_REPORT_PATH", + "OPTIONS", + "PATH_CONFIG_KEYS", + "PYPROJECT_OPTIONS", + "TESTABLE_CLI_OPTIONS", + "ConfigKeySpec", + "OptionSpec", +] diff --git a/codeclone/contracts.py b/codeclone/contracts.py deleted file mode 100644 index 70a76ee..0000000 --- a/codeclone/contracts.py +++ /dev/null @@ -1,72 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at https://mozilla.org/MPL/2.0/. -# SPDX-License-Identifier: MPL-2.0 -# Copyright (c) 2026 Den Rozhnovskiy - -from __future__ import annotations - -from enum import IntEnum -from typing import Final - -BASELINE_SCHEMA_VERSION: Final = "2.1" -BASELINE_FINGERPRINT_VERSION: Final = "1" - -CACHE_VERSION: Final = "2.5" -REPORT_SCHEMA_VERSION: Final = "2.8" -METRICS_BASELINE_SCHEMA_VERSION: Final = "1.2" - -DEFAULT_COMPLEXITY_THRESHOLD: Final = 20 -DEFAULT_COUPLING_THRESHOLD: Final = 10 -DEFAULT_COHESION_THRESHOLD: Final = 4 -DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD: Final = 20 -DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD: Final = 10 -DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD: Final = 4 -DEFAULT_HEALTH_THRESHOLD: Final = 60 - -COMPLEXITY_RISK_LOW_MAX: Final = 10 -COMPLEXITY_RISK_MEDIUM_MAX: Final = 20 -COUPLING_RISK_LOW_MAX: Final = 5 -COUPLING_RISK_MEDIUM_MAX: Final = 10 -COHESION_RISK_MEDIUM_MAX: Final = 3 - -HEALTH_WEIGHTS: Final[dict[str, float]] = { - "clones": 0.25, - "complexity": 0.20, - "coupling": 0.10, - "cohesion": 0.15, - "dead_code": 0.10, - "dependencies": 0.10, - "coverage": 0.10, -} - - -class ExitCode(IntEnum): - SUCCESS = 0 - CONTRACT_ERROR = 2 - GATING_FAILURE = 3 - INTERNAL_ERROR = 5 - - -REPOSITORY_URL: Final = "https://github.com/orenlab/codeclone" -ISSUES_URL: Final = "https://github.com/orenlab/codeclone/issues" -DOCS_URL: Final = "https://orenlab.github.io/codeclone/" - - -def cli_help_epilog() -> str: - return "\n".join( - [ - "Exit codes:", - " 0 Success.", - " 2 Contract error: untrusted or invalid baseline, invalid output", - " configuration, incompatible versions, or unreadable sources in", - " CI/gating mode.", - " 3 Gating failure: new clones, threshold violations, or metrics", - " quality gate failures.", - " 5 Internal error: unexpected exception.", - "", - f"Repository: {REPOSITORY_URL}", - f"Issues: {ISSUES_URL}", - f"Docs: {DOCS_URL}", - ] - ) diff --git a/codeclone/contracts/__init__.py b/codeclone/contracts/__init__.py new file mode 100644 index 0000000..f8ad78f --- /dev/null +++ b/codeclone/contracts/__init__.py @@ -0,0 +1,141 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from enum import IntEnum +from typing import Final + +BASELINE_SCHEMA_VERSION: Final = "2.1" +BASELINE_FINGERPRINT_VERSION: Final = "1" + +CACHE_VERSION: Final = "2.6" +REPORT_SCHEMA_VERSION: Final = "2.10" +METRICS_BASELINE_SCHEMA_VERSION: Final = "1.2" + +DEFAULT_COMPLEXITY_THRESHOLD: Final = 20 +DEFAULT_COUPLING_THRESHOLD: Final = 10 +DEFAULT_COHESION_THRESHOLD: Final = 4 +DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD: Final = 20 +DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD: Final = 10 +DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD: Final = 4 +DEFAULT_HEALTH_THRESHOLD: Final = 60 +DEFAULT_ROOT: Final = "." +DEFAULT_MIN_LOC: Final = 10 +DEFAULT_MIN_STMT: Final = 6 +DEFAULT_BLOCK_MIN_LOC: Final = 20 +DEFAULT_BLOCK_MIN_STMT: Final = 8 +DEFAULT_SEGMENT_MIN_LOC: Final = 20 +DEFAULT_SEGMENT_MIN_STMT: Final = 10 +DEFAULT_PROCESSES: Final = 4 +DEFAULT_MAX_CACHE_SIZE_MB: Final = 50 +DEFAULT_MAX_BASELINE_SIZE_MB: Final = 5 +DEFAULT_COVERAGE_MIN: Final = 50 +DEFAULT_BASELINE_PATH: Final = "codeclone.baseline.json" +DEFAULT_HTML_REPORT_PATH: Final = ".cache/codeclone/report.html" +DEFAULT_JSON_REPORT_PATH: Final = ".cache/codeclone/report.json" +DEFAULT_MARKDOWN_REPORT_PATH: Final = ".cache/codeclone/report.md" +DEFAULT_SARIF_REPORT_PATH: Final = ".cache/codeclone/report.sarif" +DEFAULT_TEXT_REPORT_PATH: Final = ".cache/codeclone/report.txt" + +COMPLEXITY_RISK_LOW_MAX: Final = 10 +COMPLEXITY_RISK_MEDIUM_MAX: Final = 20 +COUPLING_RISK_LOW_MAX: Final = 5 +COUPLING_RISK_MEDIUM_MAX: Final = 10 +COHESION_RISK_MEDIUM_MAX: Final = 3 +HEALTH_DEPENDENCY_CYCLE_PENALTY: Final = 25 +HEALTH_DEPENDENCY_DEPTH_LEVEL_PENALTY: Final = 4 +HEALTH_DEPENDENCY_DEPTH_AVG_MULTIPLIER: Final = 2.0 +HEALTH_DEPENDENCY_DEPTH_P95_MARGIN: Final = 1 + +HEALTH_WEIGHTS: Final[dict[str, float]] = { + "clones": 0.25, + "complexity": 0.20, + "coupling": 0.10, + "cohesion": 0.15, + "dead_code": 0.10, + "dependencies": 0.10, + "coverage": 0.10, +} + + +class ExitCode(IntEnum): + SUCCESS = 0 + CONTRACT_ERROR = 2 + GATING_FAILURE = 3 + INTERNAL_ERROR = 5 + + +REPOSITORY_URL: Final = "https://github.com/orenlab/codeclone" +ISSUES_URL: Final = "https://github.com/orenlab/codeclone/issues" +DOCS_URL: Final = "https://orenlab.github.io/codeclone/" + + +def cli_help_epilog() -> str: + return "\n".join( + [ + "Exit codes:", + " 0 Success.", + " 2 Contract error: untrusted or invalid baseline, invalid output", + " configuration, incompatible versions, or unreadable sources in", + " CI/gating mode.", + " 3 Gating failure: new clones, threshold violations, or metrics", + " quality gate failures.", + " 5 Internal error: unexpected exception.", + "", + f"Repository: {REPOSITORY_URL}", + f"Issues: {ISSUES_URL}", + f"Docs: {DOCS_URL}", + ] + ) + + +__all__ = [ + "BASELINE_FINGERPRINT_VERSION", + "BASELINE_SCHEMA_VERSION", + "CACHE_VERSION", + "COHESION_RISK_MEDIUM_MAX", + "COMPLEXITY_RISK_LOW_MAX", + "COMPLEXITY_RISK_MEDIUM_MAX", + "COUPLING_RISK_LOW_MAX", + "COUPLING_RISK_MEDIUM_MAX", + "DEFAULT_BASELINE_PATH", + "DEFAULT_BLOCK_MIN_LOC", + "DEFAULT_BLOCK_MIN_STMT", + "DEFAULT_COHESION_THRESHOLD", + "DEFAULT_COMPLEXITY_THRESHOLD", + "DEFAULT_COUPLING_THRESHOLD", + "DEFAULT_COVERAGE_MIN", + "DEFAULT_HEALTH_THRESHOLD", + "DEFAULT_HTML_REPORT_PATH", + "DEFAULT_JSON_REPORT_PATH", + "DEFAULT_MARKDOWN_REPORT_PATH", + "DEFAULT_MAX_BASELINE_SIZE_MB", + "DEFAULT_MAX_CACHE_SIZE_MB", + "DEFAULT_MIN_LOC", + "DEFAULT_MIN_STMT", + "DEFAULT_PROCESSES", + "DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD", + "DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD", + "DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD", + "DEFAULT_ROOT", + "DEFAULT_SARIF_REPORT_PATH", + "DEFAULT_SEGMENT_MIN_LOC", + "DEFAULT_SEGMENT_MIN_STMT", + "DEFAULT_TEXT_REPORT_PATH", + "DOCS_URL", + "HEALTH_DEPENDENCY_CYCLE_PENALTY", + "HEALTH_DEPENDENCY_DEPTH_AVG_MULTIPLIER", + "HEALTH_DEPENDENCY_DEPTH_LEVEL_PENALTY", + "HEALTH_DEPENDENCY_DEPTH_P95_MARGIN", + "HEALTH_WEIGHTS", + "ISSUES_URL", + "METRICS_BASELINE_SCHEMA_VERSION", + "REPORT_SCHEMA_VERSION", + "REPOSITORY_URL", + "ExitCode", + "cli_help_epilog", +] diff --git a/codeclone/errors.py b/codeclone/contracts/errors.py similarity index 85% rename from codeclone/errors.py rename to codeclone/contracts/errors.py index 7b9331f..f19c34b 100644 --- a/codeclone/errors.py +++ b/codeclone/contracts/errors.py @@ -37,3 +37,14 @@ class BaselineValidationError(BaselineSchemaError): def __init__(self, message: str, *, status: str = "invalid_type") -> None: super().__init__(message) self.status = status + + +__all__ = [ + "BaselineSchemaError", + "BaselineValidationError", + "CacheError", + "CodeCloneError", + "FileProcessingError", + "ParseError", + "ValidationError", +] diff --git a/codeclone/contracts/schemas.py b/codeclone/contracts/schemas.py new file mode 100644 index 0000000..ec1eb49 --- /dev/null +++ b/codeclone/contracts/schemas.py @@ -0,0 +1,85 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from typing import TypedDict + + +class AnalysisProfile(TypedDict): + min_loc: int + min_stmt: int + block_min_loc: int + block_min_stmt: int + segment_min_loc: int + segment_min_stmt: int + collect_api_surface: bool + + +class AnalysisProfileMeta(TypedDict): + min_loc: int + min_stmt: int + block_min_loc: int + block_min_stmt: int + segment_min_loc: int + segment_min_stmt: int + + +class ReportMeta(TypedDict): + """ + Canonical report metadata contract shared by HTML, JSON, and TXT reports. + + Key semantics: + - python_version: runtime major.minor string for human readability (e.g. "3.14") + - python_tag: runtime compatibility tag used by baseline/cache contracts + (e.g. "cp314") + - baseline_*: values loaded from baseline metadata for audit/provenance + - cache_*: cache status/provenance for run transparency + """ + + codeclone_version: str + project_name: str + scan_root: str + python_version: str + python_tag: str + baseline_path: str + baseline_fingerprint_version: str | None + baseline_schema_version: str | None + baseline_python_tag: str | None + baseline_generator_name: str | None + baseline_generator_version: str | None + baseline_payload_sha256: str | None + baseline_payload_sha256_verified: bool + baseline_loaded: bool + baseline_status: str + cache_path: str + cache_used: bool + cache_status: str + cache_schema_version: str | None + files_skipped_source_io: int + metrics_baseline_path: str + metrics_baseline_loaded: bool + metrics_baseline_status: str + metrics_baseline_schema_version: str | None + metrics_baseline_payload_sha256: str | None + metrics_baseline_payload_sha256_verified: bool + health_score: int | None + health_grade: str | None + analysis_mode: str + metrics_computed: list[str] + analysis_profile: AnalysisProfileMeta + design_complexity_threshold: int + design_coupling_threshold: int + design_cohesion_threshold: int + analysis_started_at_utc: str | None + report_generated_at_utc: str + + +__all__ = [ + "AnalysisProfile", + "AnalysisProfileMeta", + "ReportMeta", +] diff --git a/codeclone/core/__init__.py b/codeclone/core/__init__.py new file mode 100644 index 0000000..557317f --- /dev/null +++ b/codeclone/core/__init__.py @@ -0,0 +1,4 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 diff --git a/codeclone/core/_types.py b/codeclone/core/_types.py new file mode 100644 index 0000000..3336374 --- /dev/null +++ b/codeclone/core/_types.py @@ -0,0 +1,358 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from argparse import Namespace +from collections.abc import Mapping +from dataclasses import dataclass +from hashlib import sha256 +from pathlib import Path + +import orjson + +from ..analysis.normalizer import NormalizationConfig +from ..cache.entries import FileStat +from ..cache.projection import SegmentReportProjection +from ..contracts import DEFAULT_PROCESSES +from ..models import ( + BlockUnit, + ClassMetrics, + CoverageJoinResult, + DeadCandidate, + FileMetrics, + GroupItem, + GroupItemLike, + ModuleApiSurface, + ModuleDep, + ModuleDocstringCoverage, + ModuleTypingCoverage, + ProjectMetrics, + SecuritySurface, + SegmentGroupItem, + SegmentUnit, + StructuralFindingGroup, + Suggestion, + SuppressedCloneGroup, + Unit, +) +from ..utils.coerce import as_int, as_mapping, as_str + +MAX_FILE_SIZE = 10 * 1024 * 1024 +DEFAULT_BATCH_SIZE = 100 +PARALLEL_MIN_FILES_PER_WORKER = 8 +PARALLEL_MIN_FILES_FLOOR = 16 +DEFAULT_RUNTIME_PROCESSES = DEFAULT_PROCESSES + + +@dataclass(frozen=True, slots=True) +class OutputPaths: + html: Path | None = None + json: Path | None = None + text: Path | None = None + md: Path | None = None + sarif: Path | None = None + + +@dataclass(frozen=True, slots=True) +class BootstrapResult: + root: Path + config: NormalizationConfig + args: Namespace + output_paths: OutputPaths + cache_path: Path + + +@dataclass(frozen=True, slots=True) +class DiscoveryResult: + files_found: int + cache_hits: int + files_skipped: int + all_file_paths: tuple[str, ...] + cached_units: tuple[GroupItem, ...] + cached_blocks: tuple[GroupItem, ...] + cached_segments: tuple[GroupItem, ...] + cached_class_metrics: tuple[ClassMetrics, ...] + cached_module_deps: tuple[ModuleDep, ...] + cached_dead_candidates: tuple[DeadCandidate, ...] + cached_referenced_names: frozenset[str] + files_to_process: tuple[str, ...] + skipped_warnings: tuple[str, ...] + cached_security_surfaces: tuple[SecuritySurface, ...] = () + cached_referenced_qualnames: frozenset[str] = frozenset() + cached_typing_modules: tuple[ModuleTypingCoverage, ...] = () + cached_docstring_modules: tuple[ModuleDocstringCoverage, ...] = () + cached_api_modules: tuple[ModuleApiSurface, ...] = () + cached_structural_findings: tuple[StructuralFindingGroup, ...] = () + cached_segment_report_projection: SegmentReportProjection | None = None + cached_lines: int = 0 + cached_functions: int = 0 + cached_methods: int = 0 + cached_classes: int = 0 + cached_source_stats_by_file: tuple[tuple[str, int, int, int, int], ...] = () + + +@dataclass(frozen=True, slots=True) +class FileProcessResult: + filepath: str + success: bool + error: str | None = None + units: list[Unit] | None = None + blocks: list[BlockUnit] | None = None + segments: list[SegmentUnit] | None = None + lines: int = 0 + functions: int = 0 + methods: int = 0 + classes: int = 0 + stat: FileStat | None = None + error_kind: str | None = None + file_metrics: FileMetrics | None = None + structural_findings: list[StructuralFindingGroup] | None = None + + +@dataclass(frozen=True, slots=True) +class ProcessingResult: + units: tuple[GroupItem, ...] + blocks: tuple[GroupItem, ...] + segments: tuple[GroupItem, ...] + class_metrics: tuple[ClassMetrics, ...] + module_deps: tuple[ModuleDep, ...] + dead_candidates: tuple[DeadCandidate, ...] + referenced_names: frozenset[str] + files_analyzed: int + files_skipped: int + analyzed_lines: int + analyzed_functions: int + analyzed_methods: int + analyzed_classes: int + failed_files: tuple[str, ...] + source_read_failures: tuple[str, ...] + security_surfaces: tuple[SecuritySurface, ...] = () + referenced_qualnames: frozenset[str] = frozenset() + typing_modules: tuple[ModuleTypingCoverage, ...] = () + docstring_modules: tuple[ModuleDocstringCoverage, ...] = () + api_modules: tuple[ModuleApiSurface, ...] = () + structural_findings: tuple[StructuralFindingGroup, ...] = () + source_stats_by_file: tuple[tuple[str, int, int, int, int], ...] = () + + +@dataclass(frozen=True, slots=True) +class AnalysisResult: + func_groups: Mapping[str, list[GroupItem]] + block_groups: Mapping[str, list[GroupItem]] + block_groups_report: Mapping[str, list[GroupItem]] + segment_groups: Mapping[str, list[GroupItem]] + suppressed_segment_groups: int + block_group_facts: dict[str, dict[str, str]] + func_clones_count: int + block_clones_count: int + segment_clones_count: int + files_analyzed_or_cached: int + project_metrics: ProjectMetrics | None + metrics_payload: dict[str, object] | None + suggestions: tuple[Suggestion, ...] + segment_groups_raw_digest: str + suppressed_clone_groups: tuple[SuppressedCloneGroup, ...] = () + coverage_join: CoverageJoinResult | None = None + suppressed_dead_code_items: int = 0 + structural_findings: tuple[StructuralFindingGroup, ...] = () + + +@dataclass(frozen=True, slots=True) +class ReportArtifacts: + html: str | None = None + json: str | None = None + text: str | None = None + md: str | None = None + sarif: str | None = None + report_document: dict[str, object] | None = None + + +def _as_sorted_str_tuple(value: object) -> tuple[str, ...]: + if not isinstance(value, list): + return () + return tuple(sorted({item for item in value if isinstance(item, str) and item})) + + +def _group_item_sort_key(item: GroupItemLike) -> tuple[str, int, int, str]: + return ( + as_str(item.get("filepath")), + as_int(item.get("start_line")), + as_int(item.get("end_line")), + as_str(item.get("qualname")), + ) + + +def _segment_projection_item_sort_key( + item: GroupItemLike, +) -> tuple[str, str, int, int]: + return ( + as_str(item.get("filepath")), + as_str(item.get("qualname")), + as_int(item.get("start_line")), + as_int(item.get("end_line")), + ) + + +def _segment_groups_digest(segment_groups: Mapping[str, list[GroupItem]]) -> str: + normalized_rows: list[ + tuple[str, tuple[tuple[str, str, int, int, int, str, str], ...]] + ] = [] + for group_key in sorted(segment_groups): + items = sorted(segment_groups[group_key], key=_segment_projection_item_sort_key) + normalized_items = [ + ( + as_str(item.get("filepath")), + as_str(item.get("qualname")), + as_int(item.get("start_line")), + as_int(item.get("end_line")), + as_int(item.get("size")), + as_str(item.get("segment_hash")), + as_str(item.get("segment_sig")), + ) + for item in items + ] + normalized_rows.append((group_key, tuple(normalized_items))) + payload = orjson.dumps(tuple(normalized_rows), option=orjson.OPT_SORT_KEYS) + return sha256(payload).hexdigest() + + +def _coerce_segment_report_projection( + value: object, +) -> SegmentReportProjection | None: + row = as_mapping(value) + if not row: + return None + match row.get("digest"), row.get("suppressed"), row.get("groups"): + case str() as digest, int() as suppressed, dict() as groups: + pass + case _: + return None + if not all( + isinstance(group_key, str) and isinstance(items, list) + for group_key, items in groups.items() + ): + return None + normalized_groups: dict[str, list[SegmentGroupItem]] = {} + for group_key, items in groups.items(): + if not isinstance(group_key, str) or not isinstance(items, list): + return None + normalized_items: list[SegmentGroupItem] = [] + for item in items: + if not isinstance(item, dict): + return None + segment_hash = item.get("segment_hash") + segment_sig = item.get("segment_sig") + filepath = item.get("filepath") + qualname = item.get("qualname") + start_line = item.get("start_line") + end_line = item.get("end_line") + size = item.get("size") + if not ( + isinstance(segment_hash, str) + and isinstance(segment_sig, str) + and isinstance(filepath, str) + and isinstance(qualname, str) + and isinstance(start_line, int) + and isinstance(end_line, int) + and isinstance(size, int) + ): + return None + normalized_items.append( + SegmentGroupItem( + segment_hash=segment_hash, + segment_sig=segment_sig, + filepath=filepath, + qualname=qualname, + start_line=start_line, + end_line=end_line, + size=size, + ) + ) + normalized_groups[group_key] = normalized_items + return { + "digest": digest, + "suppressed": suppressed, + "groups": normalized_groups, + } + + +def _module_dep_sort_key(dep: ModuleDep) -> tuple[str, str, str, int]: + return dep.source, dep.target, dep.import_type, dep.line + + +def _class_metric_sort_key(metric: ClassMetrics) -> tuple[str, int, int, str]: + return metric.filepath, metric.start_line, metric.end_line, metric.qualname + + +def _dead_candidate_sort_key(item: DeadCandidate) -> tuple[str, int, int, str]: + return item.filepath, item.start_line, item.end_line, item.qualname + + +def _module_names_from_units(units: tuple[GroupItemLike, ...]) -> frozenset[str]: + modules: set[str] = set() + for item in units: + qualname = as_str(item.get("qualname")) if isinstance(item, Mapping) else "" + module_name = qualname.split(":", 1)[0] if ":" in qualname else qualname + if module_name: + modules.add(module_name) + return frozenset(sorted(modules)) + + +def _unit_to_group_item(unit: Unit) -> GroupItem: + return { + "qualname": unit.qualname, + "filepath": unit.filepath, + "start_line": unit.start_line, + "end_line": unit.end_line, + "loc": unit.loc, + "stmt_count": unit.stmt_count, + "fingerprint": unit.fingerprint, + "loc_bucket": unit.loc_bucket, + "cyclomatic_complexity": unit.cyclomatic_complexity, + "nesting_depth": unit.nesting_depth, + "risk": unit.risk, + "raw_hash": unit.raw_hash, + "entry_guard_count": unit.entry_guard_count, + "entry_guard_terminal_profile": unit.entry_guard_terminal_profile, + "entry_guard_has_side_effect_before": unit.entry_guard_has_side_effect_before, + "terminal_kind": unit.terminal_kind, + "try_finally_profile": unit.try_finally_profile, + "side_effect_order_profile": unit.side_effect_order_profile, + } + + +def _block_to_group_item(block: BlockUnit) -> GroupItem: + return { + "block_hash": block.block_hash, + "filepath": block.filepath, + "qualname": block.qualname, + "start_line": block.start_line, + "end_line": block.end_line, + "size": block.size, + } + + +def _segment_to_group_item(segment: SegmentUnit) -> GroupItem: + return { + "filepath": segment.filepath, + "qualname": segment.qualname, + "start_line": segment.start_line, + "end_line": segment.end_line, + "size": segment.size, + "segment_hash": segment.segment_hash, + "segment_sig": segment.segment_sig, + } + + +def _should_collect_structural_findings(output_paths: OutputPaths) -> bool: + return bool( + output_paths.html + or output_paths.json + or output_paths.md + or output_paths.text + or output_paths.sarif + ) diff --git a/codeclone/core/api_surface_payload.py b/codeclone/core/api_surface_payload.py new file mode 100644 index 0000000..7ec0ff8 --- /dev/null +++ b/codeclone/core/api_surface_payload.py @@ -0,0 +1,98 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections.abc import Sequence + +from ..models import ApiBreakingChange, ApiSurfaceSnapshot +from ..utils.coerce import as_int, as_str + + +def _api_surface_summary(api_surface: ApiSurfaceSnapshot | None) -> dict[str, object]: + modules = api_surface.modules if api_surface is not None else () + return { + "enabled": api_surface is not None, + "modules": len(modules), + "public_symbols": sum(len(module.symbols) for module in modules), + "added": 0, + "breaking": 0, + "strict_types": False, + } + + +def _api_surface_rows( + api_surface: ApiSurfaceSnapshot | None, +) -> list[dict[str, object]]: + if api_surface is None: + return [] + rows: list[dict[str, object]] = [] + for module in api_surface.modules: + rows.extend( + { + "record_kind": "symbol", + "module": module.module, + "filepath": module.filepath, + "qualname": symbol.qualname, + "start_line": symbol.start_line, + "end_line": symbol.end_line, + "symbol_kind": symbol.kind, + "exported_via": symbol.exported_via, + "params_total": len(symbol.params), + "params": [ + { + "name": param.name, + "kind": param.kind, + "has_default": param.has_default, + "annotated": bool(param.annotation_hash), + } + for param in symbol.params + ], + "returns_annotated": bool(symbol.returns_hash), + } + for symbol in module.symbols + ) + return sorted( + rows, + key=lambda item: ( + as_str(item.get("filepath")), + as_int(item.get("start_line")), + as_int(item.get("end_line")), + as_str(item.get("qualname")), + as_str(item.get("record_kind")), + ), + ) + + +def _breaking_api_surface_rows(changes: Sequence[object]) -> list[dict[str, object]]: + rows: list[dict[str, object]] = [] + for change in changes: + if not isinstance(change, ApiBreakingChange): + continue + module_name, _, _local_name = change.qualname.partition(":") + rows.append( + { + "record_kind": "breaking_change", + "module": module_name, + "filepath": change.filepath, + "qualname": change.qualname, + "start_line": change.start_line, + "end_line": change.end_line, + "symbol_kind": change.symbol_kind, + "change_kind": change.change_kind, + "detail": change.detail, + } + ) + return sorted( + rows, + key=lambda item: ( + as_str(item.get("filepath")), + as_int(item.get("start_line")), + as_int(item.get("end_line")), + as_str(item.get("qualname")), + as_str(item.get("change_kind")), + ), + ) diff --git a/codeclone/core/bootstrap.py b/codeclone/core/bootstrap.py new file mode 100644 index 0000000..1043a27 --- /dev/null +++ b/codeclone/core/bootstrap.py @@ -0,0 +1,41 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from argparse import Namespace +from pathlib import Path + +from ..analysis.normalizer import NormalizationConfig +from ._types import BootstrapResult, OutputPaths + + +def bootstrap( + *, + args: Namespace, + root: Path, + output_paths: OutputPaths, + cache_path: Path, +) -> BootstrapResult: + return BootstrapResult( + root=root, + config=NormalizationConfig(), + args=args, + output_paths=output_paths, + cache_path=cache_path, + ) + + +def _resolve_optional_runtime_path(value: object, *, root: Path) -> Path | None: + text = str(value).strip() if value is not None else "" + if not text: + return None + candidate = Path(text).expanduser() + resolved = candidate if candidate.is_absolute() else root / candidate + try: + return resolved.resolve() + except OSError: + return resolved.absolute() diff --git a/codeclone/core/coverage_payload.py b/codeclone/core/coverage_payload.py new file mode 100644 index 0000000..1380b71 --- /dev/null +++ b/codeclone/core/coverage_payload.py @@ -0,0 +1,173 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from ..models import CoverageJoinResult, ProjectMetrics +from ..utils.coerce import as_int, as_str + + +def _permille(numerator: int, denominator: int) -> int: + if denominator <= 0: + return 0 + return round((1000.0 * float(numerator)) / float(denominator)) + + +def _coverage_join_summary( + coverage_join: CoverageJoinResult | None, +) -> dict[str, object]: + if coverage_join is None: + return {} + return { + "status": coverage_join.status, + "source": coverage_join.coverage_xml, + "files": coverage_join.files, + "units": len(coverage_join.units), + "measured_units": coverage_join.measured_units, + "overall_executable_lines": coverage_join.overall_executable_lines, + "overall_covered_lines": coverage_join.overall_covered_lines, + "overall_permille": _permille( + coverage_join.overall_covered_lines, + coverage_join.overall_executable_lines, + ), + "missing_from_report_units": sum( + 1 + for fact in coverage_join.units + if fact.coverage_status == "missing_from_report" + ), + "coverage_hotspots": coverage_join.coverage_hotspots, + "scope_gap_hotspots": coverage_join.scope_gap_hotspots, + "hotspot_threshold_percent": coverage_join.hotspot_threshold_percent, + "invalid_reason": coverage_join.invalid_reason, + } + + +def _coverage_join_rows( + coverage_join: CoverageJoinResult | None, +) -> list[dict[str, object]]: + if coverage_join is None or coverage_join.status != "ok": + return [] + return sorted( + ( + { + "qualname": fact.qualname, + "filepath": fact.filepath, + "start_line": fact.start_line, + "end_line": fact.end_line, + "cyclomatic_complexity": fact.cyclomatic_complexity, + "risk": fact.risk, + "executable_lines": fact.executable_lines, + "covered_lines": fact.covered_lines, + "coverage_permille": fact.coverage_permille, + "coverage_status": fact.coverage_status, + "coverage_hotspot": ( + fact.risk in {"medium", "high"} + and fact.coverage_status == "measured" + and (fact.coverage_permille / 10.0) + < float(coverage_join.hotspot_threshold_percent) + ), + "scope_gap_hotspot": ( + fact.risk in {"medium", "high"} + and fact.coverage_status == "missing_from_report" + ), + "coverage_review_item": ( + ( + fact.risk in {"medium", "high"} + and fact.coverage_status == "measured" + and (fact.coverage_permille / 10.0) + < float(coverage_join.hotspot_threshold_percent) + ) + or ( + fact.risk in {"medium", "high"} + and fact.coverage_status == "missing_from_report" + ) + ), + } + for fact in coverage_join.units + ), + key=lambda item: ( + 0 if bool(item.get("coverage_hotspot")) else 1, + 0 if bool(item.get("scope_gap_hotspot")) else 1, + {"high": 0, "medium": 1, "low": 2}.get(as_str(item.get("risk")), 3), + as_int(item.get("coverage_permille"), 0), + -as_int(item.get("cyclomatic_complexity"), 0), + as_str(item.get("filepath")), + as_int(item.get("start_line")), + as_str(item.get("qualname")), + ), + ) + + +def _coverage_adoption_rows(project_metrics: ProjectMetrics) -> list[dict[str, object]]: + docstring_by_module = { + (item.filepath, item.module): item for item in project_metrics.docstring_modules + } + rows: list[dict[str, object]] = [] + seen_keys: set[tuple[str, str]] = set() + for typing_item in project_metrics.typing_modules: + key = (typing_item.filepath, typing_item.module) + seen_keys.add(key) + docstring_item = docstring_by_module.get(key) + doc_total = docstring_item.public_symbol_total if docstring_item else 0 + doc_documented = ( + docstring_item.public_symbol_documented if docstring_item else 0 + ) + rows.append( + { + "module": typing_item.module, + "filepath": typing_item.filepath, + "callable_count": typing_item.callable_count, + "params_total": typing_item.params_total, + "params_annotated": typing_item.params_annotated, + "param_permille": _permille( + typing_item.params_annotated, + typing_item.params_total, + ), + "returns_total": typing_item.returns_total, + "returns_annotated": typing_item.returns_annotated, + "return_permille": _permille( + typing_item.returns_annotated, + typing_item.returns_total, + ), + "any_annotation_count": typing_item.any_annotation_count, + "public_symbol_total": doc_total, + "public_symbol_documented": doc_documented, + "docstring_permille": _permille(doc_documented, doc_total), + } + ) + for docstring_item in project_metrics.docstring_modules: + key = (docstring_item.filepath, docstring_item.module) + if key in seen_keys: + continue + rows.append( + { + "module": docstring_item.module, + "filepath": docstring_item.filepath, + "callable_count": 0, + "params_total": 0, + "params_annotated": 0, + "param_permille": 0, + "returns_total": 0, + "returns_annotated": 0, + "return_permille": 0, + "any_annotation_count": 0, + "public_symbol_total": docstring_item.public_symbol_total, + "public_symbol_documented": docstring_item.public_symbol_documented, + "docstring_permille": _permille( + docstring_item.public_symbol_documented, + docstring_item.public_symbol_total, + ), + } + ) + return sorted( + rows, + key=lambda item: ( + as_int(item.get("param_permille")), + as_int(item.get("docstring_permille")), + as_int(item.get("return_permille")), + as_str(item.get("module")), + ), + ) diff --git a/codeclone/core/discovery.py b/codeclone/core/discovery.py new file mode 100644 index 0000000..99ddcf1 --- /dev/null +++ b/codeclone/core/discovery.py @@ -0,0 +1,220 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections.abc import Mapping, Sequence + +from ..cache.store import Cache, file_stat_signature +from ..models import ( + ClassMetrics, + DeadCandidate, + GroupItem, + ModuleApiSurface, + ModuleDep, + ModuleDocstringCoverage, + ModuleTypingCoverage, + SecuritySurface, + StructuralFindingGroup, +) +from ..scanner import iter_py_files +from ._types import ( + BootstrapResult, + DiscoveryResult, + _class_metric_sort_key, + _coerce_segment_report_projection, + _dead_candidate_sort_key, + _group_item_sort_key, + _module_dep_sort_key, + _should_collect_structural_findings, +) +from .discovery_cache import ( + decode_cached_structural_finding_group as _decode_cached_structural_finding_group, +) +from .discovery_cache import ( + load_cached_metrics_extended as _load_cached_metrics_extended, +) +from .discovery_cache import usable_cached_source_stats as _usable_cached_source_stats + +DiscoveryBuffers = tuple[ + list[GroupItem], + list[GroupItem], + list[GroupItem], + list[ClassMetrics], + list[ModuleDep], + list[DeadCandidate], + set[str], + set[str], + list[ModuleTypingCoverage], + list[ModuleDocstringCoverage], + list[ModuleApiSurface], + list[SecuritySurface], + list[str], + list[str], +] + + +def _group_items_from_cache(rows: Sequence[Mapping[str, object]]) -> list[GroupItem]: + return [dict(row) for row in rows] + + +def _new_discovery_buffers() -> DiscoveryBuffers: + # Keep buffer order aligned with DiscoveryBuffers above. + return [], [], [], [], [], [], set(), set(), [], [], [], [], [], [] + + +def discover(*, boot: BootstrapResult, cache: Cache) -> DiscoveryResult: + files_found = 0 + cache_hits = 0 + files_skipped = 0 + collect_structural_findings = _should_collect_structural_findings(boot.output_paths) + cached_segment_projection = _coerce_segment_report_projection( + getattr(cache, "segment_report_projection", None) + ) + ( + cached_units, + cached_blocks, + cached_segments, + cached_class_metrics, + cached_module_deps, + cached_dead_candidates, + cached_referenced_names, + cached_referenced_qualnames, + cached_typing_modules, + cached_docstring_modules, + cached_api_modules, + cached_security_surfaces, + files_to_process, + skipped_warnings, + ) = _new_discovery_buffers() + cached_sf: list[StructuralFindingGroup] = [] + cached_source_stats_by_file: list[tuple[str, int, int, int, int]] = [] + cached_lines = 0 + cached_functions = 0 + cached_methods = 0 + cached_classes = 0 + all_file_paths: list[str] = [] + + for filepath in iter_py_files(str(boot.root)): + files_found += 1 + all_file_paths.append(filepath) + try: + stat = file_stat_signature(filepath) + except OSError as exc: + files_skipped += 1 + skipped_warnings.append(f"{filepath}: {exc}") + continue + cached = cache.get_file_entry(filepath) + if cached and cached.get("stat") == stat: + cached_source_stats = _usable_cached_source_stats( + cached, + skip_metrics=boot.args.skip_metrics, + collect_structural_findings=collect_structural_findings, + ) + if cached_source_stats is None: + files_to_process.append(filepath) + continue + cache_hits += 1 + lines, functions, methods, classes = cached_source_stats + cached_lines += lines + cached_functions += functions + cached_methods += methods + cached_classes += classes + cached_source_stats_by_file.append( + (filepath, lines, functions, methods, classes) + ) + cached_units.extend(_group_items_from_cache(cached["units"])) + cached_blocks.extend(_group_items_from_cache(cached["blocks"])) + cached_segments.extend(_group_items_from_cache(cached["segments"])) + if not boot.args.skip_metrics: + ( + class_metrics, + module_deps, + dead_candidates, + referenced_names, + referenced_qualnames, + typing_coverage, + docstring_coverage, + api_surface, + security_surfaces, + ) = _load_cached_metrics_extended(cached, filepath=filepath) + cached_class_metrics.extend(class_metrics) + cached_module_deps.extend(module_deps) + cached_dead_candidates.extend(dead_candidates) + cached_referenced_names.update(referenced_names) + cached_referenced_qualnames.update(referenced_qualnames) + if typing_coverage is not None: + cached_typing_modules.append(typing_coverage) + if docstring_coverage is not None: + cached_docstring_modules.append(docstring_coverage) + if api_surface is not None: + cached_api_modules.append(api_surface) + cached_security_surfaces.extend(security_surfaces) + if collect_structural_findings: + cached_sf.extend( + _decode_cached_structural_finding_group(group_dict, filepath) + for group_dict in cached.get("structural_findings") or [] + ) + continue + files_to_process.append(filepath) + + cache.prune_file_entries(all_file_paths) + + return DiscoveryResult( + files_found=files_found, + cache_hits=cache_hits, + files_skipped=files_skipped, + all_file_paths=tuple(all_file_paths), + cached_units=tuple(sorted(cached_units, key=_group_item_sort_key)), + cached_blocks=tuple(sorted(cached_blocks, key=_group_item_sort_key)), + cached_segments=tuple(sorted(cached_segments, key=_group_item_sort_key)), + cached_class_metrics=tuple( + sorted(cached_class_metrics, key=_class_metric_sort_key) + ), + cached_module_deps=tuple(sorted(cached_module_deps, key=_module_dep_sort_key)), + cached_dead_candidates=tuple( + sorted(cached_dead_candidates, key=_dead_candidate_sort_key) + ), + cached_referenced_names=frozenset(cached_referenced_names), + cached_security_surfaces=tuple( + sorted( + cached_security_surfaces, + key=lambda item: ( + item.filepath, + item.start_line, + item.end_line, + item.qualname, + item.category, + item.capability, + item.evidence_symbol, + ), + ) + ), + cached_referenced_qualnames=frozenset(cached_referenced_qualnames), + cached_typing_modules=tuple( + sorted(cached_typing_modules, key=lambda item: (item.filepath, item.module)) + ), + cached_docstring_modules=tuple( + sorted( + cached_docstring_modules, + key=lambda item: (item.filepath, item.module), + ) + ), + cached_api_modules=tuple( + sorted(cached_api_modules, key=lambda item: (item.filepath, item.module)) + ), + files_to_process=tuple(files_to_process), + skipped_warnings=tuple(sorted(skipped_warnings)), + cached_structural_findings=tuple(cached_sf), + cached_segment_report_projection=cached_segment_projection, + cached_lines=cached_lines, + cached_functions=cached_functions, + cached_methods=cached_methods, + cached_classes=cached_classes, + cached_source_stats_by_file=tuple( + sorted(cached_source_stats_by_file, key=lambda row: row[0]) + ), + ) diff --git a/codeclone/core/discovery_cache.py b/codeclone/core/discovery_cache.py new file mode 100644 index 0000000..995fdd4 --- /dev/null +++ b/codeclone/core/discovery_cache.py @@ -0,0 +1,581 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections.abc import Mapping +from typing import Literal + +from ..cache.entries import ( + CacheEntry, + ClassMetricsDict, + DeadCandidateDict, + ModuleDepDict, + SecuritySurfaceDict, + StructuralFindingGroupDict, +) +from ..models import ( + ApiParamSpec, + ClassMetrics, + DeadCandidate, + ModuleApiSurface, + ModuleDep, + ModuleDocstringCoverage, + ModuleTypingCoverage, + PublicSymbol, + SecuritySurface, + SecuritySurfaceCategory, + SecuritySurfaceClassificationMode, + SecuritySurfaceEvidenceKind, + SecuritySurfaceLocationScope, + StructuralFindingGroup, + StructuralFindingOccurrence, +) +from ..paths import is_test_filepath +from ..utils.coerce import as_mapping +from ._types import _as_sorted_str_tuple + +_ApiParamKind = Literal["pos_only", "pos_or_kw", "vararg", "kw_only", "kwarg"] +_PublicSymbolKind = Literal["function", "class", "method", "constant"] +_ExportedViaKind = Literal["all", "name"] +_RiskLevel = Literal["low", "medium", "high"] +_ImportType = Literal["import", "from_import"] +_DeadCandidateKind = Literal["function", "class", "method", "import"] + + +def _api_param_kind(value: object) -> _ApiParamKind | None: + match value: + case "pos_only": + return "pos_only" + case "pos_or_kw": + return "pos_or_kw" + case "vararg": + return "vararg" + case "kw_only": + return "kw_only" + case "kwarg": + return "kwarg" + case _: + return None + + +def _public_symbol_kind(value: object) -> _PublicSymbolKind | None: + match value: + case "function": + return "function" + case "class": + return "class" + case "method": + return "method" + case "constant": + return "constant" + case _: + return None + + +def _exported_via_kind(value: object) -> _ExportedViaKind | None: + match value: + case "all": + return "all" + case "name": + return "name" + case _: + return None + + +def _risk_level(value: object) -> _RiskLevel | None: + match value: + case "low": + return "low" + case "medium": + return "medium" + case "high": + return "high" + case _: + return None + + +def _import_type(value: object) -> _ImportType | None: + match value: + case "import": + return "import" + case "from_import": + return "from_import" + case _: + return None + + +def _dead_candidate_kind(value: object) -> _DeadCandidateKind | None: + match value: + case "function": + return "function" + case "class": + return "class" + case "method": + return "method" + case "import": + return "import" + case _: + return None + + +def _security_surface_category(value: object) -> SecuritySurfaceCategory | None: + match value: + case ( + "archive_extraction" + | "crypto_transport" + | "database_boundary" + | "deserialization" + | "dynamic_execution" + | "dynamic_loading" + | "filesystem_mutation" + | "identity_token" + | "network_boundary" + | "process_boundary" + ): + return value + case _: + return None + + +def _security_surface_location_scope( + value: object, +) -> SecuritySurfaceLocationScope | None: + match value: + case "module" | "class" | "callable": + return value + case _: + return None + + +def _security_surface_classification_mode( + value: object, +) -> SecuritySurfaceClassificationMode | None: + match value: + case "exact_builtin" | "exact_call" | "exact_import": + return value + case _: + return None + + +def _security_surface_evidence_kind( + value: object, +) -> SecuritySurfaceEvidenceKind | None: + match value: + case "builtin" | "call" | "import": + return value + case _: + return None + + +def decode_cached_structural_finding_group( + group_dict: StructuralFindingGroupDict, + filepath: str, +) -> StructuralFindingGroup: + finding_kind = group_dict["finding_kind"] + finding_key = group_dict["finding_key"] + signature = group_dict["signature"] + items = tuple( + StructuralFindingOccurrence( + finding_kind=finding_kind, + finding_key=finding_key, + file_path=filepath, + qualname=item["qualname"], + start=item["start"], + end=item["end"], + signature=signature, + ) + for item in group_dict["items"] + ) + return StructuralFindingGroup( + finding_kind=finding_kind, + finding_key=finding_key, + signature=signature, + items=items, + ) + + +def _cache_entry_has_metrics(entry: CacheEntry) -> bool: + metric_keys = ( + "class_metrics", + "module_deps", + "dead_candidates", + "referenced_names", + "referenced_qualnames", + "import_names", + "class_names", + ) + return all(key in entry and isinstance(entry.get(key), list) for key in metric_keys) + + +def _cache_entry_has_structural_findings(entry: CacheEntry) -> bool: + return "structural_findings" in entry + + +def _cache_entry_source_stats(entry: CacheEntry) -> tuple[int, int, int, int] | None: + stats_obj = entry.get("source_stats") + if not isinstance(stats_obj, dict): + return None + lines = stats_obj.get("lines") + functions = stats_obj.get("functions") + methods = stats_obj.get("methods") + classes = stats_obj.get("classes") + if not ( + isinstance(lines, int) + and isinstance(functions, int) + and isinstance(methods, int) + and isinstance(classes, int) + and lines >= 0 + and functions >= 0 + and methods >= 0 + and classes >= 0 + ): + return None + return lines, functions, methods, classes + + +def usable_cached_source_stats( + entry: CacheEntry, + *, + skip_metrics: bool, + collect_structural_findings: bool, +) -> tuple[int, int, int, int] | None: + if not skip_metrics and not _cache_entry_has_metrics(entry): + return None + if collect_structural_findings and not _cache_entry_has_structural_findings(entry): + return None + return _cache_entry_source_stats(entry) + + +def _cache_dict_module_fields( + value: object, +) -> tuple[Mapping[str, object], str, str] | None: + if not isinstance(value, Mapping): + return None + row = as_mapping(value) + module = row.get("module") + filepath = row.get("filepath") + if not isinstance(module, str) or not isinstance(filepath, str): + return None + return row, module, filepath + + +def _cache_dict_int_fields( + row: Mapping[str, object], + *keys: str, +) -> tuple[int, ...] | None: + values: list[int] = [] + for key in keys: + value = row.get(key) + if not isinstance(value, int): + return None + values.append(value) + return tuple(values) + + +def _api_param_fields( + row: Mapping[str, object], +) -> tuple[str, _ApiParamKind, bool, str] | None: + name = row.get("name") + validated_kind = _api_param_kind(row.get("kind")) + has_default = row.get("has_default") + annotation_hash = row.get("annotation_hash", "") + if ( + not isinstance(name, str) + or validated_kind is None + or not isinstance(has_default, bool) + or not isinstance(annotation_hash, str) + ): + return None + return name, validated_kind, has_default, annotation_hash + + +def _typing_coverage_from_cache_dict(value: object) -> ModuleTypingCoverage | None: + row_info = _cache_dict_module_fields(value) + if row_info is None: + return None + row, module, filepath = row_info + int_fields = _cache_dict_int_fields( + row, + "callable_count", + "params_total", + "params_annotated", + "returns_total", + "returns_annotated", + "any_annotation_count", + ) + if int_fields is None: + return None + return ModuleTypingCoverage( + module=module, + filepath=filepath, + callable_count=int_fields[0], + params_total=int_fields[1], + params_annotated=int_fields[2], + returns_total=int_fields[3], + returns_annotated=int_fields[4], + any_annotation_count=int_fields[5], + ) + + +def _docstring_coverage_from_cache_dict( + value: object, +) -> ModuleDocstringCoverage | None: + row_info = _cache_dict_module_fields(value) + if row_info is None: + return None + row, module, filepath = row_info + totals = _cache_dict_int_fields( + row, + "public_symbol_total", + "public_symbol_documented", + ) + if totals is None: + return None + return ModuleDocstringCoverage( + module=module, + filepath=filepath, + public_symbol_total=totals[0], + public_symbol_documented=totals[1], + ) + + +def _api_param_spec_from_cache_dict(value: object) -> ApiParamSpec | None: + row = as_mapping(value) + if not row: + return None + fields = _api_param_fields(row) + if fields is None: + return None + name, validated_kind, has_default, annotation_hash = fields + return ApiParamSpec( + name=name, + kind=validated_kind, + has_default=has_default, + annotation_hash=annotation_hash, + ) + + +def _public_symbol_from_cache_dict(value: object) -> PublicSymbol | None: + row = as_mapping(value) + if not row: + return None + qualname = row.get("qualname") + start_line = row.get("start_line") + end_line = row.get("end_line") + returns_hash = row.get("returns_hash", "") + params_raw = row.get("params", []) + validated_kind = _public_symbol_kind(row.get("kind")) + validated_exported_via = _exported_via_kind(row.get("exported_via", "name")) + if ( + not isinstance(qualname, str) + or validated_kind is None + or not isinstance(start_line, int) + or not isinstance(end_line, int) + or validated_exported_via is None + or not isinstance(returns_hash, str) + or not isinstance(params_raw, list) + ): + return None + params: list[ApiParamSpec] = [] + for param in params_raw: + if not isinstance(param, dict): + return None + parsed = _api_param_spec_from_cache_dict(param) + if parsed is None: + return None + params.append(parsed) + return PublicSymbol( + qualname=qualname, + kind=validated_kind, + start_line=start_line, + end_line=end_line, + params=tuple(params), + returns_hash=returns_hash, + exported_via=validated_exported_via, + ) + + +def _api_surface_from_cache_dict(value: object) -> ModuleApiSurface | None: + row_info = _cache_dict_module_fields(value) + if row_info is None: + return None + row, module, filepath = row_info + all_declared_raw = row.get("all_declared", []) + symbols_raw = row.get("symbols", []) + if ( + not isinstance(all_declared_raw, list) + or not isinstance(symbols_raw, list) + or not all(isinstance(item, str) for item in all_declared_raw) + ): + return None + symbols: list[PublicSymbol] = [] + for item in symbols_raw: + parsed = _public_symbol_from_cache_dict(item) + if parsed is None: + return None + symbols.append(parsed) + return ModuleApiSurface( + module=module, + filepath=filepath, + all_declared=tuple(sorted(set(all_declared_raw))) or None, + symbols=tuple(sorted(symbols, key=lambda item: item.qualname)), + ) + + +def _class_metric_from_cache_row(metric_row: ClassMetricsDict) -> ClassMetrics | None: + risk_coupling = _risk_level(metric_row["risk_coupling"]) + risk_cohesion = _risk_level(metric_row["risk_cohesion"]) + if ( + not metric_row.get("qualname") + or not metric_row.get("filepath") + or risk_coupling is None + or risk_cohesion is None + ): + return None + return ClassMetrics( + qualname=metric_row["qualname"], + filepath=metric_row["filepath"], + start_line=metric_row["start_line"], + end_line=metric_row["end_line"], + cbo=metric_row["cbo"], + lcom4=metric_row["lcom4"], + method_count=metric_row["method_count"], + instance_var_count=metric_row["instance_var_count"], + risk_coupling=risk_coupling, + risk_cohesion=risk_cohesion, + coupled_classes=_as_sorted_str_tuple(metric_row.get("coupled_classes", [])), + ) + + +def _module_dep_from_cache_row(dep_row: ModuleDepDict) -> ModuleDep | None: + import_type = _import_type(dep_row["import_type"]) + if not dep_row.get("source") or not dep_row.get("target") or import_type is None: + return None + return ModuleDep( + source=dep_row["source"], + target=dep_row["target"], + import_type=import_type, + line=dep_row["line"], + ) + + +def _dead_candidate_from_cache_row(dead_row: DeadCandidateDict) -> DeadCandidate | None: + kind = _dead_candidate_kind(dead_row["kind"]) + if ( + not dead_row.get("qualname") + or not dead_row.get("local_name") + or not dead_row.get("filepath") + or kind is None + ): + return None + return DeadCandidate( + qualname=dead_row["qualname"], + local_name=dead_row["local_name"], + filepath=dead_row["filepath"], + start_line=dead_row["start_line"], + end_line=dead_row["end_line"], + kind=kind, + suppressed_rules=_as_sorted_str_tuple(dead_row.get("suppressed_rules", [])), + ) + + +def _security_surface_from_cache_row( + surface_row: SecuritySurfaceDict, +) -> SecuritySurface | None: + category = _security_surface_category(surface_row.get("category")) + location_scope = _security_surface_location_scope(surface_row.get("location_scope")) + classification_mode = _security_surface_classification_mode( + surface_row.get("classification_mode") + ) + evidence_kind = _security_surface_evidence_kind(surface_row.get("evidence_kind")) + if ( + category is None + or location_scope is None + or classification_mode is None + or evidence_kind is None + ): + return None + return SecuritySurface( + category=category, + capability=surface_row["capability"], + module=surface_row["module"], + filepath=surface_row["filepath"], + qualname=surface_row["qualname"], + start_line=surface_row["start_line"], + end_line=surface_row["end_line"], + location_scope=location_scope, + classification_mode=classification_mode, + evidence_kind=evidence_kind, + evidence_symbol=surface_row["evidence_symbol"], + ) + + +def load_cached_metrics_extended( + entry: CacheEntry, + *, + filepath: str, +) -> tuple[ + tuple[ClassMetrics, ...], + tuple[ModuleDep, ...], + tuple[DeadCandidate, ...], + frozenset[str], + frozenset[str], + ModuleTypingCoverage | None, + ModuleDocstringCoverage | None, + ModuleApiSurface | None, + tuple[SecuritySurface, ...], +]: + class_metrics_rows: list[ClassMetricsDict] = entry.get("class_metrics", []) + class_metrics_items: list[ClassMetrics] = [] + for metric_row in class_metrics_rows: + parsed_metric = _class_metric_from_cache_row(metric_row) + if parsed_metric is not None: + class_metrics_items.append(parsed_metric) + class_metrics = tuple(class_metrics_items) + module_dep_rows: list[ModuleDepDict] = entry.get("module_deps", []) + module_dep_items: list[ModuleDep] = [] + for dep_row in module_dep_rows: + parsed_dep = _module_dep_from_cache_row(dep_row) + if parsed_dep is not None: + module_dep_items.append(parsed_dep) + module_deps = tuple(module_dep_items) + dead_rows: list[DeadCandidateDict] = entry.get("dead_candidates", []) + dead_candidate_items: list[DeadCandidate] = [] + for dead_row in dead_rows: + parsed_dead = _dead_candidate_from_cache_row(dead_row) + if parsed_dead is not None: + dead_candidate_items.append(parsed_dead) + dead_candidates = tuple(dead_candidate_items) + referenced_names = ( + frozenset() + if is_test_filepath(filepath) + else frozenset(entry.get("referenced_names", [])) + ) + referenced_qualnames = ( + frozenset() + if is_test_filepath(filepath) + else frozenset(entry.get("referenced_qualnames", [])) + ) + security_surface_rows: list[SecuritySurfaceDict] = entry.get( + "security_surfaces", [] + ) + security_surface_items: list[SecuritySurface] = [] + for surface_row in security_surface_rows: + parsed_surface = _security_surface_from_cache_row(surface_row) + if parsed_surface is not None: + security_surface_items.append(parsed_surface) + return ( + class_metrics, + module_deps, + dead_candidates, + referenced_names, + referenced_qualnames, + _typing_coverage_from_cache_dict(entry.get("typing_coverage")), + _docstring_coverage_from_cache_dict(entry.get("docstring_coverage")), + _api_surface_from_cache_dict(entry.get("api_surface")), + tuple(security_surface_items), + ) diff --git a/codeclone/core/metrics_payload.py b/codeclone/core/metrics_payload.py new file mode 100644 index 0000000..cc69d18 --- /dev/null +++ b/codeclone/core/metrics_payload.py @@ -0,0 +1,323 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections.abc import Mapping, Sequence + +from ..analysis.suppressions import ( + DEAD_CODE_RULE_ID, + INLINE_CODECLONE_SUPPRESSION_SOURCE, +) +from ..domain.findings import CATEGORY_COHESION, CATEGORY_COMPLEXITY, CATEGORY_COUPLING +from ..domain.quality import CONFIDENCE_HIGH, RISK_LOW +from ..metrics.overloaded_modules import build_overloaded_modules_payload +from ..models import ( + ClassMetrics, + CoverageJoinResult, + DeadItem, + DepGraph, + GroupItemLike, + MetricsDiff, + ModuleDep, + ProjectMetrics, + SecuritySurface, +) +from ..utils.coerce import as_int, as_mapping, as_sequence, as_str +from .api_surface_payload import ( + _api_surface_rows, + _api_surface_summary, + _breaking_api_surface_rows, +) +from .coverage_payload import ( + _coverage_adoption_rows, + _coverage_join_rows, + _coverage_join_summary, + _permille, +) +from .security_surfaces_payload import build_security_surfaces_payload + + +def _enrich_metrics_report_payload( + *, + metrics_payload: Mapping[str, object], + metrics_diff: MetricsDiff | None, + coverage_adoption_diff_available: bool, + api_surface_diff_available: bool, +) -> dict[str, object]: + enriched = { + key: (dict(value) if isinstance(value, Mapping) else value) + for key, value in metrics_payload.items() + } + coverage_adoption = dict(as_mapping(enriched.get("coverage_adoption"))) + coverage_summary = dict(as_mapping(coverage_adoption.get("summary"))) + if coverage_summary: + coverage_summary["baseline_diff_available"] = coverage_adoption_diff_available + coverage_summary["param_delta"] = ( + int(metrics_diff.typing_param_permille_delta) + if metrics_diff is not None and coverage_adoption_diff_available + else 0 + ) + coverage_summary["return_delta"] = ( + int(metrics_diff.typing_return_permille_delta) + if metrics_diff is not None and coverage_adoption_diff_available + else 0 + ) + coverage_summary["docstring_delta"] = ( + int(metrics_diff.docstring_permille_delta) + if metrics_diff is not None and coverage_adoption_diff_available + else 0 + ) + coverage_adoption["summary"] = coverage_summary + enriched["coverage_adoption"] = coverage_adoption + + api_surface = dict(as_mapping(enriched.get("api_surface"))) + api_summary = dict(as_mapping(api_surface.get("summary"))) + api_items = list(as_sequence(api_surface.get("items"))) + if api_summary: + api_summary["baseline_diff_available"] = api_surface_diff_available + api_summary["added"] = ( + len(metrics_diff.new_api_symbols) + if metrics_diff is not None and api_surface_diff_available + else 0 + ) + api_summary["breaking"] = ( + len(metrics_diff.new_api_breaking_changes) + if metrics_diff is not None and api_surface_diff_available + else 0 + ) + api_surface["summary"] = api_summary + if ( + metrics_diff is not None + and api_surface_diff_available + and metrics_diff.new_api_breaking_changes + ): + api_items.extend( + _breaking_api_surface_rows(metrics_diff.new_api_breaking_changes) + ) + api_surface["items"] = api_items + if api_surface: + enriched["api_surface"] = api_surface + return enriched + + +def build_metrics_report_payload( + *, + scan_root: str = "", + project_metrics: ProjectMetrics, + dep_graph: DepGraph | None = None, + coverage_join: CoverageJoinResult | None = None, + units: Sequence[GroupItemLike], + class_metrics: Sequence[ClassMetrics], + module_deps: Sequence[ModuleDep] = (), + security_surfaces: Sequence[SecuritySurface] = (), + source_stats_by_file: Sequence[tuple[str, int, int, int, int]] = (), + suppressed_dead_code: Sequence[DeadItem] = (), +) -> dict[str, object]: + sorted_units = sorted( + units, + key=lambda item: ( + as_int(item.get("cyclomatic_complexity"), 0), + as_int(item.get("nesting_depth"), 0), + as_str(item.get("qualname")), + ), + reverse=True, + ) + complexity_rows = [ + { + "qualname": as_str(item.get("qualname")), + "filepath": as_str(item.get("filepath")), + "start_line": as_int(item.get("start_line"), 0), + "end_line": as_int(item.get("end_line"), 0), + "cyclomatic_complexity": as_int(item.get("cyclomatic_complexity"), 1), + "nesting_depth": as_int(item.get("nesting_depth"), 0), + "risk": as_str(item.get("risk"), RISK_LOW), + } + for item in sorted_units + ] + classes_sorted = sorted( + class_metrics, + key=lambda item: (item.cbo, item.lcom4, item.qualname), + reverse=True, + ) + coupling_rows = [ + { + "qualname": metric.qualname, + "filepath": metric.filepath, + "start_line": metric.start_line, + "end_line": metric.end_line, + "cbo": metric.cbo, + "risk": metric.risk_coupling, + "coupled_classes": list(metric.coupled_classes), + } + for metric in classes_sorted + ] + cohesion_rows = [ + { + "qualname": metric.qualname, + "filepath": metric.filepath, + "start_line": metric.start_line, + "end_line": metric.end_line, + "lcom4": metric.lcom4, + "risk": metric.risk_cohesion, + "method_count": metric.method_count, + "instance_var_count": metric.instance_var_count, + } + for metric in classes_sorted + ] + active_dead_items = tuple(project_metrics.dead_code) + suppressed_dead_items = tuple(suppressed_dead_code) + coverage_adoption_rows = _coverage_adoption_rows(project_metrics) + api_surface_summary = _api_surface_summary(project_metrics.api_surface) + api_surface_items = _api_surface_rows(project_metrics.api_surface) + coverage_join_summary = _coverage_join_summary(coverage_join) + coverage_join_items = _coverage_join_rows(coverage_join) + + def _serialize_dead_item( + item: DeadItem, + *, + suppressed: bool = False, + ) -> dict[str, object]: + payload: dict[str, object] = { + "qualname": item.qualname, + "filepath": item.filepath, + "start_line": item.start_line, + "end_line": item.end_line, + "kind": item.kind, + "confidence": item.confidence, + } + if suppressed: + payload["suppressed_by"] = [ + { + "rule": DEAD_CODE_RULE_ID, + "source": INLINE_CODECLONE_SUPPRESSION_SOURCE, + } + ] + return payload + + payload = { + CATEGORY_COMPLEXITY: { + "functions": complexity_rows, + "summary": { + "total": len(complexity_rows), + "average": round(project_metrics.complexity_avg, 2), + "max": project_metrics.complexity_max, + "high_risk": len(project_metrics.high_risk_functions), + }, + }, + CATEGORY_COUPLING: { + "classes": coupling_rows, + "summary": { + "total": len(coupling_rows), + "average": round(project_metrics.coupling_avg, 2), + "max": project_metrics.coupling_max, + "high_risk": len(project_metrics.high_risk_classes), + }, + }, + CATEGORY_COHESION: { + "classes": cohesion_rows, + "summary": { + "total": len(cohesion_rows), + "average": round(project_metrics.cohesion_avg, 2), + "max": project_metrics.cohesion_max, + "low_cohesion": len(project_metrics.low_cohesion_classes), + }, + }, + "dependencies": { + "modules": project_metrics.dependency_modules, + "edges": project_metrics.dependency_edges, + "max_depth": project_metrics.dependency_max_depth, + "avg_depth": ( + round(dep_graph.avg_depth, 2) if dep_graph is not None else 0.0 + ), + "p95_depth": dep_graph.p95_depth if dep_graph is not None else 0, + "cycles": [list(cycle) for cycle in project_metrics.dependency_cycles], + "longest_chains": [ + list(chain) for chain in project_metrics.dependency_longest_chains + ], + "edge_list": [ + { + "source": edge.source, + "target": edge.target, + "import_type": edge.import_type, + "line": edge.line, + } + for edge in project_metrics.dependency_edge_list + ], + }, + "dead_code": { + "items": [_serialize_dead_item(item) for item in active_dead_items], + "suppressed_items": [ + _serialize_dead_item(item, suppressed=True) + for item in suppressed_dead_items + ], + "summary": { + "total": len(active_dead_items), + "critical": sum( + 1 + for item in active_dead_items + if item.confidence == CONFIDENCE_HIGH + ), + "high_confidence": sum( + 1 + for item in active_dead_items + if item.confidence == CONFIDENCE_HIGH + ), + "suppressed": len(suppressed_dead_items), + }, + }, + "health": { + "score": project_metrics.health.total, + "grade": project_metrics.health.grade, + "dimensions": dict(project_metrics.health.dimensions), + }, + "coverage_adoption": { + "summary": { + "modules": len(coverage_adoption_rows), + "params_total": project_metrics.typing_param_total, + "params_annotated": project_metrics.typing_param_annotated, + "param_permille": _permille( + project_metrics.typing_param_annotated, + project_metrics.typing_param_total, + ), + "returns_total": project_metrics.typing_return_total, + "returns_annotated": project_metrics.typing_return_annotated, + "return_permille": _permille( + project_metrics.typing_return_annotated, + project_metrics.typing_return_total, + ), + "public_symbol_total": project_metrics.docstring_public_total, + "public_symbol_documented": project_metrics.docstring_public_documented, + "docstring_permille": _permille( + project_metrics.docstring_public_documented, + project_metrics.docstring_public_total, + ), + "typing_any_count": project_metrics.typing_any_count, + }, + "items": coverage_adoption_rows, + }, + "api_surface": { + "summary": dict(api_surface_summary), + "items": api_surface_items, + }, + "overloaded_modules": build_overloaded_modules_payload( + scan_root=scan_root, + source_stats_by_file=source_stats_by_file, + units=units, + class_metrics=class_metrics, + module_deps=module_deps, + ), + "security_surfaces": build_security_surfaces_payload( + scan_root=scan_root, + surfaces=security_surfaces, + ), + } + if coverage_join is not None: + payload["coverage_join"] = { + "summary": dict(coverage_join_summary), + "items": coverage_join_items, + } + return payload diff --git a/codeclone/core/parallelism.py b/codeclone/core/parallelism.py new file mode 100644 index 0000000..3750670 --- /dev/null +++ b/codeclone/core/parallelism.py @@ -0,0 +1,355 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections.abc import Callable, Sequence +from concurrent.futures import ProcessPoolExecutor, as_completed + +from ..cache.entries import SourceStatsDict +from ..cache.store import Cache +from ..models import ( + ClassMetrics, + DeadCandidate, + GroupItem, + ModuleApiSurface, + ModuleDep, + ModuleDocstringCoverage, + ModuleTypingCoverage, + SecuritySurface, + StructuralFindingGroup, +) +from ._types import ( + DEFAULT_BATCH_SIZE, + DEFAULT_RUNTIME_PROCESSES, + PARALLEL_MIN_FILES_FLOOR, + PARALLEL_MIN_FILES_PER_WORKER, + BootstrapResult, + DiscoveryResult, + FileProcessResult, + ProcessingResult, + _block_to_group_item, + _class_metric_sort_key, + _dead_candidate_sort_key, + _group_item_sort_key, + _module_dep_sort_key, + _segment_to_group_item, + _should_collect_structural_findings, + _unit_to_group_item, +) +from .worker import _invoke_process_file + + +def _parallel_min_files(processes: int) -> int: + return max(PARALLEL_MIN_FILES_FLOOR, processes * PARALLEL_MIN_FILES_PER_WORKER) + + +def _resolve_process_count(processes: object) -> int: + if not isinstance(processes, int): + return DEFAULT_RUNTIME_PROCESSES + return max(1, processes) + + +def _should_use_parallel(files_count: int, processes: int) -> bool: + if processes <= 1: + return False + return files_count >= _parallel_min_files(processes) + + +def process( + *, + boot: BootstrapResult, + discovery: DiscoveryResult, + cache: Cache, + on_advance: Callable[[], None] | None = None, + on_worker_error: Callable[[str], None] | None = None, + on_parallel_fallback: Callable[[Exception], None] | None = None, + batch_size: int = DEFAULT_BATCH_SIZE, +) -> ProcessingResult: + files_to_process = discovery.files_to_process + if not files_to_process: + return ProcessingResult( + units=discovery.cached_units, + blocks=discovery.cached_blocks, + segments=discovery.cached_segments, + class_metrics=discovery.cached_class_metrics, + module_deps=discovery.cached_module_deps, + dead_candidates=discovery.cached_dead_candidates, + referenced_names=discovery.cached_referenced_names, + security_surfaces=discovery.cached_security_surfaces, + referenced_qualnames=discovery.cached_referenced_qualnames, + typing_modules=discovery.cached_typing_modules, + docstring_modules=discovery.cached_docstring_modules, + api_modules=discovery.cached_api_modules, + files_analyzed=0, + files_skipped=discovery.files_skipped, + analyzed_lines=0, + analyzed_functions=0, + analyzed_methods=0, + analyzed_classes=0, + failed_files=(), + source_read_failures=(), + structural_findings=discovery.cached_structural_findings, + source_stats_by_file=discovery.cached_source_stats_by_file, + ) + + all_units: list[GroupItem] = list(discovery.cached_units) + all_blocks: list[GroupItem] = list(discovery.cached_blocks) + all_segments: list[GroupItem] = list(discovery.cached_segments) + all_class_metrics: list[ClassMetrics] = list(discovery.cached_class_metrics) + all_module_deps: list[ModuleDep] = list(discovery.cached_module_deps) + all_dead_candidates: list[DeadCandidate] = list(discovery.cached_dead_candidates) + all_referenced_names: set[str] = set(discovery.cached_referenced_names) + all_referenced_qualnames: set[str] = set(discovery.cached_referenced_qualnames) + all_security_surfaces: list[SecuritySurface] = list( + discovery.cached_security_surfaces + ) + all_typing_modules: list[ModuleTypingCoverage] = list( + discovery.cached_typing_modules + ) + all_docstring_modules: list[ModuleDocstringCoverage] = list( + discovery.cached_docstring_modules + ) + all_api_modules: list[ModuleApiSurface] = list(discovery.cached_api_modules) + + collect_structural_findings = _should_collect_structural_findings(boot.output_paths) + collect_api_surface = not boot.args.skip_metrics and bool( + getattr(boot.args, "api_surface", False) + ) + api_include_private_modules = bool( + getattr(boot.args, "api_include_private_modules", False) + ) + files_analyzed = 0 + files_skipped = discovery.files_skipped + analyzed_lines = 0 + analyzed_functions = 0 + analyzed_methods = 0 + analyzed_classes = 0 + all_structural_findings: list[StructuralFindingGroup] = list( + discovery.cached_structural_findings + ) + source_stats_by_file: dict[str, tuple[int, int, int, int]] = { + filepath: (lines, functions, methods, classes) + for ( + filepath, + lines, + functions, + methods, + classes, + ) in discovery.cached_source_stats_by_file + } + failed_files: list[str] = [] + source_read_failures: list[str] = [] + root_str = str(boot.root) + processes = _resolve_process_count(boot.args.processes) + min_loc = int(boot.args.min_loc) + min_stmt = int(boot.args.min_stmt) + block_min_loc = int(boot.args.block_min_loc) + block_min_stmt = int(boot.args.block_min_stmt) + segment_min_loc = int(boot.args.segment_min_loc) + segment_min_stmt = int(boot.args.segment_min_stmt) + + def _accept_result(result: FileProcessResult) -> None: + nonlocal files_analyzed + nonlocal files_skipped + nonlocal analyzed_lines + nonlocal analyzed_functions + nonlocal analyzed_methods + nonlocal analyzed_classes + + if result.success and result.stat is not None: + source_stats_payload = SourceStatsDict( + lines=result.lines, + functions=result.functions, + methods=result.methods, + classes=result.classes, + ) + structural_payload = ( + result.structural_findings if collect_structural_findings else None + ) + try: + cache.put_file_entry( + result.filepath, + result.stat, + result.units or [], + result.blocks or [], + result.segments or [], + source_stats=source_stats_payload, + file_metrics=result.file_metrics, + structural_findings=structural_payload, + ) + except TypeError as exc: + if "source_stats" not in str(exc): + raise + cache.put_file_entry( + result.filepath, + result.stat, + result.units or [], + result.blocks or [], + result.segments or [], + file_metrics=result.file_metrics, + structural_findings=structural_payload, + ) + files_analyzed += 1 + analyzed_lines += result.lines + analyzed_functions += result.functions + analyzed_methods += result.methods + analyzed_classes += result.classes + source_stats_by_file[result.filepath] = ( + result.lines, + result.functions, + result.methods, + result.classes, + ) + if result.units: + all_units.extend(_unit_to_group_item(unit) for unit in result.units) + if result.blocks: + all_blocks.extend( + _block_to_group_item(block) for block in result.blocks + ) + if result.segments: + all_segments.extend( + _segment_to_group_item(segment) for segment in result.segments + ) + if result.structural_findings: + all_structural_findings.extend(result.structural_findings) + if not boot.args.skip_metrics and result.file_metrics is not None: + all_class_metrics.extend(result.file_metrics.class_metrics) + all_module_deps.extend(result.file_metrics.module_deps) + all_dead_candidates.extend(result.file_metrics.dead_candidates) + all_referenced_names.update(result.file_metrics.referenced_names) + all_referenced_qualnames.update( + result.file_metrics.referenced_qualnames + ) + all_security_surfaces.extend(result.file_metrics.security_surfaces) + if result.file_metrics.typing_coverage is not None: + all_typing_modules.append(result.file_metrics.typing_coverage) + if result.file_metrics.docstring_coverage is not None: + all_docstring_modules.append(result.file_metrics.docstring_coverage) + if result.file_metrics.api_surface is not None: + all_api_modules.append(result.file_metrics.api_surface) + return + + files_skipped += 1 + failure = f"{result.filepath}: {result.error}" + failed_files.append(failure) + if result.error_kind == "source_read_error": + source_read_failures.append(failure) + + def _run_sequential(files: Sequence[str]) -> None: + for filepath in files: + _accept_result( + _invoke_process_file( + filepath, + root_str, + boot.config, + min_loc, + min_stmt, + collect_structural_findings=collect_structural_findings, + collect_api_surface=collect_api_surface, + api_include_private_modules=api_include_private_modules, + block_min_loc=block_min_loc, + block_min_stmt=block_min_stmt, + segment_min_loc=segment_min_loc, + segment_min_stmt=segment_min_stmt, + ) + ) + if on_advance is not None: + on_advance() + + if _should_use_parallel(len(files_to_process), processes): + try: + with ProcessPoolExecutor(max_workers=processes) as executor: + for idx in range(0, len(files_to_process), batch_size): + batch = files_to_process[idx : idx + batch_size] + futures = [ + executor.submit( + _invoke_process_file, + filepath, + root_str, + boot.config, + min_loc, + min_stmt, + collect_structural_findings=collect_structural_findings, + collect_api_surface=collect_api_surface, + api_include_private_modules=api_include_private_modules, + block_min_loc=block_min_loc, + block_min_stmt=block_min_stmt, + segment_min_loc=segment_min_loc, + segment_min_stmt=segment_min_stmt, + ) + for filepath in batch + ] + future_to_path = { + id(future): filepath + for future, filepath in zip(futures, batch, strict=True) + } + for future in as_completed(futures): + filepath = future_to_path[id(future)] + try: + _accept_result(future.result()) + except Exception as exc: # pragma: no cover - worker crash + files_skipped += 1 + failed_files.append(f"{filepath}: {exc}") + if on_worker_error is not None: + on_worker_error(str(exc)) + if on_advance is not None: + on_advance() + except (OSError, RuntimeError, PermissionError) as exc: + if on_parallel_fallback is not None: + on_parallel_fallback(exc) + _run_sequential(files_to_process) + else: + _run_sequential(files_to_process) + + return ProcessingResult( + units=tuple(sorted(all_units, key=_group_item_sort_key)), + blocks=tuple(sorted(all_blocks, key=_group_item_sort_key)), + segments=tuple(sorted(all_segments, key=_group_item_sort_key)), + class_metrics=tuple(sorted(all_class_metrics, key=_class_metric_sort_key)), + module_deps=tuple(sorted(all_module_deps, key=_module_dep_sort_key)), + dead_candidates=tuple( + sorted(all_dead_candidates, key=_dead_candidate_sort_key) + ), + referenced_names=frozenset(all_referenced_names), + security_surfaces=tuple( + sorted( + all_security_surfaces, + key=lambda item: ( + item.filepath, + item.start_line, + item.end_line, + item.qualname, + item.category, + item.capability, + item.evidence_symbol, + ), + ) + ), + referenced_qualnames=frozenset(all_referenced_qualnames), + typing_modules=tuple( + sorted(all_typing_modules, key=lambda item: (item.filepath, item.module)) + ), + docstring_modules=tuple( + sorted(all_docstring_modules, key=lambda item: (item.filepath, item.module)) + ), + api_modules=tuple( + sorted(all_api_modules, key=lambda item: (item.filepath, item.module)) + ), + files_analyzed=files_analyzed, + files_skipped=files_skipped, + analyzed_lines=analyzed_lines, + analyzed_functions=analyzed_functions, + analyzed_methods=analyzed_methods, + analyzed_classes=analyzed_classes, + failed_files=tuple(sorted(failed_files)), + source_read_failures=tuple(sorted(source_read_failures)), + structural_findings=tuple(all_structural_findings), + source_stats_by_file=tuple( + (filepath, *stats) + for filepath, stats in sorted(source_stats_by_file.items()) + ), + ) diff --git a/codeclone/core/pipeline.py b/codeclone/core/pipeline.py new file mode 100644 index 0000000..96abf1b --- /dev/null +++ b/codeclone/core/pipeline.py @@ -0,0 +1,363 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections.abc import Mapping, Sequence + +from ..contracts import DEFAULT_COVERAGE_MIN +from ..findings.clones.golden_fixtures import ( + build_suppressed_clone_groups, + split_clone_groups_for_golden_fixtures, +) +from ..findings.clones.grouping import ( + build_block_groups, + build_groups, + build_segment_groups, +) +from ..findings.structural.detectors import ( + build_clone_cohort_structural_findings, +) +from ..metrics._base import MetricProjectContext +from ..metrics.coverage_join import CoverageJoinParseError, build_coverage_join +from ..metrics.dead_code import find_suppressed_unused +from ..metrics.registry import ( + METRIC_FAMILIES, + build_project_metrics, + project_metrics_defaults, +) +from ..models import ( + ClassMetrics, + CoverageJoinResult, + DeadCandidate, + DeadItem, + DepGraph, + GroupItemLike, + ModuleApiSurface, + ModuleDep, + ModuleDocstringCoverage, + ModuleTypingCoverage, + ProjectMetrics, + SecuritySurface, + StructuralFindingGroup, + Suggestion, +) +from ..report.blocks import prepare_block_report_groups +from ..report.explain import build_block_group_facts +from ..report.segments import prepare_segment_report_groups +from ..report.suggestions import generate_suggestions +from ._types import ( + AnalysisResult, + BootstrapResult, + DiscoveryResult, + ProcessingResult, + _segment_groups_digest, + _should_collect_structural_findings, +) +from .bootstrap import _resolve_optional_runtime_path +from .metrics_payload import build_metrics_report_payload + + +def _artifact_dep_graph(value: object, default: DepGraph) -> DepGraph: + return value if isinstance(value, DepGraph) else default + + +def _artifact_dead_items( + value: object, + default: tuple[DeadItem, ...], +) -> tuple[DeadItem, ...]: + if isinstance(value, tuple) and all(isinstance(item, DeadItem) for item in value): + return value + return default + + +def compute_project_metrics( + *, + units: Sequence[GroupItemLike], + class_metrics: Sequence[ClassMetrics], + module_deps: Sequence[ModuleDep], + dead_candidates: Sequence[DeadCandidate], + referenced_names: frozenset[str], + referenced_qualnames: frozenset[str], + security_surfaces: Sequence[SecuritySurface] = (), + typing_modules: Sequence[ModuleTypingCoverage] = (), + docstring_modules: Sequence[ModuleDocstringCoverage] = (), + api_modules: Sequence[ModuleApiSurface] = (), + files_found: int, + files_analyzed_or_cached: int, + function_clone_groups: int, + block_clone_groups: int, + skip_dependencies: bool, + skip_dead_code: bool, +) -> tuple[ProjectMetrics, DepGraph, tuple[DeadItem, ...]]: + context = MetricProjectContext( + units=tuple(units), + class_metrics=tuple(class_metrics), + module_deps=tuple(module_deps), + dead_candidates=tuple(dead_candidates), + referenced_names=referenced_names, + referenced_qualnames=referenced_qualnames, + security_surfaces=tuple(security_surfaces), + typing_modules=tuple(typing_modules), + docstring_modules=tuple(docstring_modules), + api_modules=tuple(api_modules), + files_found=files_found, + files_analyzed_or_cached=files_analyzed_or_cached, + function_clone_groups=function_clone_groups, + block_clone_groups=block_clone_groups, + skip_dependencies=skip_dependencies, + skip_dead_code=skip_dead_code, + ) + project_fields = project_metrics_defaults() + dep_graph = DepGraph( + modules=frozenset(), + edges=(), + cycles=(), + max_depth=0, + avg_depth=0.0, + p95_depth=0, + longest_chains=(), + ) + dead_items: tuple[DeadItem, ...] = () + for family in METRIC_FAMILIES.values(): + aggregate = family.aggregate([family.compute(context)]) + project_fields.update(aggregate.project_fields) + dep_graph = _artifact_dep_graph(aggregate.artifacts.get("dep_graph"), dep_graph) + dead_items = _artifact_dead_items( + aggregate.artifacts.get("dead_items"), + dead_items, + ) + return build_project_metrics(project_fields), dep_graph, dead_items + + +def compute_suggestions( + *, + project_metrics: ProjectMetrics, + units: Sequence[GroupItemLike], + class_metrics: Sequence[ClassMetrics], + func_groups: Mapping[str, Sequence[GroupItemLike]], + block_groups: Mapping[str, Sequence[GroupItemLike]], + segment_groups: Mapping[str, Sequence[GroupItemLike]], + block_group_facts: Mapping[str, Mapping[str, str]] | None = None, + structural_findings: Sequence[StructuralFindingGroup] | None = None, + scan_root: str = "", +) -> tuple[Suggestion, ...]: + return generate_suggestions( + project_metrics=project_metrics, + units=units, + class_metrics=class_metrics, + func_groups=func_groups, + block_groups=block_groups, + segment_groups=segment_groups, + block_group_facts=block_group_facts, + structural_findings=structural_findings, + scan_root=scan_root, + ) + + +def analyze( + *, + boot: BootstrapResult, + discovery: DiscoveryResult, + processing: ProcessingResult, +) -> AnalysisResult: + golden_fixture_paths = tuple( + str(pattern).strip() + for pattern in getattr(boot.args, "golden_fixture_paths", ()) + if str(pattern).strip() + ) + func_split = split_clone_groups_for_golden_fixtures( + groups=build_groups(processing.units), + kind="function", + golden_fixture_paths=golden_fixture_paths, + scan_root=str(boot.root), + ) + block_split = split_clone_groups_for_golden_fixtures( + groups=build_block_groups(processing.blocks), + kind="block", + golden_fixture_paths=golden_fixture_paths, + scan_root=str(boot.root), + ) + segment_split = split_clone_groups_for_golden_fixtures( + groups=build_segment_groups(processing.segments), + kind="segment", + golden_fixture_paths=golden_fixture_paths, + scan_root=str(boot.root), + ) + + func_groups = func_split.active_groups + block_groups = block_split.active_groups + segment_groups_raw = segment_split.active_groups + segment_groups_raw_digest = _segment_groups_digest(segment_groups_raw) + cached_projection = discovery.cached_segment_report_projection + if ( + cached_projection is not None + and cached_projection.get("digest") == segment_groups_raw_digest + ): + projection_groups = cached_projection.get("groups", {}) + segment_groups = { + group_key: [ + { + "segment_hash": str(item["segment_hash"]), + "segment_sig": str(item["segment_sig"]), + "filepath": str(item["filepath"]), + "qualname": str(item["qualname"]), + "start_line": int(item["start_line"]), + "end_line": int(item["end_line"]), + "size": int(item["size"]), + } + for item in projection_groups[group_key] + ] + for group_key in sorted(projection_groups) + } + suppressed_segment_groups = int(cached_projection.get("suppressed", 0)) + else: + segment_groups, suppressed_segment_groups = prepare_segment_report_groups( + segment_groups_raw + ) + + block_groups_report = prepare_block_report_groups(block_groups) + suppressed_block_groups_report = prepare_block_report_groups( + block_split.suppressed_groups + ) + if segment_split.suppressed_groups: + suppressed_segment_groups_report, _ = prepare_segment_report_groups( + segment_split.suppressed_groups + ) + else: + suppressed_segment_groups_report = {} + suppressed_clone_groups = ( + *build_suppressed_clone_groups( + kind="function", + groups=func_split.suppressed_groups, + matched_patterns=func_split.matched_patterns, + ), + *build_suppressed_clone_groups( + kind="block", + groups=suppressed_block_groups_report, + matched_patterns=block_split.matched_patterns, + ), + *build_suppressed_clone_groups( + kind="segment", + groups=suppressed_segment_groups_report, + matched_patterns=segment_split.matched_patterns, + ), + ) + block_group_facts = build_block_group_facts( + {**block_groups_report, **suppressed_block_groups_report} + ) + + func_clones_count = len(func_groups) + block_clones_count = len(block_groups) + segment_clones_count = len(segment_groups) + files_analyzed_or_cached = processing.files_analyzed + discovery.cache_hits + + project_metrics: ProjectMetrics | None = None + metrics_payload: dict[str, object] | None = None + suggestions: tuple[Suggestion, ...] = () + suppressed_dead_items: tuple[DeadItem, ...] = () + coverage_join: CoverageJoinResult | None = None + cohort_structural_findings: tuple[StructuralFindingGroup, ...] = () + if _should_collect_structural_findings(boot.output_paths): + cohort_structural_findings = build_clone_cohort_structural_findings( + func_groups=func_groups + ) + combined_structural_findings = ( + *processing.structural_findings, + *cohort_structural_findings, + ) + if not boot.args.skip_metrics: + project_metrics, dep_graph, _ = compute_project_metrics( + units=processing.units, + class_metrics=processing.class_metrics, + module_deps=processing.module_deps, + dead_candidates=processing.dead_candidates, + referenced_names=processing.referenced_names, + referenced_qualnames=processing.referenced_qualnames, + security_surfaces=processing.security_surfaces, + typing_modules=processing.typing_modules, + docstring_modules=processing.docstring_modules, + api_modules=processing.api_modules, + files_found=discovery.files_found, + files_analyzed_or_cached=files_analyzed_or_cached, + function_clone_groups=func_clones_count, + block_clone_groups=block_clones_count, + skip_dependencies=boot.args.skip_dependencies, + skip_dead_code=boot.args.skip_dead_code, + ) + if not boot.args.skip_dead_code: + suppressed_dead_items = find_suppressed_unused( + definitions=tuple(processing.dead_candidates), + referenced_names=processing.referenced_names, + referenced_qualnames=processing.referenced_qualnames, + ) + suggestions = compute_suggestions( + project_metrics=project_metrics, + units=processing.units, + class_metrics=processing.class_metrics, + func_groups=func_groups, + block_groups=block_groups_report, + segment_groups=segment_groups, + block_group_facts=block_group_facts, + structural_findings=combined_structural_findings, + scan_root=str(boot.root), + ) + coverage_xml_path = _resolve_optional_runtime_path( + getattr(boot.args, "coverage_xml", None), + root=boot.root, + ) + if coverage_xml_path is not None: + try: + coverage_join = build_coverage_join( + coverage_xml=coverage_xml_path, + root_path=boot.root, + units=processing.units, + hotspot_threshold_percent=int( + getattr(boot.args, "coverage_min", DEFAULT_COVERAGE_MIN) + ), + ) + except CoverageJoinParseError as exc: + coverage_join = CoverageJoinResult( + coverage_xml=str(coverage_xml_path), + status="invalid", + hotspot_threshold_percent=int( + getattr(boot.args, "coverage_min", DEFAULT_COVERAGE_MIN) + ), + invalid_reason=str(exc), + ) + metrics_payload = build_metrics_report_payload( + scan_root=str(boot.root), + project_metrics=project_metrics, + dep_graph=dep_graph, + coverage_join=coverage_join, + units=processing.units, + class_metrics=processing.class_metrics, + module_deps=processing.module_deps, + security_surfaces=processing.security_surfaces, + source_stats_by_file=processing.source_stats_by_file, + suppressed_dead_code=suppressed_dead_items, + ) + + return AnalysisResult( + func_groups=func_groups, + block_groups=block_groups, + block_groups_report=block_groups_report, + segment_groups=segment_groups, + suppressed_clone_groups=tuple(suppressed_clone_groups), + suppressed_segment_groups=suppressed_segment_groups, + block_group_facts=block_group_facts, + func_clones_count=func_clones_count, + block_clones_count=block_clones_count, + segment_clones_count=segment_clones_count, + files_analyzed_or_cached=files_analyzed_or_cached, + project_metrics=project_metrics, + metrics_payload=metrics_payload, + suggestions=suggestions, + segment_groups_raw_digest=segment_groups_raw_digest, + coverage_join=coverage_join, + suppressed_dead_code_items=len(suppressed_dead_items), + structural_findings=combined_structural_findings, + ) diff --git a/codeclone/core/reporting.py b/codeclone/core/reporting.py new file mode 100644 index 0000000..d43683f --- /dev/null +++ b/codeclone/core/reporting.py @@ -0,0 +1,267 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections.abc import Callable, Collection, Mapping + +from ..contracts import DEFAULT_COVERAGE_MIN +from ..models import MetricsDiff +from ..report.gates.evaluator import GateResult, GateState +from ..report.gates.evaluator import MetricGateConfig as _MetricGateConfig +from ..report.gates.evaluator import evaluate_gate_state as _evaluate_gate_state +from ..report.gates.evaluator import ( + gate_state_from_project_metrics as _gate_state_from_metrics, +) +from ..report.renderers.json import render_json_report_document +from ..report.renderers.text import render_text_report_document +from ._types import ( + AnalysisResult, + BootstrapResult, + DiscoveryResult, + ProcessingResult, + ReportArtifacts, +) +from .metrics_payload import _enrich_metrics_report_payload + +MetricGateConfig = _MetricGateConfig +GatingResult = GateResult + + +def _coerce_metrics_diff(value: object | None) -> MetricsDiff | None: + return value if isinstance(value, MetricsDiff) else None + + +def _load_markdown_report_renderer() -> Callable[..., str]: + from ..report.renderers.markdown import to_markdown_report + + return to_markdown_report + + +def _load_sarif_report_renderer() -> Callable[..., str]: + from ..report.renderers.sarif import to_sarif_report + + return to_sarif_report + + +def _load_report_document_builder() -> Callable[..., dict[str, object]]: + from ..report.document.builder import build_report_document + + return build_report_document + + +def report( + *, + boot: BootstrapResult, + discovery: DiscoveryResult, + processing: ProcessingResult, + analysis: AnalysisResult, + report_meta: Mapping[str, object], + new_func: Collection[str], + new_block: Collection[str], + html_builder: Callable[..., str] | None = None, + metrics_diff: object | None = None, + coverage_adoption_diff_available: bool = False, + api_surface_diff_available: bool = False, + include_report_document: bool = False, +) -> ReportArtifacts: + contents: dict[str, str | None] = { + "html": None, + "json": None, + "md": None, + "sarif": None, + "text": None, + } + structural_findings = ( + analysis.structural_findings if analysis.structural_findings else None + ) + report_inventory = { + "files": { + "total_found": discovery.files_found, + "analyzed": processing.files_analyzed, + "cached": discovery.cache_hits, + "skipped": processing.files_skipped, + "source_io_skipped": len(processing.source_read_failures), + }, + "code": { + "parsed_lines": processing.analyzed_lines + discovery.cached_lines, + "functions": processing.analyzed_functions + discovery.cached_functions, + "methods": processing.analyzed_methods + discovery.cached_methods, + "classes": processing.analyzed_classes + discovery.cached_classes, + }, + "file_list": list(discovery.all_file_paths), + } + report_document: dict[str, object] | None = None + needs_report_document = ( + include_report_document + or boot.output_paths.html is not None + or any( + path is not None + for path in ( + boot.output_paths.json, + boot.output_paths.md, + boot.output_paths.sarif, + boot.output_paths.text, + ) + ) + ) + if needs_report_document: + build_report_document = _load_report_document_builder() + validated_metrics_diff = _coerce_metrics_diff(metrics_diff) + metrics_for_report = ( + _enrich_metrics_report_payload( + metrics_payload=analysis.metrics_payload, + metrics_diff=validated_metrics_diff, + coverage_adoption_diff_available=coverage_adoption_diff_available, + api_surface_diff_available=api_surface_diff_available, + ) + if analysis.metrics_payload is not None + else None + ) + report_document = build_report_document( + func_groups=analysis.func_groups, + block_groups=analysis.block_groups_report, + segment_groups=analysis.segment_groups, + suppressed_clone_groups=analysis.suppressed_clone_groups, + meta=report_meta, + inventory=report_inventory, + block_facts=analysis.block_group_facts, + new_function_group_keys=new_func, + new_block_group_keys=new_block, + new_segment_group_keys=set(analysis.segment_groups.keys()), + metrics=metrics_for_report, + suggestions=analysis.suggestions, + structural_findings=structural_findings, + ) + + if boot.output_paths.html and html_builder is not None: + validated_metrics_diff = _coerce_metrics_diff(metrics_diff) + metrics_for_html = ( + _enrich_metrics_report_payload( + metrics_payload=analysis.metrics_payload, + metrics_diff=validated_metrics_diff, + coverage_adoption_diff_available=coverage_adoption_diff_available, + api_surface_diff_available=api_surface_diff_available, + ) + if analysis.metrics_payload is not None + else None + ) + contents["html"] = html_builder( + func_groups=analysis.func_groups, + block_groups=analysis.block_groups_report, + segment_groups=analysis.segment_groups, + block_group_facts=analysis.block_group_facts, + new_function_group_keys=new_func, + new_block_group_keys=new_block, + report_meta=report_meta, + metrics=metrics_for_html, + suggestions=analysis.suggestions, + structural_findings=structural_findings, + report_document=report_document, + metrics_diff=metrics_diff, + title="CodeClone Report", + context_lines=3, + max_snippet_lines=220, + ) + + if any( + path is not None + for path in ( + boot.output_paths.json, + boot.output_paths.md, + boot.output_paths.sarif, + boot.output_paths.text, + ) + ): + assert report_document is not None + + if boot.output_paths.json and report_document is not None: + contents["json"] = render_json_report_document(report_document) + + def _render_projection_artifact(renderer: Callable[..., str]) -> str: + assert report_document is not None + return renderer( + report_document=report_document, + meta=report_meta, + inventory=report_inventory, + func_groups=analysis.func_groups, + block_groups=analysis.block_groups_report, + segment_groups=analysis.segment_groups, + block_facts=analysis.block_group_facts, + new_function_group_keys=new_func, + new_block_group_keys=new_block, + new_segment_group_keys=set(analysis.segment_groups.keys()), + metrics=analysis.metrics_payload, + suggestions=analysis.suggestions, + structural_findings=structural_findings, + ) + + for key, output_path, loader in ( + ("md", boot.output_paths.md, _load_markdown_report_renderer), + ("sarif", boot.output_paths.sarif, _load_sarif_report_renderer), + ): + if output_path and report_document is not None: + contents[key] = _render_projection_artifact(loader()) + + if boot.output_paths.text and report_document is not None: + contents["text"] = render_text_report_document(report_document) + + return ReportArtifacts( + html=contents["html"], + json=contents["json"], + md=contents["md"], + sarif=contents["sarif"], + text=contents["text"], + report_document=report_document, + ) + + +def gate( + *, + boot: BootstrapResult, + analysis: AnalysisResult, + new_func: Collection[str], + new_block: Collection[str], + metrics_diff: MetricsDiff | None, +) -> GatingResult: + config = MetricGateConfig( + fail_complexity=boot.args.fail_complexity, + fail_coupling=boot.args.fail_coupling, + fail_cohesion=boot.args.fail_cohesion, + fail_cycles=boot.args.fail_cycles, + fail_dead_code=boot.args.fail_dead_code, + fail_health=boot.args.fail_health, + fail_on_new_metrics=boot.args.fail_on_new_metrics, + fail_on_typing_regression=bool( + getattr(boot.args, "fail_on_typing_regression", False) + ), + fail_on_docstring_regression=bool( + getattr(boot.args, "fail_on_docstring_regression", False) + ), + fail_on_api_break=bool(getattr(boot.args, "fail_on_api_break", False)), + fail_on_untested_hotspots=bool( + getattr(boot.args, "fail_on_untested_hotspots", False) + ), + min_typing_coverage=int(getattr(boot.args, "min_typing_coverage", -1)), + min_docstring_coverage=int(getattr(boot.args, "min_docstring_coverage", -1)), + coverage_min=int(getattr(boot.args, "coverage_min", DEFAULT_COVERAGE_MIN)), + fail_on_new=bool(getattr(boot.args, "fail_on_new", False)), + fail_threshold=int(getattr(boot.args, "fail_threshold", -1)), + ) + clone_new_count = len(tuple(new_func)) + len(tuple(new_block)) + clone_total = analysis.func_clones_count + analysis.block_clones_count + if analysis.project_metrics is None: + state = GateState(clone_new_count=clone_new_count, clone_total=clone_total) + else: + state = _gate_state_from_metrics( + project_metrics=analysis.project_metrics, + coverage_join=analysis.coverage_join, + metrics_diff=metrics_diff, + clone_new_count=clone_new_count, + clone_total=clone_total, + ) + result = _evaluate_gate_state(state=state, config=config) + return GatingResult(exit_code=result.exit_code, reasons=result.reasons) diff --git a/codeclone/core/security_surfaces_payload.py b/codeclone/core/security_surfaces_payload.py new file mode 100644 index 0000000..40d2a16 --- /dev/null +++ b/codeclone/core/security_surfaces_payload.py @@ -0,0 +1,104 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections import Counter +from collections.abc import Sequence + +from ..domain.source_scope import SOURCE_KIND_BREAKDOWN_KEYS +from ..models import SecuritySurface +from ..paths import classify_source_kind + + +def _security_surface_source_kind( + surface: SecuritySurface, + *, + scan_root: str, +) -> str: + return classify_source_kind(surface.filepath, scan_root=scan_root) + + +def _security_surface_sort_key( + surface: SecuritySurface, + *, + scan_root: str, +) -> tuple[str, int, int, str, str, str, str]: + source_kind = _security_surface_source_kind(surface, scan_root=scan_root) + return ( + source_kind, + surface.start_line, + surface.end_line, + surface.filepath, + surface.qualname, + surface.category, + surface.capability, + ) + + +def build_security_surfaces_payload( + *, + scan_root: str, + surfaces: Sequence[SecuritySurface], +) -> dict[str, object]: + sorted_surfaces = tuple( + sorted( + surfaces, + key=lambda surface: _security_surface_sort_key( + surface, + scan_root=scan_root, + ), + ) + ) + category_counts = Counter(surface.category for surface in sorted_surfaces) + source_kind_counts = Counter( + _security_surface_source_kind(surface, scan_root=scan_root) + for surface in sorted_surfaces + ) + return { + "summary": { + "items": len(sorted_surfaces), + "modules": len({surface.module for surface in sorted_surfaces}), + "exact_items": len(sorted_surfaces), + "category_count": len(category_counts), + "categories": { + category: category_counts[category] + for category in sorted(category_counts) + }, + "by_source_kind": { + kind: source_kind_counts.get(kind, 0) + for kind in SOURCE_KIND_BREAKDOWN_KEYS + }, + "production": source_kind_counts.get("production", 0), + "tests": source_kind_counts.get("tests", 0), + "fixtures": source_kind_counts.get("fixtures", 0), + "other": source_kind_counts.get("other", 0), + "report_only": True, + }, + "items": [ + { + "category": surface.category, + "capability": surface.capability, + "module": surface.module, + "filepath": surface.filepath, + "qualname": surface.qualname, + "start_line": surface.start_line, + "end_line": surface.end_line, + "source_kind": _security_surface_source_kind( + surface, + scan_root=scan_root, + ), + "location_scope": surface.location_scope, + "classification_mode": surface.classification_mode, + "evidence_kind": surface.evidence_kind, + "evidence_symbol": surface.evidence_symbol, + } + for surface in sorted_surfaces + ], + } + + +__all__ = ["build_security_surfaces_payload"] diff --git a/codeclone/core/worker.py b/codeclone/core/worker.py new file mode 100644 index 0000000..4cbd52c --- /dev/null +++ b/codeclone/core/worker.py @@ -0,0 +1,171 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import inspect +import os +from collections.abc import Callable +from pathlib import Path + +from ..analysis.normalizer import NormalizationConfig +from ..analysis.units import extract_units_and_stats_from_source +from ..cache.entries import FileStat +from ..contracts import ( + DEFAULT_BLOCK_MIN_LOC, + DEFAULT_BLOCK_MIN_STMT, + DEFAULT_SEGMENT_MIN_LOC, + DEFAULT_SEGMENT_MIN_STMT, +) +from ..scanner import module_name_from_path +from ._types import MAX_FILE_SIZE, FileProcessResult + + +def process_file( + filepath: str, + root: str, + cfg: NormalizationConfig, + min_loc: int, + min_stmt: int, + collect_structural_findings: bool = True, + collect_api_surface: bool = False, + api_include_private_modules: bool = False, + block_min_loc: int = DEFAULT_BLOCK_MIN_LOC, + block_min_stmt: int = DEFAULT_BLOCK_MIN_STMT, + segment_min_loc: int = DEFAULT_SEGMENT_MIN_LOC, + segment_min_stmt: int = DEFAULT_SEGMENT_MIN_STMT, +) -> FileProcessResult: + try: + try: + stat_result = os.stat(filepath) + if stat_result.st_size > MAX_FILE_SIZE: + return FileProcessResult( + filepath=filepath, + success=False, + error=( + f"File too large: {stat_result.st_size} bytes " + f"(max {MAX_FILE_SIZE})" + ), + error_kind="file_too_large", + ) + except OSError as exc: + return FileProcessResult( + filepath=filepath, + success=False, + error=f"Cannot stat file: {exc}", + error_kind="stat_error", + ) + stat: FileStat = { + "mtime_ns": stat_result.st_mtime_ns, + "size": stat_result.st_size, + } + try: + source = Path(filepath).read_text("utf-8") + except UnicodeDecodeError as exc: + return FileProcessResult( + filepath=filepath, + success=False, + error=f"Encoding error: {exc}", + error_kind="source_read_error", + ) + except OSError as exc: + return FileProcessResult( + filepath=filepath, + success=False, + error=f"Cannot read file: {exc}", + error_kind="source_read_error", + ) + module_name = module_name_from_path(root, filepath) + units, blocks, segments, source_stats, file_metrics, structural_findings = ( + extract_units_and_stats_from_source( + source=source, + filepath=filepath, + module_name=module_name, + cfg=cfg, + min_loc=min_loc, + min_stmt=min_stmt, + block_min_loc=block_min_loc, + block_min_stmt=block_min_stmt, + segment_min_loc=segment_min_loc, + segment_min_stmt=segment_min_stmt, + collect_structural_findings=collect_structural_findings, + collect_api_surface=collect_api_surface, + api_include_private_modules=api_include_private_modules, + ) + ) + return FileProcessResult( + filepath=filepath, + success=True, + units=units, + blocks=blocks, + segments=segments, + lines=source_stats.lines, + functions=source_stats.functions, + methods=source_stats.methods, + classes=source_stats.classes, + stat=stat, + file_metrics=file_metrics, + structural_findings=structural_findings, + ) + except Exception as exc: # pragma: no cover - defensive shell around workers + return FileProcessResult( + filepath=filepath, + success=False, + error=f"Unexpected error: {type(exc).__name__}: {exc}", + error_kind="unexpected_error", + ) + + +def _invoke_process_file( + filepath: str, + root: str, + cfg: NormalizationConfig, + min_loc: int, + min_stmt: int, + *, + collect_structural_findings: bool, + collect_api_surface: bool, + api_include_private_modules: bool, + block_min_loc: int, + block_min_stmt: int, + segment_min_loc: int, + segment_min_stmt: int, +) -> FileProcessResult: + optional_kwargs: dict[str, object] = { + "collect_structural_findings": collect_structural_findings, + "collect_api_surface": collect_api_surface, + "api_include_private_modules": api_include_private_modules, + "block_min_loc": block_min_loc, + "block_min_stmt": block_min_stmt, + "segment_min_loc": segment_min_loc, + "segment_min_stmt": segment_min_stmt, + } + try: + signature = inspect.signature(process_file) + except (TypeError, ValueError): + supported_kwargs = optional_kwargs + else: + parameters = tuple(signature.parameters.values()) + if any( + parameter.kind == inspect.Parameter.VAR_KEYWORD for parameter in parameters + ): + supported_kwargs = optional_kwargs + else: + supported_names = {parameter.name for parameter in parameters} + supported_kwargs = { + key: value + for key, value in optional_kwargs.items() + if key in supported_names + } + process_callable: Callable[..., FileProcessResult] = process_file + return process_callable( + filepath, + root, + cfg, + min_loc, + min_stmt, + **supported_kwargs, + ) diff --git a/codeclone/domain/__init__.py b/codeclone/domain/__init__.py index 61cd04f..9135843 100644 --- a/codeclone/domain/__init__.py +++ b/codeclone/domain/__init__.py @@ -3,135 +3,3 @@ # file, You can obtain one at https://mozilla.org/MPL/2.0/. # SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy - -from .findings import ( - CATEGORY_CLONE, - CATEGORY_COHESION, - CATEGORY_COMPLEXITY, - CATEGORY_COUPLING, - CATEGORY_DEAD_CODE, - CATEGORY_DEPENDENCY, - CATEGORY_STRUCTURAL, - CLONE_KIND_BLOCK, - CLONE_KIND_FUNCTION, - CLONE_KIND_SEGMENT, - CLONE_NOVELTY_KNOWN, - CLONE_NOVELTY_NEW, - FAMILY_CLONE, - FAMILY_CLONES, - FAMILY_DEAD_CODE, - FAMILY_DESIGN, - FAMILY_METRICS, - FAMILY_STRUCTURAL, - FINDING_KIND_CLASS_HOTSPOT, - FINDING_KIND_CLONE_GROUP, - FINDING_KIND_CYCLE, - FINDING_KIND_FUNCTION_HOTSPOT, - FINDING_KIND_UNUSED_SYMBOL, - STRUCTURAL_KIND_CLONE_COHORT_DRIFT, - STRUCTURAL_KIND_CLONE_GUARD_EXIT_DIVERGENCE, - STRUCTURAL_KIND_DUPLICATED_BRANCHES, - SYMBOL_KIND_CLASS, - SYMBOL_KIND_FUNCTION, - SYMBOL_KIND_IMPORT, - SYMBOL_KIND_METHOD, -) -from .quality import ( - CONFIDENCE_HIGH, - CONFIDENCE_LOW, - CONFIDENCE_MEDIUM, - EFFORT_EASY, - EFFORT_HARD, - EFFORT_MODERATE, - EFFORT_WEIGHT, - HEALTH_GRADE_A, - HEALTH_GRADE_B, - HEALTH_GRADE_C, - HEALTH_GRADE_D, - HEALTH_GRADE_F, - HEALTH_GRADES, - RISK_HIGH, - RISK_LOW, - RISK_MEDIUM, - SEVERITY_CRITICAL, - SEVERITY_INFO, - SEVERITY_ORDER, - SEVERITY_RANK, - SEVERITY_WARNING, -) -from .source_scope import ( - IMPACT_SCOPE_MIXED, - IMPACT_SCOPE_NON_RUNTIME, - IMPACT_SCOPE_RUNTIME, - SOURCE_KIND_BREAKDOWN_KEYS, - SOURCE_KIND_FIXTURES, - SOURCE_KIND_MIXED, - SOURCE_KIND_ORDER, - SOURCE_KIND_OTHER, - SOURCE_KIND_PRODUCTION, - SOURCE_KIND_TESTS, -) - -__all__ = [ - "CATEGORY_CLONE", - "CATEGORY_COHESION", - "CATEGORY_COMPLEXITY", - "CATEGORY_COUPLING", - "CATEGORY_DEAD_CODE", - "CATEGORY_DEPENDENCY", - "CATEGORY_STRUCTURAL", - "CLONE_KIND_BLOCK", - "CLONE_KIND_FUNCTION", - "CLONE_KIND_SEGMENT", - "CLONE_NOVELTY_KNOWN", - "CLONE_NOVELTY_NEW", - "CONFIDENCE_HIGH", - "CONFIDENCE_LOW", - "CONFIDENCE_MEDIUM", - "EFFORT_EASY", - "EFFORT_HARD", - "EFFORT_MODERATE", - "EFFORT_WEIGHT", - "FAMILY_CLONE", - "FAMILY_CLONES", - "FAMILY_DEAD_CODE", - "FAMILY_DESIGN", - "FAMILY_METRICS", - "FAMILY_STRUCTURAL", - "FINDING_KIND_CLASS_HOTSPOT", - "FINDING_KIND_CLONE_GROUP", - "FINDING_KIND_CYCLE", - "FINDING_KIND_FUNCTION_HOTSPOT", - "FINDING_KIND_UNUSED_SYMBOL", - "HEALTH_GRADES", - "HEALTH_GRADE_A", - "HEALTH_GRADE_B", - "HEALTH_GRADE_C", - "HEALTH_GRADE_D", - "HEALTH_GRADE_F", - "IMPACT_SCOPE_MIXED", - "IMPACT_SCOPE_NON_RUNTIME", - "IMPACT_SCOPE_RUNTIME", - "RISK_HIGH", - "RISK_LOW", - "RISK_MEDIUM", - "SEVERITY_CRITICAL", - "SEVERITY_INFO", - "SEVERITY_ORDER", - "SEVERITY_RANK", - "SEVERITY_WARNING", - "SOURCE_KIND_BREAKDOWN_KEYS", - "SOURCE_KIND_FIXTURES", - "SOURCE_KIND_MIXED", - "SOURCE_KIND_ORDER", - "SOURCE_KIND_OTHER", - "SOURCE_KIND_PRODUCTION", - "SOURCE_KIND_TESTS", - "STRUCTURAL_KIND_CLONE_COHORT_DRIFT", - "STRUCTURAL_KIND_CLONE_GUARD_EXIT_DIVERGENCE", - "STRUCTURAL_KIND_DUPLICATED_BRANCHES", - "SYMBOL_KIND_CLASS", - "SYMBOL_KIND_FUNCTION", - "SYMBOL_KIND_IMPORT", - "SYMBOL_KIND_METHOD", -] diff --git a/codeclone/extractor.py b/codeclone/extractor.py deleted file mode 100644 index bacbef4..0000000 --- a/codeclone/extractor.py +++ /dev/null @@ -1,1149 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at https://mozilla.org/MPL/2.0/. -# SPDX-License-Identifier: MPL-2.0 -# Copyright (c) 2026 Den Rozhnovskiy - -from __future__ import annotations - -import ast -import io -import math -import os -import signal -import tokenize -from contextlib import contextmanager -from dataclasses import dataclass, field -from hashlib import sha1 as _sha1 -from typing import TYPE_CHECKING, Literal, NamedTuple - -from . import qualnames as _qualnames -from .blocks import extract_blocks, extract_segments -from .cfg import CFGBuilder -from .errors import ParseError -from .fingerprint import bucket_loc, sha1 -from .metrics import ( - cohesion_risk, - compute_cbo, - compute_lcom4, - coupling_risk, - cyclomatic_complexity, - risk_level, -) -from .metrics.adoption import collect_module_adoption -from .metrics.api_surface import collect_module_api_surface -from .models import ( - BlockUnit, - ClassMetrics, - DeadCandidate, - FileMetrics, - ModuleDep, - SegmentUnit, - SourceStats, - StructuralFindingGroup, - Unit, -) -from .normalize import ( - AstNormalizer, - NormalizationConfig, - normalized_ast_dump_from_list, - stmt_hashes, -) -from .paths import is_test_filepath -from .structural_findings import scan_function_structure -from .suppressions import ( - DeclarationTarget, - bind_suppressions_to_declarations, - build_suppression_index, - extract_suppression_directives, - suppression_target_key, -) - -if TYPE_CHECKING: - from collections.abc import Iterator, Mapping - - from .suppressions import SuppressionTargetKey - -__all__ = [ - "Unit", - "extract_units_and_stats_from_source", -] - -# ========================= -# Helpers -# ========================= - -PARSE_TIMEOUT_SECONDS = 5 - - -class _ParseTimeoutError(Exception): - pass - - -# Any named declaration: function, async function, or class. -_NamedDeclarationNode = _qualnames.FunctionNode | ast.ClassDef -# Unique key for a declaration's token index: (start_line, end_line, qualname). -_DeclarationTokenIndexKey = tuple[int, int, str] -_DECLARATION_TOKEN_STRINGS = frozenset({"def", "async", "class"}) - - -def _consumed_cpu_seconds(resource_module: object) -> float: - """Return consumed CPU seconds for the current process.""" - try: - usage = resource_module.getrusage( # type: ignore[attr-defined] - resource_module.RUSAGE_SELF # type: ignore[attr-defined] - ) - return float(usage.ru_utime) + float(usage.ru_stime) - except Exception: - return 0.0 - - -@contextmanager -def _parse_limits(timeout_s: int) -> Iterator[None]: - if os.name != "posix" or timeout_s <= 0: - yield - return - - old_handler = signal.getsignal(signal.SIGALRM) - - def _timeout_handler(_signum: int, _frame: object) -> None: - raise _ParseTimeoutError("AST parsing timeout") - - old_limits: tuple[int, int] | None = None - try: - signal.signal(signal.SIGALRM, _timeout_handler) - signal.setitimer(signal.ITIMER_REAL, timeout_s) - - try: - import resource - - old_limits = resource.getrlimit(resource.RLIMIT_CPU) - soft, hard = old_limits - consumed_cpu_s = _consumed_cpu_seconds(resource) - desired_soft = max(1, timeout_s + math.ceil(consumed_cpu_s)) - if soft == resource.RLIM_INFINITY: - candidate_soft = desired_soft - else: - # Never reduce finite soft limits and avoid immediate SIGXCPU - # when the process already consumed more CPU than timeout_s. - candidate_soft = max(soft, desired_soft) - if hard == resource.RLIM_INFINITY: - new_soft = candidate_soft - else: - new_soft = min(max(1, hard), candidate_soft) - # Never lower hard limit: raising it back may be disallowed for - # unprivileged processes and can lead to process termination later. - resource.setrlimit(resource.RLIMIT_CPU, (new_soft, hard)) - except Exception: - # If resource is unavailable or cannot be set, rely on alarm only. - pass - - yield - finally: - signal.setitimer(signal.ITIMER_REAL, 0) - signal.signal(signal.SIGALRM, old_handler) - if old_limits is not None: - try: - import resource - - resource.setrlimit(resource.RLIMIT_CPU, old_limits) - except Exception: - pass - - -def _parse_with_limits(source: str, timeout_s: int) -> ast.AST: - try: - with _parse_limits(timeout_s): - return ast.parse(source) - except _ParseTimeoutError as e: - raise ParseError(str(e)) from e - - -def _stmt_count(node: ast.AST) -> int: - body = getattr(node, "body", None) - return len(body) if isinstance(body, list) else 0 - - -def _source_tokens(source: str) -> tuple[tokenize.TokenInfo, ...]: - try: - return tuple(tokenize.generate_tokens(io.StringIO(source).readline)) - except tokenize.TokenError: - return () - - -def _declaration_token_name(node: ast.AST) -> str: - if isinstance(node, ast.ClassDef): - return "class" - if isinstance(node, ast.AsyncFunctionDef): - return "async" - return "def" - - -def _declaration_token_index( - *, - source_tokens: tuple[tokenize.TokenInfo, ...], - start_line: int, - start_col: int, - declaration_token: str, - source_token_index: Mapping[_DeclarationTokenIndexKey, int] | None = None, -) -> int | None: - if source_token_index is not None: - return source_token_index.get((start_line, start_col, declaration_token)) - for idx, token in enumerate(source_tokens): - if token.start != (start_line, start_col): - continue - if token.type == tokenize.NAME and token.string == declaration_token: - return idx - return None - - -def _build_declaration_token_index( - source_tokens: tuple[tokenize.TokenInfo, ...], -) -> Mapping[_DeclarationTokenIndexKey, int]: - indexed: dict[_DeclarationTokenIndexKey, int] = {} - for idx, token in enumerate(source_tokens): - if token.type == tokenize.NAME and token.string in _DECLARATION_TOKEN_STRINGS: - indexed[(token.start[0], token.start[1], token.string)] = idx - return indexed - - -def _scan_declaration_colon_line( - *, - source_tokens: tuple[tokenize.TokenInfo, ...], - start_index: int, -) -> int | None: - nesting = 0 - for token in source_tokens[start_index + 1 :]: - if token.type == tokenize.OP: - if token.string in "([{": - nesting += 1 - continue - if token.string in ")]}": - if nesting > 0: - nesting -= 1 - continue - if token.string == ":" and nesting == 0: - return token.start[0] - if token.type == tokenize.NEWLINE and nesting == 0: - return None - return None - - -def _fallback_declaration_end_line(node: ast.AST, *, start_line: int) -> int: - body = getattr(node, "body", None) - if not isinstance(body, list) or not body: - return start_line - - first_body_line = int(getattr(body[0], "lineno", 0)) - if first_body_line <= 0 or first_body_line == start_line: - return start_line - return max(start_line, first_body_line - 1) - - -def _declaration_end_line( - node: ast.AST, - *, - source_tokens: tuple[tokenize.TokenInfo, ...], - source_token_index: Mapping[_DeclarationTokenIndexKey, int] | None = None, -) -> int: - start_line = int(getattr(node, "lineno", 0)) - start_col = int(getattr(node, "col_offset", 0)) - if start_line <= 0: - return 0 - - declaration_token = _declaration_token_name(node) - start_index = _declaration_token_index( - source_tokens=source_tokens, - start_line=start_line, - start_col=start_col, - declaration_token=declaration_token, - source_token_index=source_token_index, - ) - if start_index is None: - return _fallback_declaration_end_line(node, start_line=start_line) - - colon_line = _scan_declaration_colon_line( - source_tokens=source_tokens, - start_index=start_index, - ) - if colon_line is not None: - return colon_line - return _fallback_declaration_end_line(node, start_line=start_line) - - -# ========================= -# CFG fingerprinting -# ========================= - - -def _cfg_fingerprint_and_complexity( - node: _qualnames.FunctionNode, - cfg: NormalizationConfig, - qualname: str, -) -> tuple[str, int]: - """ - Generate a structural fingerprint for a function using CFG analysis. - - The fingerprint is computed by: - 1. Building a Control Flow Graph (CFG) from the function - 2. Normalizing each CFG block's statements (variable names, constants, etc.) - 3. Creating a canonical representation of the CFG structure - 4. Hashing the representation with SHA-1 - - Functions with identical control flow and normalized statements will - produce the same fingerprint, even if they differ in variable names, - constants, or type annotations. - - Args: - node: Function AST node to fingerprint - cfg: Normalization configuration (what to ignore) - qualname: Qualified name for logging/debugging - - Returns: - 40-character hex SHA-1 hash of the normalized CFG - """ - builder = CFGBuilder() - graph = builder.build(qualname, node) - cfg_normalizer = AstNormalizer(cfg) - - # Use generator to avoid building large list of strings - parts: list[str] = [] - for block in sorted(graph.blocks, key=lambda b: b.id): - succ_ids = ",".join( - str(s.id) for s in sorted(block.successors, key=lambda s: s.id) - ) - block_dump = normalized_ast_dump_from_list( - block.statements, - cfg, - normalizer=cfg_normalizer, - ) - parts.append(f"BLOCK[{block.id}]:{block_dump}|SUCCESSORS:{succ_ids}") - return sha1("|".join(parts)), cyclomatic_complexity(graph) - - -def _raw_source_hash_for_range( - source_lines: list[str], - start_line: int, - end_line: int, -) -> str: - window = "".join(source_lines[start_line - 1 : end_line]).strip() - no_space = "".join(window.split()) - return _sha1(no_space.encode("utf-8")).hexdigest() - - -def _resolve_import_target( - module_name: str, - import_node: ast.ImportFrom, -) -> str: - if import_node.level <= 0: - return import_node.module or "" - - parent_parts = module_name.split(".") - keep = max(0, len(parent_parts) - import_node.level) - prefix = parent_parts[:keep] - if import_node.module: - return ".".join([*prefix, import_node.module]) - return ".".join(prefix) - - -_PROTOCOL_MODULE_NAMES = frozenset({"typing", "typing_extensions"}) - - -@dataclass(slots=True) -class _ModuleWalkState: - import_names: set[str] = field(default_factory=set) - deps: list[ModuleDep] = field(default_factory=list) - referenced_names: set[str] = field(default_factory=set) - imported_symbol_bindings: dict[str, set[str]] = field(default_factory=dict) - imported_module_aliases: dict[str, str] = field(default_factory=dict) - name_nodes: list[ast.Name] = field(default_factory=list) - attr_nodes: list[ast.Attribute] = field(default_factory=list) - protocol_symbol_aliases: set[str] = field(default_factory=lambda: {"Protocol"}) - protocol_module_aliases: set[str] = field( - default_factory=lambda: set(_PROTOCOL_MODULE_NAMES) - ) - - -def _append_module_dep( - *, - module_name: str, - target: str, - import_type: Literal["import", "from_import"], - line: int, - state: _ModuleWalkState, -) -> None: - state.deps.append( - ModuleDep( - source=module_name, - target=target, - import_type=import_type, - line=line, - ) - ) - - -def _collect_import_node( - *, - node: ast.Import, - module_name: str, - state: _ModuleWalkState, - collect_referenced_names: bool, -) -> None: - line = int(getattr(node, "lineno", 0)) - for alias in node.names: - alias_name = alias.asname or alias.name.split(".", 1)[0] - state.import_names.add(alias_name) - _append_module_dep( - module_name=module_name, - target=alias.name, - import_type="import", - line=line, - state=state, - ) - if collect_referenced_names: - state.imported_module_aliases[alias_name] = alias.name - if alias.name in _PROTOCOL_MODULE_NAMES: - state.protocol_module_aliases.add(alias_name) - - -def _dotted_expr_name(expr: ast.expr) -> str | None: - if isinstance(expr, ast.Name): - return expr.id - if isinstance(expr, ast.Attribute): - prefix = _dotted_expr_name(expr.value) - if prefix is None: - return None - return f"{prefix}.{expr.attr}" - return None - - -def _collect_import_from_node( - *, - node: ast.ImportFrom, - module_name: str, - state: _ModuleWalkState, - collect_referenced_names: bool, -) -> None: - target = _resolve_import_target(module_name, node) - if target: - state.import_names.add(target.split(".", 1)[0]) - _append_module_dep( - module_name=module_name, - target=target, - import_type="from_import", - line=int(getattr(node, "lineno", 0)), - state=state, - ) - - if node.module in _PROTOCOL_MODULE_NAMES: - for alias in node.names: - if alias.name == "Protocol": - state.protocol_symbol_aliases.add(alias.asname or alias.name) - - if not collect_referenced_names or not target: - return - - for alias in node.names: - if alias.name == "*": - continue - alias_name = alias.asname or alias.name - state.imported_symbol_bindings.setdefault(alias_name, set()).add( - f"{target}:{alias.name}" - ) - - -def _is_protocol_class( - class_node: ast.ClassDef, - *, - protocol_symbol_aliases: frozenset[str], - protocol_module_aliases: frozenset[str], -) -> bool: - for base in class_node.bases: - base_name = _dotted_expr_name(base) - if base_name is None: - continue - if base_name in protocol_symbol_aliases: - return True - if "." in base_name and base_name.rsplit(".", 1)[-1] == "Protocol": - module_alias = base_name.rsplit(".", 1)[0] - if module_alias in protocol_module_aliases: - return True - return False - - -def _is_non_runtime_candidate(node: _qualnames.FunctionNode) -> bool: - for decorator in node.decorator_list: - name = _dotted_expr_name(decorator) - if name is None: - continue - terminal = name.rsplit(".", 1)[-1] - if terminal in {"overload", "abstractmethod"}: - return True - return False - - -def _node_line_span(node: ast.AST) -> tuple[int, int] | None: - start = int(getattr(node, "lineno", 0)) - end = int(getattr(node, "end_lineno", 0)) - if start <= 0 or end <= 0: - return None - return start, end - - -def _eligible_unit_shape( - node: _qualnames.FunctionNode, - *, - min_loc: int, - min_stmt: int, -) -> tuple[int, int, int, int] | None: - span = _node_line_span(node) - if span is None: - return None - start, end = span - if end < start: - return None - loc = end - start + 1 - stmt_count = _stmt_count(node) - if loc < min_loc or stmt_count < min_stmt: - return None - return start, end, loc, stmt_count - - -def _class_metrics_for_node( - *, - module_name: str, - class_qualname: str, - class_node: ast.ClassDef, - filepath: str, - module_import_names: set[str], - module_class_names: set[str], -) -> ClassMetrics | None: - span = _node_line_span(class_node) - if span is None: - return None - start, end = span - cbo, coupled_classes = compute_cbo( - class_node, - module_import_names=module_import_names, - module_class_names=module_class_names, - ) - lcom4, method_count, instance_var_count = compute_lcom4(class_node) - return ClassMetrics( - qualname=f"{module_name}:{class_qualname}", - filepath=filepath, - start_line=start, - end_line=end, - cbo=cbo, - lcom4=lcom4, - method_count=method_count, - instance_var_count=instance_var_count, - risk_coupling=coupling_risk(cbo), - risk_cohesion=cohesion_risk(lcom4), - coupled_classes=coupled_classes, - ) - - -def _dead_candidate_kind(local_name: str) -> Literal["function", "method"]: - return "method" if "." in local_name else "function" - - -def _should_skip_dead_candidate( - local_name: str, - node: _qualnames.FunctionNode, - *, - protocol_class_qualnames: set[str], -) -> bool: - if _is_non_runtime_candidate(node): - return True - if "." not in local_name: - return False - owner_qualname = local_name.rsplit(".", 1)[0] - return owner_qualname in protocol_class_qualnames - - -def _build_dead_candidate( - *, - module_name: str, - local_name: str, - node: _NamedDeclarationNode, - filepath: str, - kind: Literal["class", "function", "method"], - suppression_index: Mapping[SuppressionTargetKey, tuple[str, ...]], - start_line: int, - end_line: int, -) -> DeadCandidate: - qualname = f"{module_name}:{local_name}" - return DeadCandidate( - qualname=qualname, - local_name=node.name, - filepath=filepath, - start_line=start_line, - end_line=end_line, - kind=kind, - suppressed_rules=suppression_index.get( - suppression_target_key( - filepath=filepath, - qualname=qualname, - start_line=start_line, - end_line=end_line, - kind=kind, - ), - (), - ), - ) - - -def _dead_candidate_for_unit( - *, - module_name: str, - local_name: str, - node: _qualnames.FunctionNode, - filepath: str, - suppression_index: Mapping[SuppressionTargetKey, tuple[str, ...]], - protocol_class_qualnames: set[str], -) -> DeadCandidate | None: - span = _node_line_span(node) - if span is None: - return None - if _should_skip_dead_candidate( - local_name, - node, - protocol_class_qualnames=protocol_class_qualnames, - ): - return None - start, end = span - return _build_dead_candidate( - module_name=module_name, - local_name=local_name, - node=node, - filepath=filepath, - kind=_dead_candidate_kind(local_name), - suppression_index=suppression_index, - start_line=start, - end_line=end, - ) - - -def _collect_load_reference_node( - *, - node: ast.AST, - state: _ModuleWalkState, -) -> None: - if isinstance(node, ast.Name) and isinstance(node.ctx, ast.Load): - state.referenced_names.add(node.id) - state.name_nodes.append(node) - return - if isinstance(node, ast.Attribute) and isinstance(node.ctx, ast.Load): - state.referenced_names.add(node.attr) - state.attr_nodes.append(node) - - -def _resolve_referenced_qualnames( - *, - module_name: str, - collector: _qualnames.QualnameCollector, - state: _ModuleWalkState, -) -> frozenset[str]: - top_level_class_by_name = { - class_qualname: class_qualname - for class_qualname, _class_node in collector.class_nodes - if "." not in class_qualname - } - local_method_qualnames = frozenset( - f"{module_name}:{local_name}" - for local_name, _node in collector.units - if "." in local_name - ) - - resolved: set[str] = set() - for name_node in state.name_nodes: - for qualname in state.imported_symbol_bindings.get(name_node.id, ()): - resolved.add(qualname) - - for attr_node in state.attr_nodes: - base = attr_node.value - if isinstance(base, ast.Name): - imported_module = state.imported_module_aliases.get(base.id) - if imported_module is not None: - resolved.add(f"{imported_module}:{attr_node.attr}") - else: - class_qualname = top_level_class_by_name.get(base.id) - if class_qualname is not None: - local_method_qualname = ( - f"{module_name}:{class_qualname}.{attr_node.attr}" - ) - if local_method_qualname in local_method_qualnames: - resolved.add(local_method_qualname) - - return frozenset(resolved) - - -class _ModuleWalkResult(NamedTuple): - import_names: frozenset[str] - module_deps: tuple[ModuleDep, ...] - referenced_names: frozenset[str] - referenced_qualnames: frozenset[str] - protocol_symbol_aliases: frozenset[str] - protocol_module_aliases: frozenset[str] - - -def _collect_module_walk_data( - *, - tree: ast.AST, - module_name: str, - collector: _qualnames.QualnameCollector, - collect_referenced_names: bool, -) -> _ModuleWalkResult: - """Single ast.walk that collects imports, deps, names, qualnames & protocol aliases. - - Reduces the hot path to one tree walk plus one local qualname resolution phase. - """ - state = _ModuleWalkState() - for node in ast.walk(tree): - if isinstance(node, ast.Import): - _collect_import_node( - node=node, - module_name=module_name, - state=state, - collect_referenced_names=collect_referenced_names, - ) - elif isinstance(node, ast.ImportFrom): - _collect_import_from_node( - node=node, - module_name=module_name, - state=state, - collect_referenced_names=collect_referenced_names, - ) - elif collect_referenced_names: - _collect_load_reference_node(node=node, state=state) - - deps_sorted = tuple( - sorted( - state.deps, - key=lambda dep: (dep.source, dep.target, dep.import_type, dep.line), - ) - ) - resolved = ( - _resolve_referenced_qualnames( - module_name=module_name, - collector=collector, - state=state, - ) - if collect_referenced_names - else frozenset() - ) - - return _ModuleWalkResult( - import_names=frozenset(state.import_names), - module_deps=deps_sorted, - referenced_names=frozenset(state.referenced_names), - referenced_qualnames=resolved, - protocol_symbol_aliases=frozenset(state.protocol_symbol_aliases), - protocol_module_aliases=frozenset(state.protocol_module_aliases), - ) - - -def _collect_dead_candidates( - *, - filepath: str, - module_name: str, - collector: _qualnames.QualnameCollector, - protocol_symbol_aliases: frozenset[str] = frozenset({"Protocol"}), - protocol_module_aliases: frozenset[str] = frozenset( - {"typing", "typing_extensions"} - ), - suppression_rules_by_target: Mapping[SuppressionTargetKey, tuple[str, ...]] - | None = None, -) -> tuple[DeadCandidate, ...]: - protocol_class_qualnames = { - class_qualname - for class_qualname, class_node in collector.class_nodes - if _is_protocol_class( - class_node, - protocol_symbol_aliases=protocol_symbol_aliases, - protocol_module_aliases=protocol_module_aliases, - ) - } - - candidates: list[DeadCandidate] = [] - suppression_index = ( - suppression_rules_by_target if suppression_rules_by_target is not None else {} - ) - for local_name, node in collector.units: - candidate = _dead_candidate_for_unit( - module_name=module_name, - local_name=local_name, - node=node, - filepath=filepath, - suppression_index=suppression_index, - protocol_class_qualnames=protocol_class_qualnames, - ) - if candidate is not None: - candidates.append(candidate) - - for class_qualname, class_node in collector.class_nodes: - span = _node_line_span(class_node) - if span is not None: - start, end = span - candidates.append( - _build_dead_candidate( - module_name=module_name, - local_name=class_qualname, - node=class_node, - filepath=filepath, - kind="class", - suppression_index=suppression_index, - start_line=start, - end_line=end, - ) - ) - - return tuple( - sorted( - candidates, - key=lambda item: ( - item.filepath, - item.start_line, - item.end_line, - item.qualname, - ), - ) - ) - - -def _collect_declaration_targets( - *, - filepath: str, - module_name: str, - collector: _qualnames.QualnameCollector, - source_tokens: tuple[tokenize.TokenInfo, ...] = (), - source_token_index: Mapping[_DeclarationTokenIndexKey, int] | None = None, - include_inline_lines: bool = False, -) -> tuple[DeclarationTarget, ...]: - declarations: list[DeclarationTarget] = [] - declaration_specs: list[ - tuple[str, ast.AST, Literal["function", "method", "class"]] - ] = [ - ( - local_name, - node, - "method" if "." in local_name else "function", - ) - for local_name, node in collector.units - ] - declaration_specs.extend( - (class_qualname, class_node, "class") - for class_qualname, class_node in collector.class_nodes - ) - - for qualname_suffix, node, kind in declaration_specs: - start = int(getattr(node, "lineno", 0)) - end = int(getattr(node, "end_lineno", 0)) - if start > 0 and end > 0: - declaration_end_line = ( - _declaration_end_line( - node, - source_tokens=source_tokens, - source_token_index=source_token_index, - ) - if include_inline_lines - else None - ) - declarations.append( - DeclarationTarget( - filepath=filepath, - qualname=f"{module_name}:{qualname_suffix}", - start_line=start, - end_line=end, - kind=kind, - declaration_end_line=declaration_end_line, - ) - ) - - return tuple( - sorted( - declarations, - key=lambda item: ( - item.filepath, - item.start_line, - item.end_line, - item.qualname, - item.kind, - ), - ) - ) - - -def _build_suppression_index_for_source( - *, - source: str, - filepath: str, - module_name: str, - collector: _qualnames.QualnameCollector, -) -> Mapping[SuppressionTargetKey, tuple[str, ...]]: - suppression_directives = extract_suppression_directives(source) - if not suppression_directives: - return {} - - needs_inline_binding = any( - directive.binding == "inline" for directive in suppression_directives - ) - source_tokens: tuple[tokenize.TokenInfo, ...] = () - source_token_index: Mapping[_DeclarationTokenIndexKey, int] | None = None - if needs_inline_binding: - source_tokens = _source_tokens(source) - if source_tokens: - source_token_index = _build_declaration_token_index(source_tokens) - - declaration_targets = _collect_declaration_targets( - filepath=filepath, - module_name=module_name, - collector=collector, - source_tokens=source_tokens, - source_token_index=source_token_index, - include_inline_lines=needs_inline_binding, - ) - suppression_bindings = bind_suppressions_to_declarations( - directives=suppression_directives, - declarations=declaration_targets, - ) - return build_suppression_index(suppression_bindings) - - -# ========================= -# Public API -# ========================= - - -def extract_units_and_stats_from_source( - source: str, - filepath: str, - module_name: str, - cfg: NormalizationConfig, - min_loc: int, - min_stmt: int, - *, - block_min_loc: int = 20, - block_min_stmt: int = 8, - segment_min_loc: int = 20, - segment_min_stmt: int = 10, - collect_structural_findings: bool = True, - collect_api_surface: bool = False, - api_include_private_modules: bool = False, -) -> tuple[ - list[Unit], - list[BlockUnit], - list[SegmentUnit], - SourceStats, - FileMetrics, - list[StructuralFindingGroup], -]: - try: - tree = _parse_with_limits(source, PARSE_TIMEOUT_SECONDS) - except SyntaxError as e: - raise ParseError(f"Failed to parse {filepath}: {e}") from e - if not isinstance(tree, ast.Module): - raise ParseError(f"Failed to parse {filepath}: expected module AST root") - - collector = _qualnames.QualnameCollector() - collector.visit(tree) - source_lines = source.splitlines() - source_line_count = len(source_lines) - - is_test_file = is_test_filepath(filepath) - - # Single-pass AST walk replaces 3 separate functions / 4 walks. - _walk = _collect_module_walk_data( - tree=tree, - module_name=module_name, - collector=collector, - collect_referenced_names=not is_test_file, - ) - import_names = _walk.import_names - module_deps = _walk.module_deps - referenced_names = _walk.referenced_names - referenced_qualnames = _walk.referenced_qualnames - protocol_symbol_aliases = _walk.protocol_symbol_aliases - protocol_module_aliases = _walk.protocol_module_aliases - - suppression_index = _build_suppression_index_for_source( - source=source, - filepath=filepath, - module_name=module_name, - collector=collector, - ) - class_names = frozenset(class_node.name for _, class_node in collector.class_nodes) - module_import_names = set(import_names) - module_class_names = set(class_names) - class_metrics: list[ClassMetrics] = [] - - units: list[Unit] = [] - block_units: list[BlockUnit] = [] - segment_units: list[SegmentUnit] = [] - structural_findings: list[StructuralFindingGroup] = [] - - for local_name, node in collector.units: - unit_shape = _eligible_unit_shape( - node, - min_loc=min_loc, - min_stmt=min_stmt, - ) - if unit_shape is None: - continue - start, end, loc, stmt_count = unit_shape - - qualname = f"{module_name}:{local_name}" - fingerprint, complexity = _cfg_fingerprint_and_complexity(node, cfg, qualname) - structure_facts = scan_function_structure( - node, - filepath, - qualname, - collect_findings=collect_structural_findings, - ) - depth = structure_facts.nesting_depth - risk = risk_level(complexity) - raw_hash = _raw_source_hash_for_range(source_lines, start, end) - - units.append( - Unit( - qualname=qualname, - filepath=filepath, - start_line=start, - end_line=end, - loc=loc, - stmt_count=stmt_count, - fingerprint=fingerprint, - loc_bucket=bucket_loc(loc), - cyclomatic_complexity=complexity, - nesting_depth=depth, - risk=risk, - raw_hash=raw_hash, - entry_guard_count=structure_facts.entry_guard_count, - entry_guard_terminal_profile=( - structure_facts.entry_guard_terminal_profile - ), - entry_guard_has_side_effect_before=( - structure_facts.entry_guard_has_side_effect_before - ), - terminal_kind=structure_facts.terminal_kind, - try_finally_profile=structure_facts.try_finally_profile, - side_effect_order_profile=structure_facts.side_effect_order_profile, - ) - ) - - needs_blocks = ( - not local_name.endswith("__init__") - and loc >= block_min_loc - and stmt_count >= block_min_stmt - ) - needs_segments = loc >= segment_min_loc and stmt_count >= segment_min_stmt - - if needs_blocks or needs_segments: - body = getattr(node, "body", None) - hashes: list[str] | None = None - if isinstance(body, list): - hashes = stmt_hashes(body, cfg) - - if needs_blocks: - block_units.extend( - extract_blocks( - node, - filepath=filepath, - qualname=qualname, - cfg=cfg, - block_size=4, - max_blocks=15, - precomputed_hashes=hashes, - ) - ) - - if needs_segments: - segment_units.extend( - extract_segments( - node, - filepath=filepath, - qualname=qualname, - cfg=cfg, - window_size=6, - max_segments=60, - precomputed_hashes=hashes, - ) - ) - - if collect_structural_findings: - structural_findings.extend(structure_facts.structural_findings) - - for class_qualname, class_node in collector.class_nodes: - class_metric = _class_metrics_for_node( - module_name=module_name, - class_qualname=class_qualname, - class_node=class_node, - filepath=filepath, - module_import_names=module_import_names, - module_class_names=module_class_names, - ) - if class_metric is not None: - class_metrics.append(class_metric) - - dead_candidates = _collect_dead_candidates( - filepath=filepath, - module_name=module_name, - collector=collector, - protocol_symbol_aliases=protocol_symbol_aliases, - protocol_module_aliases=protocol_module_aliases, - suppression_rules_by_target=suppression_index, - ) - - sorted_class_metrics = tuple( - sorted( - class_metrics, - key=lambda item: ( - item.filepath, - item.start_line, - item.end_line, - item.qualname, - ), - ) - ) - typing_coverage, docstring_coverage = collect_module_adoption( - tree=tree, - module_name=module_name, - filepath=filepath, - collector=collector, - imported_names=import_names, - ) - api_surface = None - if collect_api_surface: - api_surface = collect_module_api_surface( - tree=tree, - module_name=module_name, - filepath=filepath, - collector=collector, - imported_names=import_names, - include_private_modules=api_include_private_modules, - ) - - return ( - units, - block_units, - segment_units, - SourceStats( - lines=source_line_count, - functions=collector.function_count, - methods=collector.method_count, - classes=collector.class_count, - ), - FileMetrics( - class_metrics=sorted_class_metrics, - module_deps=module_deps, - dead_candidates=dead_candidates, - referenced_names=referenced_names, - import_names=import_names, - class_names=class_names, - referenced_qualnames=referenced_qualnames, - typing_coverage=typing_coverage, - docstring_coverage=docstring_coverage, - api_surface=api_surface, - ), - structural_findings, - ) diff --git a/codeclone/findings/__init__.py b/codeclone/findings/__init__.py new file mode 100644 index 0000000..9135843 --- /dev/null +++ b/codeclone/findings/__init__.py @@ -0,0 +1,5 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy diff --git a/codeclone/findings/clones/__init__.py b/codeclone/findings/clones/__init__.py new file mode 100644 index 0000000..9135843 --- /dev/null +++ b/codeclone/findings/clones/__init__.py @@ -0,0 +1,5 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy diff --git a/codeclone/golden_fixtures.py b/codeclone/findings/clones/golden_fixtures.py similarity index 97% rename from codeclone/golden_fixtures.py rename to codeclone/findings/clones/golden_fixtures.py index 3b6fe47..b60caaa 100644 --- a/codeclone/golden_fixtures.py +++ b/codeclone/findings/clones/golden_fixtures.py @@ -11,15 +11,15 @@ from pathlib import PurePosixPath from typing import Literal -from .domain.source_scope import SOURCE_KIND_FIXTURES, SOURCE_KIND_TESTS -from .models import ( +from ...domain.source_scope import SOURCE_KIND_FIXTURES, SOURCE_KIND_TESTS +from ...models import ( GroupItem, GroupItemLike, GroupMap, GroupMapLike, SuppressedCloneGroup, ) -from .paths import classify_source_kind, normalize_repo_path, relative_repo_path +from ...paths import classify_source_kind, normalize_repo_path, relative_repo_path CloneGroupKind = Literal["function", "block", "segment"] diff --git a/codeclone/grouping.py b/codeclone/findings/clones/grouping.py similarity index 98% rename from codeclone/grouping.py rename to codeclone/findings/clones/grouping.py index c4590b3..7aa37dc 100644 --- a/codeclone/grouping.py +++ b/codeclone/findings/clones/grouping.py @@ -9,7 +9,7 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from .models import GroupItemsLike, GroupMap + from ...models import GroupItemsLike, GroupMap def _group_items_by_key( diff --git a/codeclone/findings/ids.py b/codeclone/findings/ids.py new file mode 100644 index 0000000..d09cf0d --- /dev/null +++ b/codeclone/findings/ids.py @@ -0,0 +1,31 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + + +def clone_group_id(kind: str, group_key: str) -> str: + return f"clone:{kind}:{group_key}" + + +def structural_group_id(finding_kind: str, finding_key: str) -> str: + return f"structural:{finding_kind}:{finding_key}" + + +def dead_code_group_id(subject_key: str) -> str: + return f"dead_code:{subject_key}" + + +def design_group_id(category: str, subject_key: str) -> str: + return f"design:{category}:{subject_key}" + + +__all__ = [ + "clone_group_id", + "dead_code_group_id", + "design_group_id", + "structural_group_id", +] diff --git a/codeclone/findings/structural/__init__.py b/codeclone/findings/structural/__init__.py new file mode 100644 index 0000000..9135843 --- /dev/null +++ b/codeclone/findings/structural/__init__.py @@ -0,0 +1,5 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy diff --git a/codeclone/structural_findings.py b/codeclone/findings/structural/detectors.py similarity index 99% rename from codeclone/structural_findings.py rename to codeclone/findings/structural/detectors.py index 2d805d7..0ebe138 100644 --- a/codeclone/structural_findings.py +++ b/codeclone/findings/structural/detectors.py @@ -21,13 +21,13 @@ from hashlib import sha1 from typing import TYPE_CHECKING, overload -from ._coerce import as_int, as_str -from .domain.findings import ( +from ...domain.findings import ( STRUCTURAL_KIND_CLONE_COHORT_DRIFT, STRUCTURAL_KIND_CLONE_GUARD_EXIT_DIVERGENCE, STRUCTURAL_KIND_DUPLICATED_BRANCHES, ) -from .models import GroupItemLike, StructuralFindingGroup, StructuralFindingOccurrence +from ...models import GroupItemLike, StructuralFindingGroup, StructuralFindingOccurrence +from ...utils.coerce import as_int, as_str if TYPE_CHECKING: from collections.abc import Mapping, Sequence diff --git a/codeclone/fingerprint.py b/codeclone/fingerprint.py deleted file mode 100644 index 72adaee..0000000 --- a/codeclone/fingerprint.py +++ /dev/null @@ -1,24 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at https://mozilla.org/MPL/2.0/. -# SPDX-License-Identifier: MPL-2.0 -# Copyright (c) 2026 Den Rozhnovskiy - -from __future__ import annotations - -import hashlib - - -def sha1(s: str) -> str: - return hashlib.sha1(s.encode("utf-8")).hexdigest() - - -def bucket_loc(loc: int) -> str: - # Helps avoid grouping wildly different sizes if desired - if loc < 20: - return "0-19" - if loc < 50: - return "20-49" - if loc < 100: - return "50-99" - return "100+" diff --git a/codeclone/html_report.py b/codeclone/html_report.py deleted file mode 100644 index 16ceab5..0000000 --- a/codeclone/html_report.py +++ /dev/null @@ -1,29 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at https://mozilla.org/MPL/2.0/. -# SPDX-License-Identifier: MPL-2.0 -# Copyright (c) 2026 Den Rozhnovskiy - -"""Public facade for HTML report generation. - -Re-exports build_html_report from the new _html_report package and -keeps backward-compatible imports that tests and downstream code rely on. -""" - -from __future__ import annotations - -from ._html_report import build_html_report -from ._html_snippets import ( - _FileCache, - _pygments_css, - _render_code_block, - _try_pygments, -) - -__all__ = [ - "_FileCache", - "_pygments_css", - "_render_code_block", - "_try_pygments", - "build_html_report", -] diff --git a/codeclone/main.py b/codeclone/main.py new file mode 100644 index 0000000..7e17b85 --- /dev/null +++ b/codeclone/main.py @@ -0,0 +1,15 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from .surfaces.cli.workflow import main + +__all__ = ["main"] + + +if __name__ == "__main__": + main() diff --git a/codeclone/mcp_service.py b/codeclone/mcp_service.py deleted file mode 100644 index cbed02b..0000000 --- a/codeclone/mcp_service.py +++ /dev/null @@ -1,4727 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at https://mozilla.org/MPL/2.0/. -# SPDX-License-Identifier: MPL-2.0 -# Copyright (c) 2026 Den Rozhnovskiy - -from __future__ import annotations - -import hashlib -import subprocess -from argparse import Namespace -from collections import OrderedDict -from collections.abc import Iterable, Mapping, Sequence -from dataclasses import dataclass -from json import JSONDecodeError -from pathlib import Path -from threading import RLock -from typing import Final, Literal, cast - -import orjson - -from . import __version__ -from ._cli_args import ( - DEFAULT_BASELINE_PATH, - DEFAULT_BLOCK_MIN_LOC, - DEFAULT_BLOCK_MIN_STMT, - DEFAULT_MAX_BASELINE_SIZE_MB, - DEFAULT_MAX_CACHE_SIZE_MB, - DEFAULT_MIN_LOC, - DEFAULT_MIN_STMT, - DEFAULT_SEGMENT_MIN_LOC, - DEFAULT_SEGMENT_MIN_STMT, -) -from ._cli_baselines import ( - CloneBaselineState, - MetricsBaselineState, - probe_metrics_baseline_section, - resolve_clone_baseline_state, - resolve_metrics_baseline_state, -) -from ._cli_config import ConfigValidationError, load_pyproject_config -from ._cli_meta import _build_report_meta, _current_report_timestamp_utc -from ._cli_runtime import ( - resolve_cache_path, - resolve_cache_status, - validate_numeric_args, -) -from ._coerce import as_float as _as_float -from ._coerce import as_int as _as_int -from ._git_diff import validate_git_diff_ref -from .baseline import Baseline -from .cache import Cache, CacheStatus -from .contracts import ( - DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, - DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, - DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, - DOCS_URL, - REPORT_SCHEMA_VERSION, - ExitCode, -) -from .domain.findings import ( - CATEGORY_CLONE, - CATEGORY_COHESION, - CATEGORY_COMPLEXITY, - CATEGORY_COUPLING, - CATEGORY_DEAD_CODE, - CATEGORY_DEPENDENCY, - CATEGORY_STRUCTURAL, - CLONE_KIND_SEGMENT, - FAMILY_CLONE, - FAMILY_CLONES, - FAMILY_DEAD_CODE, - FAMILY_DESIGN, - FAMILY_STRUCTURAL, -) -from .domain.quality import ( - CONFIDENCE_HIGH, - CONFIDENCE_LOW, - CONFIDENCE_MEDIUM, - EFFORT_EASY, - EFFORT_HARD, - EFFORT_MODERATE, - SEVERITY_CRITICAL, - SEVERITY_INFO, - SEVERITY_WARNING, -) -from .domain.source_scope import ( - SOURCE_KIND_FIXTURES, - SOURCE_KIND_MIXED, - SOURCE_KIND_ORDER, - SOURCE_KIND_OTHER, - SOURCE_KIND_PRODUCTION, - SOURCE_KIND_TESTS, -) -from .models import CoverageJoinResult, MetricsDiff, ProjectMetrics, Suggestion -from .pipeline import ( - GatingResult, - MetricGateConfig, - OutputPaths, - analyze, - bootstrap, - discover, - metric_gate_reasons, - process, - report, -) -from .report.json_contract import ( - clone_group_id, - dead_code_group_id, - design_group_id, - structural_group_id, -) - -AnalysisMode = Literal["full", "clones_only"] -CachePolicy = Literal["reuse", "refresh", "off"] -FreshnessKind = Literal["fresh", "mixed", "reused"] -HotlistKind = Literal[ - "most_actionable", - "highest_spread", - "highest_priority", - "production_hotspots", - "test_fixture_hotspots", -] -FindingFamilyFilter = Literal["all", "clone", "structural", "dead_code", "design"] -FindingNoveltyFilter = Literal["all", "new", "known"] -FindingSort = Literal["default", "priority", "severity", "spread"] -DetailLevel = Literal["summary", "normal", "full"] -ComparisonFocus = Literal["all", "clones", "structural", "metrics"] -PRSummaryFormat = Literal["markdown", "json"] -HelpTopic = Literal[ - "workflow", - "analysis_profile", - "suppressions", - "baseline", - "coverage", - "latest_runs", - "review_state", - "changed_scope", -] -HelpDetail = Literal["compact", "normal"] -MetricsDetailFamily = Literal[ - "complexity", - "coupling", - "cohesion", - "coverage_adoption", - "coverage_join", - "dependencies", - "dead_code", - "api_surface", - "god_modules", - "overloaded_modules", - "health", -] -ReportSection = Literal[ - "all", - "meta", - "inventory", - "findings", - "metrics", - "metrics_detail", - "derived", - "changed", - "integrity", -] -HealthScope = Literal["repository"] -SummaryFocus = Literal["repository", "production", "changed_paths"] - -_LEGACY_CACHE_PATH = Path("~/.cache/codeclone/cache.json").expanduser() -_REPORT_DUMMY_PATH = Path(".cache/codeclone/report.json") -_HEALTH_SCOPE_REPOSITORY: Final[HealthScope] = "repository" -_FOCUS_REPOSITORY: Final[SummaryFocus] = "repository" -_FOCUS_PRODUCTION: Final[SummaryFocus] = "production" -_FOCUS_CHANGED_PATHS: Final[SummaryFocus] = "changed_paths" -_MCP_CONFIG_KEYS = frozenset( - { - "min_loc", - "min_stmt", - "block_min_loc", - "block_min_stmt", - "segment_min_loc", - "segment_min_stmt", - "processes", - "cache_path", - "max_cache_size_mb", - "baseline", - "max_baseline_size_mb", - "metrics_baseline", - "api_surface", - "coverage_xml", - "coverage_min", - "golden_fixture_paths", - } -) -_RESOURCE_SECTION_MAP: Final[dict[str, ReportSection]] = { - "report.json": "all", - "summary": "meta", - "health": "metrics", - "changed": "changed", - "overview": "derived", -} -_SEVERITY_WEIGHT: Final[dict[str, float]] = { - SEVERITY_CRITICAL: 1.0, - SEVERITY_WARNING: 0.6, - SEVERITY_INFO: 0.2, -} -_EFFORT_WEIGHT: Final[dict[str, float]] = { - EFFORT_EASY: 1.0, - EFFORT_MODERATE: 0.6, - EFFORT_HARD: 0.3, -} -_NOVELTY_WEIGHT: Final[dict[str, float]] = {"new": 1.0, "known": 0.5} -_RUNTIME_WEIGHT: Final[dict[str, float]] = { - "production": 1.0, - "mixed": 0.8, - "tests": 0.4, - "fixtures": 0.2, - "other": 0.5, -} -_CONFIDENCE_WEIGHT: Final[dict[str, float]] = { - CONFIDENCE_HIGH: 1.0, - CONFIDENCE_MEDIUM: 0.7, - CONFIDENCE_LOW: 0.3, -} -# Canonical report groups use FAMILY_CLONES ("clones"), while individual finding -# payloads use FAMILY_CLONE ("clone"). -_VALID_ANALYSIS_MODES = frozenset({"full", "clones_only"}) -_VALID_CACHE_POLICIES = frozenset({"reuse", "refresh", "off"}) -_VALID_FINDING_FAMILIES = frozenset( - {"all", "clone", "structural", "dead_code", "design"} -) -_VALID_FINDING_NOVELTY = frozenset({"all", "new", "known"}) -_VALID_FINDING_SORT = frozenset({"default", "priority", "severity", "spread"}) -_VALID_DETAIL_LEVELS = frozenset({"summary", "normal", "full"}) -_VALID_COMPARISON_FOCUS = frozenset({"all", "clones", "structural", "metrics"}) -_VALID_PR_SUMMARY_FORMATS = frozenset({"markdown", "json"}) -_VALID_HELP_TOPICS = frozenset( - { - "workflow", - "analysis_profile", - "suppressions", - "baseline", - "coverage", - "latest_runs", - "review_state", - "changed_scope", - } -) -_VALID_HELP_DETAILS = frozenset({"compact", "normal"}) -DEFAULT_MCP_HISTORY_LIMIT = 4 -MAX_MCP_HISTORY_LIMIT = 10 -_VALID_REPORT_SECTIONS = frozenset( - { - "all", - "meta", - "inventory", - "findings", - "metrics", - "metrics_detail", - "derived", - "changed", - "integrity", - } -) -_VALID_HOTLIST_KINDS = frozenset( - { - "most_actionable", - "highest_spread", - "highest_priority", - "production_hotspots", - "test_fixture_hotspots", - } -) -_VALID_SEVERITIES = frozenset({SEVERITY_CRITICAL, SEVERITY_WARNING, SEVERITY_INFO}) -_SOURCE_KIND_BREAKDOWN_ORDER: Final[tuple[str, ...]] = ( - SOURCE_KIND_PRODUCTION, - SOURCE_KIND_TESTS, - SOURCE_KIND_FIXTURES, - SOURCE_KIND_MIXED, - SOURCE_KIND_OTHER, -) -_COMPACT_ITEM_PATH_KEYS: Final[frozenset[str]] = frozenset( - {"relative_path", "path", "filepath", "file"} -) -_COMPACT_ITEM_EMPTY_VALUES: Final[tuple[object, ...]] = ("", None, [], {}, ()) -_HOTLIST_REPORT_KEYS: Final[dict[str, str]] = { - "most_actionable": "most_actionable_ids", - "highest_spread": "highest_spread_ids", - "production_hotspots": "production_hotspot_ids", - "test_fixture_hotspots": "test_fixture_hotspot_ids", -} -_CHECK_TO_DIMENSION: Final[dict[str, str]] = { - "cohesion": "cohesion", - "coupling": "coupling", - "dead_code": "dead_code", - "complexity": "complexity", - "clones": "clones", -} -_DESIGN_CHECK_CONTEXT: Final[dict[str, dict[str, object]]] = { - "complexity": { - "category": CATEGORY_COMPLEXITY, - "metric": "cyclomatic_complexity", - "operator": ">", - "default_threshold": DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, - }, - "coupling": { - "category": CATEGORY_COUPLING, - "metric": "cbo", - "operator": ">", - "default_threshold": DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, - }, - "cohesion": { - "category": CATEGORY_COHESION, - "metric": "lcom4", - "operator": ">=", - "default_threshold": DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, - }, -} -_VALID_METRICS_DETAIL_FAMILIES = frozenset( - { - "complexity", - "coupling", - "cohesion", - "coverage_adoption", - "coverage_join", - "dependencies", - "dead_code", - "api_surface", - "god_modules", - "overloaded_modules", - "health", - } -) -_METRICS_DETAIL_FAMILY_ALIASES: Final[dict[str, str]] = { - "god_modules": "overloaded_modules", -} -_SHORT_RUN_ID_LENGTH = 8 -_SHORT_HASH_ID_LENGTH = 6 - - -@dataclass(frozen=True) -class MCPHelpTopicSpec: - summary: str - key_points: tuple[str, ...] - recommended_tools: tuple[str, ...] - doc_links: tuple[tuple[str, str], ...] - warnings: tuple[str, ...] = () - anti_patterns: tuple[str, ...] = () - - -_MCP_BOOK_URL: Final = f"{DOCS_URL}book/" -_MCP_GUIDE_URL: Final = f"{DOCS_URL}mcp/" -_MCP_INTERFACE_DOC_LINK: Final[tuple[str, str]] = ( - "MCP interface contract", - f"{_MCP_BOOK_URL}20-mcp-interface/", -) -_BASELINE_DOC_LINK: Final[tuple[str, str]] = ( - "Baseline contract", - f"{_MCP_BOOK_URL}06-baseline/", -) -_CONFIG_DOC_LINK: Final[tuple[str, str]] = ( - "Config and defaults", - f"{_MCP_BOOK_URL}04-config-and-defaults/", -) -_REPORT_DOC_LINK: Final[tuple[str, str]] = ( - "Report contract", - f"{_MCP_BOOK_URL}08-report/", -) -_CLI_DOC_LINK: Final[tuple[str, str]] = ( - "CLI contract", - f"{_MCP_BOOK_URL}09-cli/", -) -_PIPELINE_DOC_LINK: Final[tuple[str, str]] = ( - "Core pipeline", - f"{_MCP_BOOK_URL}05-core-pipeline/", -) -_SUPPRESSIONS_DOC_LINK: Final[tuple[str, str]] = ( - "Inline suppressions contract", - f"{_MCP_BOOK_URL}19-inline-suppressions/", -) -_MCP_GUIDE_DOC_LINK: Final[tuple[str, str]] = ("MCP usage guide", _MCP_GUIDE_URL) -_HELP_TOPIC_SPECS: Final[dict[str, MCPHelpTopicSpec]] = { - "workflow": MCPHelpTopicSpec( - summary=( - "CodeClone MCP is triage-first and budget-aware. Start with a " - "summary or production triage, then narrow through hotspots or " - "focused checks before opening one finding in detail." - ), - key_points=( - "Recommended first pass: analyze_repository or analyze_changed_paths.", - ( - "Start with default or pyproject-resolved thresholds; lower them " - "only for an explicit higher-sensitivity follow-up pass." - ), - ( - "Use get_run_summary or get_production_triage before broad " - "finding listing." - ), - ( - "Prefer list_hotspots or focused check_* tools over " - "list_findings on noisy repositories." - ), - ("Use get_finding and get_remediation only after selecting an issue."), - ( - "get_report_section(section='all') is an exception path, not " - "a default first step." - ), - ), - recommended_tools=( - "analyze_repository", - "analyze_changed_paths", - "get_run_summary", - "get_production_triage", - "list_hotspots", - "check_clones", - "check_dead_code", - "get_finding", - "get_remediation", - ), - doc_links=(_MCP_INTERFACE_DOC_LINK, _MCP_GUIDE_DOC_LINK), - warnings=( - ( - "Broad list_findings calls burn context quickly on large or " - "noisy repositories." - ), - ( - "Prefer generate_pr_summary(format='markdown') unless machine " - "JSON is explicitly required." - ), - ), - anti_patterns=( - "Starting exploration with list_findings on a noisy repository.", - "Using get_report_section(section='all') as the default first step.", - ( - "Escalating detail on larger lists instead of opening one " - "finding with get_finding." - ), - ), - ), - "analysis_profile": MCPHelpTopicSpec( - summary=( - "CodeClone default analysis is intentionally conservative: stable " - "first-pass review, baseline-aware governance, and CI-friendly " - "signal over maximum local sensitivity." - ), - key_points=( - ( - "Default thresholds are intentionally conservative and " - "production-friendly." - ), - ( - "A clean default run does not rule out smaller local " - "duplication or repetition." - ), - ( - "Lowering thresholds increases sensitivity and can surface " - "smaller functions, tighter windows, and finer local signals." - ), - ( - "Lower-threshold runs are best for exploratory local review, " - "not as a silent replacement for the default governance profile." - ), - "Interpret results in the context of the active threshold profile.", - ), - recommended_tools=( - "analyze_repository", - "analyze_changed_paths", - "get_run_summary", - "compare_runs", - ), - doc_links=( - _CONFIG_DOC_LINK, - _PIPELINE_DOC_LINK, - _MCP_INTERFACE_DOC_LINK, - ), - warnings=( - ( - "Do not treat a default-threshold run as proof that no smaller " - "local clone or repetition exists." - ), - ( - "Lower-threshold runs usually increase noise and should be read " - "as higher-sensitivity exploratory passes." - ), - "Run comparisons are most meaningful when profiles are aligned.", - ), - anti_patterns=( - ( - "Assuming a clean default pass means no finer-grained " - "duplication exists anywhere in the repository." - ), - ( - "Lowering thresholds for exploration and then interpreting the " - "result as if it had the same meaning as the conservative " - "default pass." - ), - ( - "Mixing low-threshold exploratory output into baseline or CI " - "reasoning without acknowledging the profile change." - ), - ), - ), - "suppressions": MCPHelpTopicSpec( - summary=( - "CodeClone supports explicit inline suppressions for selected " - "findings. They are local policy, not analysis truth, and should " - "stay narrow and declaration-scoped." - ), - key_points=( - "Current syntax uses codeclone: ignore[rule-id,...].", - "Binding is declaration-scoped: def, async def, or class.", - ( - "Supported placement is the previous line or inline on the " - "declaration or header line." - ), - ( - "Suppressions are target-specific and do not imply file-wide " - "or cascading scope." - ), - ( - "Use suppressions for accepted dynamic or runtime false " - "positives, not to hide broad classes of debt." - ), - ), - recommended_tools=("get_finding", "get_remediation"), - doc_links=(_SUPPRESSIONS_DOC_LINK, _MCP_INTERFACE_DOC_LINK), - warnings=( - ( - "MCP explains suppression semantics but never creates or " - "updates suppressions." - ), - ), - anti_patterns=( - "Treating suppressions as file-wide or inherited state.", - ( - "Using suppressions to hide broad structural debt instead of " - "accepted false positives." - ), - ), - ), - "baseline": MCPHelpTopicSpec( - summary=( - "A baseline is CodeClone's accepted comparison snapshot for clones " - "and optional metrics. It separates known debt from new regressions " - "and is trust-checked before use." - ), - key_points=( - ( - "Canonical baseline schema is v2.0 with meta and clone keys; " - "metrics may be embedded for unified flows." - ), - ( - "Compatibility depends on generator identity, supported " - "schema version, fingerprint version, python tag, and payload " - "integrity." - ), - ( - "Known means already present in the trusted baseline; new " - "means not accepted by baseline." - ), - ( - "In CI and gating contexts, untrusted baseline states are " - "contract errors rather than soft warnings." - ), - "MCP is read-only and does not update or rewrite baselines.", - ), - recommended_tools=("get_run_summary", "evaluate_gates", "compare_runs"), - doc_links=(_BASELINE_DOC_LINK,), - warnings=( - "Baseline trust semantics directly affect new-vs-known classification.", - ), - anti_patterns=( - "Treating baseline as mutable MCP session state.", - "Assuming an untrusted baseline is only cosmetic in CI contexts.", - ), - ), - "coverage": MCPHelpTopicSpec( - summary=( - "Coverage join is an external current-run signal: CodeClone reads " - "an existing Cobertura XML report and joins line hits to risky " - "function spans." - ), - key_points=( - "Use Cobertura XML such as `coverage xml` output from coverage.py.", - "Coverage join does not become baseline truth and does not affect health.", - ( - "Coverage hotspot gating is current-run only and focuses on " - "medium/high-risk functions measured below the configured " - "threshold." - ), - ( - "Functions missing from the supplied coverage.xml are surfaced " - "as scope gaps, not labeled as untested." - ), - "Use metrics_detail(family='coverage_join') for bounded drill-down.", - ), - recommended_tools=( - "analyze_repository", - "analyze_changed_paths", - "get_run_summary", - "get_report_section", - "evaluate_gates", - ), - doc_links=( - _MCP_INTERFACE_DOC_LINK, - _CLI_DOC_LINK, - _REPORT_DOC_LINK, - ), - warnings=( - "Coverage join is only as accurate as the external XML path mapping.", - "It does not infer branch coverage and does not execute tests.", - "Use fail-on-untested-hotspots only with a valid joined coverage input.", - ), - anti_patterns=( - "Treating missing coverage XML as zero coverage without stating it.", - "Reading coverage join as a baseline-aware trend signal.", - "Assuming dynamic runtime dispatch is visible through a static line join.", - ), - ), - "latest_runs": MCPHelpTopicSpec( - summary=( - "latest/* resources point to the most recent analysis run in the " - "current MCP session. They are convenience handles, not persistent " - "truth anchors." - ), - key_points=( - "Run history is in-memory only and bounded by history-limit.", - "The latest pointer moves when a newer analyze_* call registers a run.", - "A fresh repository state requires a fresh analyze run.", - ( - "Short run ids are convenience handles derived from canonical " - "run identity." - ), - ( - "Do not assume latest/* is globally current outside the " - "active MCP session." - ), - ), - recommended_tools=( - "analyze_repository", - "analyze_changed_paths", - "get_run_summary", - "compare_runs", - ), - doc_links=(_MCP_INTERFACE_DOC_LINK, _MCP_GUIDE_DOC_LINK), - warnings=( - ( - "latest/* can point at a different repository after a later " - "analyze call in the same session." - ), - ), - anti_patterns=( - ( - "Assuming latest/* remains tied to one repository across the " - "whole client session." - ), - ( - "Using latest/* as a substitute for starting a fresh run when " - "freshness matters." - ), - ), - ), - "review_state": MCPHelpTopicSpec( - summary=( - "Reviewed state in MCP is session-local workflow state. It helps " - "long sessions track review progress without modifying canonical " - "findings, baseline, or persisted artifacts." - ), - key_points=( - "Review markers are in-memory only.", - "They do not change report truth, finding identity, or CI semantics.", - "They are useful for triage workflows across long sessions.", - ( - "They should not be interpreted as acceptance, suppression, " - "or baseline update." - ), - ), - recommended_tools=( - "list_hotspots", - "get_finding", - "mark_finding_reviewed", - "list_reviewed_findings", - ), - doc_links=(_MCP_INTERFACE_DOC_LINK, _MCP_GUIDE_DOC_LINK), - warnings=( - "Reviewed markers disappear when the MCP session is cleared or restarted.", - ), - anti_patterns=( - "Treating reviewed state as a persistent acceptance signal.", - "Assuming reviewed findings are removed from canonical report truth.", - ), - ), - "changed_scope": MCPHelpTopicSpec( - summary=( - "Changed-scope analysis narrows review to findings that touch a " - "selected change set. It is for PR and patch review, not a " - "replacement for full canonical analysis." - ), - key_points=( - ( - "Use analyze_changed_paths with explicit changed_paths or " - "git_diff_ref for review-focused runs." - ), - ( - "Start with the same conservative profile as the default " - "review, then lower thresholds only when you explicitly want " - "a higher-sensitivity changed-files pass." - ), - ( - "Changed-scope is best for asking what new issues touch " - "modified files and whether anything should block CI." - ), - "Prefer production triage and hotspot views before broad listing.", - "If repository-wide truth is needed, run full analysis first.", - ), - recommended_tools=( - "analyze_changed_paths", - "get_run_summary", - "get_production_triage", - "evaluate_gates", - "generate_pr_summary", - ), - doc_links=(_MCP_INTERFACE_DOC_LINK, _MCP_GUIDE_DOC_LINK), - warnings=( - ( - "Changed-scope narrows review focus; it does not replace the " - "full canonical report for repository-wide truth." - ), - ), - anti_patterns=( - "Using changed-scope as if it were the only source of repository truth.", - ( - "Starting changed-files review with broad listing instead of " - "compact triage." - ), - ), - ), -} - - -def _suggestion_finding_id_payload(suggestion: object) -> str: - if not hasattr(suggestion, "finding_family"): - return "" - family = str(getattr(suggestion, "finding_family", "")).strip() - if family == FAMILY_CLONES: - kind = str(getattr(suggestion, "finding_kind", "")).strip() - subject_key = str(getattr(suggestion, "subject_key", "")).strip() - return clone_group_id(kind or CLONE_KIND_SEGMENT, subject_key) - if family == FAMILY_STRUCTURAL: - return structural_group_id( - str(getattr(suggestion, "finding_kind", "")).strip() or CATEGORY_STRUCTURAL, - str(getattr(suggestion, "subject_key", "")).strip(), - ) - category = str(getattr(suggestion, "category", "")).strip() - subject_key = str(getattr(suggestion, "subject_key", "")).strip() - if category == CATEGORY_DEAD_CODE: - return dead_code_group_id(subject_key) - return design_group_id( - category, - subject_key or str(getattr(suggestion, "title", "")), - ) - - -@dataclass(frozen=True, slots=True) -class _CloneShortIdEntry: - canonical_id: str - alias: str - token: str - suffix: str - - def render(self, prefix_length: int) -> str: - if prefix_length <= 0: - prefix_length = len(self.token) - return f"{self.alias}:{self.token[:prefix_length]}{self.suffix}" - - -def _partitioned_short_id(alias: str, remainder: str) -> str: - first, _, rest = remainder.partition(":") - return f"{alias}:{first}:{rest}" if rest else f"{alias}:{first}" - - -def _clone_short_id_entry_payload(canonical_id: str) -> _CloneShortIdEntry: - _prefix, _, remainder = canonical_id.partition(":") - clone_kind, _, group_key = remainder.partition(":") - hashes = [part for part in group_key.split("|") if part] - if clone_kind == "function": - fingerprint = hashes[0] if hashes else group_key - bucket = "" - if "|" in group_key: - bucket = "|" + group_key.split("|")[-1] - return _CloneShortIdEntry( - canonical_id=canonical_id, - alias="fn", - token=fingerprint, - suffix=bucket, - ) - alias = {"block": "blk", "segment": "seg"}.get(clone_kind, "clone") - combined = "|".join(hashes) if hashes else group_key - token = hashlib.sha256(combined.encode()).hexdigest() - return _CloneShortIdEntry( - canonical_id=canonical_id, - alias=alias, - token=token, - suffix=f"|x{len(hashes) or 1}", - ) - - -def _disambiguated_clone_short_ids_payload( - canonical_ids: Sequence[str], -) -> dict[str, str]: - clone_entries = [ - _clone_short_id_entry_payload(canonical_id) for canonical_id in canonical_ids - ] - max_token_length = max((len(entry.token) for entry in clone_entries), default=0) - for prefix_length in range(_SHORT_HASH_ID_LENGTH + 2, max_token_length + 1, 2): - candidates = { - entry.canonical_id: entry.render(prefix_length) for entry in clone_entries - } - if len(set(candidates.values())) == len(candidates): - return candidates - return { - entry.canonical_id: entry.render(max_token_length) for entry in clone_entries - } - - -def _leaf_symbol_name_payload(value: object) -> str: - text = str(value).strip() - if not text: - return "" - if ":" in text: - text = text.rsplit(":", maxsplit=1)[-1] - if "." in text: - text = text.rsplit(".", maxsplit=1)[-1] - return text - - -def _base_short_finding_id_payload(canonical_id: str) -> str: - prefix, _, remainder = canonical_id.partition(":") - if prefix == "clone": - return _clone_short_id_entry_payload(canonical_id).render(_SHORT_HASH_ID_LENGTH) - if prefix == "structural": - finding_kind, _, finding_key = remainder.partition(":") - return f"struct:{finding_kind}:{finding_key[:_SHORT_HASH_ID_LENGTH]}" - if prefix == "dead_code": - return f"dead:{_leaf_symbol_name_payload(remainder)}" - if prefix == "design": - category, _, subject_key = remainder.partition(":") - return f"design:{category}:{_leaf_symbol_name_payload(subject_key)}" - return canonical_id - - -def _disambiguated_short_finding_id_payload(canonical_id: str) -> str: - prefix, _, remainder = canonical_id.partition(":") - if prefix == "clone": - return _clone_short_id_entry_payload(canonical_id).render(0) - if prefix == "structural": - return _partitioned_short_id("struct", remainder) - if prefix == "dead_code": - return f"dead:{remainder}" - if prefix == "design": - return _partitioned_short_id("design", remainder) - return canonical_id - - -def _json_text_payload( - payload: object, - *, - sort_keys: bool = True, -) -> str: - options = orjson.OPT_INDENT_2 - if sort_keys: - options |= orjson.OPT_SORT_KEYS - return orjson.dumps(payload, option=options).decode("utf-8") - - -def _git_diff_lines_payload( - *, - root_path: Path, - git_diff_ref: str, -) -> tuple[str, ...]: - try: - validated_ref = validate_git_diff_ref(git_diff_ref) - except ValueError as exc: - raise MCPGitDiffError(str(exc)) from exc - try: - completed = subprocess.run( - ["git", "diff", "--name-only", validated_ref, "--"], - cwd=root_path, - check=True, - capture_output=True, - text=True, - timeout=30, - ) - except (OSError, subprocess.CalledProcessError, subprocess.TimeoutExpired) as exc: - raise MCPGitDiffError( - f"Unable to resolve changed paths from git diff ref '{validated_ref}'." - ) from exc - return tuple( - sorted({line.strip() for line in completed.stdout.splitlines() if line.strip()}) - ) - - -def _load_report_document_payload(report_json: str) -> dict[str, object]: - try: - payload = orjson.loads(report_json) - except JSONDecodeError as exc: - raise MCPServiceError( - f"Generated canonical report is not valid JSON: {exc}" - ) from exc - if not isinstance(payload, dict): - raise MCPServiceError("Generated canonical report must be a JSON object.") - return dict(payload) - - -def _validated_history_limit(history_limit: int) -> int: - if not 1 <= history_limit <= MAX_MCP_HISTORY_LIMIT: - raise ValueError( - f"history_limit must be between 1 and {MAX_MCP_HISTORY_LIMIT}." - ) - return history_limit - - -class MCPServiceError(RuntimeError): - """Base class for CodeClone MCP service errors.""" - - -class MCPServiceContractError(MCPServiceError): - """Raised when an MCP request violates the CodeClone service contract.""" - - -class MCPRunNotFoundError(MCPServiceError): - """Raised when a requested MCP run is not available in the in-memory registry.""" - - -class MCPFindingNotFoundError(MCPServiceError): - """Raised when a requested finding id is not present in the selected run.""" - - -class MCPGitDiffError(MCPServiceError): - """Raised when changed paths cannot be resolved from a git ref.""" - - -class _BufferConsole: - def __init__(self) -> None: - self.messages: list[str] = [] - - def print(self, *objects: object, **_kwargs: object) -> None: - text = " ".join(str(obj) for obj in objects).strip() - if text: - self.messages.append(text) - - -@dataclass(frozen=True, slots=True) -class MCPAnalysisRequest: - root: str | None = None - analysis_mode: AnalysisMode = "full" - respect_pyproject: bool = True - changed_paths: tuple[str, ...] = () - git_diff_ref: str | None = None - processes: int | None = None - min_loc: int | None = None - min_stmt: int | None = None - block_min_loc: int | None = None - block_min_stmt: int | None = None - segment_min_loc: int | None = None - segment_min_stmt: int | None = None - api_surface: bool | None = None - coverage_xml: str | None = None - coverage_min: int | None = None - complexity_threshold: int | None = None - coupling_threshold: int | None = None - cohesion_threshold: int | None = None - baseline_path: str | None = None - metrics_baseline_path: str | None = None - max_baseline_size_mb: int | None = None - cache_policy: CachePolicy = "reuse" - cache_path: str | None = None - max_cache_size_mb: int | None = None - - -@dataclass(frozen=True, slots=True) -class MCPGateRequest: - run_id: str | None = None - fail_on_new: bool = False - fail_threshold: int = -1 - fail_complexity: int = -1 - fail_coupling: int = -1 - fail_cohesion: int = -1 - fail_cycles: bool = False - fail_dead_code: bool = False - fail_health: int = -1 - fail_on_new_metrics: bool = False - fail_on_typing_regression: bool = False - fail_on_docstring_regression: bool = False - fail_on_api_break: bool = False - fail_on_untested_hotspots: bool = False - min_typing_coverage: int = -1 - min_docstring_coverage: int = -1 - coverage_min: int = 50 - - -@dataclass(frozen=True, slots=True) -class MCPRunRecord: - run_id: str - root: Path - request: MCPAnalysisRequest - comparison_settings: tuple[object, ...] - report_document: dict[str, object] - summary: dict[str, object] - changed_paths: tuple[str, ...] - changed_projection: dict[str, object] | None - warnings: tuple[str, ...] - failures: tuple[str, ...] - func_clones_count: int - block_clones_count: int - project_metrics: ProjectMetrics | None - coverage_join: CoverageJoinResult | None - suggestions: tuple[Suggestion, ...] - new_func: frozenset[str] - new_block: frozenset[str] - metrics_diff: MetricsDiff | None - - -class CodeCloneMCPRunStore: - def __init__(self, *, history_limit: int = DEFAULT_MCP_HISTORY_LIMIT) -> None: - self._history_limit = _validated_history_limit(history_limit) - self._lock = RLock() - self._records: OrderedDict[str, MCPRunRecord] = OrderedDict() - self._latest_run_id: str | None = None - - def register(self, record: MCPRunRecord) -> MCPRunRecord: - with self._lock: - self._records.pop(record.run_id, None) - self._records[record.run_id] = record - self._records.move_to_end(record.run_id) - self._latest_run_id = record.run_id - while len(self._records) > self._history_limit: - self._records.popitem(last=False) - return record - - def get(self, run_id: str | None = None) -> MCPRunRecord: - with self._lock: - resolved_run_id = self._resolve_run_id(run_id) - if resolved_run_id is None: - raise MCPRunNotFoundError("No matching MCP analysis run is available.") - return self._records[resolved_run_id] - - def _resolve_run_id(self, run_id: str | None) -> str | None: - if run_id is None: - return self._latest_run_id - if run_id in self._records: - return run_id - matches = [ - candidate for candidate in self._records if candidate.startswith(run_id) - ] - if len(matches) == 1: - return matches[0] - if len(matches) > 1: - raise MCPServiceContractError( - f"Run id '{run_id}' is ambiguous in this MCP session." - ) - return None - - def records(self) -> tuple[MCPRunRecord, ...]: - with self._lock: - return tuple(self._records.values()) - - def clear(self) -> tuple[str, ...]: - with self._lock: - removed_run_ids = tuple(self._records.keys()) - self._records.clear() - self._latest_run_id = None - return removed_run_ids - - -class CodeCloneMCPService: - def __init__(self, *, history_limit: int = DEFAULT_MCP_HISTORY_LIMIT) -> None: - self._runs = CodeCloneMCPRunStore(history_limit=history_limit) - self._state_lock = RLock() - self._review_state: dict[str, OrderedDict[str, str | None]] = {} - self._last_gate_results: dict[str, dict[str, object]] = {} - self._spread_max_cache: dict[str, int] = {} - - def analyze_repository(self, request: MCPAnalysisRequest) -> dict[str, object]: - self._validate_analysis_request(request) - root_path = self._resolve_root(request.root) - analysis_started_at_utc = _current_report_timestamp_utc() - changed_paths = self._resolve_request_changed_paths( - root_path=root_path, - changed_paths=request.changed_paths, - git_diff_ref=request.git_diff_ref, - ) - args = self._build_args(root_path=root_path, request=request) - ( - baseline_path, - baseline_exists, - metrics_baseline_path, - metrics_baseline_exists, - shared_baseline_payload, - ) = self._resolve_baseline_inputs(root_path=root_path, args=args) - cache_path = self._resolve_cache_path(root_path=root_path, args=args) - cache = self._build_cache( - root_path=root_path, - args=args, - cache_path=cache_path, - policy=request.cache_policy, - ) - console = _BufferConsole() - - boot = bootstrap( - args=args, - root=root_path, - output_paths=OutputPaths(json=_REPORT_DUMMY_PATH), - cache_path=cache_path, - ) - discovery_result = discover(boot=boot, cache=cache) - processing_result = process(boot=boot, discovery=discovery_result, cache=cache) - analysis_result = analyze( - boot=boot, - discovery=discovery_result, - processing=processing_result, - ) - - clone_baseline_state = resolve_clone_baseline_state( - args=args, - baseline_path=baseline_path, - baseline_exists=baseline_exists, - func_groups=analysis_result.func_groups, - block_groups=analysis_result.block_groups, - codeclone_version=__version__, - console=console, - shared_baseline_payload=( - shared_baseline_payload - if metrics_baseline_path == baseline_path - else None - ), - ) - metrics_baseline_state = resolve_metrics_baseline_state( - args=args, - metrics_baseline_path=metrics_baseline_path, - metrics_baseline_exists=metrics_baseline_exists, - baseline_updated_path=clone_baseline_state.updated_path, - project_metrics=analysis_result.project_metrics, - console=console, - shared_baseline_payload=( - shared_baseline_payload - if metrics_baseline_path == baseline_path - else None - ), - ) - - cache_status, cache_schema_version = resolve_cache_status(cache) - report_meta = _build_report_meta( - codeclone_version=__version__, - scan_root=root_path, - baseline_path=baseline_path, - baseline=clone_baseline_state.baseline, - baseline_loaded=clone_baseline_state.loaded, - baseline_status=clone_baseline_state.status.value, - cache_path=cache_path, - cache_used=cache_status == CacheStatus.OK, - cache_status=cache_status.value, - cache_schema_version=cache_schema_version, - files_skipped_source_io=len(processing_result.source_read_failures), - metrics_baseline_path=metrics_baseline_path, - metrics_baseline=metrics_baseline_state.baseline, - metrics_baseline_loaded=metrics_baseline_state.loaded, - metrics_baseline_status=metrics_baseline_state.status.value, - health_score=( - analysis_result.project_metrics.health.total - if analysis_result.project_metrics is not None - else None - ), - health_grade=( - analysis_result.project_metrics.health.grade - if analysis_result.project_metrics is not None - else None - ), - analysis_mode=request.analysis_mode, - metrics_computed=self._metrics_computed(request.analysis_mode), - min_loc=_as_int(args.min_loc, DEFAULT_MIN_LOC), - min_stmt=_as_int(args.min_stmt, DEFAULT_MIN_STMT), - block_min_loc=_as_int(args.block_min_loc, DEFAULT_BLOCK_MIN_LOC), - block_min_stmt=_as_int(args.block_min_stmt, DEFAULT_BLOCK_MIN_STMT), - segment_min_loc=_as_int(args.segment_min_loc, DEFAULT_SEGMENT_MIN_LOC), - segment_min_stmt=_as_int(args.segment_min_stmt, DEFAULT_SEGMENT_MIN_STMT), - design_complexity_threshold=_as_int( - getattr( - args, - "design_complexity_threshold", - DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, - ), - DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, - ), - design_coupling_threshold=_as_int( - getattr( - args, - "design_coupling_threshold", - DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, - ), - DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, - ), - design_cohesion_threshold=_as_int( - getattr( - args, - "design_cohesion_threshold", - DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, - ), - DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, - ), - analysis_started_at_utc=analysis_started_at_utc, - report_generated_at_utc=_current_report_timestamp_utc(), - ) - - baseline_for_diff = ( - clone_baseline_state.baseline - if clone_baseline_state.trusted_for_diff - else Baseline(baseline_path) - ) - new_func, new_block = baseline_for_diff.diff( - analysis_result.func_groups, - analysis_result.block_groups, - ) - metrics_diff = None - if ( - analysis_result.project_metrics is not None - and metrics_baseline_state.trusted_for_diff - ): - metrics_diff = metrics_baseline_state.baseline.diff( - analysis_result.project_metrics - ) - - report_artifacts = report( - boot=boot, - discovery=discovery_result, - processing=processing_result, - analysis=analysis_result, - report_meta=report_meta, - new_func=new_func, - new_block=new_block, - metrics_diff=metrics_diff, - ) - report_json = report_artifacts.json - if report_json is None: - raise MCPServiceError("CodeClone MCP expected a canonical JSON report.") - report_document = self._load_report_document(report_json) - run_id = self._report_digest(report_document) - - warning_items = set(console.messages) - if cache.load_warning: - warning_items.add(cache.load_warning) - warning_items.update(discovery_result.skipped_warnings) - warnings = tuple(sorted(warning_items)) - failures = tuple( - sorted( - { - *processing_result.failed_files, - *processing_result.source_read_failures, - } - ) - ) - - base_summary = self._build_run_summary_payload( - run_id=run_id, - root_path=root_path, - request=request, - report_document=report_document, - baseline_state=clone_baseline_state, - metrics_baseline_state=metrics_baseline_state, - cache_status=cache_status, - new_func=new_func, - new_block=new_block, - metrics_diff=metrics_diff, - warnings=warnings, - failures=failures, - ) - provisional_record = MCPRunRecord( - run_id=run_id, - root=root_path, - request=request, - comparison_settings=self._comparison_settings(args=args, request=request), - report_document=report_document, - summary=base_summary, - changed_paths=changed_paths, - changed_projection=None, - warnings=warnings, - failures=failures, - func_clones_count=analysis_result.func_clones_count, - block_clones_count=analysis_result.block_clones_count, - project_metrics=analysis_result.project_metrics, - coverage_join=analysis_result.coverage_join, - suggestions=analysis_result.suggestions, - new_func=frozenset(new_func), - new_block=frozenset(new_block), - metrics_diff=metrics_diff, - ) - changed_projection = self._build_changed_projection(provisional_record) - summary = self._augment_summary_with_changed( - summary=base_summary, - changed_paths=changed_paths, - changed_projection=changed_projection, - ) - record = MCPRunRecord( - run_id=run_id, - root=root_path, - request=request, - comparison_settings=self._comparison_settings(args=args, request=request), - report_document=report_document, - summary=summary, - changed_paths=changed_paths, - changed_projection=changed_projection, - warnings=warnings, - failures=failures, - func_clones_count=analysis_result.func_clones_count, - block_clones_count=analysis_result.block_clones_count, - project_metrics=analysis_result.project_metrics, - coverage_join=analysis_result.coverage_join, - suggestions=analysis_result.suggestions, - new_func=frozenset(new_func), - new_block=frozenset(new_block), - metrics_diff=metrics_diff, - ) - self._runs.register(record) - self._prune_session_state() - return self._summary_payload(record.summary, record=record) - - def analyze_changed_paths(self, request: MCPAnalysisRequest) -> dict[str, object]: - if not request.changed_paths and request.git_diff_ref is None: - raise MCPServiceContractError( - "analyze_changed_paths requires changed_paths or git_diff_ref." - ) - analysis_summary = self.analyze_repository(request) - record = self._runs.get(str(analysis_summary.get("run_id", "")) or None) - return self._changed_analysis_payload(record) - - def get_run_summary(self, run_id: str | None = None) -> dict[str, object]: - record = self._runs.get(run_id) - return self._summary_payload(record.summary, record=record) - - def compare_runs( - self, - *, - run_id_before: str, - run_id_after: str | None = None, - focus: ComparisonFocus = "all", - ) -> dict[str, object]: - validated_focus = cast( - "ComparisonFocus", - self._validate_choice("focus", focus, _VALID_COMPARISON_FOCUS), - ) - before = self._runs.get(run_id_before) - after = self._runs.get(run_id_after) - before_findings = self._comparison_index(before, focus=validated_focus) - after_findings = self._comparison_index(after, focus=validated_focus) - before_ids = set(before_findings) - after_ids = set(after_findings) - regressions = sorted(after_ids - before_ids) - improvements = sorted(before_ids - after_ids) - common = before_ids & after_ids - health_before = self._summary_health_score(before.summary) - health_after = self._summary_health_score(after.summary) - comparability = self._comparison_scope(before=before, after=after) - comparable = bool(comparability["comparable"]) - health_delta = ( - health_after - health_before - if comparable and health_before is not None and health_after is not None - else None - ) - verdict = ( - self._comparison_verdict( - regressions=len(regressions), - improvements=len(improvements), - health_delta=health_delta, - ) - if comparable - else "incomparable" - ) - regressions_payload = ( - [ - self._comparison_finding_card( - after, - after_findings[finding_id], - ) - for finding_id in regressions - ] - if comparable - else [] - ) - improvements_payload = ( - [ - self._comparison_finding_card( - before, - before_findings[finding_id], - ) - for finding_id in improvements - ] - if comparable - else [] - ) - payload: dict[str, object] = { - "before": { - "run_id": self._short_run_id(before.run_id), - "health": health_before, - }, - "after": { - "run_id": self._short_run_id(after.run_id), - "health": health_after, - }, - "comparable": comparable, - "health_delta": health_delta, - "verdict": verdict, - "regressions": regressions_payload, - "improvements": improvements_payload, - "unchanged": len(common) if comparable else None, - "summary": self._comparison_summary_text( - comparable=comparable, - comparability_reason=str(comparability["reason"]), - regressions=len(regressions), - improvements=len(improvements), - health_delta=health_delta, - ), - } - if not comparable: - payload["reason"] = comparability["reason"] - return payload - - def evaluate_gates(self, request: MCPGateRequest) -> dict[str, object]: - record = self._runs.get(request.run_id) - gate_result = self._evaluate_gate_snapshot(record=record, request=request) - result = { - "run_id": self._short_run_id(record.run_id), - "would_fail": gate_result.exit_code != 0, - "exit_code": gate_result.exit_code, - "reasons": list(gate_result.reasons), - "config": { - "fail_on_new": request.fail_on_new, - "fail_threshold": request.fail_threshold, - "fail_complexity": request.fail_complexity, - "fail_coupling": request.fail_coupling, - "fail_cohesion": request.fail_cohesion, - "fail_cycles": request.fail_cycles, - "fail_dead_code": request.fail_dead_code, - "fail_health": request.fail_health, - "fail_on_new_metrics": request.fail_on_new_metrics, - "fail_on_typing_regression": request.fail_on_typing_regression, - "fail_on_docstring_regression": request.fail_on_docstring_regression, - "fail_on_api_break": request.fail_on_api_break, - "fail_on_untested_hotspots": request.fail_on_untested_hotspots, - "min_typing_coverage": request.min_typing_coverage, - "min_docstring_coverage": request.min_docstring_coverage, - "coverage_min": request.coverage_min, - }, - } - with self._state_lock: - self._last_gate_results[record.run_id] = dict(result) - return result - - def _evaluate_gate_snapshot( - self, - *, - record: MCPRunRecord, - request: MCPGateRequest, - ) -> GatingResult: - reasons: list[str] = [] - if request.fail_on_untested_hotspots: - if record.coverage_join is None: - raise MCPServiceContractError( - "Coverage gating requires a run created with coverage_xml." - ) - if record.coverage_join.status != "ok": - detail = record.coverage_join.invalid_reason or "invalid coverage input" - raise MCPServiceContractError( - "Coverage gating requires a valid Cobertura XML input. " - f"Reason: {detail}" - ) - if record.project_metrics is not None: - metric_reasons = metric_gate_reasons( - project_metrics=record.project_metrics, - coverage_join=record.coverage_join, - metrics_diff=record.metrics_diff, - config=MetricGateConfig( - fail_complexity=request.fail_complexity, - fail_coupling=request.fail_coupling, - fail_cohesion=request.fail_cohesion, - fail_cycles=request.fail_cycles, - fail_dead_code=request.fail_dead_code, - fail_health=request.fail_health, - fail_on_new_metrics=request.fail_on_new_metrics, - fail_on_typing_regression=request.fail_on_typing_regression, - fail_on_docstring_regression=request.fail_on_docstring_regression, - fail_on_api_break=request.fail_on_api_break, - fail_on_untested_hotspots=request.fail_on_untested_hotspots, - min_typing_coverage=request.min_typing_coverage, - min_docstring_coverage=request.min_docstring_coverage, - coverage_min=request.coverage_min, - ), - ) - reasons.extend(f"metric:{reason}" for reason in metric_reasons) - - if request.fail_on_new and (record.new_func or record.new_block): - reasons.append("clone:new") - - total_clone_groups = record.func_clones_count + record.block_clones_count - if 0 <= request.fail_threshold < total_clone_groups: - reasons.append( - f"clone:threshold:{total_clone_groups}:{request.fail_threshold}" - ) - - if reasons: - return GatingResult( - exit_code=int(ExitCode.GATING_FAILURE), - reasons=tuple(reasons), - ) - return GatingResult(exit_code=int(ExitCode.SUCCESS), reasons=()) - - def get_report_section( - self, - *, - run_id: str | None = None, - section: ReportSection = "all", - family: MetricsDetailFamily | None = None, - path: str | None = None, - offset: int = 0, - limit: int = 50, - ) -> dict[str, object]: - validated_section = cast( - "ReportSection", - self._validate_choice("section", section, _VALID_REPORT_SECTIONS), - ) - record = self._runs.get(run_id) - report_document = record.report_document - if validated_section == "all": - return dict(report_document) - if validated_section == "changed": - if record.changed_projection is None: - raise MCPServiceContractError( - "Report section 'changed' is not available in this run." - ) - return dict(record.changed_projection) - if validated_section == "metrics": - metrics = self._as_mapping(report_document.get("metrics")) - return {"summary": dict(self._as_mapping(metrics.get("summary")))} - if validated_section == "metrics_detail": - metrics = self._as_mapping(report_document.get("metrics")) - if not metrics: - raise MCPServiceContractError( - "Report section 'metrics_detail' is not available in this run." - ) - validated_family_input = self._validate_optional_choice( - "family", - family, - _VALID_METRICS_DETAIL_FAMILIES, - ) - normalized_family = ( - _METRICS_DETAIL_FAMILY_ALIASES.get( - str(validated_family_input), - str(validated_family_input), - ) - if validated_family_input is not None - else None - ) - validated_family = cast("MetricsDetailFamily | None", normalized_family) - return self._metrics_detail_payload( - metrics=metrics, - family=validated_family, - path=path, - offset=offset, - limit=limit, - ) - if validated_section == "derived": - return self._derived_section_payload(record) - payload = report_document.get(validated_section) - if not isinstance(payload, Mapping): - raise MCPServiceContractError( - f"Report section '{validated_section}' is not available in this run." - ) - return dict(payload) - - def list_findings( - self, - *, - run_id: str | None = None, - family: FindingFamilyFilter = "all", - category: str | None = None, - severity: str | None = None, - source_kind: str | None = None, - novelty: FindingNoveltyFilter = "all", - sort_by: FindingSort = "default", - detail_level: DetailLevel = "summary", - changed_paths: Sequence[str] = (), - git_diff_ref: str | None = None, - exclude_reviewed: bool = False, - offset: int = 0, - limit: int = 50, - max_results: int | None = None, - ) -> dict[str, object]: - validated_family = cast( - "FindingFamilyFilter", - self._validate_choice("family", family, _VALID_FINDING_FAMILIES), - ) - validated_novelty = cast( - "FindingNoveltyFilter", - self._validate_choice("novelty", novelty, _VALID_FINDING_NOVELTY), - ) - validated_sort = cast( - "FindingSort", - self._validate_choice("sort_by", sort_by, _VALID_FINDING_SORT), - ) - validated_detail = cast( - "DetailLevel", - self._validate_choice("detail_level", detail_level, _VALID_DETAIL_LEVELS), - ) - validated_severity = self._validate_optional_choice( - "severity", - severity, - _VALID_SEVERITIES, - ) - record = self._runs.get(run_id) - paths_filter = self._resolve_query_changed_paths( - record=record, - changed_paths=changed_paths, - git_diff_ref=git_diff_ref, - ) - normalized_limit = max( - 1, - min(max_results if max_results is not None else limit, 200), - ) - filtered = self._query_findings( - record=record, - family=validated_family, - category=category, - severity=validated_severity, - source_kind=source_kind, - novelty=validated_novelty, - sort_by=validated_sort, - detail_level=validated_detail, - changed_paths=paths_filter, - exclude_reviewed=exclude_reviewed, - ) - total = len(filtered) - normalized_offset = max(0, offset) - items = filtered[normalized_offset : normalized_offset + normalized_limit] - next_offset = normalized_offset + len(items) - return { - "run_id": self._short_run_id(record.run_id), - "detail_level": validated_detail, - "sort_by": validated_sort, - "changed_paths": list(paths_filter), - "offset": normalized_offset, - "limit": normalized_limit, - "returned": len(items), - "total": total, - "next_offset": next_offset if next_offset < total else None, - "items": items, - } - - def get_finding( - self, - *, - finding_id: str, - run_id: str | None = None, - detail_level: DetailLevel = "normal", - ) -> dict[str, object]: - record = self._runs.get(run_id) - validated_detail = cast( - "DetailLevel", - self._validate_choice("detail_level", detail_level, _VALID_DETAIL_LEVELS), - ) - canonical_id = self._resolve_canonical_finding_id(record, finding_id) - for finding in self._base_findings(record): - if str(finding.get("id")) == canonical_id: - return self._decorate_finding( - record, - finding, - detail_level=validated_detail, - ) - raise MCPFindingNotFoundError( - f"Finding id '{finding_id}' was not found in run " - f"'{self._short_run_id(record.run_id)}'." - ) - - def get_remediation( - self, - *, - finding_id: str, - run_id: str | None = None, - detail_level: DetailLevel = "normal", - ) -> dict[str, object]: - validated_detail = cast( - "DetailLevel", - self._validate_choice("detail_level", detail_level, _VALID_DETAIL_LEVELS), - ) - record = self._runs.get(run_id) - canonical_id = self._resolve_canonical_finding_id(record, finding_id) - finding = self.get_finding( - finding_id=canonical_id, - run_id=record.run_id, - detail_level="full", - ) - remediation = self._as_mapping(finding.get("remediation")) - if not remediation: - raise MCPFindingNotFoundError( - f"Finding id '{finding_id}' does not expose remediation guidance." - ) - return { - "run_id": self._short_run_id(record.run_id), - "finding_id": self._short_finding_id(record, canonical_id), - "detail_level": validated_detail, - "remediation": self._project_remediation( - remediation, - detail_level=validated_detail, - ), - } - - def list_hotspots( - self, - *, - kind: HotlistKind, - run_id: str | None = None, - detail_level: DetailLevel = "summary", - changed_paths: Sequence[str] = (), - git_diff_ref: str | None = None, - exclude_reviewed: bool = False, - limit: int = 10, - max_results: int | None = None, - ) -> dict[str, object]: - validated_kind = cast( - "HotlistKind", - self._validate_choice("kind", kind, _VALID_HOTLIST_KINDS), - ) - validated_detail = cast( - "DetailLevel", - self._validate_choice("detail_level", detail_level, _VALID_DETAIL_LEVELS), - ) - record = self._runs.get(run_id) - paths_filter = self._resolve_query_changed_paths( - record=record, - changed_paths=changed_paths, - git_diff_ref=git_diff_ref, - ) - rows = self._hotspot_rows( - record=record, - kind=validated_kind, - detail_level=validated_detail, - changed_paths=paths_filter, - exclude_reviewed=exclude_reviewed, - ) - normalized_limit = max( - 1, - min(max_results if max_results is not None else limit, 50), - ) - return { - "run_id": self._short_run_id(record.run_id), - "kind": validated_kind, - "detail_level": validated_detail, - "changed_paths": list(paths_filter), - "returned": min(len(rows), normalized_limit), - "total": len(rows), - "items": [dict(self._as_mapping(item)) for item in rows[:normalized_limit]], - } - - def get_production_triage( - self, - *, - run_id: str | None = None, - max_hotspots: int = 3, - max_suggestions: int = 3, - ) -> dict[str, object]: - record = self._runs.get(run_id) - summary = self._summary_payload(record.summary, record=record) - findings = self._base_findings(record) - findings_breakdown = self._source_kind_breakdown( - self._finding_source_kind(finding) for finding in findings - ) - suggestion_rows = self._triage_suggestion_rows(record) - suggestion_breakdown = self._source_kind_breakdown( - row.get("source_kind") for row in suggestion_rows - ) - hotspot_limit = max(1, min(max_hotspots, 10)) - suggestion_limit = max(1, min(max_suggestions, 10)) - production_hotspots = self._hotspot_rows( - record=record, - kind="production_hotspots", - detail_level="summary", - changed_paths=(), - exclude_reviewed=False, - ) - production_suggestions = [ - dict(row) - for row in suggestion_rows - if str(row.get("source_kind", "")) == SOURCE_KIND_PRODUCTION - ] - payload: dict[str, object] = { - "run_id": self._short_run_id(record.run_id), - "focus": _FOCUS_PRODUCTION, - "health_scope": _HEALTH_SCOPE_REPOSITORY, - "baseline": dict(self._as_mapping(summary.get("baseline"))), - "health": dict(self._summary_health_payload(summary)), - "cache": dict(self._as_mapping(summary.get("cache"))), - "findings": { - "total": len(findings), - "by_source_kind": findings_breakdown, - "new_by_source_kind": dict( - self._as_mapping( - self._as_mapping(summary.get("findings")).get( - "new_by_source_kind" - ) - ) - ), - "outside_focus": len(findings) - - findings_breakdown[SOURCE_KIND_PRODUCTION], - }, - "top_hotspots": { - "kind": "production_hotspots", - "available": len(production_hotspots), - "returned": min(len(production_hotspots), hotspot_limit), - "items": [ - dict(self._as_mapping(item)) - for item in production_hotspots[:hotspot_limit] - ], - }, - "suggestions": { - "total": len(suggestion_rows), - "by_source_kind": suggestion_breakdown, - "outside_focus": len(suggestion_rows) - - suggestion_breakdown[SOURCE_KIND_PRODUCTION], - }, - "top_suggestions": { - "available": len(production_suggestions), - "returned": min(len(production_suggestions), suggestion_limit), - "items": production_suggestions[:suggestion_limit], - }, - } - analysis_profile = self._summary_analysis_profile_payload(summary) - if analysis_profile: - payload["analysis_profile"] = analysis_profile - coverage_join = self._summary_coverage_join_payload(record) - if coverage_join: - payload["coverage_join"] = coverage_join - return payload - - def get_help( - self, - *, - topic: HelpTopic, - detail: HelpDetail = "compact", - ) -> dict[str, object]: - validated_topic = cast( - "HelpTopic", - self._validate_choice("topic", topic, _VALID_HELP_TOPICS), - ) - validated_detail = cast( - "HelpDetail", - self._validate_choice("detail", detail, _VALID_HELP_DETAILS), - ) - spec = _HELP_TOPIC_SPECS[validated_topic] - payload: dict[str, object] = { - "topic": validated_topic, - "detail": validated_detail, - "summary": spec.summary, - "key_points": list(spec.key_points), - "recommended_tools": list(spec.recommended_tools), - "doc_links": [ - {"title": title, "url": url} for title, url in spec.doc_links - ], - } - if validated_detail == "normal": - if spec.warnings: - payload["warnings"] = list(spec.warnings) - if spec.anti_patterns: - payload["anti_patterns"] = list(spec.anti_patterns) - return payload - - def generate_pr_summary( - self, - *, - run_id: str | None = None, - changed_paths: Sequence[str] = (), - git_diff_ref: str | None = None, - format: PRSummaryFormat = "markdown", - ) -> dict[str, object]: - output_format = cast( - "PRSummaryFormat", - self._validate_choice("format", format, _VALID_PR_SUMMARY_FORMATS), - ) - record = self._runs.get(run_id) - paths_filter = self._resolve_query_changed_paths( - record=record, - changed_paths=changed_paths, - git_diff_ref=git_diff_ref, - prefer_record_paths=True, - ) - changed_items = self._query_findings( - record=record, - detail_level="summary", - changed_paths=paths_filter, - ) - previous = self._previous_run_for_root(record) - resolved: list[dict[str, object]] = [] - if previous is not None: - compare_payload = self.compare_runs( - run_id_before=previous.run_id, - run_id_after=record.run_id, - focus="all", - ) - resolved = cast("list[dict[str, object]]", compare_payload["improvements"]) - with self._state_lock: - gate_result = dict( - self._last_gate_results.get( - record.run_id, - {"would_fail": False, "reasons": []}, - ) - ) - verdict = self._changed_verdict( - changed_projection={ - "total": len(changed_items), - "new": sum( - 1 for item in changed_items if str(item.get("novelty", "")) == "new" - ), - }, - health_delta=self._summary_health_delta(record.summary), - ) - payload: dict[str, object] = { - "run_id": self._short_run_id(record.run_id), - "changed_files": len(paths_filter), - "health": self._summary_health_payload(record.summary), - "health_delta": self._summary_health_delta(record.summary), - "verdict": verdict, - "new_findings_in_changed_files": changed_items, - "resolved": resolved, - "blocking_gates": list(cast(Sequence[str], gate_result.get("reasons", []))), - } - if output_format == "json": - return payload - return { - "run_id": self._short_run_id(record.run_id), - "format": output_format, - "content": self._render_pr_summary_markdown(payload), - } - - def mark_finding_reviewed( - self, - *, - finding_id: str, - run_id: str | None = None, - note: str | None = None, - ) -> dict[str, object]: - record = self._runs.get(run_id) - canonical_id = self._resolve_canonical_finding_id(record, finding_id) - self.get_finding( - finding_id=canonical_id, - run_id=record.run_id, - detail_level="normal", - ) - with self._state_lock: - review_map = self._review_state.setdefault(record.run_id, OrderedDict()) - review_map[canonical_id] = ( - note.strip() if isinstance(note, str) and note.strip() else None - ) - review_map.move_to_end(canonical_id) - return { - "run_id": self._short_run_id(record.run_id), - "finding_id": self._short_finding_id(record, canonical_id), - "reviewed": True, - "note": review_map[canonical_id], - "reviewed_count": len(review_map), - } - - def list_reviewed_findings( - self, - *, - run_id: str | None = None, - ) -> dict[str, object]: - record = self._runs.get(run_id) - with self._state_lock: - review_items = tuple( - self._review_state.get(record.run_id, OrderedDict()).items() - ) - items = [] - for finding_id, note in review_items: - try: - finding = self.get_finding(finding_id=finding_id, run_id=record.run_id) - except MCPFindingNotFoundError: - continue - items.append( - { - "finding_id": self._short_finding_id(record, finding_id), - "note": note, - "finding": self._project_finding_detail( - record, - finding, - detail_level="summary", - ), - } - ) - return { - "run_id": self._short_run_id(record.run_id), - "reviewed_count": len(items), - "items": items, - } - - def clear_session_runs(self) -> dict[str, object]: - removed_run_ids = self._runs.clear() - with self._state_lock: - cleared_review_entries = sum( - len(entries) for entries in self._review_state.values() - ) - cleared_gate_results = len(self._last_gate_results) - cleared_spread_cache_entries = len(self._spread_max_cache) - self._review_state.clear() - self._last_gate_results.clear() - self._spread_max_cache.clear() - return { - "cleared_runs": len(removed_run_ids), - "cleared_run_ids": [ - self._short_run_id(run_id) for run_id in removed_run_ids - ], - "cleared_review_entries": cleared_review_entries, - "cleared_gate_results": cleared_gate_results, - "cleared_spread_cache_entries": cleared_spread_cache_entries, - } - - def check_complexity( - self, - *, - run_id: str | None = None, - root: str | None = None, - path: str | None = None, - min_complexity: int | None = None, - max_results: int = 10, - detail_level: DetailLevel = "summary", - ) -> dict[str, object]: - validated_detail = cast( - "DetailLevel", - self._validate_choice("detail_level", detail_level, _VALID_DETAIL_LEVELS), - ) - record = self._resolve_granular_record( - run_id=run_id, - root=root, - analysis_mode="full", - ) - findings = self._query_findings( - record=record, - family="design", - category=CATEGORY_COMPLEXITY, - detail_level=validated_detail, - changed_paths=self._path_filter_tuple(path), - sort_by="priority", - ) - if min_complexity is not None: - findings = [ - finding - for finding in findings - if _as_int( - self._as_mapping(finding.get("facts")).get( - "cyclomatic_complexity", - 0, - ) - ) - >= min_complexity - ] - return self._granular_payload( - record=record, - check="complexity", - items=findings, - detail_level=validated_detail, - max_results=max_results, - path=path, - threshold_context=self._design_threshold_context( - record=record, - check="complexity", - path=path, - items=findings, - requested_min=min_complexity, - ), - ) - - def check_clones( - self, - *, - run_id: str | None = None, - root: str | None = None, - path: str | None = None, - clone_type: str | None = None, - source_kind: str | None = None, - max_results: int = 10, - detail_level: DetailLevel = "summary", - ) -> dict[str, object]: - validated_detail = cast( - "DetailLevel", - self._validate_choice("detail_level", detail_level, _VALID_DETAIL_LEVELS), - ) - record = self._resolve_granular_record( - run_id=run_id, - root=root, - analysis_mode="clones_only", - ) - findings = self._query_findings( - record=record, - family="clone", - source_kind=source_kind, - detail_level=validated_detail, - changed_paths=self._path_filter_tuple(path), - sort_by="priority", - ) - if clone_type is not None: - findings = [ - finding - for finding in findings - if str(finding.get("clone_type", "")).strip() == clone_type - ] - return self._granular_payload( - record=record, - check="clones", - items=findings, - detail_level=validated_detail, - max_results=max_results, - path=path, - ) - - def check_coupling( - self, - *, - run_id: str | None = None, - root: str | None = None, - path: str | None = None, - max_results: int = 10, - detail_level: DetailLevel = "summary", - ) -> dict[str, object]: - return self._check_design_metric( - run_id=run_id, - root=root, - path=path, - max_results=max_results, - detail_level=detail_level, - category=CATEGORY_COUPLING, - check="coupling", - ) - - def check_cohesion( - self, - *, - run_id: str | None = None, - root: str | None = None, - path: str | None = None, - max_results: int = 10, - detail_level: DetailLevel = "summary", - ) -> dict[str, object]: - return self._check_design_metric( - run_id=run_id, - root=root, - path=path, - max_results=max_results, - detail_level=detail_level, - category=CATEGORY_COHESION, - check="cohesion", - ) - - def _check_design_metric( - self, - *, - run_id: str | None, - root: str | None, - path: str | None, - max_results: int, - detail_level: DetailLevel, - category: str, - check: str, - ) -> dict[str, object]: - validated_detail = cast( - "DetailLevel", - self._validate_choice("detail_level", detail_level, _VALID_DETAIL_LEVELS), - ) - record = self._resolve_granular_record( - run_id=run_id, - root=root, - analysis_mode="full", - ) - findings = self._query_findings( - record=record, - family="design", - category=category, - detail_level=validated_detail, - changed_paths=self._path_filter_tuple(path), - sort_by="priority", - ) - return self._granular_payload( - record=record, - check=check, - items=findings, - detail_level=validated_detail, - max_results=max_results, - path=path, - threshold_context=self._design_threshold_context( - record=record, - check=check, - path=path, - items=findings, - ), - ) - - def check_dead_code( - self, - *, - run_id: str | None = None, - root: str | None = None, - path: str | None = None, - min_severity: str | None = None, - max_results: int = 10, - detail_level: DetailLevel = "summary", - ) -> dict[str, object]: - validated_detail = cast( - "DetailLevel", - self._validate_choice("detail_level", detail_level, _VALID_DETAIL_LEVELS), - ) - validated_min_severity = self._validate_optional_choice( - "min_severity", - min_severity, - _VALID_SEVERITIES, - ) - record = self._resolve_granular_record( - run_id=run_id, - root=root, - analysis_mode="full", - ) - findings = self._query_findings( - record=record, - family="dead_code", - detail_level=validated_detail, - changed_paths=self._path_filter_tuple(path), - sort_by="priority", - ) - if validated_min_severity is not None: - findings = [ - finding - for finding in findings - if self._severity_rank(str(finding.get("severity", ""))) - >= self._severity_rank(validated_min_severity) - ] - return self._granular_payload( - record=record, - check="dead_code", - items=findings, - detail_level=validated_detail, - max_results=max_results, - path=path, - ) - - def read_resource(self, uri: str) -> str: - if uri == "codeclone://schema": - return _json_text_payload(self._schema_resource_payload()) - if uri == "codeclone://latest/triage": - latest = self._runs.get() - return _json_text_payload(self.get_production_triage(run_id=latest.run_id)) - latest_prefix = "codeclone://latest/" - run_prefix = "codeclone://runs/" - if uri.startswith(latest_prefix): - latest = self._runs.get() - suffix = uri[len(latest_prefix) :] - return self._render_resource(latest, suffix) - if not uri.startswith(run_prefix): - raise MCPServiceContractError(f"Unsupported CodeClone resource URI: {uri}") - remainder = uri[len(run_prefix) :] - run_id, sep, suffix = remainder.partition("/") - if not sep: - raise MCPServiceContractError(f"Unsupported CodeClone resource URI: {uri}") - record = self._runs.get(run_id) - return self._render_resource(record, suffix) - - def _render_resource(self, record: MCPRunRecord, suffix: str) -> str: - if suffix == "summary": - return _json_text_payload( - self._summary_payload(record.summary, record=record) - ) - if suffix == "triage": - raise MCPServiceContractError( - "Production triage is exposed only as codeclone://latest/triage." - ) - if suffix == "health": - return _json_text_payload(self._summary_health_payload(record.summary)) - if suffix == "gates": - with self._state_lock: - gate_result = self._last_gate_results.get(record.run_id) - if gate_result is None: - raise MCPServiceContractError( - "No gate evaluation result is available in this MCP session." - ) - return _json_text_payload(gate_result) - if suffix == "changed": - if record.changed_projection is None: - raise MCPServiceContractError( - "Changed-findings projection is not available in this run." - ) - return _json_text_payload(record.changed_projection) - if suffix == "schema": - return _json_text_payload(self._schema_resource_payload()) - if suffix == "report.json": - return _json_text_payload(record.report_document, sort_keys=False) - if suffix == "overview": - return _json_text_payload( - self.list_hotspots(kind="highest_spread", run_id=record.run_id) - ) - finding_prefix = "findings/" - if suffix.startswith(finding_prefix): - finding_id = suffix[len(finding_prefix) :] - return _json_text_payload( - self.get_finding(run_id=record.run_id, finding_id=finding_id) - ) - raise MCPServiceContractError( - f"Unsupported CodeClone resource suffix '{suffix}'." - ) - - def _resolve_request_changed_paths( - self, - *, - root_path: Path, - changed_paths: Sequence[str], - git_diff_ref: str | None, - ) -> tuple[str, ...]: - if changed_paths and git_diff_ref is not None: - raise MCPServiceContractError( - "Provide changed_paths or git_diff_ref, not both." - ) - if git_diff_ref is not None: - return self._git_diff_paths(root_path=root_path, git_diff_ref=git_diff_ref) - if not changed_paths: - return () - return self._normalize_changed_paths(root_path=root_path, paths=changed_paths) - - def _resolve_query_changed_paths( - self, - *, - record: MCPRunRecord, - changed_paths: Sequence[str], - git_diff_ref: str | None, - prefer_record_paths: bool = False, - ) -> tuple[str, ...]: - if changed_paths or git_diff_ref is not None: - return self._resolve_request_changed_paths( - root_path=record.root, - changed_paths=changed_paths, - git_diff_ref=git_diff_ref, - ) - if prefer_record_paths: - return record.changed_paths - return () - - def _normalize_changed_paths( - self, - *, - root_path: Path, - paths: Sequence[str], - ) -> tuple[str, ...]: - normalized: set[str] = set() - for raw_path in paths: - candidate = Path(str(raw_path)).expanduser() - if candidate.is_absolute(): - try: - relative = candidate.resolve().relative_to(root_path) - except (OSError, ValueError) as exc: - raise MCPServiceContractError( - f"Changed path '{raw_path}' is outside root '{root_path}'." - ) from exc - normalized.add(relative.as_posix()) - continue - cleaned = self._normalize_relative_path(candidate.as_posix()) - if cleaned: - normalized.add(cleaned) - return tuple(sorted(normalized)) - - def _git_diff_paths( - self, - *, - root_path: Path, - git_diff_ref: str, - ) -> tuple[str, ...]: - lines = _git_diff_lines_payload( - root_path=root_path, - git_diff_ref=git_diff_ref, - ) - return self._normalize_changed_paths(root_path=root_path, paths=lines) - - def _prune_session_state(self) -> None: - active_run_ids = {record.run_id for record in self._runs.records()} - with self._state_lock: - for state_map in ( - self._review_state, - self._last_gate_results, - self._spread_max_cache, - ): - stale_run_ids = [ - run_id for run_id in state_map if run_id not in active_run_ids - ] - for run_id in stale_run_ids: - state_map.pop(run_id, None) - - def _summary_health_score(self, summary: Mapping[str, object]) -> int | None: - health = self._summary_health_payload(summary) - if health.get("available") is False: - return None - score = health.get("score", 0) - return _as_int(score, 0) - - def _summary_health_delta(self, summary: Mapping[str, object]) -> int | None: - if self._summary_health_payload(summary).get("available") is False: - return None - metrics_diff = self._as_mapping(summary.get("metrics_diff")) - value = metrics_diff.get("health_delta", 0) - return _as_int(value, 0) - - def _summary_health_payload( - self, - summary: Mapping[str, object], - ) -> dict[str, object]: - if str(summary.get("analysis_mode", "")) == "clones_only": - return {"available": False, "reason": "metrics_skipped"} - health = dict(self._as_mapping(summary.get("health"))) - if health: - return health - return {"available": False, "reason": "unavailable"} - - @staticmethod - def _short_run_id(run_id: str) -> str: - return run_id[:_SHORT_RUN_ID_LENGTH] - - def _finding_id_maps( - self, - record: MCPRunRecord, - ) -> tuple[dict[str, str], dict[str, str]]: - canonical_ids = sorted( - str(finding.get("id", "")) - for finding in self._base_findings(record) - if str(finding.get("id", "")) - ) - base_ids = { - canonical_id: self._base_short_finding_id(canonical_id) - for canonical_id in canonical_ids - } - grouped: dict[str, list[str]] = {} - for canonical_id, short_id in base_ids.items(): - grouped.setdefault(short_id, []).append(canonical_id) - canonical_to_short: dict[str, str] = {} - short_to_canonical: dict[str, str] = {} - for short_id, group in grouped.items(): - if len(group) == 1: - canonical_id = group[0] - canonical_to_short[canonical_id] = short_id - short_to_canonical[short_id] = canonical_id - continue - disambiguated_ids = self._disambiguated_short_finding_ids(group) - for canonical_id, disambiguated in disambiguated_ids.items(): - canonical_to_short[canonical_id] = disambiguated - short_to_canonical[disambiguated] = canonical_id - return canonical_to_short, short_to_canonical - - @staticmethod - def _base_short_finding_id(canonical_id: str) -> str: - return _base_short_finding_id_payload(canonical_id) - - @staticmethod - def _disambiguated_short_finding_id(canonical_id: str) -> str: - return _disambiguated_short_finding_id_payload(canonical_id) - - def _disambiguated_short_finding_ids( - self, - canonical_ids: Sequence[str], - ) -> dict[str, str]: - clone_ids = [ - canonical_id - for canonical_id in canonical_ids - if canonical_id.startswith("clone:") - ] - if len(clone_ids) == len(canonical_ids): - clone_short_ids = _disambiguated_clone_short_ids_payload(clone_ids) - if len(set(clone_short_ids.values())) == len(clone_short_ids): - return clone_short_ids - return { - canonical_id: self._disambiguated_short_finding_id(canonical_id) - for canonical_id in canonical_ids - } - - def _short_finding_id( - self, - record: MCPRunRecord, - canonical_id: str, - ) -> str: - canonical_to_short, _short_to_canonical = self._finding_id_maps(record) - return canonical_to_short.get(canonical_id, canonical_id) - - def _resolve_canonical_finding_id( - self, - record: MCPRunRecord, - finding_id: str, - ) -> str: - canonical_to_short, short_to_canonical = self._finding_id_maps(record) - if finding_id in canonical_to_short: - return finding_id - canonical = short_to_canonical.get(finding_id) - if canonical is not None: - return canonical - raise MCPFindingNotFoundError( - f"Finding id '{finding_id}' was not found in run " - f"'{self._short_run_id(record.run_id)}'." - ) - - def _leaf_symbol_name(self, value: object) -> str: - return _leaf_symbol_name_payload(value) - - @staticmethod - def _comparison_settings( - *, - args: Namespace, - request: MCPAnalysisRequest, - ) -> tuple[object, ...]: - return ( - request.analysis_mode, - _as_int(args.min_loc, DEFAULT_MIN_LOC), - _as_int(args.min_stmt, DEFAULT_MIN_STMT), - _as_int(args.block_min_loc, DEFAULT_BLOCK_MIN_LOC), - _as_int(args.block_min_stmt, DEFAULT_BLOCK_MIN_STMT), - _as_int(args.segment_min_loc, DEFAULT_SEGMENT_MIN_LOC), - _as_int(args.segment_min_stmt, DEFAULT_SEGMENT_MIN_STMT), - _as_int( - args.design_complexity_threshold, - DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, - ), - _as_int( - args.design_coupling_threshold, - DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, - ), - _as_int( - args.design_cohesion_threshold, - DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, - ), - ) - - @staticmethod - def _comparison_scope( - *, - before: MCPRunRecord, - after: MCPRunRecord, - ) -> dict[str, object]: - same_root = before.root == after.root - same_analysis_settings = before.comparison_settings == after.comparison_settings - if same_root and same_analysis_settings: - reason = "comparable" - elif not same_root and not same_analysis_settings: - reason = "different_root_and_analysis_settings" - elif not same_root: - reason = "different_root" - else: - reason = "different_analysis_settings" - return { - "comparable": same_root and same_analysis_settings, - "same_root": same_root, - "same_analysis_settings": same_analysis_settings, - "reason": reason, - } - - @staticmethod - def _severity_rank(severity: str) -> int: - return { - SEVERITY_CRITICAL: 3, - SEVERITY_WARNING: 2, - SEVERITY_INFO: 1, - }.get(severity, 0) - - def _path_filter_tuple(self, path: str | None) -> tuple[str, ...]: - if not path: - return () - cleaned = self._normalize_relative_path(Path(path).as_posix()) - return (cleaned,) if cleaned else () - - def _normalize_relative_path(self, path: str) -> str: - cleaned = path.strip() - if cleaned == ".": - return "" - if cleaned.startswith("./"): - cleaned = cleaned[2:] - cleaned = cleaned.rstrip("/") - if ".." in Path(cleaned).parts: - raise MCPServiceContractError(f"path traversal not allowed: {path}") - return cleaned - - def _previous_run_for_root(self, record: MCPRunRecord) -> MCPRunRecord | None: - previous: MCPRunRecord | None = None - for item in self._runs.records(): - if item.run_id == record.run_id: - return previous - if item.root == record.root: - previous = item - return None - - @staticmethod - def _record_supports_analysis_mode( - record: MCPRunRecord, - *, - analysis_mode: AnalysisMode, - ) -> bool: - record_mode = record.request.analysis_mode - if analysis_mode == "clones_only": - return record_mode in {"clones_only", "full"} - return record_mode == "full" - - def _latest_compatible_record( - self, - *, - analysis_mode: AnalysisMode, - root_path: Path | None = None, - ) -> MCPRunRecord | None: - for item in reversed(self._runs.records()): - if root_path is not None and item.root != root_path: - continue - if self._record_supports_analysis_mode( - item, - analysis_mode=analysis_mode, - ): - return item - return None - - def _resolve_granular_record( - self, - *, - run_id: str | None, - root: str | None, - analysis_mode: AnalysisMode, - ) -> MCPRunRecord: - if run_id is not None: - record = self._runs.get(run_id) - if self._record_supports_analysis_mode(record, analysis_mode=analysis_mode): - return record - raise MCPServiceContractError( - "Selected MCP run is not compatible with this check. " - f"Call analyze_repository(root='{record.root}', " - "analysis_mode='full') first." - ) - root_path = self._resolve_optional_root(root) - latest_record = self._latest_compatible_record( - analysis_mode=analysis_mode, - root_path=root_path, - ) - if latest_record is not None: - return latest_record - if root_path is not None: - raise MCPRunNotFoundError( - f"No compatible MCP analysis run is available for root: {root_path}. " - f"Call analyze_repository(root='{root_path}') or " - f"analyze_changed_paths(root='{root_path}', changed_paths=[...]) first." - ) - raise MCPRunNotFoundError( - "No compatible MCP analysis run is available. " - "Call analyze_repository(root='/path/to/repo') or " - "analyze_changed_paths(root='/path/to/repo', changed_paths=[...]) first." - ) - - def _base_findings(self, record: MCPRunRecord) -> list[dict[str, object]]: - report_document = record.report_document - findings = self._as_mapping(report_document.get("findings")) - groups = self._as_mapping(findings.get("groups")) - clone_groups = self._as_mapping(groups.get(FAMILY_CLONES)) - return [ - *self._dict_list(clone_groups.get("functions")), - *self._dict_list(clone_groups.get("blocks")), - *self._dict_list(clone_groups.get("segments")), - *self._dict_list( - self._as_mapping(groups.get(FAMILY_STRUCTURAL)).get("groups") - ), - *self._dict_list( - self._as_mapping(groups.get(FAMILY_DEAD_CODE)).get("groups") - ), - *self._dict_list(self._as_mapping(groups.get(FAMILY_DESIGN)).get("groups")), - ] - - def _query_findings( - self, - *, - record: MCPRunRecord, - family: FindingFamilyFilter = "all", - category: str | None = None, - severity: str | None = None, - source_kind: str | None = None, - novelty: FindingNoveltyFilter = "all", - sort_by: FindingSort = "default", - detail_level: DetailLevel = "normal", - changed_paths: Sequence[str] = (), - exclude_reviewed: bool = False, - ) -> list[dict[str, object]]: - findings = self._base_findings(record) - max_spread_value = max( - (self._spread_value(finding) for finding in findings), - default=0, - ) - with self._state_lock: - self._spread_max_cache[record.run_id] = max_spread_value - filtered = [ - finding - for finding in findings - if self._matches_finding_filters( - finding=finding, - family=family, - category=category, - severity=severity, - source_kind=source_kind, - novelty=novelty, - ) - and ( - not changed_paths - or self._finding_touches_paths( - finding=finding, - changed_paths=changed_paths, - ) - ) - and (not exclude_reviewed or not self._finding_is_reviewed(record, finding)) - ] - remediation_map = { - str(finding.get("id", "")): self._remediation_for_finding(record, finding) - for finding in filtered - } - priority_map = { - str(finding.get("id", "")): self._priority_score( - record, - finding, - remediation=remediation_map[str(finding.get("id", ""))], - max_spread_value=max_spread_value, - ) - for finding in filtered - } - ordered = self._sort_findings( - record=record, - findings=filtered, - sort_by=sort_by, - priority_map=priority_map, - ) - return [ - self._decorate_finding( - record, - finding, - detail_level=detail_level, - remediation=remediation_map[str(finding.get("id", ""))], - priority_payload=priority_map[str(finding.get("id", ""))], - max_spread_value=max_spread_value, - ) - for finding in ordered - ] - - def _sort_findings( - self, - *, - record: MCPRunRecord, - findings: Sequence[Mapping[str, object]], - sort_by: FindingSort, - priority_map: Mapping[str, Mapping[str, object]] | None = None, - ) -> list[dict[str, object]]: - finding_rows = [dict(finding) for finding in findings] - if sort_by == "default": - return finding_rows - if sort_by == "severity": - finding_rows.sort( - key=lambda finding: ( - -self._severity_rank(str(finding.get("severity", ""))), - str(finding.get("id", "")), - ) - ) - elif sort_by == "spread": - finding_rows.sort( - key=lambda finding: ( - -self._spread_value(finding), - -_as_float(finding.get("priority", 0.0), 0.0), - str(finding.get("id", "")), - ) - ) - else: - finding_rows.sort( - key=lambda finding: ( - -_as_float( - self._as_mapping( - (priority_map or {}).get(str(finding.get("id", ""))) - ).get("score", 0.0), - 0.0, - ) - if priority_map is not None - else -_as_float( - self._priority_score(record, finding)["score"], - 0.0, - ), - -self._severity_rank(str(finding.get("severity", ""))), - str(finding.get("id", "")), - ) - ) - return finding_rows - - def _decorate_finding( - self, - record: MCPRunRecord, - finding: Mapping[str, object], - *, - detail_level: DetailLevel, - remediation: Mapping[str, object] | None = None, - priority_payload: Mapping[str, object] | None = None, - max_spread_value: int | None = None, - ) -> dict[str, object]: - resolved_remediation = ( - remediation - if remediation is not None - else self._remediation_for_finding(record, finding) - ) - resolved_priority_payload = ( - dict(priority_payload) - if priority_payload is not None - else self._priority_score( - record, - finding, - remediation=resolved_remediation, - max_spread_value=max_spread_value, - ) - ) - payload = dict(finding) - payload["priority_score"] = resolved_priority_payload["score"] - payload["priority_factors"] = resolved_priority_payload["factors"] - payload["locations"] = self._locations_for_finding( - record, - finding, - include_uri=detail_level == "full", - ) - payload["html_anchor"] = f"finding-{finding.get('id', '')}" - if resolved_remediation is not None: - payload["remediation"] = resolved_remediation - return self._project_finding_detail( - record, - payload, - detail_level=detail_level, - ) - - def _project_finding_detail( - self, - record: MCPRunRecord, - finding: Mapping[str, object], - *, - detail_level: DetailLevel, - ) -> dict[str, object]: - if detail_level == "full": - full_payload = dict(finding) - full_payload["id"] = self._short_finding_id( - record, - str(finding.get("id", "")), - ) - return full_payload - payload: dict[str, object] = { - "id": self._short_finding_id(record, str(finding.get("id", ""))), - "kind": self._finding_kind_label(finding), - "severity": str(finding.get("severity", "")), - "novelty": str(finding.get("novelty", "")), - "scope": self._finding_source_kind(finding), - "count": _as_int(finding.get("count", 0), 0), - "spread": dict(self._as_mapping(finding.get("spread"))), - "priority": round(_as_float(finding.get("priority_score", 0.0), 0.0), 2), - } - clone_type = str(finding.get("clone_type", "")).strip() - if clone_type: - payload["type"] = clone_type - locations = [ - self._as_mapping(item) - for item in self._as_sequence(finding.get("locations")) - ] - if detail_level == "summary": - remediation = self._as_mapping(finding.get("remediation")) - if remediation: - payload["effort"] = str(remediation.get("effort", "")) - payload["locations"] = [ - summary_location - for summary_location in ( - self._summary_location_string(location) for location in locations - ) - if summary_location - ] - return payload - remediation = self._as_mapping(finding.get("remediation")) - if remediation: - payload["remediation"] = self._project_remediation( - remediation, - detail_level="normal", - ) - payload["locations"] = [ - projected - for projected in ( - self._normal_location_payload(location) for location in locations - ) - if projected - ] - return payload - - def _finding_summary_card( - self, - record: MCPRunRecord, - finding: Mapping[str, object], - ) -> dict[str, object]: - return self._finding_summary_card_payload( - record, - self._decorate_finding(record, finding, detail_level="full"), - ) - - def _finding_summary_card_payload( - self, - record: MCPRunRecord, - finding: Mapping[str, object], - ) -> dict[str, object]: - return self._project_finding_detail(record, finding, detail_level="summary") - - def _comparison_finding_card( - self, - record: MCPRunRecord, - finding: Mapping[str, object], - ) -> dict[str, object]: - summary_card = self._finding_summary_card(record, finding) - return { - "id": summary_card.get("id"), - "kind": summary_card.get("kind"), - "severity": summary_card.get("severity"), - } - - @staticmethod - def _finding_kind_label(finding: Mapping[str, object]) -> str: - family = str(finding.get("family", "")).strip() - kind = str(finding.get("kind", finding.get("category", ""))).strip() - if family == FAMILY_CLONE: - clone_kind = str( - finding.get("clone_kind", finding.get("category", kind)) - ).strip() - return f"{clone_kind}_clone" if clone_kind else "clone" - if family == FAMILY_DEAD_CODE: - return "dead_code" - return kind or family - - @staticmethod - def _summary_location_string(location: Mapping[str, object]) -> str: - path = str(location.get("file", "")).strip() - line = _as_int(location.get("line", 0), 0) - if not path: - return "" - return f"{path}:{line}" if line > 0 else path - - def _normal_location_payload( - self, - location: Mapping[str, object], - ) -> dict[str, object]: - path = str(location.get("file", "")).strip() - if not path: - return {} - payload: dict[str, object] = { - "path": path, - "line": _as_int(location.get("line", 0), 0), - "end_line": _as_int(location.get("end_line", 0), 0), - } - symbol = self._leaf_symbol_name(location.get("symbol")) - if symbol: - payload["symbol"] = symbol - return payload - - def _matches_finding_filters( - self, - *, - finding: Mapping[str, object], - family: FindingFamilyFilter, - category: str | None = None, - severity: str | None, - source_kind: str | None, - novelty: FindingNoveltyFilter, - ) -> bool: - finding_family = str(finding.get("family", "")).strip() - if family != "all" and finding_family != family: - return False - if ( - category is not None - and str(finding.get("category", "")).strip() != category - ): - return False - if ( - severity is not None - and str(finding.get("severity", "")).strip() != severity - ): - return False - dominant_kind = str( - self._as_mapping(finding.get("source_scope")).get("dominant_kind", "") - ).strip() - if source_kind is not None and dominant_kind != source_kind: - return False - return novelty == "all" or str(finding.get("novelty", "")).strip() == novelty - - def _finding_touches_paths( - self, - *, - finding: Mapping[str, object], - changed_paths: Sequence[str], - ) -> bool: - normalized_paths = tuple(changed_paths) - for item in self._as_sequence(finding.get("items")): - relative_path = str(self._as_mapping(item).get("relative_path", "")).strip() - if relative_path and self._path_matches(relative_path, normalized_paths): - return True - return False - - @staticmethod - def _path_matches(relative_path: str, changed_paths: Sequence[str]) -> bool: - for candidate in changed_paths: - if relative_path == candidate or relative_path.startswith(candidate + "/"): - return True - return False - - def _finding_is_reviewed( - self, - record: MCPRunRecord, - finding: Mapping[str, object], - ) -> bool: - with self._state_lock: - review_map = self._review_state.get(record.run_id, OrderedDict()) - return str(finding.get("id", "")) in review_map - - def _include_hotspot_finding( - self, - *, - record: MCPRunRecord, - finding: Mapping[str, object], - changed_paths: Sequence[str], - exclude_reviewed: bool, - ) -> bool: - if changed_paths and not self._finding_touches_paths( - finding=finding, - changed_paths=changed_paths, - ): - return False - return not exclude_reviewed or not self._finding_is_reviewed(record, finding) - - def _priority_score( - self, - record: MCPRunRecord, - finding: Mapping[str, object], - *, - remediation: Mapping[str, object] | None = None, - max_spread_value: int | None = None, - ) -> dict[str, object]: - spread_weight = self._spread_weight( - record, - finding, - max_spread_value=max_spread_value, - ) - factors = { - "severity_weight": _SEVERITY_WEIGHT.get( - str(finding.get("severity", "")), - 0.2, - ), - "effort_weight": _EFFORT_WEIGHT.get( - ( - str(remediation.get("effort", EFFORT_MODERATE)) - if remediation is not None - else EFFORT_MODERATE - ), - 0.6, - ), - "novelty_weight": _NOVELTY_WEIGHT.get( - str(finding.get("novelty", "")), - 0.7, - ), - "runtime_weight": _RUNTIME_WEIGHT.get( - str( - self._as_mapping(finding.get("source_scope")).get( - "dominant_kind", - "other", - ) - ), - 0.5, - ), - "spread_weight": spread_weight, - "confidence_weight": _CONFIDENCE_WEIGHT.get( - str(finding.get("confidence", CONFIDENCE_MEDIUM)), - 0.7, - ), - } - product = 1.0 - for value in factors.values(): - product *= max(_as_float(value, 0.01), 0.01) - score = product ** (1.0 / max(len(factors), 1)) - return { - "score": round(score, 4), - "factors": { - key: round(_as_float(value, 0.0), 4) for key, value in factors.items() - }, - } - - def _spread_weight( - self, - record: MCPRunRecord, - finding: Mapping[str, object], - *, - max_spread_value: int | None = None, - ) -> float: - spread_value = self._spread_value(finding) - if max_spread_value is None: - with self._state_lock: - max_spread_value = self._spread_max_cache.get(record.run_id) - if max_spread_value is None: - max_spread_value = max( - (self._spread_value(item) for item in self._base_findings(record)), - default=0, - ) - with self._state_lock: - self._spread_max_cache[record.run_id] = max_spread_value - max_value = max_spread_value - if max_value <= 0: - return 0.3 - return max(0.2, min(1.0, spread_value / max_value)) - - def _spread_value(self, finding: Mapping[str, object]) -> int: - spread = self._as_mapping(finding.get("spread")) - files = _as_int(spread.get("files", 0), 0) - functions = _as_int(spread.get("functions", 0), 0) - count = _as_int(finding.get("count", 0), 0) - return max(files, functions, count, 1) - - def _locations_for_finding( - self, - record: MCPRunRecord, - finding: Mapping[str, object], - *, - include_uri: bool = True, - ) -> list[dict[str, object]]: - locations: list[dict[str, object]] = [] - for item in self._as_sequence(finding.get("items")): - item_map = self._as_mapping(item) - relative_path = str(item_map.get("relative_path", "")).strip() - if not relative_path: - continue - line = _as_int(item_map.get("start_line", 0) or 0, 0) - end_line = _as_int(item_map.get("end_line", 0) or 0, 0) - symbol = str(item_map.get("qualname", item_map.get("module", ""))).strip() - location: dict[str, object] = { - "file": relative_path, - "line": line, - "end_line": end_line, - "symbol": symbol, - } - if include_uri: - absolute_path = (record.root / relative_path).resolve() - uri = absolute_path.as_uri() - if line > 0: - uri = f"{uri}#L{line}" - location["uri"] = uri - locations.append(location) - deduped: list[dict[str, object]] = [] - seen: set[tuple[str, int, str]] = set() - for location in locations: - key = ( - str(location.get("file", "")), - _as_int(location.get("line", 0), 0), - str(location.get("symbol", "")), - ) - if key not in seen: - seen.add(key) - deduped.append(location) - return deduped - - @staticmethod - def _suggestion_finding_id(suggestion: object) -> str: - return _suggestion_finding_id_payload(suggestion) - - def _remediation_for_finding( - self, - record: MCPRunRecord, - finding: Mapping[str, object], - ) -> dict[str, object] | None: - suggestion = self._suggestion_for_finding(record, str(finding.get("id", ""))) - if suggestion is None: - return None - source_kind = str(getattr(suggestion, "source_kind", "other")) - spread_files = _as_int(getattr(suggestion, "spread_files", 0), 0) - spread_functions = _as_int(getattr(suggestion, "spread_functions", 0), 0) - title = str(getattr(suggestion, "title", "")).strip() - severity = str(finding.get("severity", "")).strip() - novelty = str(finding.get("novelty", "known")).strip() - count = _as_int( - getattr(suggestion, "fact_count", 0) or finding.get("count", 0) or 0, - 0, - ) - safe_refactor_shape = self._safe_refactor_shape(suggestion) - effort = str(getattr(suggestion, "effort", EFFORT_MODERATE)) - confidence = str(getattr(suggestion, "confidence", CONFIDENCE_MEDIUM)) - risk_level = self._risk_level_for_effort(effort) - return { - "effort": effort, - "priority": _as_float(getattr(suggestion, "priority", 0.0), 0.0), - "confidence": confidence, - "safe_refactor_shape": safe_refactor_shape, - "steps": list(getattr(suggestion, "steps", ())), - "risk_level": risk_level, - "why_now": self._why_now_text( - title=title, - severity=severity, - novelty=novelty, - count=count, - source_kind=source_kind, - spread_files=spread_files, - spread_functions=spread_functions, - effort=effort, - ), - "blast_radius": { - "files": spread_files, - "functions": spread_functions, - "is_production": source_kind == "production", - }, - } - - def _suggestion_for_finding( - self, - record: MCPRunRecord, - finding_id: str, - ) -> object | None: - for suggestion in record.suggestions: - if self._suggestion_finding_id(suggestion) == finding_id: - return suggestion - return None - - @staticmethod - def _safe_refactor_shape(suggestion: object) -> str: - category = str(getattr(suggestion, "category", "")).strip() - clone_type = str(getattr(suggestion, "clone_type", "")).strip() - title = str(getattr(suggestion, "title", "")).strip() - if category == CATEGORY_CLONE and clone_type == "Type-1": - return "Keep one canonical implementation and route callers through it." - if category == CATEGORY_CLONE and clone_type == "Type-2": - return "Extract shared implementation with explicit parameters." - if category == CATEGORY_CLONE and "Block" in title: - return "Extract the repeated statement sequence into a helper." - if category == CATEGORY_STRUCTURAL: - return "Extract the repeated branch family into a named helper." - if category == CATEGORY_COMPLEXITY: - return "Split the function into smaller named steps." - if category == CATEGORY_COUPLING: - return "Isolate responsibilities and invert unnecessary dependencies." - if category == CATEGORY_COHESION: - return "Split the class by responsibility boundary." - if category == CATEGORY_DEAD_CODE: - return "Delete the unused symbol or document intentional reachability." - if category == CATEGORY_DEPENDENCY: - return "Break the cycle by moving shared abstractions to a lower layer." - return "Extract the repeated logic into a shared, named abstraction." - - @staticmethod - def _risk_level_for_effort(effort: str) -> str: - return { - EFFORT_EASY: "low", - EFFORT_MODERATE: "medium", - EFFORT_HARD: "high", - }.get(effort, "medium") - - @staticmethod - def _why_now_text( - *, - title: str, - severity: str, - novelty: str, - count: int, - source_kind: str, - spread_files: int, - spread_functions: int, - effort: str, - ) -> str: - novelty_text = "new regression" if novelty == "new" else "known debt" - context = ( - "production code" - if source_kind == "production" - else source_kind or "mixed scope" - ) - spread_text = f"{spread_files} files / {spread_functions} functions" - count_text = f"{count} instances" if count > 0 else "localized issue" - return ( - f"{severity.upper()} {title} in {context} — {count_text}, " - f"{spread_text}, {effort} fix, {novelty_text}." - ) - - def _project_remediation( - self, - remediation: Mapping[str, object], - *, - detail_level: DetailLevel, - ) -> dict[str, object]: - if detail_level == "full": - return dict(remediation) - projected = { - "effort": remediation.get("effort"), - "risk": remediation.get("risk_level"), - "shape": remediation.get("safe_refactor_shape"), - "why_now": remediation.get("why_now"), - } - if detail_level == "summary": - return projected - projected["steps"] = list(self._as_sequence(remediation.get("steps"))) - return projected - - def _hotspot_rows( - self, - *, - record: MCPRunRecord, - kind: HotlistKind, - detail_level: DetailLevel, - changed_paths: Sequence[str], - exclude_reviewed: bool, - ) -> list[dict[str, object]]: - findings = self._base_findings(record) - finding_index = {str(finding.get("id", "")): finding for finding in findings} - max_spread_value = max( - (self._spread_value(finding) for finding in findings), - default=0, - ) - with self._state_lock: - self._spread_max_cache[record.run_id] = max_spread_value - remediation_map = { - str(finding.get("id", "")): self._remediation_for_finding(record, finding) - for finding in findings - } - priority_map = { - str(finding.get("id", "")): self._priority_score( - record, - finding, - remediation=remediation_map[str(finding.get("id", ""))], - max_spread_value=max_spread_value, - ) - for finding in findings - } - derived = self._as_mapping(record.report_document.get("derived")) - hotlists = self._as_mapping(derived.get("hotlists")) - if kind == "highest_priority": - ordered_ids = [ - str(finding.get("id", "")) - for finding in self._sort_findings( - record=record, - findings=findings, - sort_by="priority", - priority_map=priority_map, - ) - ] - else: - hotlist_key = _HOTLIST_REPORT_KEYS.get(kind) - if hotlist_key is None: - return [] - ordered_ids = [ - str(item) - for item in self._as_sequence(hotlists.get(hotlist_key)) - if str(item) - ] - rows: list[dict[str, object]] = [] - for finding_id in ordered_ids: - finding = finding_index.get(finding_id) - if finding is None or not self._include_hotspot_finding( - record=record, - finding=finding, - changed_paths=changed_paths, - exclude_reviewed=exclude_reviewed, - ): - continue - finding_id_key = str(finding.get("id", "")) - rows.append( - self._decorate_finding( - record, - finding, - detail_level=detail_level, - remediation=remediation_map[finding_id_key], - priority_payload=priority_map[finding_id_key], - max_spread_value=max_spread_value, - ) - ) - return rows - - def _build_changed_projection( - self, - record: MCPRunRecord, - ) -> dict[str, object] | None: - if not record.changed_paths: - return None - items = self._query_findings( - record=record, - detail_level="summary", - changed_paths=record.changed_paths, - ) - new_count = sum(1 for item in items if str(item.get("novelty", "")) == "new") - known_count = sum( - 1 for item in items if str(item.get("novelty", "")) == "known" - ) - new_by_source_kind = self._source_kind_breakdown( - item.get("source_kind") - for item in items - if str(item.get("novelty", "")) == "new" - ) - health_delta = self._summary_health_delta(record.summary) - return { - "run_id": self._short_run_id(record.run_id), - "changed_paths": list(record.changed_paths), - "total": len(items), - "new": new_count, - "known": known_count, - "new_by_source_kind": new_by_source_kind, - "items": items, - "health": dict(self._summary_health_payload(record.summary)), - "health_delta": health_delta, - "verdict": self._changed_verdict( - changed_projection={"new": new_count, "total": len(items)}, - health_delta=health_delta, - ), - } - - def _changed_analysis_payload( - self, - record: MCPRunRecord, - ) -> dict[str, object]: - changed_projection = self._as_mapping(record.changed_projection) - health = self._summary_health_payload(record.summary) - health_payload = ( - { - "score": health.get("score"), - "grade": health.get("grade"), - } - if health.get("available") is not False - else dict(health) - ) - return { - "run_id": self._short_run_id(record.run_id), - "focus": _FOCUS_CHANGED_PATHS, - "health_scope": _HEALTH_SCOPE_REPOSITORY, - "baseline": dict(self._summary_baseline_payload(record.summary)), - "changed_files": len(record.changed_paths), - "health": health_payload, - "analysis_profile": self._summary_analysis_profile_payload(record.summary), - "health_delta": ( - _as_int(changed_projection.get("health_delta", 0), 0) - if changed_projection.get("health_delta") is not None - else None - ), - "verdict": str(changed_projection.get("verdict", "stable")), - "new_findings": _as_int(changed_projection.get("new", 0), 0), - "new_by_source_kind": dict( - self._as_mapping(changed_projection.get("new_by_source_kind")) - ), - "resolved_findings": 0, - "changed_findings": [], - "coverage_join": self._summary_coverage_join_payload(record), - } - - def _augment_summary_with_changed( - self, - *, - summary: Mapping[str, object], - changed_paths: Sequence[str], - changed_projection: Mapping[str, object] | None, - ) -> dict[str, object]: - payload = dict(summary) - if changed_paths: - payload["changed_paths"] = list(changed_paths) - if changed_projection is not None: - payload["changed_findings"] = { - "total": _as_int(changed_projection.get("total", 0), 0), - "new": _as_int(changed_projection.get("new", 0), 0), - "known": _as_int(changed_projection.get("known", 0), 0), - "items": [ - dict(self._as_mapping(item)) - for item in self._as_sequence(changed_projection.get("items"))[:10] - ], - } - payload["health_delta"] = ( - _as_int(changed_projection.get("health_delta", 0), 0) - if changed_projection.get("health_delta") is not None - else None - ) - payload["verdict"] = str(changed_projection.get("verdict", "stable")) - return payload - - @staticmethod - def _changed_verdict( - *, - changed_projection: Mapping[str, object], - health_delta: int | None, - ) -> str: - if _as_int(changed_projection.get("new", 0), 0) > 0 or ( - health_delta is not None and health_delta < 0 - ): - return "regressed" - if ( - _as_int(changed_projection.get("total", 0), 0) == 0 - and health_delta is not None - and health_delta > 0 - ): - return "improved" - return "stable" - - def _comparison_index( - self, - record: MCPRunRecord, - *, - focus: ComparisonFocus, - ) -> dict[str, dict[str, object]]: - findings = self._base_findings(record) - if focus == "clones": - findings = [f for f in findings if str(f.get("family", "")) == FAMILY_CLONE] - elif focus == "structural": - findings = [ - f for f in findings if str(f.get("family", "")) == FAMILY_STRUCTURAL - ] - elif focus == "metrics": - findings = [ - f - for f in findings - if str(f.get("family", "")) in {FAMILY_DESIGN, FAMILY_DEAD_CODE} - ] - return {str(finding.get("id", "")): dict(finding) for finding in findings} - - @staticmethod - def _comparison_verdict( - *, - regressions: int, - improvements: int, - health_delta: int | None, - ) -> str: - has_negative_signal = regressions > 0 or ( - health_delta is not None and health_delta < 0 - ) - has_positive_signal = improvements > 0 or ( - health_delta is not None and health_delta > 0 - ) - if has_negative_signal and has_positive_signal: - return "mixed" - if has_negative_signal: - return "regressed" - if has_positive_signal: - return "improved" - return "stable" - - @staticmethod - def _comparison_summary_text( - *, - comparable: bool, - comparability_reason: str, - regressions: int, - improvements: int, - health_delta: int | None, - ) -> str: - if not comparable: - reason_text = { - "different_root": "different roots", - "different_analysis_settings": "different analysis settings", - "different_root_and_analysis_settings": ( - "different roots and analysis settings" - ), - }.get(comparability_reason, "incomparable runs") - return f"Finding and run health deltas omitted ({reason_text})" - if health_delta is None: - return ( - f"{improvements} findings resolved, {regressions} new regressions; " - "run health delta omitted (metrics unavailable)" - ) - return ( - f"{improvements} findings resolved, {regressions} new regressions, " - f"run health delta {health_delta:+d}" - ) - - def _render_pr_summary_markdown(self, payload: Mapping[str, object]) -> str: - health = self._as_mapping(payload.get("health")) - score = health.get("score", "n/a") - grade = health.get("grade", "n/a") - delta = _as_int(payload.get("health_delta", 0), 0) - changed_items = [ - self._as_mapping(item) - for item in self._as_sequence(payload.get("new_findings_in_changed_files")) - ] - resolved = [ - self._as_mapping(item) - for item in self._as_sequence(payload.get("resolved")) - ] - blocking_gates = [ - str(item) - for item in self._as_sequence(payload.get("blocking_gates")) - if str(item) - ] - health_line = ( - f"Health: {score}/100 ({grade}) | Delta: {delta:+d} | " - f"Verdict: {payload.get('verdict', 'stable')}" - if payload.get("health_delta") is not None - else ( - f"Health: {score}/100 ({grade}) | Delta: n/a | " - f"Verdict: {payload.get('verdict', 'stable')}" - ) - ) - lines = [ - "## CodeClone Summary", - "", - health_line, - "", - f"### New findings in changed files ({len(changed_items)})", - ] - if not changed_items: - lines.append("- None") - else: - lines.extend( - [ - ( - f"- **{str(item.get('severity', 'info')).upper()}** " - f"{item.get('kind', 'finding')} in " - f"`{self._finding_display_location(item)}`" - ) - for item in changed_items[:10] - ] - ) - lines.extend(["", f"### Resolved ({len(resolved)})"]) - if not resolved: - lines.append("- None") - else: - lines.extend( - [ - ( - f"- {item.get('kind', 'finding')} in " - f"`{self._finding_display_location(item)}`" - ) - for item in resolved[:10] - ] - ) - lines.extend(["", "### Blocking gates"]) - if not blocking_gates: - lines.append("- none") - else: - lines.extend([f"- `{reason}`" for reason in blocking_gates]) - return "\n".join(lines) - - def _finding_display_location(self, finding: Mapping[str, object]) -> str: - locations = self._as_sequence(finding.get("locations")) - if not locations: - return "(unknown)" - first = locations[0] - if isinstance(first, str): - return first - location = self._as_mapping(first) - path = str(location.get("path", location.get("file", ""))).strip() - line = _as_int(location.get("line", 0), 0) - if not path: - return "(unknown)" - return f"{path}:{line}" if line > 0 else path - - def _granular_payload( - self, - *, - record: MCPRunRecord, - check: str, - items: Sequence[Mapping[str, object]], - detail_level: DetailLevel, - max_results: int, - path: str | None, - threshold_context: Mapping[str, object] | None = None, - ) -> dict[str, object]: - bounded_items = [dict(item) for item in items[: max(1, max_results)]] - full_health = dict(self._as_mapping(record.summary.get("health"))) - dimensions = self._as_mapping(full_health.get("dimensions")) - relevant_dimension = _CHECK_TO_DIMENSION.get(check) - slim_dimensions = ( - {relevant_dimension: dimensions.get(relevant_dimension)} - if relevant_dimension and relevant_dimension in dimensions - else dict(dimensions) - ) - payload: dict[str, object] = { - "run_id": self._short_run_id(record.run_id), - "check": check, - "detail_level": detail_level, - "path": path, - "returned": len(bounded_items), - "total": len(items), - "health": { - "score": full_health.get("score"), - "grade": full_health.get("grade"), - "dimensions": slim_dimensions, - }, - "items": bounded_items, - } - if threshold_context: - payload["threshold_context"] = dict(threshold_context) - return payload - - def _design_threshold_context( - self, - *, - record: MCPRunRecord, - check: str, - path: str | None, - items: Sequence[Mapping[str, object]], - requested_min: int | None = None, - ) -> dict[str, object] | None: - if items: - return None - spec = _DESIGN_CHECK_CONTEXT.get(check) - if spec is None: - return None - category = str(spec["category"]) - metric = str(spec["metric"]) - operator = str(spec["operator"]) - normalized_path = self._normalize_relative_path(path or "") - metrics = self._as_mapping(record.report_document.get("metrics")) - families = self._as_mapping(metrics.get("families")) - family = self._as_mapping(families.get(category)) - metric_items = [ - self._as_mapping(item) - for item in self._as_sequence(family.get("items")) - if not normalized_path - or self._metric_item_matches_path( - self._as_mapping(item), - normalized_path, - ) - ] - if not metric_items: - return None - values = [_as_int(item.get(metric), 0) for item in metric_items] - finding_threshold = self._design_finding_threshold( - record=record, - check=check, - ) - threshold = finding_threshold - threshold_kind = "finding_threshold" - if requested_min is not None and requested_min > finding_threshold: - threshold = requested_min - threshold_kind = "requested_min" - highest_below = self._highest_below_threshold( - values=values, - operator=operator, - threshold=threshold, - ) - payload: dict[str, object] = { - "metric": metric, - "threshold": threshold, - "threshold_kind": threshold_kind, - "measured_units": len(metric_items), - } - if threshold_kind != "finding_threshold": - payload["finding_threshold"] = finding_threshold - if highest_below is not None: - payload["highest_below_threshold"] = highest_below - return payload - - def _design_finding_threshold( - self, - *, - record: MCPRunRecord, - check: str, - ) -> int: - spec = _DESIGN_CHECK_CONTEXT[check] - category = str(spec["category"]) - default_threshold = _as_int(spec["default_threshold"]) - findings = self._as_mapping(record.report_document.get("findings")) - thresholds = self._as_mapping( - self._as_mapping(findings.get("thresholds")).get("design_findings") - ) - threshold_payload = self._as_mapping(thresholds.get(category)) - if threshold_payload: - return _as_int(threshold_payload.get("value"), default_threshold) - request_value = { - "complexity": record.request.complexity_threshold, - "coupling": record.request.coupling_threshold, - "cohesion": record.request.cohesion_threshold, - }.get(check) - return _as_int(request_value, default_threshold) - - @staticmethod - def _highest_below_threshold( - *, - values: Sequence[int], - operator: str, - threshold: int, - ) -> int | None: - if operator == ">": - below = [value for value in values if value <= threshold] - elif operator == ">=": - below = [value for value in values if value < threshold] - else: - return None - if not below: - return None - return max(below) - - @staticmethod - def _normalized_source_kind(value: object) -> str: - normalized = str(value).strip().lower() - if normalized in SOURCE_KIND_ORDER: - return normalized - return SOURCE_KIND_OTHER - - def _finding_source_kind(self, finding: Mapping[str, object]) -> str: - source_scope = self._as_mapping(finding.get("source_scope")) - return self._normalized_source_kind(source_scope.get("dominant_kind")) - - def _source_kind_breakdown( - self, - source_kinds: Iterable[object], - ) -> dict[str, int]: - breakdown = dict.fromkeys(_SOURCE_KIND_BREAKDOWN_ORDER, 0) - for value in source_kinds: - breakdown[self._normalized_source_kind(value)] += 1 - return breakdown - - def _triage_suggestion_rows(self, record: MCPRunRecord) -> list[dict[str, object]]: - derived = self._as_mapping(record.report_document.get("derived")) - canonical_rows = self._dict_list(derived.get("suggestions")) - suggestion_source_kinds = { - self._suggestion_finding_id(suggestion): self._normalized_source_kind( - getattr(suggestion, "source_kind", SOURCE_KIND_OTHER) - ) - for suggestion in record.suggestions - } - rows: list[dict[str, object]] = [] - for row in canonical_rows: - canonical_finding_id = str(row.get("finding_id", "")) - action = self._as_mapping(row.get("action")) - try: - finding_id = self._short_finding_id( - record, - self._resolve_canonical_finding_id(record, canonical_finding_id), - ) - except MCPFindingNotFoundError: - finding_id = self._base_short_finding_id(canonical_finding_id) - rows.append( - { - "id": f"suggestion:{finding_id}", - "finding_id": finding_id, - "title": str(row.get("title", "")), - "summary": str(row.get("summary", "")), - "effort": str(action.get("effort", "")), - "steps": list(self._as_sequence(action.get("steps"))), - "source_kind": suggestion_source_kinds.get( - canonical_finding_id, - SOURCE_KIND_OTHER, - ), - } - ) - return rows - - def _derived_section_payload(self, record: MCPRunRecord) -> dict[str, object]: - derived = self._as_mapping(record.report_document.get("derived")) - if not derived: - raise MCPServiceContractError( - "Report section 'derived' is not available in this run." - ) - suggestions = self._triage_suggestion_rows(record) - canonical_to_short, _ = self._finding_id_maps(record) - hotlists = self._as_mapping(derived.get("hotlists")) - projected_hotlists: dict[str, list[str]] = {} - for hotlist_key, hotlist_ids in hotlists.items(): - projected_hotlists[hotlist_key] = [ - canonical_to_short.get( - str(finding_id), - self._base_short_finding_id(str(finding_id)), - ) - for finding_id in self._as_sequence(hotlist_ids) - if str(finding_id) - ] - return { - "suggestions": suggestions, - "hotlists": projected_hotlists, - } - - @staticmethod - def _schema_resource_payload() -> dict[str, object]: - return { - "$schema": "https://json-schema.org/draft/2020-12/schema", - "title": "CodeCloneCanonicalReport", - "type": "object", - "required": [ - "report_schema_version", - "meta", - "inventory", - "findings", - "derived", - "integrity", - ], - "properties": { - "report_schema_version": { - "type": "string", - "const": REPORT_SCHEMA_VERSION, - }, - "meta": {"type": "object"}, - "inventory": {"type": "object"}, - "findings": {"type": "object"}, - "metrics": {"type": "object"}, - "derived": {"type": "object"}, - "integrity": {"type": "object"}, - }, - } - - def _validate_analysis_request(self, request: MCPAnalysisRequest) -> None: - self._validate_choice( - "analysis_mode", - request.analysis_mode, - _VALID_ANALYSIS_MODES, - ) - self._validate_choice( - "cache_policy", - request.cache_policy, - _VALID_CACHE_POLICIES, - ) - if request.cache_policy == "refresh": - raise MCPServiceContractError( - "cache_policy='refresh' is not supported by the read-only " - "CodeClone MCP server. Use 'reuse' or 'off'." - ) - if request.analysis_mode == "clones_only" and request.coverage_xml is not None: - raise MCPServiceContractError( - "coverage_xml requires analysis_mode='full' because coverage join " - "depends on metrics-enabled analysis." - ) - - @staticmethod - def _validate_choice( - name: str, - value: str, - allowed: Sequence[str] | frozenset[str], - ) -> str: - if value not in allowed: - allowed_list = ", ".join(sorted(allowed)) - raise MCPServiceContractError( - f"Invalid value for {name}: {value!r}. Expected one of: {allowed_list}." - ) - return value - - def _validate_optional_choice( - self, - name: str, - value: str | None, - allowed: Sequence[str] | frozenset[str], - ) -> str | None: - if value is None: - return None - return self._validate_choice(name, value, allowed) - - @staticmethod - def _resolve_root(root: str | None) -> Path: - cleaned_root = "" if root is None else str(root).strip() - if not cleaned_root: - raise MCPServiceContractError( - "MCP analysis requires an absolute repository root. " - "Omitted or relative roots are unsafe because the MCP server " - "working directory may not match the client workspace." - ) - candidate = Path(cleaned_root).expanduser() - if not candidate.is_absolute(): - raise MCPServiceContractError( - f"MCP requires an absolute repository root; got relative root " - f"{cleaned_root!r}. Relative roots like '.' are unsafe because " - "the MCP server working directory may not match the client " - "workspace." - ) - try: - root_path = candidate.resolve() - except OSError as exc: - raise MCPServiceContractError( - f"Invalid root path '{cleaned_root}': {exc}" - ) from exc - if not root_path.exists(): - raise MCPServiceContractError(f"Root path does not exist: {root_path}") - if not root_path.is_dir(): - raise MCPServiceContractError(f"Root path is not a directory: {root_path}") - return root_path - - def _resolve_optional_root(self, root: str | None) -> Path | None: - cleaned_root = "" if root is None else str(root).strip() - if not cleaned_root: - return None - return self._resolve_root(cleaned_root) - - def _build_args(self, *, root_path: Path, request: MCPAnalysisRequest) -> Namespace: - args = Namespace( - root=str(root_path), - min_loc=DEFAULT_MIN_LOC, - min_stmt=DEFAULT_MIN_STMT, - block_min_loc=DEFAULT_BLOCK_MIN_LOC, - block_min_stmt=DEFAULT_BLOCK_MIN_STMT, - segment_min_loc=DEFAULT_SEGMENT_MIN_LOC, - segment_min_stmt=DEFAULT_SEGMENT_MIN_STMT, - processes=None, - cache_path=None, - max_cache_size_mb=DEFAULT_MAX_CACHE_SIZE_MB, - baseline=DEFAULT_BASELINE_PATH, - max_baseline_size_mb=DEFAULT_MAX_BASELINE_SIZE_MB, - update_baseline=False, - fail_on_new=False, - fail_threshold=-1, - ci=False, - fail_complexity=-1, - fail_coupling=-1, - fail_cohesion=-1, - fail_cycles=False, - fail_dead_code=False, - fail_health=-1, - fail_on_new_metrics=False, - fail_on_typing_regression=False, - fail_on_docstring_regression=False, - fail_on_api_break=False, - min_typing_coverage=-1, - min_docstring_coverage=-1, - api_surface=False, - coverage_xml=None, - fail_on_untested_hotspots=False, - coverage_min=50, - design_complexity_threshold=DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, - design_coupling_threshold=DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, - design_cohesion_threshold=DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, - update_metrics_baseline=False, - metrics_baseline=DEFAULT_BASELINE_PATH, - skip_metrics=False, - skip_dead_code=False, - skip_dependencies=False, - golden_fixture_paths=(), - html_out=None, - json_out=None, - md_out=None, - sarif_out=None, - text_out=None, - no_progress=True, - no_color=True, - quiet=True, - verbose=False, - debug=False, - open_html_report=False, - timestamped_report_paths=False, - ) - if request.respect_pyproject: - try: - config_values = load_pyproject_config(root_path) - except ConfigValidationError as exc: - raise MCPServiceContractError(str(exc)) from exc - for key in sorted(_MCP_CONFIG_KEYS.intersection(config_values)): - setattr(args, key, config_values[key]) - - self._apply_request_overrides(args=args, root_path=root_path, request=request) - - if request.analysis_mode == "clones_only": - args.skip_metrics = True - args.skip_dead_code = True - args.skip_dependencies = True - else: - args.skip_metrics = False - args.skip_dead_code = False - args.skip_dependencies = False - - if not validate_numeric_args(args): - raise MCPServiceContractError( - "Numeric analysis settings must be non-negative and thresholds " - "must be >= -1. Coverage thresholds must be between 0 and 100." - ) - - return args - - def _apply_request_overrides( - self, - *, - args: Namespace, - root_path: Path, - request: MCPAnalysisRequest, - ) -> None: - override_map: dict[str, object | None] = { - "processes": request.processes, - "min_loc": request.min_loc, - "min_stmt": request.min_stmt, - "block_min_loc": request.block_min_loc, - "block_min_stmt": request.block_min_stmt, - "segment_min_loc": request.segment_min_loc, - "segment_min_stmt": request.segment_min_stmt, - "api_surface": request.api_surface, - "coverage_min": request.coverage_min, - "max_baseline_size_mb": request.max_baseline_size_mb, - "max_cache_size_mb": request.max_cache_size_mb, - "design_complexity_threshold": request.complexity_threshold, - "design_coupling_threshold": request.coupling_threshold, - "design_cohesion_threshold": request.cohesion_threshold, - } - for key, value in override_map.items(): - if value is not None: - setattr(args, key, value) - - if request.baseline_path is not None: - args.baseline = str( - self._resolve_optional_path(request.baseline_path, root_path) - ) - if request.metrics_baseline_path is not None: - args.metrics_baseline = str( - self._resolve_optional_path(request.metrics_baseline_path, root_path) - ) - if request.cache_path is not None: - args.cache_path = str( - self._resolve_optional_path(request.cache_path, root_path) - ) - if request.coverage_xml is not None: - args.coverage_xml = str( - self._resolve_optional_path(request.coverage_xml, root_path) - ) - - @staticmethod - def _resolve_optional_path(value: str, root_path: Path) -> Path: - candidate = Path(value).expanduser() - resolved = candidate if candidate.is_absolute() else root_path / candidate - try: - return resolved.resolve() - except OSError as exc: - raise MCPServiceContractError( - f"Invalid path '{value}' relative to '{root_path}': {exc}" - ) from exc - - def _resolve_baseline_inputs( - self, - *, - root_path: Path, - args: Namespace, - ) -> tuple[Path, bool, Path, bool, dict[str, object] | None]: - baseline_path = self._resolve_optional_path(str(args.baseline), root_path) - baseline_exists = baseline_path.exists() - - metrics_baseline_arg_path = self._resolve_optional_path( - str(args.metrics_baseline), - root_path, - ) - shared_baseline_payload: dict[str, object] | None = None - if metrics_baseline_arg_path == baseline_path: - probe = probe_metrics_baseline_section(metrics_baseline_arg_path) - metrics_baseline_exists = probe.has_metrics_section - shared_baseline_payload = probe.payload - else: - metrics_baseline_exists = metrics_baseline_arg_path.exists() - - return ( - baseline_path, - baseline_exists, - metrics_baseline_arg_path, - metrics_baseline_exists, - shared_baseline_payload, - ) - - @staticmethod - def _resolve_cache_path(*, root_path: Path, args: Namespace) -> Path: - return resolve_cache_path( - root_path=root_path, - args=args, - from_args=bool(args.cache_path), - legacy_cache_path=_LEGACY_CACHE_PATH, - console=_BufferConsole(), - ) - - @staticmethod - def _build_cache( - *, - root_path: Path, - args: Namespace, - cache_path: Path, - policy: CachePolicy, - ) -> Cache: - cache = Cache( - cache_path, - root=root_path, - max_size_bytes=_as_int(args.max_cache_size_mb, 0) * 1024 * 1024, - min_loc=_as_int(args.min_loc, DEFAULT_MIN_LOC), - min_stmt=_as_int(args.min_stmt, DEFAULT_MIN_STMT), - block_min_loc=_as_int(args.block_min_loc, DEFAULT_BLOCK_MIN_LOC), - block_min_stmt=_as_int(args.block_min_stmt, DEFAULT_BLOCK_MIN_STMT), - segment_min_loc=_as_int(args.segment_min_loc, DEFAULT_SEGMENT_MIN_LOC), - segment_min_stmt=_as_int( - args.segment_min_stmt, - DEFAULT_SEGMENT_MIN_STMT, - ), - collect_api_surface=bool(getattr(args, "api_surface", False)), - ) - if policy != "off": - cache.load() - return cache - - @staticmethod - def _metrics_computed(analysis_mode: AnalysisMode) -> tuple[str, ...]: - return ( - () - if analysis_mode == "clones_only" - else ( - "complexity", - "coupling", - "cohesion", - "health", - "dependencies", - "dead_code", - ) - ) - - @staticmethod - def _load_report_document(report_json: str) -> dict[str, object]: - return _load_report_document_payload(report_json) - - def _report_digest(self, report_document: Mapping[str, object]) -> str: - integrity = self._as_mapping(report_document.get("integrity")) - digest = self._as_mapping(integrity.get("digest")) - value = digest.get("value") - if not isinstance(value, str) or not value: - raise MCPServiceError("Canonical report digest is missing.") - return value - - def _build_run_summary_payload( - self, - *, - run_id: str, - root_path: Path, - request: MCPAnalysisRequest, - report_document: Mapping[str, object], - baseline_state: CloneBaselineState, - metrics_baseline_state: MetricsBaselineState, - cache_status: CacheStatus, - new_func: Sequence[str] | set[str], - new_block: Sequence[str] | set[str], - metrics_diff: MetricsDiff | None, - warnings: Sequence[str], - failures: Sequence[str], - ) -> dict[str, object]: - meta = self._as_mapping(report_document.get("meta")) - meta_baseline = self._as_mapping(meta.get("baseline")) - meta_metrics_baseline = self._as_mapping(meta.get("metrics_baseline")) - meta_cache = self._as_mapping(meta.get("cache")) - inventory = self._as_mapping(report_document.get("inventory")) - findings = self._as_mapping(report_document.get("findings")) - metrics = self._as_mapping(report_document.get("metrics")) - metrics_summary = self._as_mapping(metrics.get("summary")) - summary = self._as_mapping(findings.get("summary")) - analysis_profile = self._summary_analysis_profile_payload(meta) - payload = { - "run_id": run_id, - "root": str(root_path), - "analysis_mode": request.analysis_mode, - "codeclone_version": meta.get("codeclone_version", __version__), - "python_tag": str(meta.get("python_tag", "")), - "report_schema_version": report_document.get( - "report_schema_version", - REPORT_SCHEMA_VERSION, - ), - "baseline": { - "path": meta_baseline.get( - "path", - str(root_path / DEFAULT_BASELINE_PATH), - ), - "loaded": bool(meta_baseline.get("loaded", baseline_state.loaded)), - "status": str(meta_baseline.get("status", baseline_state.status.value)), - "trusted_for_diff": baseline_state.trusted_for_diff, - "python_tag": meta_baseline.get("python_tag"), - }, - "metrics_baseline": { - "path": meta_metrics_baseline.get( - "path", - str(root_path / DEFAULT_BASELINE_PATH), - ), - "loaded": bool( - meta_metrics_baseline.get( - "loaded", - metrics_baseline_state.loaded, - ) - ), - "status": str( - meta_metrics_baseline.get( - "status", - metrics_baseline_state.status.value, - ) - ), - "trusted_for_diff": metrics_baseline_state.trusted_for_diff, - }, - "cache": { - "path": meta_cache.get("path"), - "status": str(meta_cache.get("status", cache_status.value)), - "used": bool(meta_cache.get("used", False)), - "schema_version": meta_cache.get("schema_version"), - }, - "inventory": dict(inventory), - "findings_summary": dict(summary), - "health": dict(self._as_mapping(metrics_summary.get("health"))), - "baseline_diff": { - "new_function_clone_groups": len(new_func), - "new_block_clone_groups": len(new_block), - "new_clone_groups_total": len(new_func) + len(new_block), - }, - "metrics_diff": self._metrics_diff_payload(metrics_diff), - "warnings": list(warnings), - "failures": list(failures), - } - if analysis_profile: - payload["analysis_profile"] = analysis_profile - payload["cache"] = self._summary_cache_payload(payload) - payload["health"] = self._summary_health_payload(payload) - return payload - - def _summary_payload( - self, - summary: Mapping[str, object], - *, - record: MCPRunRecord | None = None, - ) -> dict[str, object]: - inventory = self._as_mapping(summary.get("inventory")) - if ( - not summary.get("run_id") - and not record - and "inventory" in summary - and not summary.get("baseline") - ): - return { - "focus": _FOCUS_REPOSITORY, - "health_scope": _HEALTH_SCOPE_REPOSITORY, - "inventory": self._summary_inventory_payload(inventory), - "health": self._summary_health_payload(summary), - } - resolved_run_id = ( - record.run_id if record is not None else str(summary.get("run_id", "")) - ) - payload: dict[str, object] = { - "run_id": self._short_run_id(resolved_run_id) if resolved_run_id else "", - "focus": _FOCUS_REPOSITORY, - "health_scope": _HEALTH_SCOPE_REPOSITORY, - "version": str(summary.get("codeclone_version", __version__)), - "schema": str(summary.get("report_schema_version", REPORT_SCHEMA_VERSION)), - "mode": str(summary.get("analysis_mode", "")), - "baseline": self._summary_baseline_payload(summary), - "metrics_baseline": self._summary_metrics_baseline_payload(summary), - "cache": self._summary_cache_payload(summary), - "inventory": self._summary_inventory_payload(inventory), - "health": self._summary_health_payload(summary), - "findings": self._summary_findings_payload(summary, record=record), - "diff": self._summary_diff_payload(summary), - "warnings": list(self._as_sequence(summary.get("warnings"))), - "failures": list(self._as_sequence(summary.get("failures"))), - } - analysis_profile = self._summary_analysis_profile_payload(summary) - if analysis_profile: - payload["analysis_profile"] = analysis_profile - if record is not None: - coverage_join = self._summary_coverage_join_payload(record) - if coverage_join: - payload["coverage_join"] = coverage_join - return payload - - def _summary_analysis_profile_payload( - self, - summary: Mapping[str, object], - ) -> dict[str, int]: - analysis_profile = self._as_mapping(summary.get("analysis_profile")) - if not analysis_profile: - return {} - keys = ( - "min_loc", - "min_stmt", - "block_min_loc", - "block_min_stmt", - "segment_min_loc", - "segment_min_stmt", - ) - payload = {key: _as_int(analysis_profile.get(key), -1) for key in keys} - return {key: value for key, value in payload.items() if value >= 0} - - def _summary_baseline_payload( - self, - summary: Mapping[str, object], - ) -> dict[str, object]: - return self._summary_trusted_state_payload(summary, key="baseline") - - def _summary_metrics_baseline_payload( - self, - summary: Mapping[str, object], - ) -> dict[str, object]: - return self._summary_trusted_state_payload(summary, key="metrics_baseline") - - def _summary_trusted_state_payload( - self, - summary: Mapping[str, object], - *, - key: str, - ) -> dict[str, object]: - baseline = self._as_mapping(summary.get(key)) - trusted = bool(baseline.get("trusted_for_diff", False)) - payload: dict[str, object] = { - "loaded": bool(baseline.get("loaded", False)), - "status": str(baseline.get("status", "")), - "trusted": trusted, - } - if key == "baseline": - payload["compared_without_valid_baseline"] = not trusted - baseline_python_tag = baseline.get("python_tag") - runtime_python_tag = summary.get("python_tag") - if isinstance(baseline_python_tag, str) and baseline_python_tag.strip(): - payload["baseline_python_tag"] = baseline_python_tag - if isinstance(runtime_python_tag, str) and runtime_python_tag.strip(): - payload["runtime_python_tag"] = runtime_python_tag - return payload - - def _summary_cache_payload( - self, - summary: Mapping[str, object], - ) -> dict[str, object]: - cache = dict(self._as_mapping(summary.get("cache"))) - if not cache: - return {} - return { - "used": bool(cache.get("used", False)), - "freshness": self._effective_freshness(summary), - } - - def _effective_freshness( - self, - summary: Mapping[str, object], - ) -> FreshnessKind: - inventory = self._as_mapping(summary.get("inventory")) - files = self._as_mapping(inventory.get("files")) - analyzed = max(0, _as_int(files.get("analyzed", 0), 0)) - cached = max(0, _as_int(files.get("cached", 0), 0)) - cache = self._as_mapping(summary.get("cache")) - cache_used = bool(cache.get("used")) - if cache_used and cached > 0 and analyzed == 0: - return "reused" - if cache_used and cached > 0 and analyzed > 0: - return "mixed" - return "fresh" - - def _summary_inventory_payload( - self, - inventory: Mapping[str, object], - ) -> dict[str, object]: - if not inventory: - return {} - files = self._as_mapping(inventory.get("files")) - code = self._as_mapping(inventory.get("code")) - total_files = _as_int( - files.get( - "total_found", - files.get( - "analyzed", - len( - self._as_sequence( - self._as_mapping(inventory.get("file_registry")).get( - "items" - ) - ) - ), - ), - ), - 0, - ) - functions = _as_int(code.get("functions", 0), 0) + _as_int( - code.get("methods", 0), - 0, - ) - return { - "files": total_files, - "lines": _as_int(code.get("parsed_lines", 0), 0), - "functions": functions, - "classes": _as_int(code.get("classes", 0), 0), - } - - def _summary_findings_payload( - self, - summary: Mapping[str, object], - *, - record: MCPRunRecord | None, - ) -> dict[str, object]: - findings_summary = self._as_mapping(summary.get("findings_summary")) - if record is None: - return { - "total": _as_int(findings_summary.get("total", 0), 0), - "new": 0, - "known": 0, - "by_family": {}, - "production": 0, - "new_by_source_kind": self._source_kind_breakdown(()), - } - findings = self._base_findings(record) - by_family: dict[str, int] = { - "clones": 0, - "structural": 0, - "dead_code": 0, - "design": 0, - } - new_count = 0 - known_count = 0 - production_count = 0 - new_by_source_kind = self._source_kind_breakdown( - self._finding_source_kind(finding) - for finding in findings - if str(finding.get("novelty", "")).strip() == "new" - ) - for finding in findings: - family = str(finding.get("family", "")).strip() - family_key = "clones" if family == FAMILY_CLONE else family - if family_key in by_family: - by_family[family_key] += 1 - if str(finding.get("novelty", "")).strip() == "new": - new_count += 1 - else: - known_count += 1 - if self._finding_source_kind(finding) == SOURCE_KIND_PRODUCTION: - production_count += 1 - return { - "total": len(findings), - "new": new_count, - "known": known_count, - "by_family": {key: value for key, value in by_family.items() if value > 0}, - "production": production_count, - "new_by_source_kind": new_by_source_kind, - } - - def _summary_diff_payload( - self, - summary: Mapping[str, object], - ) -> dict[str, object]: - baseline_diff = self._as_mapping(summary.get("baseline_diff")) - metrics_diff = self._as_mapping(summary.get("metrics_diff")) - return { - "new_clones": _as_int(baseline_diff.get("new_clone_groups_total", 0), 0), - "health_delta": ( - _as_int(metrics_diff.get("health_delta", 0), 0) - if metrics_diff - and self._summary_health_payload(summary).get("available") is not False - else None - ), - "typing_param_permille_delta": _as_int( - metrics_diff.get("typing_param_permille_delta", 0), - 0, - ), - "typing_return_permille_delta": _as_int( - metrics_diff.get("typing_return_permille_delta", 0), - 0, - ), - "docstring_permille_delta": _as_int( - metrics_diff.get("docstring_permille_delta", 0), - 0, - ), - "api_breaking_changes": _as_int( - metrics_diff.get("api_breaking_changes", 0), - 0, - ), - "new_api_symbols": _as_int( - metrics_diff.get("new_api_symbols", 0), - 0, - ), - } - - def _summary_coverage_join_payload( - self, - record: MCPRunRecord, - ) -> dict[str, object]: - metrics = self._as_mapping(record.report_document.get("metrics")) - families = self._as_mapping(metrics.get("families")) - coverage_join = self._as_mapping(families.get("coverage_join")) - summary = self._as_mapping(coverage_join.get("summary")) - if not summary: - return {} - payload: dict[str, object] = { - "status": str(summary.get("status", "")).strip(), - "overall_permille": _as_int(summary.get("overall_permille", 0), 0), - "coverage_hotspots": _as_int(summary.get("coverage_hotspots", 0), 0), - "scope_gap_hotspots": _as_int(summary.get("scope_gap_hotspots", 0), 0), - "hotspot_threshold_percent": _as_int( - summary.get("hotspot_threshold_percent", 0), - 0, - ), - } - source_value = summary.get("source") - source = source_value.strip() if isinstance(source_value, str) else "" - if source: - payload["source"] = source - invalid_reason_value = summary.get("invalid_reason") - invalid_reason = ( - invalid_reason_value.strip() - if isinstance(invalid_reason_value, str) - else "" - ) - if invalid_reason: - payload["invalid_reason"] = invalid_reason - return payload - - def _metrics_detail_payload( - self, - *, - metrics: Mapping[str, object], - family: MetricsDetailFamily | None, - path: str | None, - offset: int, - limit: int, - ) -> dict[str, object]: - summary = dict(self._as_mapping(metrics.get("summary"))) - families = self._as_mapping(metrics.get("families")) - normalized_path = self._normalize_relative_path(path or "") - if family is None and not normalized_path: - return { - "summary": summary, - "_hint": "Use family and/or path parameters to access per-item detail.", - } - normalized_offset = max(0, offset) - normalized_limit = max(1, min(limit, 200)) - family_names: Sequence[str] = ( - (family,) if family is not None else tuple(sorted(families)) - ) - items: list[dict[str, object]] = [] - for family_name in family_names: - family_payload = self._as_mapping(families.get(family_name)) - for item in self._as_sequence(family_payload.get("items")): - item_map = self._as_mapping(item) - if normalized_path and not self._metric_item_matches_path( - item_map, - normalized_path, - ): - continue - compact_item = self._compact_metrics_item(item_map) - if family is None: - compact_item = {"family": family_name, **compact_item} - items.append(compact_item) - if family is None: - items.sort( - key=lambda item: ( - str(item.get("family", "")), - str(item.get("path", "")), - str(item.get("qualname", "")), - _as_int(item.get("start_line", 0), 0), - ) - ) - page = items[normalized_offset : normalized_offset + normalized_limit] - return { - "family": family, - "path": normalized_path or None, - "offset": normalized_offset, - "limit": normalized_limit, - "returned": len(page), - "total": len(items), - "has_more": normalized_offset + len(page) < len(items), - "items": page, - } - - def _metric_item_matches_path( - self, - item: Mapping[str, object], - normalized_path: str, - ) -> bool: - path_value = ( - str(item.get("relative_path", "")).strip() - or str(item.get("path", "")).strip() - or str(item.get("filepath", "")).strip() - or str(item.get("file", "")).strip() - ) - if not path_value: - return False - return self._path_matches(path_value, (normalized_path,)) - - @staticmethod - def _compact_metrics_item( - item: Mapping[str, object], - ) -> dict[str, object]: - compact: dict[str, object] = {} - path_value = ( - str(item.get("relative_path", "")).strip() - or str(item.get("path", "")).strip() - or str(item.get("filepath", "")).strip() - or str(item.get("file", "")).strip() - ) - if path_value: - compact["path"] = path_value - for key, value in item.items(): - if ( - key not in _COMPACT_ITEM_PATH_KEYS - and value not in _COMPACT_ITEM_EMPTY_VALUES - ): - compact[str(key)] = value - return compact - - @staticmethod - def _metrics_diff_payload( - metrics_diff: MetricsDiff | None, - ) -> dict[str, object] | None: - if metrics_diff is None: - return None - new_high_risk_functions = tuple( - cast(Sequence[str], getattr(metrics_diff, "new_high_risk_functions", ())) - ) - new_high_coupling_classes = tuple( - cast(Sequence[str], getattr(metrics_diff, "new_high_coupling_classes", ())) - ) - new_cycles = tuple( - cast(Sequence[object], getattr(metrics_diff, "new_cycles", ())) - ) - new_dead_code = tuple( - cast(Sequence[str], getattr(metrics_diff, "new_dead_code", ())) - ) - health_delta = getattr(metrics_diff, "health_delta", 0) - return { - "new_high_risk_functions": len(new_high_risk_functions), - "new_high_coupling_classes": len(new_high_coupling_classes), - "new_cycles": len(new_cycles), - "new_dead_code": len(new_dead_code), - "health_delta": _as_int(health_delta, 0), - "typing_param_permille_delta": _as_int( - getattr(metrics_diff, "typing_param_permille_delta", 0), - 0, - ), - "typing_return_permille_delta": _as_int( - getattr(metrics_diff, "typing_return_permille_delta", 0), - 0, - ), - "docstring_permille_delta": _as_int( - getattr(metrics_diff, "docstring_permille_delta", 0), - 0, - ), - "api_breaking_changes": len( - tuple( - cast( - Sequence[object], - getattr(metrics_diff, "new_api_breaking_changes", ()), - ) - ) - ), - "new_api_symbols": len(tuple(getattr(metrics_diff, "new_api_symbols", ()))), - } - - def _dict_list(self, value: object) -> list[dict[str, object]]: - return [dict(self._as_mapping(item)) for item in self._as_sequence(value)] - - @staticmethod - def _as_mapping(value: object) -> Mapping[str, object]: - return value if isinstance(value, Mapping) else {} - - @staticmethod - def _as_sequence(value: object) -> Sequence[object]: - if isinstance(value, Sequence) and not isinstance( - value, - (str, bytes, bytearray), - ): - return value - return () diff --git a/codeclone/meta_markers.py b/codeclone/meta_markers/__init__.py similarity index 100% rename from codeclone/meta_markers.py rename to codeclone/meta_markers/__init__.py diff --git a/codeclone/metrics/__init__.py b/codeclone/metrics/__init__.py index 0551b7d..9135843 100644 --- a/codeclone/metrics/__init__.py +++ b/codeclone/metrics/__init__.py @@ -3,42 +3,3 @@ # file, You can obtain one at https://mozilla.org/MPL/2.0/. # SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy - -from __future__ import annotations - -from .cohesion import cohesion_risk, compute_lcom4 -from .complexity import cyclomatic_complexity, nesting_depth, risk_level -from .coupling import compute_cbo, coupling_risk -from .coverage_join import CoverageJoinParseError, build_coverage_join -from .dead_code import find_suppressed_unused, find_unused -from .dependencies import ( - build_dep_graph, - build_import_graph, - find_cycles, - longest_chains, - max_depth, -) -from .health import HealthInputs, compute_health -from .overloaded_modules import build_overloaded_modules_payload - -__all__ = [ - "CoverageJoinParseError", - "HealthInputs", - "build_coverage_join", - "build_dep_graph", - "build_import_graph", - "build_overloaded_modules_payload", - "cohesion_risk", - "compute_cbo", - "compute_health", - "compute_lcom4", - "coupling_risk", - "cyclomatic_complexity", - "find_cycles", - "find_suppressed_unused", - "find_unused", - "longest_chains", - "max_depth", - "nesting_depth", - "risk_level", -] diff --git a/codeclone/metrics/_base.py b/codeclone/metrics/_base.py new file mode 100644 index 0000000..e34da0e --- /dev/null +++ b/codeclone/metrics/_base.py @@ -0,0 +1,64 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Callable + + from ..models import ( + ClassMetrics, + DeadCandidate, + GroupItemLike, + ModuleApiSurface, + ModuleDep, + ModuleDocstringCoverage, + ModuleTypingCoverage, + SecuritySurface, + ) + +MetricResult = dict[str, object] + + +@dataclass(frozen=True, slots=True) +class MetricAggregate: + project_fields: dict[str, object] + artifacts: dict[str, object] = field(default_factory=dict) + + +@dataclass(slots=True) +class MetricProjectContext: + units: tuple[GroupItemLike, ...] + class_metrics: tuple[ClassMetrics, ...] + module_deps: tuple[ModuleDep, ...] + dead_candidates: tuple[DeadCandidate, ...] + referenced_names: frozenset[str] + referenced_qualnames: frozenset[str] + security_surfaces: tuple[SecuritySurface, ...] = () + typing_modules: tuple[ModuleTypingCoverage, ...] = () + docstring_modules: tuple[ModuleDocstringCoverage, ...] = () + api_modules: tuple[ModuleApiSurface, ...] = () + files_found: int = 0 + files_analyzed_or_cached: int = 0 + function_clone_groups: int = 0 + block_clone_groups: int = 0 + skip_dependencies: bool = False + skip_dead_code: bool = False + memo: dict[str, MetricResult] = field(default_factory=dict) + + +@dataclass(frozen=True, slots=True) +class MetricFamily: + name: str + compute: Callable[[MetricProjectContext], MetricResult] + aggregate: Callable[[list[MetricResult]], MetricAggregate] + report_section: str + baseline_key: str | None + gate_keys: tuple[str, ...] + skippable_flag: str | None diff --git a/codeclone/metrics/complexity.py b/codeclone/metrics/complexity.py index fa98f9d..97808e3 100644 --- a/codeclone/metrics/complexity.py +++ b/codeclone/metrics/complexity.py @@ -15,7 +15,7 @@ if TYPE_CHECKING: from collections.abc import Iterable - from ..cfg_model import CFG + from ..analysis.cfg_model import CFG ControlNode = ( ast.If diff --git a/codeclone/metrics/coverage_join.py b/codeclone/metrics/coverage_join.py index 08c8278..386b16b 100644 --- a/codeclone/metrics/coverage_join.py +++ b/codeclone/metrics/coverage_join.py @@ -10,11 +10,11 @@ from collections.abc import Sequence from dataclasses import dataclass from pathlib import Path -from typing import Literal, cast +from typing import Literal from xml.etree import ElementTree -from .._coerce import as_int, as_str from ..models import CoverageJoinResult, GroupItemLike, UnitCoverageFact +from ..utils.coerce import as_int, as_str __all__ = [ "CoverageJoinParseError", @@ -221,8 +221,10 @@ def _resolve_unit_path(filepath: str) -> str: def _risk_level(value: object) -> _Risk: risk = as_str(value, "low") - if risk in {"low", "medium", "high"}: - return cast(_Risk, risk) + if risk == "medium": + return "medium" + if risk == "high": + return "high" return "low" diff --git a/codeclone/metrics/dead_code.py b/codeclone/metrics/dead_code.py index b6306d9..599dc1c 100644 --- a/codeclone/metrics/dead_code.py +++ b/codeclone/metrics/dead_code.py @@ -9,11 +9,11 @@ from dataclasses import replace from typing import Literal +from ..analysis.suppressions import DEAD_CODE_RULE_ID from ..domain.findings import SYMBOL_KIND_FUNCTION, SYMBOL_KIND_METHOD from ..domain.quality import CONFIDENCE_HIGH, CONFIDENCE_MEDIUM from ..models import DeadCandidate, DeadItem from ..paths import is_test_filepath -from ..suppressions import DEAD_CODE_RULE_ID _TEST_NAME_PREFIXES = ("test_", "pytest_") _DYNAMIC_METHOD_PREFIXES = ("visit_",) diff --git a/codeclone/metrics/dependencies.py b/codeclone/metrics/dependencies.py index 48ba032..573cc9e 100644 --- a/codeclone/metrics/dependencies.py +++ b/codeclone/metrics/dependencies.py @@ -6,6 +6,7 @@ from __future__ import annotations +from math import ceil from typing import TYPE_CHECKING from ..models import DepGraph, ModuleDep @@ -16,6 +17,37 @@ DepAdjacency = dict[str, set[str]] +def _internal_roots( + modules: Iterable[str], + deps: Sequence[ModuleDep], +) -> frozenset[str]: + roots: set[str] = set() + for module_name in modules: + if module_name: + roots.add(module_name.split(".", 1)[0]) + for dep in deps: + if dep.source: + roots.add(dep.source.split(".", 1)[0]) + return frozenset(sorted(roots)) + + +def _is_internal_target(target: str, *, internal_roots: frozenset[str]) -> bool: + if not target: + return False + return target.split(".", 1)[0] in internal_roots + + +def _unique_sorted_edges(deps: Sequence[ModuleDep]) -> tuple[ModuleDep, ...]: + return tuple( + sorted( + { + (dep.source, dep.target, dep.import_type, dep.line): dep for dep in deps + }.values(), + key=lambda dep: (dep.source, dep.target, dep.import_type, dep.line), + ) + ) + + def build_import_graph( *, modules: Iterable[str], @@ -123,6 +155,23 @@ def max_depth(graph: DepAdjacency) -> int: return best +def depth_profile(graph: DepAdjacency) -> tuple[float, int]: + if not graph: + return 0.0, 0 + + memo: dict[str, int] = {} + depths = sorted( + _longest_path_from(node, graph=graph, visiting=set(), memo=memo) + for node in sorted(graph) + ) + if not depths: + return 0.0, 0 + + avg_depth = sum(depths) / len(depths) + percentile_index = max(0, ceil(len(depths) * 0.95) - 1) + return avg_depth, int(depths[percentile_index]) + + def _longest_path_nodes_from( node: str, *, @@ -180,22 +229,44 @@ def longest_chains( def build_dep_graph(*, modules: Iterable[str], deps: Sequence[ModuleDep]) -> DepGraph: - graph = build_import_graph(modules=modules, deps=deps) - cycles = find_cycles(graph) - depth = max_depth(graph) - chains = longest_chains(graph) - unique_edges = tuple( + base_modules = frozenset( sorted( { - (dep.source, dep.target, dep.import_type, dep.line): dep for dep in deps - }.values(), - key=lambda dep: (dep.source, dep.target, dep.import_type, dep.line), + str(module_name).strip() + for module_name in modules + if str(module_name).strip() + } ) ) + internal_roots = _internal_roots(base_modules, deps) + internal_edges = _unique_sorted_edges( + tuple( + dep + for dep in deps + if dep.source + and _is_internal_target(dep.target, internal_roots=internal_roots) + ) + ) + graph_modules = frozenset( + sorted( + { + *base_modules, + *(dep.source for dep in internal_edges if dep.source), + *(dep.target for dep in internal_edges if dep.target), + } + ) + ) + graph = build_import_graph(modules=graph_modules, deps=internal_edges) + cycles = find_cycles(graph) + depth = max_depth(graph) + avg_depth, p95_depth = depth_profile(graph) + chains = longest_chains(graph) return DepGraph( modules=frozenset(graph.keys()), - edges=unique_edges, + edges=internal_edges, cycles=cycles, max_depth=depth, + avg_depth=avg_depth, + p95_depth=p95_depth, longest_chains=chains, ) diff --git a/codeclone/metrics/health.py b/codeclone/metrics/health.py index 9f0ab67..354bb5a 100644 --- a/codeclone/metrics/health.py +++ b/codeclone/metrics/health.py @@ -7,9 +7,16 @@ from __future__ import annotations from dataclasses import dataclass +from math import ceil from typing import Literal -from ..contracts import HEALTH_WEIGHTS +from ..contracts import ( + HEALTH_DEPENDENCY_CYCLE_PENALTY, + HEALTH_DEPENDENCY_DEPTH_AVG_MULTIPLIER, + HEALTH_DEPENDENCY_DEPTH_LEVEL_PENALTY, + HEALTH_DEPENDENCY_DEPTH_P95_MARGIN, + HEALTH_WEIGHTS, +) from ..models import HealthScore @@ -29,6 +36,8 @@ class HealthInputs: low_cohesion_classes: int dependency_cycles: int dependency_max_depth: int + dependency_avg_depth: float + dependency_p95_depth: int dead_code_items: int @@ -54,6 +63,26 @@ def _safe_div(numerator: float, denominator: float) -> float: return numerator / denominator +def _dependency_expected_tail(*, avg_depth: float, p95_depth: int) -> int: + avg_based = ceil(max(0.0, avg_depth) * HEALTH_DEPENDENCY_DEPTH_AVG_MULTIPLIER) + p95_based = max(0, p95_depth) + HEALTH_DEPENDENCY_DEPTH_P95_MARGIN + return max(avg_based, p95_based) + + +def _dependency_tail_pressure( + *, + max_depth: int, + avg_depth: float, + p95_depth: int, +) -> int: + if max_depth <= 0: + return 0 + return max( + 0, + max_depth - _dependency_expected_tail(avg_depth=avg_depth, p95_depth=p95_depth), + ) + + # Piecewise clone-density curve: mild penalty for low density, # steep in the structural-debt zone, brutal when it's systemic. _CLONE_BREAKPOINTS: tuple[tuple[float, float], ...] = ( @@ -104,8 +133,13 @@ def compute_health(inputs: HealthInputs) -> HealthScore: dead_code_score = _clamp_score(100 - inputs.dead_code_items * 8) dependency_score = _clamp_score( 100 - - inputs.dependency_cycles * 25 - - max(0, inputs.dependency_max_depth - 6) * 4 + - inputs.dependency_cycles * HEALTH_DEPENDENCY_CYCLE_PENALTY + - _dependency_tail_pressure( + max_depth=inputs.dependency_max_depth, + avg_depth=inputs.dependency_avg_depth, + p95_depth=inputs.dependency_p95_depth, + ) + * HEALTH_DEPENDENCY_DEPTH_LEVEL_PENALTY ) coverage_score = _clamp_score( _safe_div(inputs.files_analyzed_or_cached * 100.0, max(1, inputs.files_found)) diff --git a/codeclone/metrics/overloaded_modules.py b/codeclone/metrics/overloaded_modules.py index 46b414b..e151879 100644 --- a/codeclone/metrics/overloaded_modules.py +++ b/codeclone/metrics/overloaded_modules.py @@ -11,7 +11,6 @@ from collections.abc import Sequence from math import floor -from .._coerce import as_float, as_int, as_sequence, as_str from ..domain.source_scope import ( SOURCE_KIND_FIXTURES, SOURCE_KIND_OTHER, @@ -20,6 +19,7 @@ ) from ..models import ClassMetrics, GroupItemLike, ModuleDep from ..scanner import module_name_from_path +from ..utils.coerce import as_float, as_int, as_sequence, as_str _CANDIDATE = "candidate" _NON_CANDIDATE = "non_candidate" diff --git a/codeclone/metrics/registry.py b/codeclone/metrics/registry.py new file mode 100644 index 0000000..23c6df8 --- /dev/null +++ b/codeclone/metrics/registry.py @@ -0,0 +1,755 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections.abc import Callable +from typing import TypeGuard + +from ..domain.findings import CATEGORY_COHESION, CATEGORY_COMPLEXITY, CATEGORY_COUPLING +from ..domain.quality import RISK_HIGH +from ..models import ( + ApiSurfaceSnapshot, + DeadItem, + DepGraph, + HealthScore, + ModuleDep, + ModuleDocstringCoverage, + ModuleTypingCoverage, + ProjectMetrics, +) +from ..utils.coerce import as_int as _as_int +from ..utils.coerce import as_str as _as_str +from ._base import MetricAggregate, MetricFamily, MetricProjectContext, MetricResult +from .dead_code import find_unused +from .dependencies import build_dep_graph +from .health import HealthInputs, compute_health + + +def _group_item_sort_key(item: object) -> tuple[str, int, int, str]: + if not isinstance(item, dict): + return "", 0, 0, "" + return ( + _as_str(item.get("filepath")), + _as_int(item.get("start_line")), + _as_int(item.get("end_line")), + _as_str(item.get("qualname")), + ) + + +def _class_metric_sort_key(metric: object) -> tuple[str, int, int, str]: + filepath = getattr(metric, "filepath", "") + start_line = getattr(metric, "start_line", 0) + end_line = getattr(metric, "end_line", 0) + qualname = getattr(metric, "qualname", "") + return str(filepath), int(start_line), int(end_line), str(qualname) + + +def _module_names_from_units(units: tuple[object, ...]) -> frozenset[str]: + modules: set[str] = set() + for item in units: + if not isinstance(item, dict): + continue + qualname = _as_str(item.get("qualname")) + module_name = qualname.split(":", 1)[0] if ":" in qualname else qualname + if module_name: + modules.add(module_name) + return frozenset(sorted(modules)) + + +def _empty_dep_graph() -> DepGraph: + return DepGraph( + modules=frozenset(), + edges=(), + cycles=(), + max_depth=0, + avg_depth=0.0, + p95_depth=0, + longest_chains=(), + ) + + +_EMPTY_HEALTH_SCORE = compute_health( + HealthInputs( + files_found=0, + files_analyzed_or_cached=0, + function_clone_groups=0, + block_clone_groups=0, + complexity_avg=0.0, + complexity_max=0, + high_risk_functions=0, + coupling_avg=0.0, + coupling_max=0, + high_risk_classes=0, + cohesion_avg=0.0, + low_cohesion_classes=0, + dependency_cycles=0, + dependency_max_depth=0, + dependency_avg_depth=0.0, + dependency_p95_depth=0, + dead_code_items=0, + ) +) + + +def _is_tuple_of_str(value: object) -> TypeGuard[tuple[str, ...]]: + return isinstance(value, tuple) and all(isinstance(item, str) for item in value) + + +def _is_tuple_of_tuple_str(value: object) -> TypeGuard[tuple[tuple[str, ...], ...]]: + return isinstance(value, tuple) and all(_is_tuple_of_str(item) for item in value) + + +def _is_tuple_of_dead_items(value: object) -> TypeGuard[tuple[DeadItem, ...]]: + return isinstance(value, tuple) and all( + isinstance(item, DeadItem) for item in value + ) + + +def _is_tuple_of_module_deps(value: object) -> TypeGuard[tuple[ModuleDep, ...]]: + return isinstance(value, tuple) and all( + isinstance(item, ModuleDep) for item in value + ) + + +def _is_tuple_of_typing_modules( + value: object, +) -> TypeGuard[tuple[ModuleTypingCoverage, ...]]: + return isinstance(value, tuple) and all( + isinstance(item, ModuleTypingCoverage) for item in value + ) + + +def _is_tuple_of_docstring_modules( + value: object, +) -> TypeGuard[tuple[ModuleDocstringCoverage, ...]]: + return isinstance(value, tuple) and all( + isinstance(item, ModuleDocstringCoverage) for item in value + ) + + +def project_metrics_defaults() -> dict[str, object]: + return { + "complexity_avg": 0.0, + "complexity_max": 0, + "high_risk_functions": (), + "coupling_avg": 0.0, + "coupling_max": 0, + "high_risk_classes": (), + "cohesion_avg": 0.0, + "cohesion_max": 0, + "low_cohesion_classes": (), + "dependency_modules": 0, + "dependency_edges": 0, + "dependency_edge_list": (), + "dependency_cycles": (), + "dependency_max_depth": 0, + "dependency_longest_chains": (), + "dead_code": (), + "health": _EMPTY_HEALTH_SCORE, + "typing_param_total": 0, + "typing_param_annotated": 0, + "typing_return_total": 0, + "typing_return_annotated": 0, + "typing_any_count": 0, + "docstring_public_total": 0, + "docstring_public_documented": 0, + "typing_modules": (), + "docstring_modules": (), + "api_surface": None, + } + + +def build_project_metrics(project_fields: dict[str, object]) -> ProjectMetrics: + return ProjectMetrics( + complexity_avg=_result_float(project_fields, "complexity_avg"), + complexity_max=_result_int(project_fields, "complexity_max"), + high_risk_functions=_result_tuple_str(project_fields, "high_risk_functions"), + coupling_avg=_result_float(project_fields, "coupling_avg"), + coupling_max=_result_int(project_fields, "coupling_max"), + high_risk_classes=_result_tuple_str(project_fields, "high_risk_classes"), + cohesion_avg=_result_float(project_fields, "cohesion_avg"), + cohesion_max=_result_int(project_fields, "cohesion_max"), + low_cohesion_classes=_result_tuple_str(project_fields, "low_cohesion_classes"), + dependency_modules=_result_int(project_fields, "dependency_modules"), + dependency_edges=_result_int(project_fields, "dependency_edges"), + dependency_edge_list=_result_module_deps( + project_fields, + "dependency_edge_list", + ), + dependency_cycles=_result_nested_tuple_str( + project_fields, + "dependency_cycles", + ), + dependency_max_depth=_result_int(project_fields, "dependency_max_depth"), + dependency_longest_chains=_result_nested_tuple_str( + project_fields, + "dependency_longest_chains", + ), + dead_code=_result_dead_items(project_fields, "dead_code"), + health=_result_health(project_fields, "health"), + typing_param_total=_result_int(project_fields, "typing_param_total"), + typing_param_annotated=_result_int(project_fields, "typing_param_annotated"), + typing_return_total=_result_int(project_fields, "typing_return_total"), + typing_return_annotated=_result_int( + project_fields, + "typing_return_annotated", + ), + typing_any_count=_result_int(project_fields, "typing_any_count"), + docstring_public_total=_result_int(project_fields, "docstring_public_total"), + docstring_public_documented=_result_int( + project_fields, + "docstring_public_documented", + ), + typing_modules=_result_typing_modules(project_fields, "typing_modules"), + docstring_modules=_result_docstring_modules( + project_fields, + "docstring_modules", + ), + api_surface=_result_api_surface(project_fields, "api_surface"), + ) + + +def _result_float(result: dict[str, object], key: str) -> float: + value = result.get(key) + return float(value) if isinstance(value, int | float) else 0.0 + + +def _result_int(result: dict[str, object], key: str) -> int: + return _as_int(result.get(key), 0) + + +def _result_tuple_str(result: dict[str, object], key: str) -> tuple[str, ...]: + value = result.get(key, ()) + return value if _is_tuple_of_str(value) else () + + +def _result_nested_tuple_str( + result: dict[str, object], + key: str, +) -> tuple[tuple[str, ...], ...]: + value = result.get(key, ()) + return value if _is_tuple_of_tuple_str(value) else () + + +def _result_dead_items( + result: dict[str, object], + key: str, +) -> tuple[DeadItem, ...]: + value = result.get(key, ()) + return value if _is_tuple_of_dead_items(value) else () + + +def _result_module_deps( + result: dict[str, object], + key: str, +) -> tuple[ModuleDep, ...]: + value = result.get(key, ()) + return value if _is_tuple_of_module_deps(value) else () + + +def _result_health(result: dict[str, object], key: str) -> HealthScore: + value = result.get(key) + return value if isinstance(value, HealthScore) else _EMPTY_HEALTH_SCORE + + +def _result_typing_modules( + result: dict[str, object], + key: str, +) -> tuple[ModuleTypingCoverage, ...]: + value = result.get(key, ()) + return value if _is_tuple_of_typing_modules(value) else () + + +def _result_docstring_modules( + result: dict[str, object], + key: str, +) -> tuple[ModuleDocstringCoverage, ...]: + value = result.get(key, ()) + return value if _is_tuple_of_docstring_modules(value) else () + + +def _result_api_surface( + result: dict[str, object], + key: str, +) -> ApiSurfaceSnapshot | None: + value = result.get(key) + return value if isinstance(value, ApiSurfaceSnapshot) else None + + +def _memoized_result( + context: MetricProjectContext, + *, + family_name: str, + builder: Callable[[MetricProjectContext], MetricResult], +) -> MetricResult: + cached = context.memo.get(family_name) + if cached is not None: + return cached + result = builder(context) + context.memo[family_name] = result + return result + + +def _first_result(results: list[MetricResult]) -> MetricResult: + return results[0] if results else {} + + +def _build_complexity_result(context: MetricProjectContext) -> MetricResult: + unit_rows = tuple(sorted(context.units, key=_group_item_sort_key)) + complexities = tuple( + max(1, _as_int(row.get("cyclomatic_complexity"), 1)) for row in unit_rows + ) + complexity_max = max(complexities) if complexities else 0 + complexity_avg = ( + float(sum(complexities)) / float(len(complexities)) if complexities else 0.0 + ) + high_risk_functions = tuple( + sorted( + { + _as_str(row.get("qualname")) + for row in unit_rows + if _as_str(row.get("risk")) == RISK_HIGH + } + ) + ) + return { + "complexity_avg": complexity_avg, + "complexity_max": complexity_max, + "high_risk_functions": high_risk_functions, + } + + +def _summarize_class_metric_family( + context: MetricProjectContext, + *, + value_attr: str, + risk_attr: str, +) -> tuple[float, int, tuple[str, ...]]: + classes_sorted = tuple(sorted(context.class_metrics, key=_class_metric_sort_key)) + values = tuple( + _as_int(getattr(metric, value_attr, 0), 0) for metric in classes_sorted + ) + value_max = max(values) if values else 0 + value_avg = float(sum(values)) / float(len(values)) if values else 0.0 + high_risk_symbols = tuple( + sorted( + { + metric.qualname + for metric in classes_sorted + if str(getattr(metric, risk_attr, "")) == RISK_HIGH + } + ) + ) + return value_avg, value_max, high_risk_symbols + + +def _compute_complexity_family(context: MetricProjectContext) -> MetricResult: + return _memoized_result( + context, + family_name=CATEGORY_COMPLEXITY, + builder=_build_complexity_result, + ) + + +def _aggregate_complexity_family(results: list[MetricResult]) -> MetricAggregate: + result = _first_result(results) + return MetricAggregate( + project_fields={ + "complexity_avg": _result_float(result, "complexity_avg"), + "complexity_max": _result_int(result, "complexity_max"), + "high_risk_functions": _result_tuple_str(result, "high_risk_functions"), + } + ) + + +def _build_coupling_result(context: MetricProjectContext) -> MetricResult: + coupling_avg, coupling_max, high_risk_classes = _summarize_class_metric_family( + context, + value_attr="cbo", + risk_attr="risk_coupling", + ) + return { + "coupling_avg": coupling_avg, + "coupling_max": coupling_max, + "high_risk_classes": high_risk_classes, + } + + +def _compute_coupling_family(context: MetricProjectContext) -> MetricResult: + return _memoized_result( + context, + family_name=CATEGORY_COUPLING, + builder=_build_coupling_result, + ) + + +def _aggregate_coupling_family(results: list[MetricResult]) -> MetricAggregate: + result = _first_result(results) + return MetricAggregate( + project_fields={ + "coupling_avg": _result_float(result, "coupling_avg"), + "coupling_max": _result_int(result, "coupling_max"), + "high_risk_classes": _result_tuple_str(result, "high_risk_classes"), + } + ) + + +def _build_cohesion_result(context: MetricProjectContext) -> MetricResult: + cohesion_avg, cohesion_max, low_cohesion_classes = _summarize_class_metric_family( + context, + value_attr="lcom4", + risk_attr="risk_cohesion", + ) + return { + "cohesion_avg": cohesion_avg, + "cohesion_max": cohesion_max, + "low_cohesion_classes": low_cohesion_classes, + } + + +def _compute_cohesion_family(context: MetricProjectContext) -> MetricResult: + return _memoized_result( + context, + family_name=CATEGORY_COHESION, + builder=_build_cohesion_result, + ) + + +def _aggregate_cohesion_family(results: list[MetricResult]) -> MetricAggregate: + result = _first_result(results) + return MetricAggregate( + project_fields={ + "cohesion_avg": _result_float(result, "cohesion_avg"), + "cohesion_max": _result_int(result, "cohesion_max"), + "low_cohesion_classes": _result_tuple_str(result, "low_cohesion_classes"), + } + ) + + +def _build_dependencies_result(context: MetricProjectContext) -> MetricResult: + dep_graph = _empty_dep_graph() + if not context.skip_dependencies: + dep_graph = build_dep_graph( + modules=_module_names_from_units(tuple(context.units)), + deps=context.module_deps, + ) + return { + "dependency_modules": len(dep_graph.modules), + "dependency_edges": len(dep_graph.edges), + "dependency_edge_list": dep_graph.edges, + "dependency_cycles": dep_graph.cycles, + "dependency_max_depth": dep_graph.max_depth, + "dependency_avg_depth": dep_graph.avg_depth, + "dependency_p95_depth": dep_graph.p95_depth, + "dependency_longest_chains": dep_graph.longest_chains, + "dep_graph": dep_graph, + } + + +def _compute_dependencies_family(context: MetricProjectContext) -> MetricResult: + return _memoized_result( + context, + family_name="dependencies", + builder=_build_dependencies_result, + ) + + +def _aggregate_dependencies_family(results: list[MetricResult]) -> MetricAggregate: + result = _first_result(results) + dep_graph = result.get("dep_graph") + return MetricAggregate( + project_fields={ + "dependency_modules": _result_int(result, "dependency_modules"), + "dependency_edges": _result_int(result, "dependency_edges"), + "dependency_edge_list": _result_module_deps(result, "dependency_edge_list"), + "dependency_cycles": _result_nested_tuple_str(result, "dependency_cycles"), + "dependency_max_depth": _result_int(result, "dependency_max_depth"), + "dependency_longest_chains": _result_nested_tuple_str( + result, + "dependency_longest_chains", + ), + }, + artifacts=({"dep_graph": dep_graph} if isinstance(dep_graph, DepGraph) else {}), + ) + + +def _build_dead_code_result(context: MetricProjectContext) -> MetricResult: + dead_items: tuple[DeadItem, ...] = () + if not context.skip_dead_code: + dead_items = find_unused( + definitions=tuple(context.dead_candidates), + referenced_names=context.referenced_names, + referenced_qualnames=context.referenced_qualnames, + ) + return { + "dead_code": dead_items, + "dead_items": dead_items, + } + + +def _compute_dead_code_family(context: MetricProjectContext) -> MetricResult: + return _memoized_result( + context, + family_name="dead_code", + builder=_build_dead_code_result, + ) + + +def _aggregate_dead_code_family(results: list[MetricResult]) -> MetricAggregate: + result = _first_result(results) + dead_items = result.get("dead_items") + return MetricAggregate( + project_fields={ + "dead_code": _result_dead_items(result, "dead_code"), + }, + artifacts=({"dead_items": dead_items} if isinstance(dead_items, tuple) else {}), + ) + + +def _build_health_result(context: MetricProjectContext) -> MetricResult: + complexity = _compute_complexity_family(context) + coupling = _compute_coupling_family(context) + cohesion = _compute_cohesion_family(context) + dependencies = _compute_dependencies_family(context) + dead_code = _compute_dead_code_family(context) + health = compute_health( + HealthInputs( + files_found=context.files_found, + files_analyzed_or_cached=context.files_analyzed_or_cached, + function_clone_groups=context.function_clone_groups, + block_clone_groups=context.block_clone_groups, + complexity_avg=_result_float(complexity, "complexity_avg"), + complexity_max=_result_int(complexity, "complexity_max"), + high_risk_functions=len( + _result_tuple_str(complexity, "high_risk_functions") + ), + coupling_avg=_result_float(coupling, "coupling_avg"), + coupling_max=_result_int(coupling, "coupling_max"), + high_risk_classes=len(_result_tuple_str(coupling, "high_risk_classes")), + cohesion_avg=_result_float(cohesion, "cohesion_avg"), + low_cohesion_classes=len( + _result_tuple_str(cohesion, "low_cohesion_classes") + ), + dependency_cycles=len( + _result_nested_tuple_str(dependencies, "dependency_cycles") + ), + dependency_max_depth=_result_int(dependencies, "dependency_max_depth"), + dependency_avg_depth=_result_float(dependencies, "dependency_avg_depth"), + dependency_p95_depth=_result_int(dependencies, "dependency_p95_depth"), + dead_code_items=len(_result_dead_items(dead_code, "dead_code")), + ) + ) + return {"health": health} + + +def _compute_health_family(context: MetricProjectContext) -> MetricResult: + return _memoized_result( + context, + family_name="health", + builder=_build_health_result, + ) + + +def _aggregate_health_family(results: list[MetricResult]) -> MetricAggregate: + result = _first_result(results) + return MetricAggregate(project_fields={"health": _result_health(result, "health")}) + + +def _build_coverage_adoption_result(context: MetricProjectContext) -> MetricResult: + typing_rows = tuple( + sorted(context.typing_modules, key=lambda item: (item.filepath, item.module)) + ) + docstring_rows = tuple( + sorted(context.docstring_modules, key=lambda item: (item.filepath, item.module)) + ) + return { + "typing_param_total": sum(item.params_total for item in typing_rows), + "typing_param_annotated": sum(item.params_annotated for item in typing_rows), + "typing_return_total": sum(item.returns_total for item in typing_rows), + "typing_return_annotated": sum(item.returns_annotated for item in typing_rows), + "typing_any_count": sum(item.any_annotation_count for item in typing_rows), + "docstring_public_total": sum( + item.public_symbol_total for item in docstring_rows + ), + "docstring_public_documented": sum( + item.public_symbol_documented for item in docstring_rows + ), + "typing_modules": typing_rows, + "docstring_modules": docstring_rows, + } + + +def _compute_coverage_adoption_family(context: MetricProjectContext) -> MetricResult: + return _memoized_result( + context, + family_name="coverage_adoption", + builder=_build_coverage_adoption_result, + ) + + +def _aggregate_coverage_adoption_family(results: list[MetricResult]) -> MetricAggregate: + result = _first_result(results) + return MetricAggregate( + project_fields={ + "typing_param_total": _result_int(result, "typing_param_total"), + "typing_param_annotated": _result_int(result, "typing_param_annotated"), + "typing_return_total": _result_int(result, "typing_return_total"), + "typing_return_annotated": _result_int( + result, + "typing_return_annotated", + ), + "typing_any_count": _result_int(result, "typing_any_count"), + "docstring_public_total": _result_int(result, "docstring_public_total"), + "docstring_public_documented": _result_int( + result, + "docstring_public_documented", + ), + "typing_modules": _result_typing_modules(result, "typing_modules"), + "docstring_modules": _result_docstring_modules( + result, + "docstring_modules", + ), + } + ) + + +def _build_api_surface_result(context: MetricProjectContext) -> MetricResult: + api_rows = tuple( + sorted(context.api_modules, key=lambda item: (item.filepath, item.module)) + ) + return { + "api_surface": ApiSurfaceSnapshot(modules=api_rows) if api_rows else None, + } + + +def _compute_api_surface_family(context: MetricProjectContext) -> MetricResult: + return _memoized_result( + context, + family_name="api_surface", + builder=_build_api_surface_result, + ) + + +def _aggregate_api_surface_family(results: list[MetricResult]) -> MetricAggregate: + result = _first_result(results) + return MetricAggregate(project_fields={"api_surface": result.get("api_surface")}) + + +def _compute_report_only_family(_context: MetricProjectContext) -> MetricResult: + return {} + + +def _aggregate_empty_family(_results: list[MetricResult]) -> MetricAggregate: + return MetricAggregate(project_fields={}) + + +METRIC_FAMILIES: dict[str, MetricFamily] = { + CATEGORY_COMPLEXITY: MetricFamily( + name=CATEGORY_COMPLEXITY, + compute=_compute_complexity_family, + aggregate=_aggregate_complexity_family, + report_section=CATEGORY_COMPLEXITY, + baseline_key="max_complexity", + gate_keys=("complexity_threshold", "new_high_risk_functions"), + skippable_flag="skip_metrics", + ), + CATEGORY_COUPLING: MetricFamily( + name=CATEGORY_COUPLING, + compute=_compute_coupling_family, + aggregate=_aggregate_coupling_family, + report_section=CATEGORY_COUPLING, + baseline_key="max_coupling", + gate_keys=("coupling_threshold", "new_high_coupling_classes"), + skippable_flag="skip_metrics", + ), + CATEGORY_COHESION: MetricFamily( + name=CATEGORY_COHESION, + compute=_compute_cohesion_family, + aggregate=_aggregate_cohesion_family, + report_section=CATEGORY_COHESION, + baseline_key="max_cohesion", + gate_keys=("cohesion_threshold",), + skippable_flag="skip_metrics", + ), + "dependencies": MetricFamily( + name="dependencies", + compute=_compute_dependencies_family, + aggregate=_aggregate_dependencies_family, + report_section="dependencies", + baseline_key="dependency_cycles", + gate_keys=("dependency_cycles", "new_dependency_cycles"), + skippable_flag="skip_metrics", + ), + "dead_code": MetricFamily( + name="dead_code", + compute=_compute_dead_code_family, + aggregate=_aggregate_dead_code_family, + report_section="dead_code", + baseline_key="dead_code_items", + gate_keys=("dead_code_high_confidence", "new_dead_code"), + skippable_flag="skip_metrics", + ), + "health": MetricFamily( + name="health", + compute=_compute_health_family, + aggregate=_aggregate_health_family, + report_section="health", + baseline_key="health_score", + gate_keys=("health_threshold", "health_regression"), + skippable_flag="skip_metrics", + ), + "coverage_adoption": MetricFamily( + name="coverage_adoption", + compute=_compute_coverage_adoption_family, + aggregate=_aggregate_coverage_adoption_family, + report_section="coverage_adoption", + baseline_key="typing_param_permille", + gate_keys=( + "typing_coverage_threshold", + "docstring_coverage_threshold", + "typing_regression", + "docstring_regression", + ), + skippable_flag="skip_metrics", + ), + "api_surface": MetricFamily( + name="api_surface", + compute=_compute_api_surface_family, + aggregate=_aggregate_api_surface_family, + report_section="api_surface", + baseline_key=None, + gate_keys=("api_breaking_changes",), + skippable_flag="skip_metrics", + ), + "overloaded_modules": MetricFamily( + name="overloaded_modules", + compute=_compute_report_only_family, + aggregate=_aggregate_empty_family, + report_section="overloaded_modules", + baseline_key=None, + gate_keys=(), + skippable_flag="skip_metrics", + ), + "security_surfaces": MetricFamily( + name="security_surfaces", + compute=_compute_report_only_family, + aggregate=_aggregate_empty_family, + report_section="security_surfaces", + baseline_key=None, + gate_keys=(), + skippable_flag="skip_metrics", + ), + "coverage_join": MetricFamily( + name="coverage_join", + compute=_compute_report_only_family, + aggregate=_aggregate_empty_family, + report_section="coverage_join", + baseline_key=None, + gate_keys=("coverage_hotspots",), + skippable_flag="skip_metrics", + ), +} diff --git a/codeclone/metrics_baseline.py b/codeclone/metrics_baseline.py deleted file mode 100644 index ed4197e..0000000 --- a/codeclone/metrics_baseline.py +++ /dev/null @@ -1,1317 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at https://mozilla.org/MPL/2.0/. -# SPDX-License-Identifier: MPL-2.0 -# Copyright (c) 2026 Den Rozhnovskiy - -from __future__ import annotations - -import hashlib -import hmac -from datetime import datetime, timezone -from enum import Enum -from json import JSONDecodeError -from pathlib import Path -from typing import TYPE_CHECKING, Any, Final, Literal, cast - -import orjson - -from . import __version__ -from ._json_io import read_json_object as _read_json_object -from ._json_io import write_json_document_atomically as _write_json_document_atomically -from ._schema_validation import validate_top_level_structure -from .baseline import current_python_tag -from .cache_paths import runtime_filepath_from_wire, wire_filepath_from_runtime -from .contracts import BASELINE_SCHEMA_VERSION, METRICS_BASELINE_SCHEMA_VERSION -from .errors import BaselineValidationError -from .metrics.api_surface import compare_api_surfaces -from .models import ( - ApiBreakingChange, - ApiParamSpec, - ApiSurfaceSnapshot, - MetricsDiff, - MetricsSnapshot, - ModuleApiSurface, - ProjectMetrics, - PublicSymbol, -) - -if TYPE_CHECKING: - from collections.abc import Mapping - -METRICS_BASELINE_GENERATOR: Final = "codeclone" -MAX_METRICS_BASELINE_SIZE_BYTES: Final = 5 * 1024 * 1024 - - -class MetricsBaselineStatus(str, Enum): - OK = "ok" - MISSING = "missing" - TOO_LARGE = "too_large" - INVALID_JSON = "invalid_json" - INVALID_TYPE = "invalid_type" - MISSING_FIELDS = "missing_fields" - MISMATCH_SCHEMA_VERSION = "mismatch_schema_version" - MISMATCH_PYTHON_VERSION = "mismatch_python_version" - GENERATOR_MISMATCH = "generator_mismatch" - INTEGRITY_MISSING = "integrity_missing" - INTEGRITY_FAILED = "integrity_failed" - - -METRICS_BASELINE_UNTRUSTED_STATUSES: Final[frozenset[MetricsBaselineStatus]] = ( - frozenset( - { - MetricsBaselineStatus.MISSING, - MetricsBaselineStatus.TOO_LARGE, - MetricsBaselineStatus.INVALID_JSON, - MetricsBaselineStatus.INVALID_TYPE, - MetricsBaselineStatus.MISSING_FIELDS, - MetricsBaselineStatus.MISMATCH_SCHEMA_VERSION, - MetricsBaselineStatus.MISMATCH_PYTHON_VERSION, - MetricsBaselineStatus.GENERATOR_MISMATCH, - MetricsBaselineStatus.INTEGRITY_MISSING, - MetricsBaselineStatus.INTEGRITY_FAILED, - } - ) -) - -_TOP_LEVEL_REQUIRED_KEYS = frozenset({"meta", "metrics"}) -_TOP_LEVEL_ALLOWED_KEYS = _TOP_LEVEL_REQUIRED_KEYS | frozenset( - {"clones", "api_surface"} -) -_META_REQUIRED_KEYS = frozenset( - {"generator", "schema_version", "python_tag", "created_at", "payload_sha256"} -) -_METRICS_REQUIRED_KEYS = frozenset( - { - "max_complexity", - "high_risk_functions", - "max_coupling", - "high_coupling_classes", - "max_cohesion", - "low_cohesion_classes", - "dependency_cycles", - "dependency_max_depth", - "dead_code_items", - "health_score", - "health_grade", - } -) -_METRICS_OPTIONAL_KEYS = frozenset( - { - "typing_param_permille", - "typing_return_permille", - "docstring_permille", - "typing_any_count", - } -) -_METRICS_PAYLOAD_SHA256_KEY = "metrics_payload_sha256" -_API_SURFACE_PAYLOAD_SHA256_KEY = "api_surface_payload_sha256" - - -def coerce_metrics_baseline_status( - raw_status: str | MetricsBaselineStatus | None, -) -> MetricsBaselineStatus: - if isinstance(raw_status, MetricsBaselineStatus): - return raw_status - if isinstance(raw_status, str): - try: - return MetricsBaselineStatus(raw_status) - except ValueError: - return MetricsBaselineStatus.INVALID_TYPE - return MetricsBaselineStatus.INVALID_TYPE - - -def snapshot_from_project_metrics(project_metrics: ProjectMetrics) -> MetricsSnapshot: - return MetricsSnapshot( - max_complexity=int(project_metrics.complexity_max), - high_risk_functions=tuple(sorted(set(project_metrics.high_risk_functions))), - max_coupling=int(project_metrics.coupling_max), - high_coupling_classes=tuple(sorted(set(project_metrics.high_risk_classes))), - max_cohesion=int(project_metrics.cohesion_max), - low_cohesion_classes=tuple(sorted(set(project_metrics.low_cohesion_classes))), - dependency_cycles=tuple( - sorted({tuple(cycle) for cycle in project_metrics.dependency_cycles}) - ), - dependency_max_depth=int(project_metrics.dependency_max_depth), - dead_code_items=tuple( - sorted({item.qualname for item in project_metrics.dead_code}) - ), - health_score=int(project_metrics.health.total), - health_grade=project_metrics.health.grade, - typing_param_permille=_permille( - project_metrics.typing_param_annotated, - project_metrics.typing_param_total, - ), - typing_return_permille=_permille( - project_metrics.typing_return_annotated, - project_metrics.typing_return_total, - ), - docstring_permille=_permille( - project_metrics.docstring_public_documented, - project_metrics.docstring_public_total, - ), - typing_any_count=int(project_metrics.typing_any_count), - ) - - -def _permille(numerator: int, denominator: int) -> int: - if denominator <= 0: - return 0 - return round((1000.0 * float(numerator)) / float(denominator)) - - -def _canonical_json(payload: object) -> str: - return orjson.dumps(payload, option=orjson.OPT_SORT_KEYS).decode("utf-8") - - -def _snapshot_payload( - snapshot: MetricsSnapshot, - *, - include_adoption: bool = True, -) -> dict[str, object]: - payload: dict[str, object] = { - "max_complexity": int(snapshot.max_complexity), - "high_risk_functions": list(snapshot.high_risk_functions), - "max_coupling": int(snapshot.max_coupling), - "high_coupling_classes": list(snapshot.high_coupling_classes), - "max_cohesion": int(snapshot.max_cohesion), - "low_cohesion_classes": list(snapshot.low_cohesion_classes), - "dependency_cycles": [list(cycle) for cycle in snapshot.dependency_cycles], - "dependency_max_depth": int(snapshot.dependency_max_depth), - "dead_code_items": list(snapshot.dead_code_items), - "health_score": int(snapshot.health_score), - "health_grade": snapshot.health_grade, - } - if include_adoption: - payload.update( - { - "typing_param_permille": int(snapshot.typing_param_permille), - "typing_return_permille": int(snapshot.typing_return_permille), - "docstring_permille": int(snapshot.docstring_permille), - "typing_any_count": int(snapshot.typing_any_count), - } - ) - return payload - - -def _compute_payload_sha256( - snapshot: MetricsSnapshot, - *, - include_adoption: bool = True, -) -> str: - canonical = _canonical_json( - _snapshot_payload(snapshot, include_adoption=include_adoption) - ) - return hashlib.sha256(canonical.encode("utf-8")).hexdigest() - - -def _now_utc_z() -> str: - return ( - datetime.now(timezone.utc) - .replace(microsecond=0) - .isoformat() - .replace( - "+00:00", - "Z", - ) - ) - - -class MetricsBaseline: - __slots__ = ( - "api_surface_payload_sha256", - "api_surface_snapshot", - "created_at", - "generator_name", - "generator_version", - "has_coverage_adoption_snapshot", - "is_embedded_in_clone_baseline", - "path", - "payload_sha256", - "python_tag", - "schema_version", - "snapshot", - ) - - def __init__(self, path: str | Path) -> None: - self.path = Path(path) - self.generator_name: str | None = None - self.generator_version: str | None = None - self.schema_version: str | None = None - self.python_tag: str | None = None - self.created_at: str | None = None - self.payload_sha256: str | None = None - self.snapshot: MetricsSnapshot | None = None - self.has_coverage_adoption_snapshot = False - self.api_surface_payload_sha256: str | None = None - self.api_surface_snapshot: ApiSurfaceSnapshot | None = None - self.is_embedded_in_clone_baseline = False - - def load( - self, - *, - max_size_bytes: int | None = None, - preloaded_payload: dict[str, object] | None = None, - ) -> None: - try: - exists = self.path.exists() - except OSError as e: - raise BaselineValidationError( - f"Cannot stat metrics baseline file at {self.path}: {e}", - status=MetricsBaselineStatus.INVALID_TYPE, - ) from e - if not exists: - return - - size_limit = ( - MAX_METRICS_BASELINE_SIZE_BYTES - if max_size_bytes is None - else max_size_bytes - ) - try: - file_size = self.path.stat().st_size - except OSError as e: - raise BaselineValidationError( - f"Cannot stat metrics baseline file at {self.path}: {e}", - status=MetricsBaselineStatus.INVALID_TYPE, - ) from e - if file_size > size_limit: - raise BaselineValidationError( - "Metrics baseline file is too large " - f"({file_size} bytes, max {size_limit} bytes) at {self.path}.", - status=MetricsBaselineStatus.TOO_LARGE, - ) - - if preloaded_payload is None: - payload = _load_json_object(self.path) - else: - if not isinstance(preloaded_payload, dict): - raise BaselineValidationError( - f"Metrics baseline payload must be an object at {self.path}", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - payload = preloaded_payload - _validate_top_level_structure(payload, path=self.path) - self.is_embedded_in_clone_baseline = "clones" in payload - - meta_obj = payload.get("meta") - metrics_obj = payload.get("metrics") - if not isinstance(meta_obj, dict): - raise BaselineValidationError( - f"Invalid metrics baseline schema at {self.path}: " - "'meta' must be object", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - if not isinstance(metrics_obj, dict): - raise BaselineValidationError( - f"Invalid metrics baseline schema at {self.path}: " - "'metrics' must be object", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - - _validate_required_keys(meta_obj, _META_REQUIRED_KEYS, path=self.path) - _validate_required_keys(metrics_obj, _METRICS_REQUIRED_KEYS, path=self.path) - _validate_exact_keys( - metrics_obj, - _METRICS_REQUIRED_KEYS | _METRICS_OPTIONAL_KEYS, - path=self.path, - ) - - generator_name, generator_version = _parse_generator(meta_obj, path=self.path) - schema_version = _require_str(meta_obj, "schema_version", path=self.path) - python_tag = _require_str(meta_obj, "python_tag", path=self.path) - created_at = _require_str(meta_obj, "created_at", path=self.path) - payload_sha256 = _extract_metrics_payload_sha256(meta_obj, path=self.path) - api_surface_payload_sha256 = _extract_optional_payload_sha256( - meta_obj, - key=_API_SURFACE_PAYLOAD_SHA256_KEY, - ) - - self.generator_name = generator_name - self.generator_version = generator_version - self.schema_version = schema_version - self.python_tag = python_tag - self.created_at = created_at - self.payload_sha256 = payload_sha256 - self.api_surface_payload_sha256 = api_surface_payload_sha256 - self.snapshot = _parse_snapshot(metrics_obj, path=self.path) - self.has_coverage_adoption_snapshot = _has_coverage_adoption_snapshot( - metrics_obj, - ) - self.api_surface_snapshot = _parse_api_surface_snapshot( - payload.get("api_surface"), - path=self.path, - root=self.path.parent, - ) - - def save(self) -> None: - if self.snapshot is None: - raise BaselineValidationError( - "Metrics baseline snapshot is missing.", - status=MetricsBaselineStatus.MISSING_FIELDS, - ) - payload = _build_payload( - snapshot=self.snapshot, - schema_version=self.schema_version or METRICS_BASELINE_SCHEMA_VERSION, - python_tag=self.python_tag or current_python_tag(), - generator_name=self.generator_name or METRICS_BASELINE_GENERATOR, - generator_version=self.generator_version or __version__, - created_at=self.created_at or _now_utc_z(), - include_adoption=self.has_coverage_adoption_snapshot, - api_surface_snapshot=self.api_surface_snapshot, - api_surface_root=self.path.parent, - ) - payload_meta = cast("Mapping[str, Any]", payload["meta"]) - payload_metrics_hash = _require_str( - payload_meta, - "payload_sha256", - path=self.path, - ) - payload_api_surface_hash = _optional_require_str( - payload_meta, - _API_SURFACE_PAYLOAD_SHA256_KEY, - path=self.path, - ) - existing: dict[str, Any] | None = None - try: - if self.path.exists(): - loaded = _load_json_object(self.path) - if "clones" in loaded: - existing = loaded - except BaselineValidationError as e: - raise BaselineValidationError( - f"Cannot read existing baseline file at {self.path}: {e}", - status=MetricsBaselineStatus.INVALID_JSON, - ) from e - - if existing is not None: - existing_meta, clones_obj = _require_embedded_clone_baseline_payload( - existing, path=self.path - ) - merged_schema_version = _resolve_embedded_schema_version( - existing_meta, path=self.path - ) - merged_meta = dict(existing_meta) - merged_meta["schema_version"] = merged_schema_version - merged_meta[_METRICS_PAYLOAD_SHA256_KEY] = payload_metrics_hash - if payload_api_surface_hash is None: - merged_meta.pop(_API_SURFACE_PAYLOAD_SHA256_KEY, None) - else: - merged_meta[_API_SURFACE_PAYLOAD_SHA256_KEY] = payload_api_surface_hash - merged_payload: dict[str, object] = { - "meta": merged_meta, - "clones": clones_obj, - "metrics": payload["metrics"], - } - api_surface_payload = payload.get("api_surface") - if api_surface_payload is not None: - merged_payload["api_surface"] = api_surface_payload - self.path.parent.mkdir(parents=True, exist_ok=True) - _atomic_write_json(self.path, merged_payload) - self.is_embedded_in_clone_baseline = True - self.schema_version = merged_schema_version - self.python_tag = _require_str(merged_meta, "python_tag", path=self.path) - self.created_at = _require_str(merged_meta, "created_at", path=self.path) - self.payload_sha256 = _require_str( - merged_meta, _METRICS_PAYLOAD_SHA256_KEY, path=self.path - ) - self.api_surface_payload_sha256 = _optional_require_str( - merged_meta, - _API_SURFACE_PAYLOAD_SHA256_KEY, - path=self.path, - ) - self.generator_name, self.generator_version = _parse_generator( - merged_meta, path=self.path - ) - return - - self.path.parent.mkdir(parents=True, exist_ok=True) - _atomic_write_json(self.path, payload) - self.is_embedded_in_clone_baseline = False - self.schema_version = _require_str( - payload_meta, "schema_version", path=self.path - ) - self.python_tag = _require_str(payload_meta, "python_tag", path=self.path) - self.created_at = _require_str(payload_meta, "created_at", path=self.path) - self.payload_sha256 = payload_metrics_hash - self.api_surface_payload_sha256 = payload_api_surface_hash - - def verify_compatibility(self, *, runtime_python_tag: str) -> None: - if self.generator_name != METRICS_BASELINE_GENERATOR: - raise BaselineValidationError( - "Metrics baseline generator mismatch: expected 'codeclone'.", - status=MetricsBaselineStatus.GENERATOR_MISMATCH, - ) - expected_schema = ( - BASELINE_SCHEMA_VERSION - if self.is_embedded_in_clone_baseline - else METRICS_BASELINE_SCHEMA_VERSION - ) - if not _is_compatible_metrics_schema( - baseline_version=self.schema_version, - expected_version=expected_schema, - ): - raise BaselineValidationError( - "Metrics baseline schema version mismatch: " - f"baseline={self.schema_version}, " - f"expected={expected_schema}.", - status=MetricsBaselineStatus.MISMATCH_SCHEMA_VERSION, - ) - if self.python_tag != runtime_python_tag: - raise BaselineValidationError( - "Metrics baseline python tag mismatch: " - f"baseline={self.python_tag}, current={runtime_python_tag}.", - status=MetricsBaselineStatus.MISMATCH_PYTHON_VERSION, - ) - self.verify_integrity() - - def verify_integrity(self) -> None: - if self.snapshot is None: - raise BaselineValidationError( - "Metrics baseline snapshot is missing.", - status=MetricsBaselineStatus.MISSING_FIELDS, - ) - if not isinstance(self.payload_sha256, str): - raise BaselineValidationError( - "Metrics baseline integrity payload hash is missing.", - status=MetricsBaselineStatus.INTEGRITY_MISSING, - ) - if len(self.payload_sha256) != 64: - raise BaselineValidationError( - "Metrics baseline integrity payload hash is missing.", - status=MetricsBaselineStatus.INTEGRITY_MISSING, - ) - expected = _compute_payload_sha256( - self.snapshot, - include_adoption=self.has_coverage_adoption_snapshot, - ) - if not hmac.compare_digest(self.payload_sha256, expected): - raise BaselineValidationError( - "Metrics baseline integrity check failed: payload_sha256 mismatch.", - status=MetricsBaselineStatus.INTEGRITY_FAILED, - ) - if self.api_surface_snapshot is not None: - if ( - not isinstance(self.api_surface_payload_sha256, str) - or len(self.api_surface_payload_sha256) != 64 - ): - raise BaselineValidationError( - "Metrics baseline API surface integrity payload hash is missing.", - status=MetricsBaselineStatus.INTEGRITY_MISSING, - ) - expected_api = _compute_api_surface_payload_sha256( - self.api_surface_snapshot, - root=self.path.parent, - ) - legacy_absolute_expected_api = _compute_api_surface_payload_sha256( - self.api_surface_snapshot - ) - legacy_expected_api = _compute_legacy_api_surface_payload_sha256( - self.api_surface_snapshot, - root=self.path.parent, - ) - legacy_absolute_qualname_expected_api = ( - _compute_legacy_api_surface_payload_sha256(self.api_surface_snapshot) - ) - if not ( - hmac.compare_digest(self.api_surface_payload_sha256, expected_api) - or hmac.compare_digest( - self.api_surface_payload_sha256, - legacy_absolute_expected_api, - ) - or hmac.compare_digest( - self.api_surface_payload_sha256, - legacy_expected_api, - ) - or hmac.compare_digest( - self.api_surface_payload_sha256, - legacy_absolute_qualname_expected_api, - ) - ): - raise BaselineValidationError( - "Metrics baseline integrity check failed: " - "api_surface payload_sha256 mismatch.", - status=MetricsBaselineStatus.INTEGRITY_FAILED, - ) - - @staticmethod - def from_project_metrics( - *, - project_metrics: ProjectMetrics, - path: str | Path, - schema_version: str | None = None, - python_tag: str | None = None, - generator_version: str | None = None, - include_adoption: bool = True, - include_api_surface: bool = True, - ) -> MetricsBaseline: - baseline = MetricsBaseline(path) - baseline.generator_name = METRICS_BASELINE_GENERATOR - baseline.generator_version = generator_version or __version__ - baseline.schema_version = schema_version or METRICS_BASELINE_SCHEMA_VERSION - baseline.python_tag = python_tag or current_python_tag() - baseline.created_at = _now_utc_z() - baseline.snapshot = snapshot_from_project_metrics(project_metrics) - baseline.payload_sha256 = _compute_payload_sha256( - baseline.snapshot, - include_adoption=include_adoption, - ) - baseline.has_coverage_adoption_snapshot = include_adoption - baseline.api_surface_snapshot = ( - project_metrics.api_surface if include_api_surface else None - ) - baseline.api_surface_payload_sha256 = ( - _compute_api_surface_payload_sha256( - baseline.api_surface_snapshot, - root=baseline.path.parent, - ) - if baseline.api_surface_snapshot is not None - else None - ) - return baseline - - def diff(self, current: ProjectMetrics) -> MetricsDiff: - if self.snapshot is None: - snapshot = MetricsSnapshot( - max_complexity=0, - high_risk_functions=(), - max_coupling=0, - high_coupling_classes=(), - max_cohesion=0, - low_cohesion_classes=(), - dependency_cycles=(), - dependency_max_depth=0, - dead_code_items=(), - health_score=0, - health_grade="F", - typing_param_permille=0, - typing_return_permille=0, - docstring_permille=0, - typing_any_count=0, - ) - else: - snapshot = self.snapshot - - current_snapshot = snapshot_from_project_metrics(current) - - new_high_risk_functions = tuple( - sorted( - set(current_snapshot.high_risk_functions) - - set(snapshot.high_risk_functions) - ) - ) - new_high_coupling_classes = tuple( - sorted( - set(current_snapshot.high_coupling_classes) - - set(snapshot.high_coupling_classes) - ) - ) - new_cycles = tuple( - sorted( - set(current_snapshot.dependency_cycles) - - set(snapshot.dependency_cycles) - ) - ) - new_dead_code = tuple( - sorted( - set(current_snapshot.dead_code_items) - set(snapshot.dead_code_items) - ) - ) - added_api_symbols: tuple[str, ...] - api_breaking_changes: tuple[ApiBreakingChange, ...] - if self.api_surface_snapshot is None: - added_api_symbols = () - api_breaking_changes = () - else: - added_api_symbols, api_breaking_changes = compare_api_surfaces( - baseline=self.api_surface_snapshot, - current=current.api_surface, - strict_types=False, - ) - - return MetricsDiff( - new_high_risk_functions=new_high_risk_functions, - new_high_coupling_classes=new_high_coupling_classes, - new_cycles=new_cycles, - new_dead_code=new_dead_code, - health_delta=current_snapshot.health_score - snapshot.health_score, - typing_param_permille_delta=( - current_snapshot.typing_param_permille - snapshot.typing_param_permille - ), - typing_return_permille_delta=( - current_snapshot.typing_return_permille - - snapshot.typing_return_permille - ), - docstring_permille_delta=( - current_snapshot.docstring_permille - snapshot.docstring_permille - ), - new_api_symbols=added_api_symbols, - new_api_breaking_changes=api_breaking_changes, - ) - - -def _is_compatible_metrics_schema( - *, - baseline_version: str | None, - expected_version: str, -) -> bool: - if baseline_version is None: - return False - baseline_major_minor = _parse_major_minor(baseline_version) - expected_major_minor = _parse_major_minor(expected_version) - if baseline_major_minor is None or expected_major_minor is None: - return baseline_version == expected_version - baseline_major, baseline_minor = baseline_major_minor - expected_major, expected_minor = expected_major_minor - return baseline_major == expected_major and baseline_minor <= expected_minor - - -def _has_coverage_adoption_snapshot(metrics_obj: Mapping[str, object]) -> bool: - return all( - key in metrics_obj - for key in ( - "typing_param_permille", - "typing_return_permille", - "docstring_permille", - ) - ) - - -def _parse_major_minor(version: str) -> tuple[int, int] | None: - parts = version.split(".") - if len(parts) != 2 or not all(part.isdigit() for part in parts): - return None - return int(parts[0]), int(parts[1]) - - -def _atomic_write_json(path: Path, payload: dict[str, object]) -> None: - _write_json_document_atomically( - path, - payload, - indent=True, - trailing_newline=True, - ) - - -def _load_json_object(path: Path) -> dict[str, Any]: - try: - return _read_json_object(path) - except OSError as e: - raise BaselineValidationError( - f"Cannot read metrics baseline file at {path}: {e}", - status=MetricsBaselineStatus.INVALID_JSON, - ) from e - except JSONDecodeError as e: - raise BaselineValidationError( - f"Corrupted metrics baseline file at {path}: {e}", - status=MetricsBaselineStatus.INVALID_JSON, - ) from e - except TypeError: - raise BaselineValidationError( - f"Metrics baseline payload must be an object at {path}", - status=MetricsBaselineStatus.INVALID_TYPE, - ) from None - - -def _validate_top_level_structure(payload: dict[str, Any], *, path: Path) -> None: - validate_top_level_structure( - payload, - path=path, - required_keys=_TOP_LEVEL_REQUIRED_KEYS, - allowed_keys=_TOP_LEVEL_ALLOWED_KEYS, - schema_label="metrics baseline", - missing_status=MetricsBaselineStatus.MISSING_FIELDS, - extra_status=MetricsBaselineStatus.INVALID_TYPE, - ) - - -def _validate_required_keys( - payload: Mapping[str, Any], - required: frozenset[str], - *, - path: Path, -) -> None: - missing = required - set(payload.keys()) - if missing: - raise BaselineValidationError( - "Invalid metrics baseline schema at " - f"{path}: missing required fields: {', '.join(sorted(missing))}", - status=MetricsBaselineStatus.MISSING_FIELDS, - ) - - -def _validate_exact_keys( - payload: Mapping[str, Any], - required: frozenset[str], - *, - path: Path, -) -> None: - extra = set(payload.keys()) - set(required) - if extra: - raise BaselineValidationError( - "Invalid metrics baseline schema at " - f"{path}: unexpected fields: {', '.join(sorted(extra))}", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - - -def _require_str(payload: Mapping[str, Any], key: str, *, path: Path) -> str: - value = payload.get(key) - if isinstance(value, str): - return value - raise BaselineValidationError( - f"Invalid metrics baseline schema at {path}: {key!r} must be str", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - - -def _extract_metrics_payload_sha256( - payload: Mapping[str, Any], - *, - path: Path, -) -> str: - direct = payload.get(_METRICS_PAYLOAD_SHA256_KEY) - if isinstance(direct, str): - return direct - return _require_str(payload, "payload_sha256", path=path) - - -def _extract_optional_payload_sha256( - payload: Mapping[str, Any], - *, - key: str, -) -> str | None: - value = payload.get(key) - return value if isinstance(value, str) else None - - -def _require_int(payload: Mapping[str, Any], key: str, *, path: Path) -> int: - value = payload.get(key) - if isinstance(value, bool): - raise BaselineValidationError( - f"Invalid metrics baseline schema at {path}: {key!r} must be int", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - if isinstance(value, int): - return value - raise BaselineValidationError( - f"Invalid metrics baseline schema at {path}: {key!r} must be int", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - - -def _optional_require_str( - payload: Mapping[str, Any], - key: str, - *, - path: Path, -) -> str | None: - value = payload.get(key) - if value is None: - return None - if isinstance(value, str): - return value - raise BaselineValidationError( - f"Invalid metrics baseline schema at {path}: {key!r} must be str", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - - -def _require_str_list(payload: Mapping[str, Any], key: str, *, path: Path) -> list[str]: - value = payload.get(key) - if not isinstance(value, list): - raise BaselineValidationError( - f"Invalid metrics baseline schema at {path}: {key!r} must be list[str]", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - if not all(isinstance(item, str) for item in value): - raise BaselineValidationError( - f"Invalid metrics baseline schema at {path}: {key!r} must be list[str]", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - return value - - -def _parse_cycles( - payload: Mapping[str, Any], - *, - key: str, - path: Path, -) -> tuple[tuple[str, ...], ...]: - value = payload.get(key) - if not isinstance(value, list): - raise BaselineValidationError( - f"Invalid metrics baseline schema at {path}: {key!r} must be list", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - - cycles: list[tuple[str, ...]] = [] - for cycle in value: - if not isinstance(cycle, list): - raise BaselineValidationError( - "Invalid metrics baseline schema at " - f"{path}: {key!r} cycle item must be list[str]", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - if not all(isinstance(item, str) for item in cycle): - raise BaselineValidationError( - "Invalid metrics baseline schema at " - f"{path}: {key!r} cycle item must be list[str]", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - cycles.append(tuple(cycle)) - return tuple(sorted(set(cycles))) - - -def _parse_generator( - meta: Mapping[str, Any], - *, - path: Path, -) -> tuple[str, str | None]: - generator = meta.get("generator") - if isinstance(generator, str): - version_value = meta.get("generator_version") - if version_value is None: - version_value = meta.get("codeclone_version") - if version_value is None: - return generator, None - if not isinstance(version_value, str): - raise BaselineValidationError( - "Invalid metrics baseline schema at " - f"{path}: generator_version must be str", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - return generator, version_value - - if isinstance(generator, dict): - allowed_keys = {"name", "version"} - extra = set(generator.keys()) - allowed_keys - if extra: - raise BaselineValidationError( - f"Invalid metrics baseline schema at {path}: " - f"unexpected generator keys: {', '.join(sorted(extra))}", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - name = generator.get("name") - version = generator.get("version") - if not isinstance(name, str): - raise BaselineValidationError( - "Invalid metrics baseline schema at " - f"{path}: generator.name must be str", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - if version is not None and not isinstance(version, str): - raise BaselineValidationError( - "Invalid metrics baseline schema at " - f"{path}: generator.version must be str", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - return name, version if isinstance(version, str) else None - - raise BaselineValidationError( - f"Invalid metrics baseline schema at {path}: generator must be object or str", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - - -def _require_embedded_clone_baseline_payload( - payload: Mapping[str, Any], - *, - path: Path, -) -> tuple[dict[str, Any], dict[str, Any]]: - meta_obj = payload.get("meta") - clones_obj = payload.get("clones") - if not isinstance(meta_obj, dict): - raise BaselineValidationError( - f"Invalid baseline schema at {path}: 'meta' must be object", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - if not isinstance(clones_obj, dict): - raise BaselineValidationError( - f"Invalid baseline schema at {path}: 'clones' must be object", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - _require_str(meta_obj, "payload_sha256", path=path) - _require_str(meta_obj, "python_tag", path=path) - _require_str(meta_obj, "created_at", path=path) - functions = clones_obj.get("functions") - blocks = clones_obj.get("blocks") - if not isinstance(functions, list) or not all( - isinstance(item, str) for item in functions - ): - raise BaselineValidationError( - f"Invalid baseline schema at {path}: 'clones.functions' must be list[str]", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - if not isinstance(blocks, list) or not all( - isinstance(item, str) for item in blocks - ): - raise BaselineValidationError( - f"Invalid baseline schema at {path}: 'clones.blocks' must be list[str]", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - return meta_obj, clones_obj - - -def _resolve_embedded_schema_version(meta: Mapping[str, Any], *, path: Path) -> str: - raw_version = _require_str(meta, "schema_version", path=path) - parts = raw_version.split(".") - if len(parts) not in {2, 3} or not all(part.isdigit() for part in parts): - raise BaselineValidationError( - "Invalid baseline schema at " - f"{path}: 'schema_version' must be semver string", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - major = int(parts[0]) - if major >= 2: - return raw_version - return BASELINE_SCHEMA_VERSION - - -def _parse_snapshot( - payload: Mapping[str, Any], - *, - path: Path, -) -> MetricsSnapshot: - grade = _require_str(payload, "health_grade", path=path) - if grade not in {"A", "B", "C", "D", "F"}: - raise BaselineValidationError( - "Invalid metrics baseline schema at " - f"{path}: 'health_grade' must be one of A/B/C/D/F", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - - return MetricsSnapshot( - max_complexity=_require_int(payload, "max_complexity", path=path), - high_risk_functions=tuple( - sorted(set(_require_str_list(payload, "high_risk_functions", path=path))) - ), - max_coupling=_require_int(payload, "max_coupling", path=path), - high_coupling_classes=tuple( - sorted(set(_require_str_list(payload, "high_coupling_classes", path=path))) - ), - max_cohesion=_require_int(payload, "max_cohesion", path=path), - low_cohesion_classes=tuple( - sorted(set(_require_str_list(payload, "low_cohesion_classes", path=path))) - ), - dependency_cycles=_parse_cycles(payload, key="dependency_cycles", path=path), - dependency_max_depth=_require_int(payload, "dependency_max_depth", path=path), - dead_code_items=tuple( - sorted(set(_require_str_list(payload, "dead_code_items", path=path))) - ), - health_score=_require_int(payload, "health_score", path=path), - health_grade=cast("Literal['A', 'B', 'C', 'D', 'F']", grade), - typing_param_permille=_optional_int( - payload, - "typing_param_permille", - path=path, - ), - typing_return_permille=_optional_int( - payload, - "typing_return_permille", - path=path, - ), - docstring_permille=_optional_int(payload, "docstring_permille", path=path), - typing_any_count=_optional_int(payload, "typing_any_count", path=path), - ) - - -def _optional_int(payload: Mapping[str, Any], key: str, *, path: Path) -> int: - value = payload.get(key) - if value is None: - return 0 - return _require_int(payload, key, path=path) - - -def _parse_api_surface_snapshot( - payload: object, - *, - path: Path, - root: Path | None = None, -) -> ApiSurfaceSnapshot | None: - if payload is None: - return None - if not isinstance(payload, dict): - raise BaselineValidationError( - f"Invalid metrics baseline schema at {path}: 'api_surface' must be object", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - raw_modules = payload.get("modules", []) - if not isinstance(raw_modules, list): - raise BaselineValidationError( - f"Invalid metrics baseline schema at {path}: " - "'api_surface.modules' must be list", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - modules: list[ModuleApiSurface] = [] - for raw_module in raw_modules: - if not isinstance(raw_module, dict): - raise BaselineValidationError( - f"Invalid metrics baseline schema at {path}: " - "api surface module must be object", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - module = _require_str(raw_module, "module", path=path) - wire_filepath = _require_str(raw_module, "filepath", path=path) - filepath = runtime_filepath_from_wire(wire_filepath, root=root) - all_declared = _require_str_list_or_none(raw_module, "all_declared", path=path) - raw_symbols = raw_module.get("symbols", []) - if not isinstance(raw_symbols, list): - raise BaselineValidationError( - f"Invalid metrics baseline schema at {path}: " - "api surface symbols must be list", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - symbols: list[PublicSymbol] = [] - for raw_symbol in raw_symbols: - if not isinstance(raw_symbol, dict): - raise BaselineValidationError( - f"Invalid metrics baseline schema at {path}: " - "api surface symbol must be object", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - local_name = _optional_require_str(raw_symbol, "local_name", path=path) - legacy_qualname = _optional_require_str(raw_symbol, "qualname", path=path) - if local_name is None and legacy_qualname is None: - raise BaselineValidationError( - f"Invalid metrics baseline schema at {path}: " - "api surface symbol requires 'local_name' or 'qualname'", - status=MetricsBaselineStatus.MISSING_FIELDS, - ) - if local_name is None: - assert legacy_qualname is not None - qualname = legacy_qualname - else: - qualname = _compose_api_surface_qualname( - module=module, - local_name=local_name, - ) - kind = _require_str(raw_symbol, "kind", path=path) - exported_via = _require_str(raw_symbol, "exported_via", path=path) - params_raw = raw_symbol.get("params", []) - if not isinstance(params_raw, list): - raise BaselineValidationError( - f"Invalid metrics baseline schema at {path}: " - "api surface params must be list", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - params: list[ApiParamSpec] = [] - for raw_param in params_raw: - if not isinstance(raw_param, dict): - raise BaselineValidationError( - f"Invalid metrics baseline schema at {path}: " - "api param must be object", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - name = _require_str(raw_param, "name", path=path) - param_kind = _require_str(raw_param, "kind", path=path) - has_default = raw_param.get("has_default") - annotation_hash = _optional_require_str( - raw_param, - "annotation_hash", - path=path, - ) - if not isinstance(has_default, bool): - raise BaselineValidationError( - f"Invalid metrics baseline schema at {path}: " - "api param 'has_default' must be bool", - status=MetricsBaselineStatus.INVALID_TYPE, - ) - params.append( - ApiParamSpec( - name=name, - kind=cast( - ( - "Literal['pos_only', 'pos_or_kw', " - "'vararg', 'kw_only', 'kwarg']" - ), - param_kind, - ), - has_default=has_default, - annotation_hash=annotation_hash or "", - ) - ) - symbols.append( - PublicSymbol( - qualname=qualname, - kind=cast( - "Literal['function', 'class', 'method', 'constant']", - kind, - ), - start_line=_require_int(raw_symbol, "start_line", path=path), - end_line=_require_int(raw_symbol, "end_line", path=path), - params=tuple(params), - returns_hash=_optional_require_str( - raw_symbol, - "returns_hash", - path=path, - ) - or "", - exported_via=cast("Literal['all', 'name']", exported_via), - ) - ) - modules.append( - ModuleApiSurface( - module=module, - filepath=filepath, - symbols=tuple(sorted(symbols, key=lambda item: item.qualname)), - all_declared=tuple(all_declared) if all_declared is not None else None, - ) - ) - return ApiSurfaceSnapshot( - modules=tuple(sorted(modules, key=lambda item: (item.filepath, item.module))) - ) - - -def _require_str_list_or_none( - payload: Mapping[str, Any], - key: str, - *, - path: Path, -) -> list[str] | None: - value = payload.get(key) - if value is None: - return None - return _require_str_list(payload, key, path=path) - - -def _api_surface_snapshot_payload( - snapshot: ApiSurfaceSnapshot, - *, - root: Path | None = None, - legacy_qualname: bool = False, -) -> dict[str, object]: - return { - "modules": [ - { - "module": module.module, - "filepath": wire_filepath_from_runtime(module.filepath, root=root), - "all_declared": list(module.all_declared or ()), - "symbols": [ - { - ("qualname" if legacy_qualname else "local_name"): ( - symbol.qualname - if legacy_qualname - else _local_name_from_qualname( - module=module.module, - qualname=symbol.qualname, - ) - ), - "kind": symbol.kind, - "start_line": symbol.start_line, - "end_line": symbol.end_line, - "params": [ - { - "name": param.name, - "kind": param.kind, - "has_default": param.has_default, - "annotation_hash": param.annotation_hash, - } - for param in symbol.params - ], - "returns_hash": symbol.returns_hash, - "exported_via": symbol.exported_via, - } - for symbol in sorted( - module.symbols, - key=lambda item: item.qualname, - ) - ], - } - for module in sorted( - snapshot.modules, - key=lambda item: (item.filepath, item.module), - ) - ] - } - - -def _compute_api_surface_payload_sha256( - snapshot: ApiSurfaceSnapshot, - *, - root: Path | None = None, -) -> str: - canonical = _canonical_json(_api_surface_snapshot_payload(snapshot, root=root)) - return hashlib.sha256(canonical.encode("utf-8")).hexdigest() - - -def _compute_legacy_api_surface_payload_sha256( - snapshot: ApiSurfaceSnapshot, - *, - root: Path | None = None, -) -> str: - canonical = _canonical_json( - _api_surface_snapshot_payload(snapshot, root=root, legacy_qualname=True) - ) - return hashlib.sha256(canonical.encode("utf-8")).hexdigest() - - -def _compose_api_surface_qualname(*, module: str, local_name: str) -> str: - return f"{module}:{local_name}" - - -def _local_name_from_qualname(*, module: str, qualname: str) -> str: - prefix = f"{module}:" - if qualname.startswith(prefix): - return qualname[len(prefix) :] - return qualname - - -def _build_payload( - *, - snapshot: MetricsSnapshot, - schema_version: str, - python_tag: str, - generator_name: str, - generator_version: str, - created_at: str, - include_adoption: bool = True, - api_surface_snapshot: ApiSurfaceSnapshot | None = None, - api_surface_root: Path | None = None, -) -> dict[str, Any]: - payload_sha256 = _compute_payload_sha256( - snapshot, - include_adoption=include_adoption, - ) - payload: dict[str, Any] = { - "meta": { - "generator": { - "name": generator_name, - "version": generator_version, - }, - "schema_version": schema_version, - "python_tag": python_tag, - "created_at": created_at, - "payload_sha256": payload_sha256, - }, - "metrics": _snapshot_payload( - snapshot, - include_adoption=include_adoption, - ), - } - if api_surface_snapshot is not None: - payload["meta"][_API_SURFACE_PAYLOAD_SHA256_KEY] = ( - _compute_api_surface_payload_sha256( - api_surface_snapshot, - root=api_surface_root, - ) - ) - payload["api_surface"] = _api_surface_snapshot_payload( - api_surface_snapshot, - root=api_surface_root, - ) - return payload - - -__all__ = [ - "BASELINE_SCHEMA_VERSION", - "MAX_METRICS_BASELINE_SIZE_BYTES", - "METRICS_BASELINE_GENERATOR", - "METRICS_BASELINE_SCHEMA_VERSION", - "METRICS_BASELINE_UNTRUSTED_STATUSES", - "MetricsBaseline", - "MetricsBaselineStatus", - "coerce_metrics_baseline_status", - "current_python_tag", - "snapshot_from_project_metrics", -] diff --git a/codeclone/models.py b/codeclone/models.py index 4814fc1..26cd95e 100644 --- a/codeclone/models.py +++ b/codeclone/models.py @@ -93,6 +93,8 @@ class DepGraph: edges: tuple[ModuleDep, ...] cycles: tuple[tuple[str, ...], ...] max_depth: int + avg_depth: float + p95_depth: int longest_chains: tuple[tuple[str, ...], ...] @@ -117,6 +119,42 @@ class DeadCandidate: suppressed_rules: tuple[str, ...] = field(default_factory=tuple) +SecuritySurfaceCategory = Literal[ + "archive_extraction", + "crypto_transport", + "database_boundary", + "deserialization", + "dynamic_execution", + "dynamic_loading", + "filesystem_mutation", + "identity_token", + "network_boundary", + "process_boundary", +] +SecuritySurfaceLocationScope = Literal["module", "class", "callable"] +SecuritySurfaceClassificationMode = Literal[ + "exact_builtin", + "exact_call", + "exact_import", +] +SecuritySurfaceEvidenceKind = Literal["builtin", "call", "import"] + + +@dataclass(frozen=True, slots=True) +class SecuritySurface: + category: SecuritySurfaceCategory + capability: str + module: str + filepath: str + qualname: str + start_line: int + end_line: int + location_scope: SecuritySurfaceLocationScope + classification_mode: SecuritySurfaceClassificationMode + evidence_kind: SecuritySurfaceEvidenceKind + evidence_symbol: str + + @dataclass(frozen=True, slots=True) class FileMetrics: class_metrics: tuple[ClassMetrics, ...] @@ -125,6 +163,7 @@ class FileMetrics: referenced_names: frozenset[str] import_names: frozenset[str] class_names: frozenset[str] + security_surfaces: tuple[SecuritySurface, ...] = () referenced_qualnames: frozenset[str] = field(default_factory=frozenset) typing_coverage: ModuleTypingCoverage | None = None docstring_coverage: ModuleDocstringCoverage | None = None diff --git a/codeclone/paths.py b/codeclone/paths/__init__.py similarity index 98% rename from codeclone/paths.py rename to codeclone/paths/__init__.py index d93428f..f12522a 100644 --- a/codeclone/paths.py +++ b/codeclone/paths/__init__.py @@ -8,7 +8,7 @@ from pathlib import Path -from .domain.source_scope import ( +from ..domain.source_scope import ( SOURCE_KIND_FIXTURES, SOURCE_KIND_OTHER, SOURCE_KIND_PRODUCTION, diff --git a/codeclone/pipeline.py b/codeclone/pipeline.py deleted file mode 100644 index 50d2c58..0000000 --- a/codeclone/pipeline.py +++ /dev/null @@ -1,2773 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at https://mozilla.org/MPL/2.0/. -# SPDX-License-Identifier: MPL-2.0 -# Copyright (c) 2026 Den Rozhnovskiy - -from __future__ import annotations - -import inspect -import os -from collections.abc import Mapping -from concurrent.futures import ProcessPoolExecutor, as_completed -from dataclasses import dataclass -from hashlib import sha256 -from pathlib import Path -from typing import TYPE_CHECKING, Literal, cast - -import orjson - -from ._coerce import as_int, as_str -from .cache import ( - ApiParamSpecDict, - Cache, - CacheEntry, - ClassMetricsDict, - DeadCandidateDict, - FileStat, - ModuleDepDict, - PublicSymbolDict, - SegmentReportProjection, - SourceStatsDict, - StructuralFindingGroupDict, - file_stat_signature, -) -from .contracts import ExitCode -from .domain.findings import CATEGORY_COHESION, CATEGORY_COMPLEXITY, CATEGORY_COUPLING -from .domain.quality import CONFIDENCE_HIGH, RISK_HIGH, RISK_LOW -from .extractor import extract_units_and_stats_from_source -from .golden_fixtures import ( - build_suppressed_clone_groups, - split_clone_groups_for_golden_fixtures, -) -from .grouping import build_block_groups, build_groups, build_segment_groups -from .metrics import ( - CoverageJoinParseError, - HealthInputs, - build_coverage_join, - build_dep_graph, - build_overloaded_modules_payload, - compute_health, - find_suppressed_unused, - find_unused, -) -from .models import ( - ApiBreakingChange, - ApiParamSpec, - ApiSurfaceSnapshot, - BlockUnit, - ClassMetrics, - CoverageJoinResult, - DeadCandidate, - DeadItem, - DepGraph, - FileMetrics, - GroupItem, - GroupItemLike, - GroupMap, - MetricsDiff, - ModuleApiSurface, - ModuleDep, - ModuleDocstringCoverage, - ModuleTypingCoverage, - ProjectMetrics, - PublicSymbol, - SegmentUnit, - StructuralFindingGroup, - StructuralFindingOccurrence, - Suggestion, - SuppressedCloneGroup, - Unit, -) -from .normalize import NormalizationConfig -from .paths import is_test_filepath -from .report.blocks import prepare_block_report_groups -from .report.explain import build_block_group_facts -from .report.json_contract import build_report_document -from .report.segments import prepare_segment_report_groups -from .report.serialize import render_json_report_document, render_text_report_document -from .report.suggestions import generate_suggestions -from .scanner import iter_py_files, module_name_from_path -from .structural_findings import build_clone_cohort_structural_findings -from .suppressions import DEAD_CODE_RULE_ID, INLINE_CODECLONE_SUPPRESSION_SOURCE - -if TYPE_CHECKING: - from argparse import Namespace - from collections.abc import Callable, Collection, Mapping, Sequence - -MAX_FILE_SIZE = 10 * 1024 * 1024 -DEFAULT_BATCH_SIZE = 100 -PARALLEL_MIN_FILES_PER_WORKER = 8 -PARALLEL_MIN_FILES_FLOOR = 16 -DEFAULT_RUNTIME_PROCESSES = 4 - -_as_int = as_int -_as_str = as_str - - -@dataclass(frozen=True, slots=True) -class OutputPaths: - html: Path | None = None - json: Path | None = None - text: Path | None = None - md: Path | None = None - sarif: Path | None = None - - -@dataclass(frozen=True, slots=True) -class BootstrapResult: - root: Path - config: NormalizationConfig - args: Namespace - output_paths: OutputPaths - cache_path: Path - - -@dataclass(frozen=True, slots=True) -class DiscoveryResult: - files_found: int - cache_hits: int - files_skipped: int - all_file_paths: tuple[str, ...] - cached_units: tuple[GroupItem, ...] - cached_blocks: tuple[GroupItem, ...] - cached_segments: tuple[GroupItem, ...] - cached_class_metrics: tuple[ClassMetrics, ...] - cached_module_deps: tuple[ModuleDep, ...] - cached_dead_candidates: tuple[DeadCandidate, ...] - cached_referenced_names: frozenset[str] - files_to_process: tuple[str, ...] - skipped_warnings: tuple[str, ...] - cached_referenced_qualnames: frozenset[str] = frozenset() - cached_typing_modules: tuple[ModuleTypingCoverage, ...] = () - cached_docstring_modules: tuple[ModuleDocstringCoverage, ...] = () - cached_api_modules: tuple[ModuleApiSurface, ...] = () - cached_structural_findings: tuple[StructuralFindingGroup, ...] = () - cached_segment_report_projection: SegmentReportProjection | None = None - cached_lines: int = 0 - cached_functions: int = 0 - cached_methods: int = 0 - cached_classes: int = 0 - cached_source_stats_by_file: tuple[tuple[str, int, int, int, int], ...] = () - - -@dataclass(frozen=True, slots=True) -class FileProcessResult: - filepath: str - success: bool - error: str | None = None - units: list[Unit] | None = None - blocks: list[BlockUnit] | None = None - segments: list[SegmentUnit] | None = None - lines: int = 0 - functions: int = 0 - methods: int = 0 - classes: int = 0 - stat: FileStat | None = None - error_kind: str | None = None - file_metrics: FileMetrics | None = None - structural_findings: list[StructuralFindingGroup] | None = None - - -@dataclass(frozen=True, slots=True) -class ProcessingResult: - units: tuple[GroupItem, ...] - blocks: tuple[GroupItem, ...] - segments: tuple[GroupItem, ...] - class_metrics: tuple[ClassMetrics, ...] - module_deps: tuple[ModuleDep, ...] - dead_candidates: tuple[DeadCandidate, ...] - referenced_names: frozenset[str] - files_analyzed: int - files_skipped: int - analyzed_lines: int - analyzed_functions: int - analyzed_methods: int - analyzed_classes: int - failed_files: tuple[str, ...] - source_read_failures: tuple[str, ...] - referenced_qualnames: frozenset[str] = frozenset() - typing_modules: tuple[ModuleTypingCoverage, ...] = () - docstring_modules: tuple[ModuleDocstringCoverage, ...] = () - api_modules: tuple[ModuleApiSurface, ...] = () - structural_findings: tuple[StructuralFindingGroup, ...] = () - source_stats_by_file: tuple[tuple[str, int, int, int, int], ...] = () - - -@dataclass(frozen=True, slots=True) -class AnalysisResult: - func_groups: GroupMap - block_groups: GroupMap - block_groups_report: GroupMap - segment_groups: GroupMap - suppressed_segment_groups: int - block_group_facts: dict[str, dict[str, str]] - func_clones_count: int - block_clones_count: int - segment_clones_count: int - files_analyzed_or_cached: int - project_metrics: ProjectMetrics | None - metrics_payload: dict[str, object] | None - suggestions: tuple[Suggestion, ...] - segment_groups_raw_digest: str - suppressed_clone_groups: tuple[SuppressedCloneGroup, ...] = () - coverage_join: CoverageJoinResult | None = None - suppressed_dead_code_items: int = 0 - structural_findings: tuple[StructuralFindingGroup, ...] = () - - -@dataclass(frozen=True, slots=True) -class GatingResult: - exit_code: int - reasons: tuple[str, ...] - - -@dataclass(frozen=True, slots=True) -class ReportArtifacts: - html: str | None = None - json: str | None = None - text: str | None = None - md: str | None = None - sarif: str | None = None - report_document: dict[str, object] | None = None - - -@dataclass(frozen=True, slots=True) -class MetricGateConfig: - fail_complexity: int - fail_coupling: int - fail_cohesion: int - fail_cycles: bool - fail_dead_code: bool - fail_health: int - fail_on_new_metrics: bool - fail_on_typing_regression: bool = False - fail_on_docstring_regression: bool = False - fail_on_api_break: bool = False - fail_on_untested_hotspots: bool = False - min_typing_coverage: int = -1 - min_docstring_coverage: int = -1 - coverage_min: int = 50 - - -def _as_sorted_str_tuple(value: object) -> tuple[str, ...]: - if not isinstance(value, list): - return () - return tuple(sorted({item for item in value if isinstance(item, str) and item})) - - -def _group_item_sort_key(item: GroupItemLike) -> tuple[str, int, int, str]: - return ( - _as_str(item.get("filepath")), - _as_int(item.get("start_line")), - _as_int(item.get("end_line")), - _as_str(item.get("qualname")), - ) - - -def _segment_projection_item_sort_key(item: GroupItemLike) -> tuple[str, str, int, int]: - return ( - _as_str(item.get("filepath")), - _as_str(item.get("qualname")), - _as_int(item.get("start_line")), - _as_int(item.get("end_line")), - ) - - -def _segment_groups_digest(segment_groups: GroupMap) -> str: - normalized_rows: list[ - tuple[str, tuple[tuple[str, str, int, int, int, str, str], ...]] - ] = [] - for group_key in sorted(segment_groups): - items = sorted(segment_groups[group_key], key=_segment_projection_item_sort_key) - normalized_items: list[tuple[str, str, int, int, int, str, str]] = [ - ( - _as_str(item.get("filepath")), - _as_str(item.get("qualname")), - _as_int(item.get("start_line")), - _as_int(item.get("end_line")), - _as_int(item.get("size")), - _as_str(item.get("segment_hash")), - _as_str(item.get("segment_sig")), - ) - for item in items - ] - normalized_rows.append((group_key, tuple(normalized_items))) - payload = orjson.dumps(tuple(normalized_rows), option=orjson.OPT_SORT_KEYS) - return sha256(payload).hexdigest() - - -def _coerce_segment_report_projection( - value: object, -) -> SegmentReportProjection | None: - if not isinstance(value, dict): - return None - digest = value.get("digest") - suppressed = value.get("suppressed") - groups = value.get("groups") - if ( - not isinstance(digest, str) - or not isinstance(suppressed, int) - or not isinstance(groups, dict) - ): - return None - if not all( - isinstance(group_key, str) and isinstance(items, list) - for group_key, items in groups.items() - ): - return None - return cast("SegmentReportProjection", value) - - -def _module_dep_sort_key(dep: ModuleDep) -> tuple[str, str, str, int]: - return dep.source, dep.target, dep.import_type, dep.line - - -def _class_metric_sort_key(metric: ClassMetrics) -> tuple[str, int, int, str]: - return metric.filepath, metric.start_line, metric.end_line, metric.qualname - - -def _dead_candidate_sort_key(item: DeadCandidate) -> tuple[str, int, int, str]: - return item.filepath, item.start_line, item.end_line, item.qualname - - -def _unit_to_group_item(unit: Unit) -> GroupItem: - return { - "qualname": unit.qualname, - "filepath": unit.filepath, - "start_line": unit.start_line, - "end_line": unit.end_line, - "loc": unit.loc, - "stmt_count": unit.stmt_count, - "fingerprint": unit.fingerprint, - "loc_bucket": unit.loc_bucket, - "cyclomatic_complexity": unit.cyclomatic_complexity, - "nesting_depth": unit.nesting_depth, - "risk": unit.risk, - "raw_hash": unit.raw_hash, - "entry_guard_count": unit.entry_guard_count, - "entry_guard_terminal_profile": unit.entry_guard_terminal_profile, - "entry_guard_has_side_effect_before": unit.entry_guard_has_side_effect_before, - "terminal_kind": unit.terminal_kind, - "try_finally_profile": unit.try_finally_profile, - "side_effect_order_profile": unit.side_effect_order_profile, - } - - -def _block_to_group_item(block: BlockUnit) -> GroupItem: - return { - "block_hash": block.block_hash, - "filepath": block.filepath, - "qualname": block.qualname, - "start_line": block.start_line, - "end_line": block.end_line, - "size": block.size, - } - - -def _segment_to_group_item(segment: SegmentUnit) -> GroupItem: - return { - "segment_hash": segment.segment_hash, - "segment_sig": segment.segment_sig, - "filepath": segment.filepath, - "qualname": segment.qualname, - "start_line": segment.start_line, - "end_line": segment.end_line, - "size": segment.size, - } - - -def _parallel_min_files(processes: int) -> int: - return max(PARALLEL_MIN_FILES_FLOOR, processes * PARALLEL_MIN_FILES_PER_WORKER) - - -def _resolve_process_count(processes: object) -> int: - if processes is None: - return DEFAULT_RUNTIME_PROCESSES - return max(1, _as_int(processes, DEFAULT_RUNTIME_PROCESSES)) - - -def _should_collect_structural_findings(output_paths: OutputPaths) -> bool: - return any( - path is not None - for path in ( - output_paths.html, - output_paths.json, - output_paths.md, - output_paths.sarif, - output_paths.text, - ) - ) - - -def _should_use_parallel(files_count: int, processes: int) -> bool: - if processes <= 1: - return False - return files_count >= _parallel_min_files(processes) - - -def _new_discovery_buffers() -> tuple[ - list[GroupItem], - list[GroupItem], - list[GroupItem], - list[ClassMetrics], - list[ModuleDep], - list[DeadCandidate], - set[str], - set[str], - list[ModuleTypingCoverage], - list[ModuleDocstringCoverage], - list[ModuleApiSurface], - list[str], - list[str], -]: - return [], [], [], [], [], [], set(), set(), [], [], [], [], [] - - -def _decode_cached_structural_finding_group( - group_dict: StructuralFindingGroupDict, - filepath: str, -) -> StructuralFindingGroup: - """Convert a StructuralFindingGroupDict (from cache) to a StructuralFindingGroup.""" - finding_kind = group_dict["finding_kind"] - finding_key = group_dict["finding_key"] - signature = group_dict["signature"] - items = tuple( - StructuralFindingOccurrence( - finding_kind=finding_kind, - finding_key=finding_key, - file_path=filepath, - qualname=item["qualname"], - start=item["start"], - end=item["end"], - signature=signature, - ) - for item in group_dict["items"] - ) - return StructuralFindingGroup( - finding_kind=finding_kind, - finding_key=finding_key, - signature=signature, - items=items, - ) - - -def bootstrap( - *, - args: Namespace, - root: Path, - output_paths: OutputPaths, - cache_path: Path, -) -> BootstrapResult: - return BootstrapResult( - root=root, - config=NormalizationConfig(), - args=args, - output_paths=output_paths, - cache_path=cache_path, - ) - - -def _resolve_optional_runtime_path(value: object, *, root: Path) -> Path | None: - text = str(value).strip() if value is not None else "" - if not text: - return None - candidate = Path(text).expanduser() - resolved = candidate if candidate.is_absolute() else root / candidate - try: - return resolved.resolve() - except OSError: - return resolved.absolute() - - -def _cache_entry_has_metrics(entry: CacheEntry) -> bool: - metric_keys = ( - "class_metrics", - "module_deps", - "dead_candidates", - "referenced_names", - "referenced_qualnames", - "import_names", - "class_names", - ) - return all(key in entry and isinstance(entry.get(key), list) for key in metric_keys) - - -def _cache_entry_has_structural_findings(entry: CacheEntry) -> bool: - return "structural_findings" in entry - - -def _cache_entry_source_stats(entry: CacheEntry) -> tuple[int, int, int, int] | None: - stats_obj = entry.get("source_stats") - if not isinstance(stats_obj, dict): - return None - lines = stats_obj.get("lines") - functions = stats_obj.get("functions") - methods = stats_obj.get("methods") - classes = stats_obj.get("classes") - if not ( - isinstance(lines, int) - and isinstance(functions, int) - and isinstance(methods, int) - and isinstance(classes, int) - and lines >= 0 - and functions >= 0 - and methods >= 0 - and classes >= 0 - ): - return None - return lines, functions, methods, classes - - -def _usable_cached_source_stats( - entry: CacheEntry, - *, - skip_metrics: bool, - collect_structural_findings: bool, -) -> tuple[int, int, int, int] | None: - if not skip_metrics and not _cache_entry_has_metrics(entry): - return None - if collect_structural_findings and not _cache_entry_has_structural_findings(entry): - return None - return _cache_entry_source_stats(entry) - - -def _cache_dict_module_fields( - value: object, -) -> tuple[Mapping[str, object], str, str] | None: - if not isinstance(value, dict): - return None - row = cast("Mapping[str, object]", value) - module = row.get("module") - filepath = row.get("filepath") - if not isinstance(module, str) or not isinstance(filepath, str): - return None - return row, module, filepath - - -def _cache_dict_int_fields( - row: Mapping[str, object], - *keys: str, -) -> tuple[int, ...] | None: - values: list[int] = [] - for key in keys: - value = row.get(key) - if not isinstance(value, int): - return None - values.append(value) - return tuple(values) - - -def _typing_coverage_from_cache_dict( - value: object, -) -> ModuleTypingCoverage | None: - row_info = _cache_dict_module_fields(value) - if row_info is None: - return None - row, module, filepath = row_info - int_fields = _cache_dict_int_fields( - row, - "callable_count", - "params_total", - "params_annotated", - "returns_total", - "returns_annotated", - "any_annotation_count", - ) - if int_fields is None: - return None - ( - callable_count, - params_total, - params_annotated, - returns_total, - returns_annotated, - any_annotation_count, - ) = int_fields - return ModuleTypingCoverage( - module=module, - filepath=filepath, - callable_count=callable_count, - params_total=params_total, - params_annotated=params_annotated, - returns_total=returns_total, - returns_annotated=returns_annotated, - any_annotation_count=any_annotation_count, - ) - - -def _docstring_coverage_from_cache_dict( - value: object, -) -> ModuleDocstringCoverage | None: - row_info = _cache_dict_module_fields(value) - if row_info is None: - return None - row, module, filepath = row_info - totals = _cache_dict_int_fields( - row, - "public_symbol_total", - "public_symbol_documented", - ) - if totals is None: - return None - public_symbol_total, public_symbol_documented = totals - return ModuleDocstringCoverage( - module=module, - filepath=filepath, - public_symbol_total=public_symbol_total, - public_symbol_documented=public_symbol_documented, - ) - - -def _api_param_spec_from_cache_dict(value: ApiParamSpecDict) -> ApiParamSpec | None: - name = value.get("name") - kind = value.get("kind") - has_default = value.get("has_default") - annotation_hash = value.get("annotation_hash", "") - if ( - not isinstance(name, str) - or not isinstance(kind, str) - or not isinstance(has_default, bool) - or not isinstance(annotation_hash, str) - ): - return None - return ApiParamSpec( - name=name, - kind=cast( - "Literal['pos_only', 'pos_or_kw', 'vararg', 'kw_only', 'kwarg']", - kind, - ), - has_default=has_default, - annotation_hash=annotation_hash, - ) - - -def _public_symbol_from_cache_dict( - value: PublicSymbolDict, -) -> PublicSymbol | None: - qualname = value.get("qualname") - kind = value.get("kind") - start_line = value.get("start_line") - end_line = value.get("end_line") - exported_via = value.get("exported_via", "name") - returns_hash = value.get("returns_hash", "") - params_raw = value.get("params", []) - if ( - not isinstance(qualname, str) - or not isinstance(kind, str) - or not isinstance(start_line, int) - or not isinstance(end_line, int) - or not isinstance(exported_via, str) - or not isinstance(returns_hash, str) - or not isinstance(params_raw, list) - ): - return None - params = [] - for param in params_raw: - if not isinstance(param, dict): - return None - parsed = _api_param_spec_from_cache_dict(param) - if parsed is None: - return None - params.append(parsed) - return PublicSymbol( - qualname=qualname, - kind=cast("Literal['function', 'class', 'method', 'constant']", kind), - start_line=start_line, - end_line=end_line, - params=tuple(params), - returns_hash=returns_hash, - exported_via=cast("Literal['all', 'name']", exported_via), - ) - - -def _api_surface_from_cache_dict(value: object) -> ModuleApiSurface | None: - row_info = _cache_dict_module_fields(value) - if row_info is None: - return None - row, module, filepath = row_info - all_declared_raw = row.get("all_declared", []) - symbols_raw = row.get("symbols", []) - if ( - not isinstance(all_declared_raw, list) - or not isinstance(symbols_raw, list) - or not all(isinstance(item, str) for item in all_declared_raw) - ): - return None - symbols: list[PublicSymbol] = [] - for item in symbols_raw: - if not isinstance(item, dict): - return None - parsed = _public_symbol_from_cache_dict(cast("PublicSymbolDict", item)) - if parsed is None: - return None - symbols.append(parsed) - return ModuleApiSurface( - module=module, - filepath=filepath, - all_declared=tuple(sorted(set(all_declared_raw))) or None, - symbols=tuple(sorted(symbols, key=lambda item: item.qualname)), - ) - - -def _load_cached_metrics_extended( - entry: CacheEntry, - *, - filepath: str, -) -> tuple[ - tuple[ClassMetrics, ...], - tuple[ModuleDep, ...], - tuple[DeadCandidate, ...], - frozenset[str], - frozenset[str], - ModuleTypingCoverage | None, - ModuleDocstringCoverage | None, - ModuleApiSurface | None, -]: - class_metrics_rows: list[ClassMetricsDict] = entry.get("class_metrics", []) - class_metrics = tuple( - ClassMetrics( - qualname=row["qualname"], - filepath=row["filepath"], - start_line=row["start_line"], - end_line=row["end_line"], - cbo=row["cbo"], - lcom4=row["lcom4"], - method_count=row["method_count"], - instance_var_count=row["instance_var_count"], - risk_coupling=cast( - "Literal['low', 'medium', 'high']", - row["risk_coupling"], - ), - risk_cohesion=cast( - "Literal['low', 'medium', 'high']", - row["risk_cohesion"], - ), - coupled_classes=_as_sorted_str_tuple(row.get("coupled_classes", [])), - ) - for row in class_metrics_rows - if row.get("qualname") and row.get("filepath") - ) - - module_dep_rows: list[ModuleDepDict] = entry.get("module_deps", []) - module_deps = tuple( - ModuleDep( - source=row["source"], - target=row["target"], - import_type=cast("Literal['import', 'from_import']", row["import_type"]), - line=row["line"], - ) - for row in module_dep_rows - if row.get("source") and row.get("target") - ) - - dead_rows: list[DeadCandidateDict] = entry.get("dead_candidates", []) - dead_candidates = tuple( - DeadCandidate( - qualname=row["qualname"], - local_name=row["local_name"], - filepath=row["filepath"], - start_line=row["start_line"], - end_line=row["end_line"], - kind=cast( - "Literal['function', 'class', 'method', 'import']", - row["kind"], - ), - suppressed_rules=tuple(sorted(set(row.get("suppressed_rules", [])))), - ) - for row in dead_rows - if row.get("qualname") and row.get("local_name") and row.get("filepath") - ) - - referenced_names = ( - frozenset() - if is_test_filepath(filepath) - else frozenset(entry.get("referenced_names", [])) - ) - referenced_qualnames = ( - frozenset() - if is_test_filepath(filepath) - else frozenset(entry.get("referenced_qualnames", [])) - ) - typing_coverage = _typing_coverage_from_cache_dict(entry.get("typing_coverage")) - docstring_coverage = _docstring_coverage_from_cache_dict( - entry.get("docstring_coverage") - ) - api_surface = _api_surface_from_cache_dict(entry.get("api_surface")) - return ( - class_metrics, - module_deps, - dead_candidates, - referenced_names, - referenced_qualnames, - typing_coverage, - docstring_coverage, - api_surface, - ) - - -def discover(*, boot: BootstrapResult, cache: Cache) -> DiscoveryResult: - files_found = 0 - cache_hits = 0 - files_skipped = 0 - collect_structural_findings = _should_collect_structural_findings(boot.output_paths) - cached_segment_projection = _coerce_segment_report_projection( - getattr(cache, "segment_report_projection", None) - ) - - ( - cached_units, - cached_blocks, - cached_segments, - cached_class_metrics, - cached_module_deps, - cached_dead_candidates, - cached_referenced_names, - cached_referenced_qualnames, - cached_typing_modules, - cached_docstring_modules, - cached_api_modules, - files_to_process, - skipped_warnings, - ) = _new_discovery_buffers() - cached_sf: list[StructuralFindingGroup] = [] - cached_source_stats_by_file: list[tuple[str, int, int, int, int]] = [] - cached_lines = 0 - cached_functions = 0 - cached_methods = 0 - cached_classes = 0 - all_file_paths: list[str] = [] - - for filepath in iter_py_files(str(boot.root)): - files_found += 1 - all_file_paths.append(filepath) - try: - stat = file_stat_signature(filepath) - except OSError as exc: - files_skipped += 1 - skipped_warnings.append(f"{filepath}: {exc}") - continue - - cached = cache.get_file_entry(filepath) - if cached and cached.get("stat") == stat: - cached_source_stats = _usable_cached_source_stats( - cached, - skip_metrics=boot.args.skip_metrics, - collect_structural_findings=collect_structural_findings, - ) - if cached_source_stats is None: - files_to_process.append(filepath) - continue - - cache_hits += 1 - lines, functions, methods, classes = cached_source_stats - cached_lines += lines - cached_functions += functions - cached_methods += methods - cached_classes += classes - cached_source_stats_by_file.append( - (filepath, lines, functions, methods, classes) - ) - cached_units.extend(cast("list[GroupItem]", cast(object, cached["units"]))) - cached_blocks.extend( - cast("list[GroupItem]", cast(object, cached["blocks"])) - ) - cached_segments.extend( - cast("list[GroupItem]", cast(object, cached["segments"])) - ) - - if not boot.args.skip_metrics: - ( - class_metrics, - module_deps, - dead_candidates, - referenced_names, - referenced_qualnames, - typing_coverage, - docstring_coverage, - api_surface, - ) = _load_cached_metrics_extended(cached, filepath=filepath) - cached_class_metrics.extend(class_metrics) - cached_module_deps.extend(module_deps) - cached_dead_candidates.extend(dead_candidates) - cached_referenced_names.update(referenced_names) - cached_referenced_qualnames.update(referenced_qualnames) - if typing_coverage is not None: - cached_typing_modules.append(typing_coverage) - if docstring_coverage is not None: - cached_docstring_modules.append(docstring_coverage) - if api_surface is not None: - cached_api_modules.append(api_surface) - if collect_structural_findings: - cached_sf.extend( - _decode_cached_structural_finding_group(group_dict, filepath) - for group_dict in cached.get("structural_findings") or [] - ) - continue - - files_to_process.append(filepath) - - return DiscoveryResult( - files_found=files_found, - cache_hits=cache_hits, - files_skipped=files_skipped, - all_file_paths=tuple(all_file_paths), - cached_units=tuple(sorted(cached_units, key=_group_item_sort_key)), - cached_blocks=tuple(sorted(cached_blocks, key=_group_item_sort_key)), - cached_segments=tuple(sorted(cached_segments, key=_group_item_sort_key)), - cached_class_metrics=tuple( - sorted(cached_class_metrics, key=_class_metric_sort_key) - ), - cached_module_deps=tuple(sorted(cached_module_deps, key=_module_dep_sort_key)), - cached_dead_candidates=tuple( - sorted(cached_dead_candidates, key=_dead_candidate_sort_key) - ), - cached_referenced_names=frozenset(cached_referenced_names), - cached_referenced_qualnames=frozenset(cached_referenced_qualnames), - cached_typing_modules=tuple( - sorted(cached_typing_modules, key=lambda item: (item.filepath, item.module)) - ), - cached_docstring_modules=tuple( - sorted( - cached_docstring_modules, - key=lambda item: (item.filepath, item.module), - ) - ), - cached_api_modules=tuple( - sorted(cached_api_modules, key=lambda item: (item.filepath, item.module)) - ), - files_to_process=tuple(files_to_process), - skipped_warnings=tuple(sorted(skipped_warnings)), - cached_structural_findings=tuple(cached_sf), - cached_segment_report_projection=cached_segment_projection, - cached_lines=cached_lines, - cached_functions=cached_functions, - cached_methods=cached_methods, - cached_classes=cached_classes, - cached_source_stats_by_file=tuple( - sorted(cached_source_stats_by_file, key=lambda row: row[0]) - ), - ) - - -def process_file( - filepath: str, - root: str, - cfg: NormalizationConfig, - min_loc: int, - min_stmt: int, - collect_structural_findings: bool = True, - collect_api_surface: bool = False, - api_include_private_modules: bool = False, - block_min_loc: int = 20, - block_min_stmt: int = 8, - segment_min_loc: int = 20, - segment_min_stmt: int = 10, -) -> FileProcessResult: - try: - try: - stat_result = os.stat(filepath) - if stat_result.st_size > MAX_FILE_SIZE: - return FileProcessResult( - filepath=filepath, - success=False, - error=( - f"File too large: {stat_result.st_size} bytes " - f"(max {MAX_FILE_SIZE})" - ), - error_kind="file_too_large", - ) - except OSError as exc: - return FileProcessResult( - filepath=filepath, - success=False, - error=f"Cannot stat file: {exc}", - error_kind="stat_error", - ) - - stat: FileStat = { - "mtime_ns": stat_result.st_mtime_ns, - "size": stat_result.st_size, - } - - try: - source = Path(filepath).read_text("utf-8") - except UnicodeDecodeError as exc: - return FileProcessResult( - filepath=filepath, - success=False, - error=f"Encoding error: {exc}", - error_kind="source_read_error", - ) - except OSError as exc: - return FileProcessResult( - filepath=filepath, - success=False, - error=f"Cannot read file: {exc}", - error_kind="source_read_error", - ) - - module_name = module_name_from_path(root, filepath) - units, blocks, segments, source_stats, file_metrics, sf = ( - extract_units_and_stats_from_source( - source=source, - filepath=filepath, - module_name=module_name, - cfg=cfg, - min_loc=min_loc, - min_stmt=min_stmt, - block_min_loc=block_min_loc, - block_min_stmt=block_min_stmt, - segment_min_loc=segment_min_loc, - segment_min_stmt=segment_min_stmt, - collect_structural_findings=collect_structural_findings, - collect_api_surface=collect_api_surface, - api_include_private_modules=api_include_private_modules, - ) - ) - - return FileProcessResult( - filepath=filepath, - success=True, - units=units, - blocks=blocks, - segments=segments, - lines=source_stats.lines, - functions=source_stats.functions, - methods=source_stats.methods, - classes=source_stats.classes, - stat=stat, - file_metrics=file_metrics, - structural_findings=sf, - ) - except Exception as exc: # pragma: no cover - defensive shell around workers - return FileProcessResult( - filepath=filepath, - success=False, - error=f"Unexpected error: {type(exc).__name__}: {exc}", - error_kind="unexpected_error", - ) - - -def _invoke_process_file( - filepath: str, - root: str, - cfg: NormalizationConfig, - min_loc: int, - min_stmt: int, - *, - collect_structural_findings: bool, - collect_api_surface: bool, - api_include_private_modules: bool, - block_min_loc: int, - block_min_stmt: int, - segment_min_loc: int, - segment_min_stmt: int, -) -> FileProcessResult: - optional_kwargs: dict[str, object] = { - "collect_structural_findings": collect_structural_findings, - "collect_api_surface": collect_api_surface, - "api_include_private_modules": api_include_private_modules, - "block_min_loc": block_min_loc, - "block_min_stmt": block_min_stmt, - "segment_min_loc": segment_min_loc, - "segment_min_stmt": segment_min_stmt, - } - try: - signature = inspect.signature(process_file) - except (TypeError, ValueError): - supported_kwargs = optional_kwargs - else: - parameters = tuple(signature.parameters.values()) - if any( - parameter.kind == inspect.Parameter.VAR_KEYWORD for parameter in parameters - ): - supported_kwargs = optional_kwargs - else: - supported_names = {parameter.name for parameter in parameters} - supported_kwargs = { - key: value - for key, value in optional_kwargs.items() - if key in supported_names - } - process_callable = cast("Callable[..., FileProcessResult]", process_file) - return process_callable( - filepath, - root, - cfg, - min_loc, - min_stmt, - **supported_kwargs, - ) - - -def process( - *, - boot: BootstrapResult, - discovery: DiscoveryResult, - cache: Cache, - on_advance: Callable[[], None] | None = None, - on_worker_error: Callable[[str], None] | None = None, - on_parallel_fallback: Callable[[Exception], None] | None = None, - batch_size: int = DEFAULT_BATCH_SIZE, -) -> ProcessingResult: - files_to_process = discovery.files_to_process - if not files_to_process: - return ProcessingResult( - units=discovery.cached_units, - blocks=discovery.cached_blocks, - segments=discovery.cached_segments, - class_metrics=discovery.cached_class_metrics, - module_deps=discovery.cached_module_deps, - dead_candidates=discovery.cached_dead_candidates, - referenced_names=discovery.cached_referenced_names, - referenced_qualnames=discovery.cached_referenced_qualnames, - typing_modules=discovery.cached_typing_modules, - docstring_modules=discovery.cached_docstring_modules, - api_modules=discovery.cached_api_modules, - files_analyzed=0, - files_skipped=discovery.files_skipped, - analyzed_lines=0, - analyzed_functions=0, - analyzed_methods=0, - analyzed_classes=0, - failed_files=(), - source_read_failures=(), - structural_findings=discovery.cached_structural_findings, - source_stats_by_file=discovery.cached_source_stats_by_file, - ) - - all_units: list[GroupItem] = list(discovery.cached_units) - all_blocks: list[GroupItem] = list(discovery.cached_blocks) - all_segments: list[GroupItem] = list(discovery.cached_segments) - - all_class_metrics: list[ClassMetrics] = list(discovery.cached_class_metrics) - all_module_deps: list[ModuleDep] = list(discovery.cached_module_deps) - all_dead_candidates: list[DeadCandidate] = list(discovery.cached_dead_candidates) - all_referenced_names: set[str] = set(discovery.cached_referenced_names) - all_referenced_qualnames: set[str] = set(discovery.cached_referenced_qualnames) - all_typing_modules: list[ModuleTypingCoverage] = list( - discovery.cached_typing_modules - ) - all_docstring_modules: list[ModuleDocstringCoverage] = list( - discovery.cached_docstring_modules - ) - all_api_modules: list[ModuleApiSurface] = list(discovery.cached_api_modules) - collect_structural_findings = _should_collect_structural_findings(boot.output_paths) - collect_api_surface = not boot.args.skip_metrics and bool( - getattr(boot.args, "api_surface", False) - ) - api_include_private_modules = bool( - getattr(boot.args, "api_include_private_modules", False) - ) - - files_analyzed = 0 - files_skipped = discovery.files_skipped - analyzed_lines = 0 - analyzed_functions = 0 - analyzed_methods = 0 - analyzed_classes = 0 - - all_structural_findings: list[StructuralFindingGroup] = list( - discovery.cached_structural_findings - ) - source_stats_by_file: dict[str, tuple[int, int, int, int]] = { - filepath: (lines, functions, methods, classes) - for filepath, lines, functions, methods, classes in ( - discovery.cached_source_stats_by_file - ) - } - failed_files: list[str] = [] - source_read_failures: list[str] = [] - root_str = str(boot.root) - # Keep process-count fallback in the core runtime so non-CLI callers such as - # the MCP service do not need to guess or mirror parallelism policy. - processes = _resolve_process_count(boot.args.processes) - min_loc = int(boot.args.min_loc) - min_stmt = int(boot.args.min_stmt) - block_min_loc = int(boot.args.block_min_loc) - block_min_stmt = int(boot.args.block_min_stmt) - segment_min_loc = int(boot.args.segment_min_loc) - segment_min_stmt = int(boot.args.segment_min_stmt) - collect_structural_findings = _should_collect_structural_findings(boot.output_paths) - - def _accept_result(result: FileProcessResult) -> None: - nonlocal files_analyzed - nonlocal files_skipped - nonlocal analyzed_lines - nonlocal analyzed_functions - nonlocal analyzed_methods - nonlocal analyzed_classes - - if result.success and result.stat is not None: - source_stats_payload = SourceStatsDict( - lines=result.lines, - functions=result.functions, - methods=result.methods, - classes=result.classes, - ) - structural_payload = ( - result.structural_findings if collect_structural_findings else None - ) - try: - cache.put_file_entry( - result.filepath, - result.stat, - result.units or [], - result.blocks or [], - result.segments or [], - source_stats=source_stats_payload, - file_metrics=result.file_metrics, - structural_findings=structural_payload, - ) - except TypeError as exc: - if "source_stats" not in str(exc): - raise - cache.put_file_entry( - result.filepath, - result.stat, - result.units or [], - result.blocks or [], - result.segments or [], - file_metrics=result.file_metrics, - structural_findings=structural_payload, - ) - files_analyzed += 1 - analyzed_lines += result.lines - analyzed_functions += result.functions - analyzed_methods += result.methods - analyzed_classes += result.classes - source_stats_by_file[result.filepath] = ( - result.lines, - result.functions, - result.methods, - result.classes, - ) - - if result.units: - all_units.extend(_unit_to_group_item(unit) for unit in result.units) - if result.blocks: - all_blocks.extend( - _block_to_group_item(block) for block in result.blocks - ) - if result.segments: - all_segments.extend( - _segment_to_group_item(segment) for segment in result.segments - ) - if result.structural_findings: - all_structural_findings.extend(result.structural_findings) - - if not boot.args.skip_metrics and result.file_metrics is not None: - all_class_metrics.extend(result.file_metrics.class_metrics) - all_module_deps.extend(result.file_metrics.module_deps) - all_dead_candidates.extend(result.file_metrics.dead_candidates) - all_referenced_names.update(result.file_metrics.referenced_names) - all_referenced_qualnames.update( - result.file_metrics.referenced_qualnames - ) - if result.file_metrics.typing_coverage is not None: - all_typing_modules.append(result.file_metrics.typing_coverage) - if result.file_metrics.docstring_coverage is not None: - all_docstring_modules.append(result.file_metrics.docstring_coverage) - if result.file_metrics.api_surface is not None: - all_api_modules.append(result.file_metrics.api_surface) - return - - files_skipped += 1 - failure = f"{result.filepath}: {result.error}" - failed_files.append(failure) - if result.error_kind == "source_read_error": - source_read_failures.append(failure) - - def _run_sequential(files: Sequence[str]) -> None: - for filepath in files: - _accept_result( - _invoke_process_file( - filepath, - root_str, - boot.config, - min_loc, - min_stmt, - collect_structural_findings=collect_structural_findings, - collect_api_surface=collect_api_surface, - api_include_private_modules=api_include_private_modules, - block_min_loc=block_min_loc, - block_min_stmt=block_min_stmt, - segment_min_loc=segment_min_loc, - segment_min_stmt=segment_min_stmt, - ) - ) - if on_advance is not None: - on_advance() - - if _should_use_parallel(len(files_to_process), processes): - try: - with ProcessPoolExecutor(max_workers=processes) as executor: - for idx in range(0, len(files_to_process), batch_size): - batch = files_to_process[idx : idx + batch_size] - futures = [ - executor.submit( - _invoke_process_file, - filepath, - root_str, - boot.config, - min_loc, - min_stmt, - collect_structural_findings=collect_structural_findings, - collect_api_surface=collect_api_surface, - api_include_private_modules=api_include_private_modules, - block_min_loc=block_min_loc, - block_min_stmt=block_min_stmt, - segment_min_loc=segment_min_loc, - segment_min_stmt=segment_min_stmt, - ) - for filepath in batch - ] - future_to_path = { - id(future): filepath - for future, filepath in zip(futures, batch, strict=True) - } - for future in as_completed(futures): - filepath = future_to_path[id(future)] - try: - _accept_result(future.result()) - except Exception as exc: # pragma: no cover - worker crash - files_skipped += 1 - failed_files.append(f"{filepath}: {exc}") - if on_worker_error is not None: - on_worker_error(str(exc)) - if on_advance is not None: - on_advance() - except (OSError, RuntimeError, PermissionError) as exc: - if on_parallel_fallback is not None: - on_parallel_fallback(exc) - _run_sequential(files_to_process) - else: - _run_sequential(files_to_process) - - return ProcessingResult( - units=tuple(sorted(all_units, key=_group_item_sort_key)), - blocks=tuple(sorted(all_blocks, key=_group_item_sort_key)), - segments=tuple(sorted(all_segments, key=_group_item_sort_key)), - class_metrics=tuple(sorted(all_class_metrics, key=_class_metric_sort_key)), - module_deps=tuple(sorted(all_module_deps, key=_module_dep_sort_key)), - dead_candidates=tuple( - sorted(all_dead_candidates, key=_dead_candidate_sort_key) - ), - referenced_names=frozenset(all_referenced_names), - referenced_qualnames=frozenset(all_referenced_qualnames), - typing_modules=tuple( - sorted(all_typing_modules, key=lambda item: (item.filepath, item.module)) - ), - docstring_modules=tuple( - sorted(all_docstring_modules, key=lambda item: (item.filepath, item.module)) - ), - api_modules=tuple( - sorted(all_api_modules, key=lambda item: (item.filepath, item.module)) - ), - files_analyzed=files_analyzed, - files_skipped=files_skipped, - analyzed_lines=analyzed_lines, - analyzed_functions=analyzed_functions, - analyzed_methods=analyzed_methods, - analyzed_classes=analyzed_classes, - failed_files=tuple(sorted(failed_files)), - source_read_failures=tuple(sorted(source_read_failures)), - structural_findings=tuple(all_structural_findings), - source_stats_by_file=tuple( - (filepath, *stats) - for filepath, stats in sorted(source_stats_by_file.items()) - ), - ) - - -def _module_names_from_units(units: Sequence[GroupItemLike]) -> frozenset[str]: - modules: set[str] = set() - for unit in units: - qualname = _as_str(unit.get("qualname")) - module_name = qualname.split(":", 1)[0] if ":" in qualname else qualname - if module_name: - modules.add(module_name) - return frozenset(sorted(modules)) - - -def compute_project_metrics( - *, - units: Sequence[GroupItemLike], - class_metrics: Sequence[ClassMetrics], - module_deps: Sequence[ModuleDep], - dead_candidates: Sequence[DeadCandidate], - referenced_names: frozenset[str], - referenced_qualnames: frozenset[str], - typing_modules: Sequence[ModuleTypingCoverage] = (), - docstring_modules: Sequence[ModuleDocstringCoverage] = (), - api_modules: Sequence[ModuleApiSurface] = (), - files_found: int, - files_analyzed_or_cached: int, - function_clone_groups: int, - block_clone_groups: int, - skip_dependencies: bool, - skip_dead_code: bool, -) -> tuple[ProjectMetrics, DepGraph, tuple[DeadItem, ...]]: - unit_rows = sorted(units, key=_group_item_sort_key) - complexities = tuple( - max(1, _as_int(row.get("cyclomatic_complexity"), 1)) for row in unit_rows - ) - complexity_max = max(complexities) if complexities else 0 - complexity_avg = ( - float(sum(complexities)) / float(len(complexities)) if complexities else 0.0 - ) - high_risk_functions = tuple( - sorted( - { - _as_str(row.get("qualname")) - for row in unit_rows - if _as_str(row.get("risk")) == RISK_HIGH - } - ) - ) - - classes_sorted = tuple(sorted(class_metrics, key=_class_metric_sort_key)) - coupling_values = tuple(metric.cbo for metric in classes_sorted) - coupling_max = max(coupling_values) if coupling_values else 0 - coupling_avg = ( - float(sum(coupling_values)) / float(len(coupling_values)) - if coupling_values - else 0.0 - ) - high_risk_classes = tuple( - sorted( - { - metric.qualname - for metric in classes_sorted - if metric.risk_coupling == RISK_HIGH - } - ) - ) - - cohesion_values = tuple(metric.lcom4 for metric in classes_sorted) - cohesion_max = max(cohesion_values) if cohesion_values else 0 - cohesion_avg = ( - float(sum(cohesion_values)) / float(len(cohesion_values)) - if cohesion_values - else 0.0 - ) - low_cohesion_classes = tuple( - sorted( - { - metric.qualname - for metric in classes_sorted - if metric.risk_cohesion == RISK_HIGH - } - ) - ) - - dep_graph = DepGraph( - modules=frozenset(), - edges=(), - cycles=(), - max_depth=0, - longest_chains=(), - ) - if not skip_dependencies: - dep_graph = build_dep_graph( - modules=_module_names_from_units(unit_rows), - deps=module_deps, - ) - - dead_items: tuple[DeadItem, ...] = () - if not skip_dead_code: - dead_items = find_unused( - definitions=tuple(dead_candidates), - referenced_names=referenced_names, - referenced_qualnames=referenced_qualnames, - ) - - typing_rows = tuple( - sorted(typing_modules, key=lambda item: (item.filepath, item.module)) - ) - docstring_rows = tuple( - sorted(docstring_modules, key=lambda item: (item.filepath, item.module)) - ) - api_rows = tuple(sorted(api_modules, key=lambda item: (item.filepath, item.module))) - typing_param_total = sum(item.params_total for item in typing_rows) - typing_param_annotated = sum(item.params_annotated for item in typing_rows) - typing_return_total = sum(item.returns_total for item in typing_rows) - typing_return_annotated = sum(item.returns_annotated for item in typing_rows) - typing_any_count = sum(item.any_annotation_count for item in typing_rows) - docstring_public_total = sum(item.public_symbol_total for item in docstring_rows) - docstring_public_documented = sum( - item.public_symbol_documented for item in docstring_rows - ) - - health = compute_health( - HealthInputs( - files_found=files_found, - files_analyzed_or_cached=files_analyzed_or_cached, - function_clone_groups=function_clone_groups, - block_clone_groups=block_clone_groups, - complexity_avg=complexity_avg, - complexity_max=complexity_max, - high_risk_functions=len(high_risk_functions), - coupling_avg=coupling_avg, - coupling_max=coupling_max, - high_risk_classes=len(high_risk_classes), - cohesion_avg=cohesion_avg, - low_cohesion_classes=len(low_cohesion_classes), - dependency_cycles=len(dep_graph.cycles), - dependency_max_depth=dep_graph.max_depth, - dead_code_items=len(dead_items), - ) - ) - - project_metrics = ProjectMetrics( - complexity_avg=complexity_avg, - complexity_max=complexity_max, - high_risk_functions=high_risk_functions, - coupling_avg=coupling_avg, - coupling_max=coupling_max, - high_risk_classes=high_risk_classes, - cohesion_avg=cohesion_avg, - cohesion_max=cohesion_max, - low_cohesion_classes=low_cohesion_classes, - dependency_modules=len(dep_graph.modules), - dependency_edges=len(dep_graph.edges), - dependency_edge_list=dep_graph.edges, - dependency_cycles=dep_graph.cycles, - dependency_max_depth=dep_graph.max_depth, - dependency_longest_chains=dep_graph.longest_chains, - dead_code=dead_items, - health=health, - typing_param_total=typing_param_total, - typing_param_annotated=typing_param_annotated, - typing_return_total=typing_return_total, - typing_return_annotated=typing_return_annotated, - typing_any_count=typing_any_count, - docstring_public_total=docstring_public_total, - docstring_public_documented=docstring_public_documented, - typing_modules=typing_rows, - docstring_modules=docstring_rows, - api_surface=ApiSurfaceSnapshot(modules=api_rows) if api_rows else None, - ) - return project_metrics, dep_graph, dead_items - - -def compute_suggestions( - *, - project_metrics: ProjectMetrics, - units: Sequence[GroupItemLike], - class_metrics: Sequence[ClassMetrics], - func_groups: Mapping[str, Sequence[GroupItemLike]], - block_groups: Mapping[str, Sequence[GroupItemLike]], - segment_groups: Mapping[str, Sequence[GroupItemLike]], - block_group_facts: Mapping[str, Mapping[str, str]] | None = None, - structural_findings: Sequence[StructuralFindingGroup] | None = None, - scan_root: str = "", -) -> tuple[Suggestion, ...]: - return generate_suggestions( - project_metrics=project_metrics, - units=units, - class_metrics=class_metrics, - func_groups=func_groups, - block_groups=block_groups, - segment_groups=segment_groups, - block_group_facts=block_group_facts, - structural_findings=structural_findings, - scan_root=scan_root, - ) - - -def _permille(numerator: int, denominator: int) -> int: - if denominator <= 0: - return 0 - return round((1000.0 * float(numerator)) / float(denominator)) - - -def _coverage_join_summary( - coverage_join: CoverageJoinResult | None, -) -> dict[str, object]: - if coverage_join is None: - return {} - return { - "status": coverage_join.status, - "source": coverage_join.coverage_xml, - "files": coverage_join.files, - "units": len(coverage_join.units), - "measured_units": coverage_join.measured_units, - "overall_executable_lines": coverage_join.overall_executable_lines, - "overall_covered_lines": coverage_join.overall_covered_lines, - "overall_permille": _permille( - coverage_join.overall_covered_lines, - coverage_join.overall_executable_lines, - ), - "missing_from_report_units": sum( - 1 - for fact in coverage_join.units - if fact.coverage_status == "missing_from_report" - ), - "coverage_hotspots": coverage_join.coverage_hotspots, - "scope_gap_hotspots": coverage_join.scope_gap_hotspots, - "hotspot_threshold_percent": coverage_join.hotspot_threshold_percent, - "invalid_reason": coverage_join.invalid_reason, - } - - -def _coverage_join_rows( - coverage_join: CoverageJoinResult | None, -) -> list[dict[str, object]]: - if coverage_join is None or coverage_join.status != "ok": - return [] - return sorted( - ( - { - "qualname": fact.qualname, - "filepath": fact.filepath, - "start_line": fact.start_line, - "end_line": fact.end_line, - "cyclomatic_complexity": fact.cyclomatic_complexity, - "risk": fact.risk, - "executable_lines": fact.executable_lines, - "covered_lines": fact.covered_lines, - "coverage_permille": fact.coverage_permille, - "coverage_status": fact.coverage_status, - "coverage_hotspot": ( - fact.risk in {"medium", "high"} - and fact.coverage_status == "measured" - and (fact.coverage_permille / 10.0) - < float(coverage_join.hotspot_threshold_percent) - ), - "scope_gap_hotspot": ( - fact.risk in {"medium", "high"} - and fact.coverage_status == "missing_from_report" - ), - "coverage_review_item": ( - ( - fact.risk in {"medium", "high"} - and fact.coverage_status == "measured" - and (fact.coverage_permille / 10.0) - < float(coverage_join.hotspot_threshold_percent) - ) - or ( - fact.risk in {"medium", "high"} - and fact.coverage_status == "missing_from_report" - ) - ), - } - for fact in coverage_join.units - ), - key=lambda item: ( - 0 if bool(item.get("coverage_hotspot")) else 1, - 0 if bool(item.get("scope_gap_hotspot")) else 1, - {"high": 0, "medium": 1, "low": 2}.get(_as_str(item.get("risk")), 3), - _as_int(item.get("coverage_permille"), 0), - -_as_int(item.get("cyclomatic_complexity"), 0), - _as_str(item.get("filepath")), - _as_int(item.get("start_line")), - _as_str(item.get("qualname")), - ), - ) - - -def _coverage_adoption_rows( - project_metrics: ProjectMetrics, -) -> list[dict[str, object]]: - docstring_by_module = { - (item.filepath, item.module): item for item in project_metrics.docstring_modules - } - rows: list[dict[str, object]] = [] - seen_keys: set[tuple[str, str]] = set() - for typing_item in project_metrics.typing_modules: - key = (typing_item.filepath, typing_item.module) - seen_keys.add(key) - docstring_item = docstring_by_module.get(key) - doc_total = docstring_item.public_symbol_total if docstring_item else 0 - doc_documented = ( - docstring_item.public_symbol_documented if docstring_item else 0 - ) - rows.append( - { - "module": typing_item.module, - "filepath": typing_item.filepath, - "callable_count": typing_item.callable_count, - "params_total": typing_item.params_total, - "params_annotated": typing_item.params_annotated, - "param_permille": _permille( - typing_item.params_annotated, - typing_item.params_total, - ), - "returns_total": typing_item.returns_total, - "returns_annotated": typing_item.returns_annotated, - "return_permille": _permille( - typing_item.returns_annotated, - typing_item.returns_total, - ), - "any_annotation_count": typing_item.any_annotation_count, - "public_symbol_total": doc_total, - "public_symbol_documented": doc_documented, - "docstring_permille": _permille(doc_documented, doc_total), - } - ) - for docstring_item in project_metrics.docstring_modules: - key = (docstring_item.filepath, docstring_item.module) - if key in seen_keys: - continue - rows.append( - { - "module": docstring_item.module, - "filepath": docstring_item.filepath, - "callable_count": 0, - "params_total": 0, - "params_annotated": 0, - "param_permille": 0, - "returns_total": 0, - "returns_annotated": 0, - "return_permille": 0, - "any_annotation_count": 0, - "public_symbol_total": docstring_item.public_symbol_total, - "public_symbol_documented": docstring_item.public_symbol_documented, - "docstring_permille": _permille( - docstring_item.public_symbol_documented, - docstring_item.public_symbol_total, - ), - } - ) - return sorted( - rows, - key=lambda item: ( - _as_int(item.get("param_permille")), - _as_int(item.get("docstring_permille")), - _as_int(item.get("return_permille")), - _as_str(item.get("module")), - ), - ) - - -def _api_surface_summary( - api_surface: ApiSurfaceSnapshot | None, -) -> dict[str, object]: - modules = api_surface.modules if api_surface is not None else () - return { - "enabled": api_surface is not None, - "modules": len(modules), - "public_symbols": sum(len(module.symbols) for module in modules), - "added": 0, - "breaking": 0, - "strict_types": False, - } - - -def _api_surface_rows( - api_surface: ApiSurfaceSnapshot | None, -) -> list[dict[str, object]]: - if api_surface is None: - return [] - rows: list[dict[str, object]] = [] - for module in api_surface.modules: - rows.extend( - { - "record_kind": "symbol", - "module": module.module, - "filepath": module.filepath, - "qualname": symbol.qualname, - "start_line": symbol.start_line, - "end_line": symbol.end_line, - "symbol_kind": symbol.kind, - "exported_via": symbol.exported_via, - "params_total": len(symbol.params), - "params": [ - { - "name": param.name, - "kind": param.kind, - "has_default": param.has_default, - "annotated": bool(param.annotation_hash), - } - for param in symbol.params - ], - "returns_annotated": bool(symbol.returns_hash), - } - for symbol in module.symbols - ) - return sorted( - rows, - key=lambda item: ( - _as_str(item.get("filepath")), - _as_int(item.get("start_line")), - _as_int(item.get("end_line")), - _as_str(item.get("qualname")), - _as_str(item.get("record_kind")), - ), - ) - - -def _breaking_api_surface_rows( - changes: Sequence[object], -) -> list[dict[str, object]]: - rows: list[dict[str, object]] = [] - for change in changes: - if not isinstance(change, ApiBreakingChange): - continue - module_name, _, _local_name = change.qualname.partition(":") - rows.append( - { - "record_kind": "breaking_change", - "module": module_name, - "filepath": change.filepath, - "qualname": change.qualname, - "start_line": change.start_line, - "end_line": change.end_line, - "symbol_kind": change.symbol_kind, - "change_kind": change.change_kind, - "detail": change.detail, - } - ) - return sorted( - rows, - key=lambda item: ( - _as_str(item.get("filepath")), - _as_int(item.get("start_line")), - _as_int(item.get("end_line")), - _as_str(item.get("qualname")), - _as_str(item.get("change_kind")), - ), - ) - - -def _enrich_metrics_report_payload( - *, - metrics_payload: Mapping[str, object], - metrics_diff: MetricsDiff | None, - coverage_adoption_diff_available: bool, - api_surface_diff_available: bool, -) -> dict[str, object]: - enriched = { - key: (dict(value) if isinstance(value, Mapping) else value) - for key, value in metrics_payload.items() - } - coverage_adoption = dict( - cast("Mapping[str, object]", enriched.get("coverage_adoption", {})) - ) - coverage_summary = dict( - cast("Mapping[str, object]", coverage_adoption.get("summary", {})) - ) - if coverage_summary: - coverage_summary["baseline_diff_available"] = coverage_adoption_diff_available - coverage_summary["param_delta"] = ( - int(metrics_diff.typing_param_permille_delta) - if metrics_diff is not None and coverage_adoption_diff_available - else 0 - ) - coverage_summary["return_delta"] = ( - int(metrics_diff.typing_return_permille_delta) - if metrics_diff is not None and coverage_adoption_diff_available - else 0 - ) - coverage_summary["docstring_delta"] = ( - int(metrics_diff.docstring_permille_delta) - if metrics_diff is not None and coverage_adoption_diff_available - else 0 - ) - coverage_adoption["summary"] = coverage_summary - enriched["coverage_adoption"] = coverage_adoption - - api_surface = dict(cast("Mapping[str, object]", enriched.get("api_surface", {}))) - api_summary = dict(cast("Mapping[str, object]", api_surface.get("summary", {}))) - api_items = list(cast("Sequence[object]", api_surface.get("items", ()))) - if api_summary: - api_summary["baseline_diff_available"] = api_surface_diff_available - api_summary["added"] = ( - len(metrics_diff.new_api_symbols) - if metrics_diff is not None and api_surface_diff_available - else 0 - ) - api_summary["breaking"] = ( - len(metrics_diff.new_api_breaking_changes) - if metrics_diff is not None and api_surface_diff_available - else 0 - ) - api_surface["summary"] = api_summary - if ( - metrics_diff is not None - and api_surface_diff_available - and metrics_diff.new_api_breaking_changes - ): - api_items.extend( - _breaking_api_surface_rows(metrics_diff.new_api_breaking_changes) - ) - api_surface["items"] = api_items - if api_surface: - enriched["api_surface"] = api_surface - return enriched - - -def build_metrics_report_payload( - *, - scan_root: str = "", - project_metrics: ProjectMetrics, - coverage_join: CoverageJoinResult | None = None, - units: Sequence[GroupItemLike], - class_metrics: Sequence[ClassMetrics], - module_deps: Sequence[ModuleDep] = (), - source_stats_by_file: Sequence[tuple[str, int, int, int, int]] = (), - suppressed_dead_code: Sequence[DeadItem] = (), -) -> dict[str, object]: - sorted_units = sorted( - units, - key=lambda item: ( - _as_int(item.get("cyclomatic_complexity")), - _as_int(item.get("nesting_depth")), - _as_str(item.get("qualname")), - ), - reverse=True, - ) - complexity_rows = [ - { - "qualname": _as_str(item.get("qualname")), - "filepath": _as_str(item.get("filepath")), - "start_line": _as_int(item.get("start_line")), - "end_line": _as_int(item.get("end_line")), - "cyclomatic_complexity": _as_int(item.get("cyclomatic_complexity"), 1), - "nesting_depth": _as_int(item.get("nesting_depth")), - "risk": _as_str(item.get("risk"), RISK_LOW), - } - for item in sorted_units - ] - classes_sorted = sorted( - class_metrics, - key=lambda item: (item.cbo, item.lcom4, item.qualname), - reverse=True, - ) - coupling_rows = [ - { - "qualname": metric.qualname, - "filepath": metric.filepath, - "start_line": metric.start_line, - "end_line": metric.end_line, - "cbo": metric.cbo, - "risk": metric.risk_coupling, - "coupled_classes": list(metric.coupled_classes), - } - for metric in classes_sorted - ] - cohesion_rows = [ - { - "qualname": metric.qualname, - "filepath": metric.filepath, - "start_line": metric.start_line, - "end_line": metric.end_line, - "lcom4": metric.lcom4, - "risk": metric.risk_cohesion, - "method_count": metric.method_count, - "instance_var_count": metric.instance_var_count, - } - for metric in classes_sorted - ] - active_dead_items = tuple(project_metrics.dead_code) - suppressed_dead_items = tuple(suppressed_dead_code) - coverage_adoption_rows = _coverage_adoption_rows(project_metrics) - api_surface_summary = _api_surface_summary(project_metrics.api_surface) - api_surface_items = _api_surface_rows(project_metrics.api_surface) - coverage_join_summary = _coverage_join_summary(coverage_join) - coverage_join_items = _coverage_join_rows(coverage_join) - - def _serialize_dead_item( - item: DeadItem, - *, - suppressed: bool = False, - ) -> dict[str, object]: - payload: dict[str, object] = { - "qualname": item.qualname, - "filepath": item.filepath, - "start_line": item.start_line, - "end_line": item.end_line, - "kind": item.kind, - "confidence": item.confidence, - } - if suppressed: - payload["suppressed_by"] = [ - { - "rule": DEAD_CODE_RULE_ID, - "source": INLINE_CODECLONE_SUPPRESSION_SOURCE, - } - ] - return payload - - payload = { - CATEGORY_COMPLEXITY: { - "functions": complexity_rows, - "summary": { - "total": len(complexity_rows), - "average": round(project_metrics.complexity_avg, 2), - "max": project_metrics.complexity_max, - "high_risk": len(project_metrics.high_risk_functions), - }, - }, - CATEGORY_COUPLING: { - "classes": coupling_rows, - "summary": { - "total": len(coupling_rows), - "average": round(project_metrics.coupling_avg, 2), - "max": project_metrics.coupling_max, - "high_risk": len(project_metrics.high_risk_classes), - }, - }, - CATEGORY_COHESION: { - "classes": cohesion_rows, - "summary": { - "total": len(cohesion_rows), - "average": round(project_metrics.cohesion_avg, 2), - "max": project_metrics.cohesion_max, - "low_cohesion": len(project_metrics.low_cohesion_classes), - }, - }, - "dependencies": { - "modules": project_metrics.dependency_modules, - "edges": project_metrics.dependency_edges, - "max_depth": project_metrics.dependency_max_depth, - "cycles": [list(cycle) for cycle in project_metrics.dependency_cycles], - "longest_chains": [ - list(chain) for chain in project_metrics.dependency_longest_chains - ], - "edge_list": [ - { - "source": edge.source, - "target": edge.target, - "import_type": edge.import_type, - "line": edge.line, - } - for edge in project_metrics.dependency_edge_list - ], - }, - "dead_code": { - "items": [_serialize_dead_item(item) for item in active_dead_items], - "suppressed_items": [ - _serialize_dead_item(item, suppressed=True) - for item in suppressed_dead_items - ], - "summary": { - "total": len(active_dead_items), - "critical": sum( - 1 - for item in active_dead_items - if item.confidence == CONFIDENCE_HIGH - ), - "high_confidence": sum( - 1 - for item in active_dead_items - if item.confidence == CONFIDENCE_HIGH - ), - "suppressed": len(suppressed_dead_items), - }, - }, - "health": { - "score": project_metrics.health.total, - "grade": project_metrics.health.grade, - "dimensions": dict(project_metrics.health.dimensions), - }, - "coverage_adoption": { - "summary": { - "modules": len(coverage_adoption_rows), - "params_total": project_metrics.typing_param_total, - "params_annotated": project_metrics.typing_param_annotated, - "param_permille": _permille( - project_metrics.typing_param_annotated, - project_metrics.typing_param_total, - ), - "returns_total": project_metrics.typing_return_total, - "returns_annotated": project_metrics.typing_return_annotated, - "return_permille": _permille( - project_metrics.typing_return_annotated, - project_metrics.typing_return_total, - ), - "public_symbol_total": project_metrics.docstring_public_total, - "public_symbol_documented": project_metrics.docstring_public_documented, - "docstring_permille": _permille( - project_metrics.docstring_public_documented, - project_metrics.docstring_public_total, - ), - "typing_any_count": project_metrics.typing_any_count, - }, - "items": coverage_adoption_rows, - }, - "api_surface": { - "summary": dict(api_surface_summary), - "items": api_surface_items, - }, - "overloaded_modules": build_overloaded_modules_payload( - scan_root=scan_root, - source_stats_by_file=source_stats_by_file, - units=units, - class_metrics=class_metrics, - module_deps=module_deps, - ), - } - if coverage_join is not None: - payload["coverage_join"] = { - "summary": dict(coverage_join_summary), - "items": coverage_join_items, - } - return payload - - -def analyze( - *, - boot: BootstrapResult, - discovery: DiscoveryResult, - processing: ProcessingResult, -) -> AnalysisResult: - golden_fixture_paths = tuple( - str(pattern).strip() - for pattern in getattr(boot.args, "golden_fixture_paths", ()) - if str(pattern).strip() - ) - - func_split = split_clone_groups_for_golden_fixtures( - groups=build_groups(processing.units), - kind="function", - golden_fixture_paths=golden_fixture_paths, - scan_root=str(boot.root), - ) - block_split = split_clone_groups_for_golden_fixtures( - groups=build_block_groups(processing.blocks), - kind="block", - golden_fixture_paths=golden_fixture_paths, - scan_root=str(boot.root), - ) - segment_split = split_clone_groups_for_golden_fixtures( - groups=build_segment_groups(processing.segments), - kind="segment", - golden_fixture_paths=golden_fixture_paths, - scan_root=str(boot.root), - ) - - func_groups = func_split.active_groups - block_groups = block_split.active_groups - segment_groups_raw = segment_split.active_groups - segment_groups_raw_digest = _segment_groups_digest(segment_groups_raw) - cached_projection = discovery.cached_segment_report_projection - if ( - cached_projection is not None - and cached_projection.get("digest") == segment_groups_raw_digest - ): - projection_groups = cached_projection.get("groups", {}) - segment_groups = { - group_key: [ - { - "segment_hash": str(item["segment_hash"]), - "segment_sig": str(item["segment_sig"]), - "filepath": str(item["filepath"]), - "qualname": str(item["qualname"]), - "start_line": int(item["start_line"]), - "end_line": int(item["end_line"]), - "size": int(item["size"]), - } - for item in projection_groups[group_key] - ] - for group_key in sorted(projection_groups) - } - suppressed_segment_groups = int(cached_projection.get("suppressed", 0)) - else: - segment_groups, suppressed_segment_groups = prepare_segment_report_groups( - segment_groups_raw - ) - - block_groups_report = prepare_block_report_groups(block_groups) - suppressed_block_groups_report = prepare_block_report_groups( - block_split.suppressed_groups - ) - if segment_split.suppressed_groups: - suppressed_segment_groups_report, _ = prepare_segment_report_groups( - segment_split.suppressed_groups - ) - else: - suppressed_segment_groups_report = {} - suppressed_clone_groups = ( - *build_suppressed_clone_groups( - kind="function", - groups=func_split.suppressed_groups, - matched_patterns=func_split.matched_patterns, - ), - *build_suppressed_clone_groups( - kind="block", - groups=suppressed_block_groups_report, - matched_patterns=block_split.matched_patterns, - ), - *build_suppressed_clone_groups( - kind="segment", - groups=suppressed_segment_groups_report, - matched_patterns=segment_split.matched_patterns, - ), - ) - block_group_facts = build_block_group_facts( - { - **block_groups_report, - **suppressed_block_groups_report, - } - ) - - func_clones_count = len(func_groups) - block_clones_count = len(block_groups) - segment_clones_count = len(segment_groups) - files_analyzed_or_cached = processing.files_analyzed + discovery.cache_hits - - project_metrics: ProjectMetrics | None = None - metrics_payload: dict[str, object] | None = None - suggestions: tuple[Suggestion, ...] = () - suppressed_dead_items: tuple[DeadItem, ...] = () - coverage_join: CoverageJoinResult | None = None - cohort_structural_findings: tuple[StructuralFindingGroup, ...] = () - if _should_collect_structural_findings(boot.output_paths): - cohort_structural_findings = build_clone_cohort_structural_findings( - func_groups=func_groups, - ) - combined_structural_findings = ( - *processing.structural_findings, - *cohort_structural_findings, - ) - - if not boot.args.skip_metrics: - project_metrics, _, _ = compute_project_metrics( - units=processing.units, - class_metrics=processing.class_metrics, - module_deps=processing.module_deps, - dead_candidates=processing.dead_candidates, - referenced_names=processing.referenced_names, - referenced_qualnames=processing.referenced_qualnames, - typing_modules=processing.typing_modules, - docstring_modules=processing.docstring_modules, - api_modules=processing.api_modules, - files_found=discovery.files_found, - files_analyzed_or_cached=files_analyzed_or_cached, - function_clone_groups=func_clones_count, - block_clone_groups=block_clones_count, - skip_dependencies=boot.args.skip_dependencies, - skip_dead_code=boot.args.skip_dead_code, - ) - if not boot.args.skip_dead_code: - suppressed_dead_items = find_suppressed_unused( - definitions=tuple(processing.dead_candidates), - referenced_names=processing.referenced_names, - referenced_qualnames=processing.referenced_qualnames, - ) - suggestions = compute_suggestions( - project_metrics=project_metrics, - units=processing.units, - class_metrics=processing.class_metrics, - func_groups=func_groups, - block_groups=block_groups_report, - segment_groups=segment_groups, - block_group_facts=block_group_facts, - structural_findings=combined_structural_findings, - scan_root=str(boot.root), - ) - coverage_xml_path = _resolve_optional_runtime_path( - getattr(boot.args, "coverage_xml", None), - root=boot.root, - ) - if coverage_xml_path is not None: - try: - coverage_join = build_coverage_join( - coverage_xml=coverage_xml_path, - root_path=boot.root, - units=processing.units, - hotspot_threshold_percent=int( - getattr(boot.args, "coverage_min", 50) - ), - ) - except CoverageJoinParseError as exc: - coverage_join = CoverageJoinResult( - coverage_xml=str(coverage_xml_path), - status="invalid", - hotspot_threshold_percent=int( - getattr(boot.args, "coverage_min", 50) - ), - invalid_reason=str(exc), - ) - metrics_payload = build_metrics_report_payload( - scan_root=str(boot.root), - project_metrics=project_metrics, - coverage_join=coverage_join, - units=processing.units, - class_metrics=processing.class_metrics, - module_deps=processing.module_deps, - source_stats_by_file=processing.source_stats_by_file, - suppressed_dead_code=suppressed_dead_items, - ) - - return AnalysisResult( - func_groups=func_groups, - block_groups=block_groups, - block_groups_report=block_groups_report, - segment_groups=segment_groups, - suppressed_clone_groups=tuple(suppressed_clone_groups), - suppressed_segment_groups=suppressed_segment_groups, - block_group_facts=block_group_facts, - func_clones_count=func_clones_count, - block_clones_count=block_clones_count, - segment_clones_count=segment_clones_count, - files_analyzed_or_cached=files_analyzed_or_cached, - project_metrics=project_metrics, - metrics_payload=metrics_payload, - suggestions=suggestions, - segment_groups_raw_digest=segment_groups_raw_digest, - coverage_join=coverage_join, - suppressed_dead_code_items=len(suppressed_dead_items), - structural_findings=combined_structural_findings, - ) - - -def _load_markdown_report_renderer() -> Callable[..., str]: - from .report.markdown import to_markdown_report - - return to_markdown_report - - -def _load_sarif_report_renderer() -> Callable[..., str]: - from .report.sarif import to_sarif_report - - return to_sarif_report - - -def report( - *, - boot: BootstrapResult, - discovery: DiscoveryResult, - processing: ProcessingResult, - analysis: AnalysisResult, - report_meta: Mapping[str, object], - new_func: Collection[str], - new_block: Collection[str], - html_builder: Callable[..., str] | None = None, - metrics_diff: object | None = None, - coverage_adoption_diff_available: bool = False, - api_surface_diff_available: bool = False, - include_report_document: bool = False, -) -> ReportArtifacts: - contents: dict[str, str | None] = { - "html": None, - "json": None, - "md": None, - "sarif": None, - "text": None, - } - - sf = analysis.structural_findings if analysis.structural_findings else None - report_inventory = { - "files": { - "total_found": discovery.files_found, - "analyzed": processing.files_analyzed, - "cached": discovery.cache_hits, - "skipped": processing.files_skipped, - "source_io_skipped": len(processing.source_read_failures), - }, - "code": { - "parsed_lines": processing.analyzed_lines + discovery.cached_lines, - "functions": processing.analyzed_functions + discovery.cached_functions, - "methods": processing.analyzed_methods + discovery.cached_methods, - "classes": processing.analyzed_classes + discovery.cached_classes, - }, - "file_list": list(discovery.all_file_paths), - } - report_document: dict[str, object] | None = None - needs_report_document = ( - include_report_document - or boot.output_paths.html is not None - or any( - path is not None - for path in ( - boot.output_paths.json, - boot.output_paths.md, - boot.output_paths.sarif, - boot.output_paths.text, - ) - ) - ) - - if needs_report_document: - metrics_for_report = ( - _enrich_metrics_report_payload( - metrics_payload=analysis.metrics_payload, - metrics_diff=cast("MetricsDiff | None", metrics_diff), - coverage_adoption_diff_available=coverage_adoption_diff_available, - api_surface_diff_available=api_surface_diff_available, - ) - if analysis.metrics_payload is not None - else None - ) - report_document = build_report_document( - func_groups=analysis.func_groups, - block_groups=analysis.block_groups_report, - segment_groups=analysis.segment_groups, - suppressed_clone_groups=analysis.suppressed_clone_groups, - meta=report_meta, - inventory=report_inventory, - block_facts=analysis.block_group_facts, - new_function_group_keys=new_func, - new_block_group_keys=new_block, - new_segment_group_keys=set(analysis.segment_groups.keys()), - metrics=metrics_for_report, - suggestions=analysis.suggestions, - structural_findings=sf, - ) - - if boot.output_paths.html and html_builder is not None: - metrics_for_html = ( - _enrich_metrics_report_payload( - metrics_payload=analysis.metrics_payload, - metrics_diff=cast("MetricsDiff | None", metrics_diff), - coverage_adoption_diff_available=coverage_adoption_diff_available, - api_surface_diff_available=api_surface_diff_available, - ) - if analysis.metrics_payload is not None - else None - ) - contents["html"] = html_builder( - func_groups=analysis.func_groups, - block_groups=analysis.block_groups_report, - segment_groups=analysis.segment_groups, - block_group_facts=analysis.block_group_facts, - new_function_group_keys=new_func, - new_block_group_keys=new_block, - report_meta=report_meta, - metrics=metrics_for_html, - suggestions=analysis.suggestions, - structural_findings=sf, - report_document=report_document, - metrics_diff=metrics_diff, - title="CodeClone Report", - context_lines=3, - max_snippet_lines=220, - ) - - if any( - path is not None - for path in ( - boot.output_paths.json, - boot.output_paths.md, - boot.output_paths.sarif, - boot.output_paths.text, - ) - ): - assert report_document is not None - - if boot.output_paths.json and report_document is not None: - contents["json"] = render_json_report_document(report_document) - - def _render_projection_artifact( - renderer: Callable[..., str], - ) -> str: - assert report_document is not None - return renderer( - report_document=report_document, - meta=report_meta, - inventory=report_inventory, - func_groups=analysis.func_groups, - block_groups=analysis.block_groups_report, - segment_groups=analysis.segment_groups, - block_facts=analysis.block_group_facts, - new_function_group_keys=new_func, - new_block_group_keys=new_block, - new_segment_group_keys=set(analysis.segment_groups.keys()), - metrics=analysis.metrics_payload, - suggestions=analysis.suggestions, - structural_findings=sf, - ) - - for key, output_path, loader in ( - ("md", boot.output_paths.md, _load_markdown_report_renderer), - ("sarif", boot.output_paths.sarif, _load_sarif_report_renderer), - ): - if output_path and report_document is not None: - contents[key] = _render_projection_artifact(loader()) - - if boot.output_paths.text and report_document is not None: - contents["text"] = render_text_report_document(report_document) - - return ReportArtifacts( - html=contents["html"], - json=contents["json"], - md=contents["md"], - sarif=contents["sarif"], - text=contents["text"], - report_document=report_document, - ) - - -def metric_gate_reasons( - *, - project_metrics: ProjectMetrics, - coverage_join: CoverageJoinResult | None, - metrics_diff: MetricsDiff | None, - config: MetricGateConfig, -) -> tuple[str, ...]: - reasons: list[str] = [] - _append_threshold_metric_reasons( - reasons=reasons, - project_metrics=project_metrics, - config=config, - ) - _append_new_metric_diff_reasons( - reasons=reasons, - metrics_diff=metrics_diff, - config=config, - ) - _append_adoption_metric_reasons( - reasons=reasons, - metrics_diff=metrics_diff, - project_metrics=project_metrics, - config=config, - ) - _append_coverage_join_reasons( - reasons=reasons, - coverage_join=coverage_join, - config=config, - ) - return tuple(reasons) - - -def _append_threshold_metric_reasons( - *, - reasons: list[str], - project_metrics: ProjectMetrics, - config: MetricGateConfig, -) -> None: - threshold_rows = ( - ( - config.fail_complexity >= 0 - and project_metrics.complexity_max > config.fail_complexity, - "Complexity threshold exceeded: " - f"max CC={project_metrics.complexity_max}, " - f"threshold={config.fail_complexity}.", - ), - ( - config.fail_coupling >= 0 - and project_metrics.coupling_max > config.fail_coupling, - "Coupling threshold exceeded: " - f"max CBO={project_metrics.coupling_max}, " - f"threshold={config.fail_coupling}.", - ), - ( - config.fail_cohesion >= 0 - and project_metrics.cohesion_max > config.fail_cohesion, - "Cohesion threshold exceeded: " - f"max LCOM4={project_metrics.cohesion_max}, " - f"threshold={config.fail_cohesion}.", - ), - ( - config.fail_health >= 0 - and project_metrics.health.total < config.fail_health, - "Health score below threshold: " - f"score={project_metrics.health.total}, threshold={config.fail_health}.", - ), - ) - reasons.extend(message for triggered, message in threshold_rows if triggered) - if config.fail_cycles and project_metrics.dependency_cycles: - reasons.append( - "Dependency cycles detected: " - f"{len(project_metrics.dependency_cycles)} cycle(s)." - ) - high_conf_dead = _high_confidence_dead_code_count(project_metrics.dead_code) - if config.fail_dead_code and high_conf_dead > 0: - reasons.append( - f"Dead code detected (high confidence): {high_conf_dead} item(s)." - ) - - -def _append_new_metric_diff_reasons( - *, - reasons: list[str], - metrics_diff: MetricsDiff | None, - config: MetricGateConfig, -) -> None: - if not config.fail_on_new_metrics or metrics_diff is None: - return - if metrics_diff.new_high_risk_functions: - reasons.append( - "New high-risk functions vs metrics baseline: " - f"{len(metrics_diff.new_high_risk_functions)}." - ) - if metrics_diff.new_high_coupling_classes: - reasons.append( - "New high-coupling classes vs metrics baseline: " - f"{len(metrics_diff.new_high_coupling_classes)}." - ) - if metrics_diff.new_cycles: - reasons.append( - "New dependency cycles vs metrics baseline: " - f"{len(metrics_diff.new_cycles)}." - ) - if metrics_diff.new_dead_code: - reasons.append( - "New dead code items vs metrics baseline: " - f"{len(metrics_diff.new_dead_code)}." - ) - if metrics_diff.health_delta < 0: - reasons.append( - "Health score regressed vs metrics baseline: " - f"delta={metrics_diff.health_delta}." - ) - - -def _append_metric_gate_reason( - *, - reasons: list[str], - enabled: bool, - triggered: bool, - message: str, -) -> None: - if enabled and triggered: - reasons.append(message) - - -def _append_adoption_metric_reasons( - *, - reasons: list[str], - metrics_diff: MetricsDiff | None, - project_metrics: ProjectMetrics, - config: MetricGateConfig, -) -> None: - typing_percent = ( - _permille( - project_metrics.typing_param_annotated, - project_metrics.typing_param_total, - ) - / 10.0 - ) - docstring_percent = ( - _permille( - project_metrics.docstring_public_documented, - project_metrics.docstring_public_total, - ) - / 10.0 - ) - if config.min_typing_coverage >= 0 and typing_percent < float( - config.min_typing_coverage - ): - reasons.append( - "Typing coverage below threshold: " - f"coverage={typing_percent:.1f}%, threshold={config.min_typing_coverage}%." - ) - if config.min_docstring_coverage >= 0 and docstring_percent < float( - config.min_docstring_coverage - ): - reasons.append( - "Docstring coverage below threshold: " - "coverage=" - f"{docstring_percent:.1f}%, " - f"threshold={config.min_docstring_coverage}%." - ) - if metrics_diff is None: - return - if config.fail_on_typing_regression: - typing_delta = int(getattr(metrics_diff, "typing_param_permille_delta", 0)) - return_delta = int(getattr(metrics_diff, "typing_return_permille_delta", 0)) - if typing_delta < 0 or return_delta < 0: - reasons.append( - "Typing coverage regressed vs metrics baseline: " - f"params_delta={typing_delta}, returns_delta={return_delta}." - ) - docstring_delta = int(getattr(metrics_diff, "docstring_permille_delta", 0)) - _append_metric_gate_reason( - reasons=reasons, - enabled=config.fail_on_docstring_regression, - triggered=docstring_delta < 0, - message=( - "Docstring coverage regressed vs metrics baseline: " - f"delta={docstring_delta}." - ), - ) - api_breaking = tuple( - cast( - "Sequence[object]", - getattr(metrics_diff, "new_api_breaking_changes", ()), - ) - ) - _append_metric_gate_reason( - reasons=reasons, - enabled=config.fail_on_api_break, - triggered=bool(api_breaking), - message=( - f"Public API breaking changes vs metrics baseline: {len(api_breaking)}." - ), - ) - - -def _append_coverage_join_reasons( - *, - reasons: list[str], - coverage_join: CoverageJoinResult | None, - config: MetricGateConfig, -) -> None: - if not config.fail_on_untested_hotspots or coverage_join is None: - return - if coverage_join.status != "ok": - return - if coverage_join.coverage_hotspots > 0: - reasons.append( - "Coverage hotspots detected: " - f"hotspots={coverage_join.coverage_hotspots}, " - f"threshold={config.coverage_min}%." - ) - - -def _high_confidence_dead_code_count(items: Sequence[DeadItem]) -> int: - return sum(1 for item in items if item.confidence == "high") - - -def gate( - *, - boot: BootstrapResult, - analysis: AnalysisResult, - new_func: Collection[str], - new_block: Collection[str], - metrics_diff: MetricsDiff | None, -) -> GatingResult: - reasons: list[str] = [] - - if analysis.project_metrics is not None: - metric_reasons = metric_gate_reasons( - project_metrics=analysis.project_metrics, - coverage_join=analysis.coverage_join, - metrics_diff=metrics_diff, - config=MetricGateConfig( - fail_complexity=boot.args.fail_complexity, - fail_coupling=boot.args.fail_coupling, - fail_cohesion=boot.args.fail_cohesion, - fail_cycles=boot.args.fail_cycles, - fail_dead_code=boot.args.fail_dead_code, - fail_health=boot.args.fail_health, - fail_on_new_metrics=boot.args.fail_on_new_metrics, - fail_on_typing_regression=bool( - getattr(boot.args, "fail_on_typing_regression", False) - ), - fail_on_docstring_regression=bool( - getattr(boot.args, "fail_on_docstring_regression", False) - ), - fail_on_api_break=bool(getattr(boot.args, "fail_on_api_break", False)), - fail_on_untested_hotspots=bool( - getattr(boot.args, "fail_on_untested_hotspots", False) - ), - min_typing_coverage=int(getattr(boot.args, "min_typing_coverage", -1)), - min_docstring_coverage=int( - getattr(boot.args, "min_docstring_coverage", -1) - ), - coverage_min=int(getattr(boot.args, "coverage_min", 50)), - ), - ) - reasons.extend(f"metric:{reason}" for reason in metric_reasons) - - if boot.args.fail_on_new and (new_func or new_block): - reasons.append("clone:new") - - total_clone_groups = analysis.func_clones_count + analysis.block_clones_count - if 0 <= boot.args.fail_threshold < total_clone_groups: - reasons.append( - f"clone:threshold:{total_clone_groups}:{boot.args.fail_threshold}" - ) - - if reasons: - return GatingResult( - exit_code=int(ExitCode.GATING_FAILURE), - reasons=tuple(reasons), - ) - - return GatingResult(exit_code=int(ExitCode.SUCCESS), reasons=()) diff --git a/codeclone/qualnames.py b/codeclone/qualnames/__init__.py similarity index 100% rename from codeclone/qualnames.py rename to codeclone/qualnames/__init__.py diff --git a/codeclone/report/__init__.py b/codeclone/report/__init__.py index e5869f7..9135843 100644 --- a/codeclone/report/__init__.py +++ b/codeclone/report/__init__.py @@ -3,41 +3,3 @@ # file, You can obtain one at https://mozilla.org/MPL/2.0/. # SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy - -from __future__ import annotations - -from ..grouping import build_block_groups, build_groups, build_segment_groups -from .blocks import prepare_block_report_groups -from .explain import build_block_group_facts -from .markdown import render_markdown_report_document, to_markdown_report -from .sarif import render_sarif_report_document, to_sarif_report -from .segments import ( - SEGMENT_MIN_UNIQUE_STMT_TYPES, - prepare_segment_report_groups, -) -from .serialize import ( - render_json_report_document, - render_text_report_document, -) -from .suggestions import classify_clone_type, generate_suggestions -from .types import GroupItem, GroupMap - -__all__ = [ - "SEGMENT_MIN_UNIQUE_STMT_TYPES", - "GroupItem", - "GroupMap", - "build_block_group_facts", - "build_block_groups", - "build_groups", - "build_segment_groups", - "classify_clone_type", - "generate_suggestions", - "prepare_block_report_groups", - "prepare_segment_report_groups", - "render_json_report_document", - "render_markdown_report_document", - "render_sarif_report_document", - "render_text_report_document", - "to_markdown_report", - "to_sarif_report", -] diff --git a/codeclone/report/derived.py b/codeclone/report/derived.py index 6873a08..7b07e30 100644 --- a/codeclone/report/derived.py +++ b/codeclone/report/derived.py @@ -7,9 +7,8 @@ from __future__ import annotations from collections import Counter -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING -from .._coerce import as_int as _as_int from ..domain.source_scope import ( IMPACT_SCOPE_MIXED, IMPACT_SCOPE_NON_RUNTIME, @@ -31,6 +30,7 @@ from ..paths import ( relative_repo_path as _relative_repo_path, ) +from ..utils.coerce import as_int as _as_int if TYPE_CHECKING: from collections.abc import Iterable, Mapping, Sequence @@ -125,7 +125,7 @@ def normalized_source_kind(value: object) -> SourceKind: def source_scope_from_counts( counts: Mapping[SourceKind, int] | Mapping[str, int], ) -> dict[str, object]: - normalized_counts = cast("Mapping[str, int]", counts) + normalized_counts = {str(key): int(value) for key, value in counts.items()} def _count(kind: str) -> int: value = normalized_counts.get(kind, 0) diff --git a/codeclone/report/document/__init__.py b/codeclone/report/document/__init__.py new file mode 100644 index 0000000..9135843 --- /dev/null +++ b/codeclone/report/document/__init__.py @@ -0,0 +1,5 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy diff --git a/codeclone/report/document/_common.py b/codeclone/report/document/_common.py new file mode 100644 index 0000000..832c664 --- /dev/null +++ b/codeclone/report/document/_common.py @@ -0,0 +1,414 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections import Counter +from collections.abc import Collection, Iterable, Mapping, Sequence +from typing import TYPE_CHECKING + +from ...contracts import ( + DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, + DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, + DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, +) +from ...domain.findings import ( + CATEGORY_COHESION, + CATEGORY_COMPLEXITY, + CATEGORY_COUPLING, + CLONE_NOVELTY_KNOWN, + CLONE_NOVELTY_NEW, + FAMILY_DEAD_CODE, +) +from ...domain.quality import ( + EFFORT_WEIGHT, + SEVERITY_RANK, +) +from ...findings.structural.detectors import normalize_structural_findings +from ...utils.coerce import as_int as _as_int +from ...utils.coerce import as_mapping as _as_mapping +from ...utils.coerce import as_sequence as _as_sequence +from ..derived import ( + normalized_source_kind as _normalized_source_kind, +) +from ..derived import ( + relative_report_path, + report_location_from_group_item, +) +from ..derived import ( + source_scope_from_counts as _report_source_scope_from_counts, +) +from ..derived import ( + source_scope_from_locations as _report_source_scope_from_locations, +) + +if TYPE_CHECKING: + from ...models import ( + GroupMapLike, + SourceKind, + StructuralFindingGroup, + SuppressedCloneGroup, + ) + +_OVERLOADED_MODULES_FAMILY = "overloaded_modules" +_COVERAGE_ADOPTION_FAMILY = "coverage_adoption" +_API_SURFACE_FAMILY = "api_surface" +_COVERAGE_JOIN_FAMILY = "coverage_join" +_SECURITY_SURFACES_FAMILY = "security_surfaces" + + +def _optional_str(value: object) -> str | None: + if value is None: + return None + text = str(value).strip() + return text or None + + +def _coerced_nonnegative_threshold(value: object, *, default: int) -> int: + threshold = _as_int(value, default) + return threshold if threshold >= 0 else default + + +def _design_findings_thresholds_payload( + raw_meta: Mapping[str, object] | None, +) -> dict[str, object]: + meta = dict(raw_meta or {}) + return { + "design_findings": { + CATEGORY_COMPLEXITY: { + "metric": "cyclomatic_complexity", + "operator": ">", + "value": _coerced_nonnegative_threshold( + meta.get("design_complexity_threshold"), + default=DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, + ), + }, + CATEGORY_COUPLING: { + "metric": "cbo", + "operator": ">", + "value": _coerced_nonnegative_threshold( + meta.get("design_coupling_threshold"), + default=DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, + ), + }, + CATEGORY_COHESION: { + "metric": "lcom4", + "operator": ">=", + "value": _coerced_nonnegative_threshold( + meta.get("design_cohesion_threshold"), + default=DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, + ), + }, + } + } + + +def _analysis_profile_payload( + raw_meta: Mapping[str, object] | None, +) -> dict[str, int] | None: + meta = dict(raw_meta or {}) + nested = _as_mapping(meta.get("analysis_profile")) + if nested: + meta = dict(nested) + keys = ( + "min_loc", + "min_stmt", + "block_min_loc", + "block_min_stmt", + "segment_min_loc", + "segment_min_stmt", + ) + if any(key not in meta for key in keys): + return None + payload = {key: _as_int(meta.get(key), -1) for key in keys} + if any(value < 0 for value in payload.values()): + return None + return payload + + +def _normalize_path(value: str) -> str: + return value.replace("\\", "/").strip() + + +def _is_absolute_path(value: str) -> bool: + normalized = _normalize_path(value) + if not normalized: + return False + if normalized.startswith("/"): + return True + return len(normalized) > 2 and normalized[1] == ":" and normalized[2] == "/" + + +def _contract_path( + value: object, + *, + scan_root: str, +) -> tuple[str | None, str | None, str | None]: + path_text = _optional_str(value) + if path_text is None: + return None, None, None + normalized_path = _normalize_path(path_text) + relative_path = relative_report_path(normalized_path, scan_root=scan_root) + if relative_path and relative_path != normalized_path: + return relative_path, "in_root", normalized_path + if _is_absolute_path(normalized_path): + return normalized_path.rsplit("/", maxsplit=1)[-1], "external", normalized_path + return normalized_path, "relative", None + + +def _contract_report_location_path(location_path: str, *, scan_root: str) -> str: + contract_path, _scope, _absolute = _contract_path( + location_path, + scan_root=scan_root, + ) + return contract_path or "" + + +def _priority( + severity: str, + effort: str, +) -> float: + severity_rank = SEVERITY_RANK.get(severity, 1) + effort_rank = EFFORT_WEIGHT.get(effort, 1) + return float(severity_rank) / float(effort_rank) + + +def _clone_novelty( + *, + group_key: str, + baseline_trusted: bool, + new_keys: Collection[str] | None, +) -> str: + if not baseline_trusted: + return CLONE_NOVELTY_NEW + if new_keys is None: + return CLONE_NOVELTY_NEW + return CLONE_NOVELTY_NEW if group_key in new_keys else CLONE_NOVELTY_KNOWN + + +def _item_sort_key(item: Mapping[str, object]) -> tuple[str, int, int, str]: + return ( + str(item.get("relative_path", "")), + _as_int(item.get("start_line")), + _as_int(item.get("end_line")), + str(item.get("qualname", "")), + ) + + +def _parse_bool_text(value: object) -> bool: + text = str(value).strip().lower() + return text in {"1", "true", "yes"} + + +def _parse_ratio_percent(value: object) -> float | None: + text = str(value).strip() + if not text: + return None + if text.endswith("%"): + try: + return float(text[:-1]) / 100.0 + except ValueError: + return None + try: + numeric = float(text) + except ValueError: + return None + return numeric if numeric <= 1.0 else numeric / 100.0 + + +def _normalize_block_machine_facts( + *, + group_key: str, + group_arity: int, + block_facts: Mapping[str, str], +) -> tuple[dict[str, object], dict[str, str]]: + facts: dict[str, object] = { + "group_key": group_key, + "group_arity": group_arity, + } + display_facts: dict[str, str] = {} + for key in sorted(block_facts): + value = str(block_facts[key]) + match key: + case "group_arity": + facts[key] = _as_int(value) + case "block_size" | "consecutive_asserts" | "instance_peer_count": + facts[key] = _as_int(value) + case "merged_regions": + facts[key] = _parse_bool_text(value) + case "assert_ratio": + ratio = _parse_ratio_percent(value) + if ratio is not None: + facts[key] = ratio + display_facts[key] = value + case ( + "match_rule" | "pattern" | "signature_kind" | "hint" | "hint_confidence" + ): + facts[key] = value + case _: + display_facts[key] = value + return facts, display_facts + + +def _source_scope_from_filepaths( + filepaths: Iterable[str], + *, + scan_root: str, +) -> dict[str, object]: + counts: Counter[SourceKind] = Counter() + for filepath in filepaths: + location = report_location_from_group_item( + {"filepath": filepath, "start_line": 0, "end_line": 0, "qualname": ""}, + scan_root=scan_root, + ) + counts[location.source_kind] += 1 + return _source_scope_from_counts(counts) + + +def _source_scope_from_counts( + counts: Mapping[SourceKind, int], +) -> dict[str, object]: + return _report_source_scope_from_counts(counts) + + +def _source_scope_from_locations( + locations: Sequence[Mapping[str, object]], +) -> dict[str, object]: + normalized_locations = [ + {"source_kind": _normalized_source_kind(location.get("source_kind"))} + for location in locations + ] + return _report_source_scope_from_locations(normalized_locations) + + +def _collect_paths_from_metrics(metrics: Mapping[str, object]) -> set[str]: + paths: set[str] = set() + complexity = _as_mapping(metrics.get(CATEGORY_COMPLEXITY)) + for item in _as_sequence(complexity.get("functions")): + item_map = _as_mapping(item) + filepath = _optional_str(item_map.get("filepath")) + if filepath is not None: + paths.add(filepath) + for family_name in (CATEGORY_COUPLING, CATEGORY_COHESION): + family = _as_mapping(metrics.get(family_name)) + for item in _as_sequence(family.get("classes")): + item_map = _as_mapping(item) + filepath = _optional_str(item_map.get("filepath")) + if filepath is not None: + paths.add(filepath) + dead_code = _as_mapping(metrics.get(FAMILY_DEAD_CODE)) + for item in _as_sequence(dead_code.get("items")): + item_map = _as_mapping(item) + filepath = _optional_str(item_map.get("filepath")) + if filepath is not None: + paths.add(filepath) + for item in _as_sequence(dead_code.get("suppressed_items")): + item_map = _as_mapping(item) + filepath = _optional_str(item_map.get("filepath")) + if filepath is not None: + paths.add(filepath) + overloaded_modules = _as_mapping(metrics.get(_OVERLOADED_MODULES_FAMILY)) + for item in _as_sequence(overloaded_modules.get("items")): + item_map = _as_mapping(item) + filepath = _optional_str(item_map.get("filepath")) + if filepath is not None: + paths.add(filepath) + coverage_adoption = _as_mapping(metrics.get(_COVERAGE_ADOPTION_FAMILY)) + for item in _as_sequence(coverage_adoption.get("items")): + item_map = _as_mapping(item) + filepath = _optional_str(item_map.get("filepath")) + if filepath is not None: + paths.add(filepath) + api_surface = _as_mapping(metrics.get(_API_SURFACE_FAMILY)) + for item in _as_sequence(api_surface.get("items")): + item_map = _as_mapping(item) + filepath = _optional_str(item_map.get("filepath")) + if filepath is not None: + paths.add(filepath) + coverage_join = _as_mapping(metrics.get(_COVERAGE_JOIN_FAMILY)) + for item in _as_sequence(coverage_join.get("items")): + item_map = _as_mapping(item) + filepath = _optional_str(item_map.get("filepath")) + if filepath is not None: + paths.add(filepath) + security_surfaces = _as_mapping(metrics.get(_SECURITY_SURFACES_FAMILY)) + for item in _as_sequence(security_surfaces.get("items")): + item_map = _as_mapping(item) + filepath = _optional_str(item_map.get("filepath")) + if filepath is not None: + paths.add(filepath) + return paths + + +def _collect_report_file_list( + *, + inventory: Mapping[str, object] | None, + func_groups: GroupMapLike, + block_groups: GroupMapLike, + segment_groups: GroupMapLike, + suppressed_clone_groups: Sequence[SuppressedCloneGroup] | None = None, + metrics: Mapping[str, object] | None, + structural_findings: Sequence[StructuralFindingGroup] | None, +) -> list[str]: + files: set[str] = set() + inventory_map = _as_mapping(inventory) + for filepath in _as_sequence(inventory_map.get("file_list")): + file_text = _optional_str(filepath) + if file_text is not None: + files.add(file_text) + for groups in (func_groups, block_groups, segment_groups): + for items in groups.values(): + for item in items: + filepath = _optional_str(item.get("filepath")) + if filepath is not None: + files.add(filepath) + for suppressed_group in suppressed_clone_groups or (): + for item in suppressed_group.items: + filepath = _optional_str(item.get("filepath")) + if filepath is not None: + files.add(filepath) + if metrics is not None: + files.update(_collect_paths_from_metrics(metrics)) + if structural_findings: + for structural_group in normalize_structural_findings(structural_findings): + for occurrence in structural_group.items: + filepath = _optional_str(occurrence.file_path) + if filepath is not None: + files.add(filepath) + return sorted(files) + + +def _count_file_lines(filepaths: Sequence[str]) -> int: + total = 0 + for filepath in filepaths: + total += _count_file_lines_for_path(filepath) + return total + + +def _count_file_lines_for_path(filepath: str) -> int: + try: + with open(filepath, encoding="utf-8", errors="surrogateescape") as handle: + return sum(1 for _ in handle) + except OSError: + return 0 + + +def _normalize_nested_string_rows(value: object) -> list[list[str]]: + rows: list[tuple[str, ...]] = [] + for row in _as_sequence(value): + modules = tuple( + str(module) for module in _as_sequence(row) if str(module).strip() + ) + if modules: + rows.append(modules) + rows.sort(key=lambda row: (len(row), row)) + return [list(row) for row in rows] + + +__all__ = [ + "_collect_report_file_list", + "normalize_structural_findings", +] diff --git a/codeclone/report/document/_design_groups.py b/codeclone/report/document/_design_groups.py new file mode 100644 index 0000000..01f0a77 --- /dev/null +++ b/codeclone/report/document/_design_groups.py @@ -0,0 +1,389 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections.abc import Mapping + +from ...contracts import ( + DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, + DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, + DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, +) +from ...domain.findings import ( + CATEGORY_COHESION, + CATEGORY_COMPLEXITY, + CATEGORY_COUPLING, + CATEGORY_COVERAGE, + CATEGORY_DEPENDENCY, + FAMILY_DESIGN, + FINDING_KIND_COVERAGE_HOTSPOT, + FINDING_KIND_COVERAGE_SCOPE_GAP, +) +from ...domain.quality import ( + CONFIDENCE_HIGH, + EFFORT_HARD, + EFFORT_MODERATE, + RISK_LOW, + SEVERITY_CRITICAL, + SEVERITY_WARNING, +) +from ...findings.ids import design_group_id +from ...utils.coerce import as_float as _as_float +from ...utils.coerce import as_int as _as_int +from ...utils.coerce import as_mapping as _as_mapping +from ...utils.coerce import as_sequence as _as_sequence +from ..derived import ( + report_location_from_group_item, +) +from ._common import ( + _COVERAGE_JOIN_FAMILY, + _coerced_nonnegative_threshold, + _contract_report_location_path, + _priority, + _source_scope_from_filepaths, +) +from ._findings_groups import _single_location_source_scope + + +def _design_singleton_group( + *, + category: str, + kind: str, + severity: str, + qualname: str, + filepath: str, + start_line: int, + end_line: int, + scan_root: str, + item_data: Mapping[str, object], + facts: Mapping[str, object], +) -> dict[str, object]: + return { + "id": design_group_id(category, qualname), + "family": FAMILY_DESIGN, + "category": category, + "kind": kind, + "severity": severity, + "confidence": CONFIDENCE_HIGH, + "priority": _priority(severity, EFFORT_MODERATE), + "count": 1, + "source_scope": _single_location_source_scope( + filepath, + scan_root=scan_root, + ), + "spread": {"files": 1, "functions": 1}, + "items": [ + { + "relative_path": _contract_report_location_path( + filepath, + scan_root=scan_root, + ), + "qualname": qualname, + "start_line": start_line, + "end_line": end_line, + **item_data, + } + ], + "facts": dict(facts), + } + + +def _complexity_design_group( + item_map: Mapping[str, object], + *, + threshold: int, + scan_root: str, +) -> dict[str, object] | None: + cc = _as_int(item_map.get("cyclomatic_complexity"), 1) + if cc <= threshold: + return None + qualname = str(item_map.get("qualname", "")) + filepath = str(item_map.get("relative_path", "")) + nesting_depth = _as_int(item_map.get("nesting_depth")) + severity = SEVERITY_CRITICAL if cc > 40 else SEVERITY_WARNING + return _design_singleton_group( + category=CATEGORY_COMPLEXITY, + kind="function_hotspot", + severity=severity, + qualname=qualname, + filepath=filepath, + start_line=_as_int(item_map.get("start_line")), + end_line=_as_int(item_map.get("end_line")), + scan_root=scan_root, + item_data={ + "cyclomatic_complexity": cc, + "nesting_depth": nesting_depth, + "risk": str(item_map.get("risk", RISK_LOW)), + }, + facts={ + "cyclomatic_complexity": cc, + "nesting_depth": nesting_depth, + }, + ) + + +def _coupling_design_group( + item_map: Mapping[str, object], + *, + threshold: int, + scan_root: str, +) -> dict[str, object] | None: + cbo = _as_int(item_map.get("cbo")) + if cbo <= threshold: + return None + qualname = str(item_map.get("qualname", "")) + filepath = str(item_map.get("relative_path", "")) + coupled_classes = list(_as_sequence(item_map.get("coupled_classes"))) + return _design_singleton_group( + category=CATEGORY_COUPLING, + kind="class_hotspot", + severity=SEVERITY_WARNING, + qualname=qualname, + filepath=filepath, + start_line=_as_int(item_map.get("start_line")), + end_line=_as_int(item_map.get("end_line")), + scan_root=scan_root, + item_data={ + "cbo": cbo, + "risk": str(item_map.get("risk", RISK_LOW)), + "coupled_classes": coupled_classes, + }, + facts={ + "cbo": cbo, + "coupled_classes": coupled_classes, + }, + ) + + +def _cohesion_design_group( + item_map: Mapping[str, object], + *, + threshold: int, + scan_root: str, +) -> dict[str, object] | None: + lcom4 = _as_int(item_map.get("lcom4")) + if lcom4 < threshold: + return None + qualname = str(item_map.get("qualname", "")) + filepath = str(item_map.get("relative_path", "")) + method_count = _as_int(item_map.get("method_count")) + instance_var_count = _as_int(item_map.get("instance_var_count")) + return _design_singleton_group( + category=CATEGORY_COHESION, + kind="class_hotspot", + severity=SEVERITY_WARNING, + qualname=qualname, + filepath=filepath, + start_line=_as_int(item_map.get("start_line")), + end_line=_as_int(item_map.get("end_line")), + scan_root=scan_root, + item_data={ + "lcom4": lcom4, + "risk": str(item_map.get("risk", RISK_LOW)), + "method_count": method_count, + "instance_var_count": instance_var_count, + }, + facts={ + "lcom4": lcom4, + "method_count": method_count, + "instance_var_count": instance_var_count, + }, + ) + + +def _dependency_design_group( + cycle: object, + *, + scan_root: str, +) -> dict[str, object] | None: + modules = [str(module) for module in _as_sequence(cycle) if str(module).strip()] + if not modules: + return None + cycle_key = " -> ".join(modules) + return { + "id": design_group_id(CATEGORY_DEPENDENCY, cycle_key), + "family": FAMILY_DESIGN, + "category": CATEGORY_DEPENDENCY, + "kind": "cycle", + "severity": SEVERITY_CRITICAL, + "confidence": CONFIDENCE_HIGH, + "priority": _priority(SEVERITY_CRITICAL, EFFORT_HARD), + "count": len(modules), + "source_scope": _source_scope_from_filepaths( + (module.replace(".", "/") + ".py" for module in modules), + scan_root=scan_root, + ), + "spread": {"files": len(modules), "functions": 0}, + "items": [ + { + "module": module, + "relative_path": module.replace(".", "/") + ".py", + "source_kind": report_location_from_group_item( + { + "filepath": module.replace(".", "/") + ".py", + "qualname": "", + "start_line": 0, + "end_line": 0, + } + ).source_kind, + } + for module in modules + ], + "facts": { + "cycle_length": len(modules), + }, + } + + +def _coverage_design_group( + item_map: Mapping[str, object], + *, + threshold_percent: int, + scan_root: str, +) -> dict[str, object] | None: + coverage_hotspot = bool(item_map.get("coverage_hotspot")) + scope_gap_hotspot = bool(item_map.get("scope_gap_hotspot")) + if not coverage_hotspot and not scope_gap_hotspot: + return None + qualname = str(item_map.get("qualname", "")).strip() + filepath = str(item_map.get("relative_path", "")).strip() + if not filepath: + return None + start_line = _as_int(item_map.get("start_line")) + end_line = _as_int(item_map.get("end_line")) + subject_key = qualname or f"{filepath}:{start_line}:{end_line}" + risk = str(item_map.get("risk", RISK_LOW)).strip() or RISK_LOW + coverage_status = str(item_map.get("coverage_status", "")).strip() + coverage_permille = _as_int(item_map.get("coverage_permille")) + covered_lines = _as_int(item_map.get("covered_lines")) + executable_lines = _as_int(item_map.get("executable_lines")) + complexity = _as_int(item_map.get("cyclomatic_complexity"), 1) + severity = SEVERITY_CRITICAL if risk == "high" else SEVERITY_WARNING + if scope_gap_hotspot: + kind = FINDING_KIND_COVERAGE_SCOPE_GAP + detail = "The supplied coverage.xml did not map to this function's file." + else: + kind = FINDING_KIND_COVERAGE_HOTSPOT + detail = "Joined line coverage is below the configured hotspot threshold." + return { + "id": design_group_id(CATEGORY_COVERAGE, subject_key), + "family": FAMILY_DESIGN, + "category": CATEGORY_COVERAGE, + "kind": kind, + "severity": severity, + "confidence": CONFIDENCE_HIGH, + "priority": _priority(severity, EFFORT_MODERATE), + "count": 1, + "source_scope": _single_location_source_scope( + filepath, + scan_root=scan_root, + ), + "spread": {"files": 1, "functions": 1}, + "items": [ + { + "relative_path": filepath, + "qualname": qualname, + "start_line": start_line, + "end_line": end_line, + "risk": risk, + "cyclomatic_complexity": complexity, + "coverage_permille": coverage_permille, + "coverage_status": coverage_status, + "covered_lines": covered_lines, + "executable_lines": executable_lines, + "coverage_hotspot": coverage_hotspot, + "scope_gap_hotspot": scope_gap_hotspot, + } + ], + "facts": { + "coverage_permille": coverage_permille, + "hotspot_threshold_percent": threshold_percent, + "coverage_status": coverage_status, + "covered_lines": covered_lines, + "executable_lines": executable_lines, + "cyclomatic_complexity": complexity, + "coverage_hotspot": coverage_hotspot, + "scope_gap_hotspot": scope_gap_hotspot, + "detail": detail, + }, + } + + +def _build_design_groups( + metrics_payload: Mapping[str, object], + *, + design_thresholds: Mapping[str, object] | None = None, + scan_root: str, +) -> list[dict[str, object]]: + families = _as_mapping(metrics_payload.get("families")) + thresholds = _as_mapping(design_thresholds) + complexity_threshold = _coerced_nonnegative_threshold( + _as_mapping(thresholds.get(CATEGORY_COMPLEXITY)).get("value"), + default=DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, + ) + coupling_threshold = _coerced_nonnegative_threshold( + _as_mapping(thresholds.get(CATEGORY_COUPLING)).get("value"), + default=DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, + ) + cohesion_threshold = _coerced_nonnegative_threshold( + _as_mapping(thresholds.get(CATEGORY_COHESION)).get("value"), + default=DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, + ) + coverage_join = _as_mapping(families.get(_COVERAGE_JOIN_FAMILY)) + coverage_threshold = _as_int( + _as_mapping(coverage_join.get("summary")).get("hotspot_threshold_percent"), + 50, + ) + groups: list[dict[str, object]] = [] + + complexity = _as_mapping(families.get(CATEGORY_COMPLEXITY)) + for item in _as_sequence(complexity.get("items")): + group = _complexity_design_group( + _as_mapping(item), + threshold=complexity_threshold, + scan_root=scan_root, + ) + if group is not None: + groups.append(group) + + coupling = _as_mapping(families.get(CATEGORY_COUPLING)) + for item in _as_sequence(coupling.get("items")): + group = _coupling_design_group( + _as_mapping(item), + threshold=coupling_threshold, + scan_root=scan_root, + ) + if group is not None: + groups.append(group) + + cohesion = _as_mapping(families.get(CATEGORY_COHESION)) + for item in _as_sequence(cohesion.get("items")): + group = _cohesion_design_group( + _as_mapping(item), + threshold=cohesion_threshold, + scan_root=scan_root, + ) + if group is not None: + groups.append(group) + + dependencies = _as_mapping(families.get("dependencies")) + for cycle in _as_sequence(dependencies.get("cycles")): + group = _dependency_design_group(cycle, scan_root=scan_root) + if group is not None: + groups.append(group) + + for item in _as_sequence(coverage_join.get("items")): + group = _coverage_design_group( + _as_mapping(item), + threshold_percent=coverage_threshold, + scan_root=scan_root, + ) + if group is not None: + groups.append(group) + + groups.sort(key=lambda group: (-_as_float(group["priority"]), str(group["id"]))) + return groups diff --git a/codeclone/report/document/_findings_groups.py b/codeclone/report/document/_findings_groups.py new file mode 100644 index 0000000..8653708 --- /dev/null +++ b/codeclone/report/document/_findings_groups.py @@ -0,0 +1,606 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections.abc import Collection, Mapping, Sequence +from typing import TYPE_CHECKING, Literal + +from ...domain.findings import ( + CLONE_KIND_BLOCK, + CLONE_KIND_FUNCTION, + CLONE_KIND_SEGMENT, + FAMILY_CLONE, + FAMILY_DEAD_CODE, + FAMILY_STRUCTURAL, +) +from ...domain.quality import ( + CONFIDENCE_HIGH, + CONFIDENCE_MEDIUM, + EFFORT_EASY, + RISK_LOW, + SEVERITY_CRITICAL, + SEVERITY_INFO, + SEVERITY_WARNING, +) +from ...findings.structural.detectors import normalize_structural_findings +from ...utils.coerce import as_float as _as_float +from ...utils.coerce import as_int as _as_int +from ...utils.coerce import as_mapping as _as_mapping +from ...utils.coerce import as_sequence as _as_sequence +from ..derived import ( + group_spread, + report_location_from_group_item, + report_location_from_structural_occurrence, +) +from ..suggestions import classify_clone_type + +if TYPE_CHECKING: + from ...models import ( + GroupItemLike, + GroupMapLike, + StructuralFindingGroup, + SuppressedCloneGroup, + ) + +from ...findings.ids import clone_group_id, dead_code_group_id, structural_group_id +from ._common import ( + _clone_novelty, + _contract_report_location_path, + _item_sort_key, + _normalize_block_machine_facts, + _priority, + _source_scope_from_locations, +) + + +def _clone_group_assessment( + *, + count: int, + clone_type: str, +) -> tuple[str, float]: + match (count >= 4, clone_type in {"Type-1", "Type-2"}): + case (True, _): + severity = SEVERITY_CRITICAL + case (False, True): + severity = SEVERITY_WARNING + case _: + severity = SEVERITY_INFO + effort = "easy" if clone_type in {"Type-1", "Type-2"} else "moderate" + return severity, _priority(severity, effort) + + +def _build_clone_group_facts( + *, + group_key: str, + kind: Literal["function", "block", "segment"], + items: Sequence[GroupItemLike], + block_facts: Mapping[str, Mapping[str, str]], +) -> tuple[dict[str, object], dict[str, str]]: + base: dict[str, object] = { + "group_key": group_key, + "group_arity": len(items), + } + display_facts: dict[str, str] = {} + match kind: + case "function": + loc_buckets = sorted( + { + str(item.get("loc_bucket", "")) + for item in items + if str(item.get("loc_bucket", "")).strip() + } + ) + base["loc_buckets"] = loc_buckets + case "block" if group_key in block_facts: + typed_facts, block_display_facts = _normalize_block_machine_facts( + group_key=group_key, + group_arity=len(items), + block_facts=block_facts[group_key], + ) + base.update(typed_facts) + display_facts.update(block_display_facts) + case _: + pass + return base, display_facts + + +def _clone_item_payload( + item: GroupItemLike, + *, + kind: Literal["function", "block", "segment"], + scan_root: str, +) -> dict[str, object]: + payload: dict[str, object] = { + "relative_path": _contract_report_location_path( + str(item.get("filepath", "")), + scan_root=scan_root, + ), + "qualname": str(item.get("qualname", "")), + "start_line": _as_int(item.get("start_line", 0)), + "end_line": _as_int(item.get("end_line", 0)), + } + match kind: + case "function": + payload.update( + { + "loc": _as_int(item.get("loc", 0)), + "stmt_count": _as_int(item.get("stmt_count", 0)), + "fingerprint": str(item.get("fingerprint", "")), + "loc_bucket": str(item.get("loc_bucket", "")), + "cyclomatic_complexity": _as_int( + item.get("cyclomatic_complexity", 1) + ), + "nesting_depth": _as_int(item.get("nesting_depth", 0)), + "risk": str(item.get("risk", RISK_LOW)), + "raw_hash": str(item.get("raw_hash", "")), + } + ) + case "block": + payload["size"] = _as_int(item.get("size", 0)) + case _: + payload.update( + { + "size": _as_int(item.get("size", 0)), + "segment_hash": str(item.get("segment_hash", "")), + "segment_sig": str(item.get("segment_sig", "")), + } + ) + return payload + + +def _build_clone_groups( + *, + groups: GroupMapLike, + kind: Literal["function", "block", "segment"], + baseline_trusted: bool, + new_keys: Collection[str] | None, + block_facts: Mapping[str, Mapping[str, str]], + scan_root: str, +) -> list[dict[str, object]]: + encoded_groups: list[dict[str, object]] = [] + new_key_set = set(new_keys) if new_keys is not None else None + for group_key in sorted(groups): + items = groups[group_key] + clone_type = classify_clone_type(items=items, kind=kind) + severity, priority = _clone_group_assessment( + count=len(items), + clone_type=clone_type, + ) + novelty = _clone_novelty( + group_key=group_key, + baseline_trusted=baseline_trusted, + new_keys=new_key_set, + ) + locations = tuple( + report_location_from_group_item(item, scan_root=scan_root) for item in items + ) + source_scope = _source_scope_from_locations( + [ + { + "source_kind": location.source_kind, + } + for location in locations + ] + ) + spread_files, spread_functions = group_spread(locations) + rows = sorted( + [ + _clone_item_payload( + item, + kind=kind, + scan_root=scan_root, + ) + for item in items + ], + key=_item_sort_key, + ) + facts, display_facts = _build_clone_group_facts( + group_key=group_key, + kind=kind, + items=items, + block_facts=block_facts, + ) + encoded_groups.append( + { + "id": clone_group_id(kind, group_key), + "family": FAMILY_CLONE, + "category": kind, + "kind": "clone_group", + "severity": severity, + "confidence": CONFIDENCE_HIGH, + "priority": priority, + "clone_kind": kind, + "clone_type": clone_type, + "novelty": novelty, + "count": len(items), + "source_scope": source_scope, + "spread": { + "files": spread_files, + "functions": spread_functions, + }, + "items": rows, + "facts": facts, + **({"display_facts": display_facts} if display_facts else {}), + } + ) + encoded_groups.sort( + key=lambda group: (-_as_int(group.get("count")), str(group["id"])) + ) + return encoded_groups + + +def _build_suppressed_clone_groups( + *, + groups: Sequence[SuppressedCloneGroup] | None, + block_facts: Mapping[str, Mapping[str, str]], + scan_root: str, +) -> dict[str, list[dict[str, object]]]: + buckets: dict[str, list[dict[str, object]]] = { + CLONE_KIND_FUNCTION: [], + CLONE_KIND_BLOCK: [], + CLONE_KIND_SEGMENT: [], + } + for group in groups or (): + items = group.items + clone_type = classify_clone_type(items=items, kind=group.kind) + severity, priority = _clone_group_assessment( + count=len(items), + clone_type=clone_type, + ) + locations = tuple( + report_location_from_group_item(item, scan_root=scan_root) for item in items + ) + source_scope = _source_scope_from_locations( + [ + { + "source_kind": location.source_kind, + } + for location in locations + ] + ) + spread_files, spread_functions = group_spread(locations) + rows = sorted( + [ + _clone_item_payload( + item, + kind=group.kind, + scan_root=scan_root, + ) + for item in items + ], + key=_item_sort_key, + ) + facts, display_facts = _build_clone_group_facts( + group_key=group.group_key, + kind=group.kind, + items=items, + block_facts=block_facts, + ) + encoded: dict[str, object] = { + "id": clone_group_id(group.kind, group.group_key), + "family": FAMILY_CLONE, + "category": group.kind, + "kind": "clone_group", + "severity": severity, + "confidence": CONFIDENCE_HIGH, + "priority": priority, + "clone_kind": group.kind, + "clone_type": clone_type, + "count": len(items), + "source_scope": source_scope, + "spread": { + "files": spread_files, + "functions": spread_functions, + }, + "items": rows, + "facts": facts, + "suppression_rule": group.suppression_rule, + "suppression_source": group.suppression_source, + "matched_patterns": list(group.matched_patterns), + } + if display_facts: + encoded["display_facts"] = display_facts + buckets[group.kind].append(encoded) + for bucket in buckets.values(): + bucket.sort(key=lambda group: (-_as_int(group.get("count")), str(group["id"]))) + return buckets + + +def _structural_group_assessment( + *, + finding_kind: str, + count: int, + spread_functions: int, +) -> tuple[str, float]: + match finding_kind: + case "clone_guard_exit_divergence" | "clone_cohort_drift": + severity = SEVERITY_WARNING + if count >= 3 or spread_functions > 1: + severity = SEVERITY_CRITICAL + return severity, _priority(severity, "moderate") + case _: + severity = ( + SEVERITY_WARNING + if count >= 4 or spread_functions > 1 + else SEVERITY_INFO + ) + return severity, _priority(severity, "moderate") + + +def _csv_values(value: object) -> list[str]: + raw = str(value).strip() + if not raw: + return [] + return sorted({part.strip() for part in raw.split(",") if part.strip()}) + + +def _build_structural_signature( + finding_kind: str, + signature: Mapping[str, str], +) -> dict[str, object]: + debug = {str(key): str(signature[key]) for key in sorted(signature)} + match finding_kind: + case "clone_guard_exit_divergence": + return { + "version": "1", + "stable": { + "family": "clone_guard_exit_divergence", + "cohort_id": str(signature.get("cohort_id", "")), + "majority_guard_count": _as_int( + signature.get("majority_guard_count") + ), + "majority_guard_terminal_profile": str( + signature.get("majority_guard_terminal_profile", "none") + ), + "majority_terminal_kind": str( + signature.get("majority_terminal_kind", "fallthrough") + ), + "majority_side_effect_before_guard": ( + str(signature.get("majority_side_effect_before_guard", "0")) + == "1" + ), + }, + "debug": debug, + } + case "clone_cohort_drift": + return { + "version": "1", + "stable": { + "family": "clone_cohort_drift", + "cohort_id": str(signature.get("cohort_id", "")), + "drift_fields": _csv_values(signature.get("drift_fields")), + "majority_profile": { + "terminal_kind": str( + signature.get("majority_terminal_kind", "") + ), + "guard_exit_profile": str( + signature.get("majority_guard_exit_profile", "") + ), + "try_finally_profile": str( + signature.get("majority_try_finally_profile", "") + ), + "side_effect_order_profile": str( + signature.get("majority_side_effect_order_profile", "") + ), + }, + }, + "debug": debug, + } + case _: + return { + "version": "1", + "stable": { + "family": "duplicated_branches", + "stmt_shape": str(signature.get("stmt_seq", "")), + "terminal_kind": str(signature.get("terminal", "")), + "control_flow": { + "has_loop": str(signature.get("has_loop", "0")) == "1", + "has_try": str(signature.get("has_try", "0")) == "1", + "nested_if": str(signature.get("nested_if", "0")) == "1", + }, + }, + "debug": debug, + } + + +def _build_structural_facts( + finding_kind: str, + signature: Mapping[str, str], + *, + count: int, +) -> dict[str, object]: + match finding_kind: + case "clone_guard_exit_divergence": + return { + "cohort_id": str(signature.get("cohort_id", "")), + "cohort_arity": _as_int(signature.get("cohort_arity")), + "divergent_members": _as_int(signature.get("divergent_members"), count), + "majority_entry_guard_count": _as_int( + signature.get("majority_guard_count"), + ), + "majority_guard_terminal_profile": str( + signature.get("majority_guard_terminal_profile", "none") + ), + "majority_terminal_kind": str( + signature.get("majority_terminal_kind", "fallthrough") + ), + "majority_side_effect_before_guard": ( + str(signature.get("majority_side_effect_before_guard", "0")) == "1" + ), + "guard_count_values": _csv_values(signature.get("guard_count_values")), + "guard_terminal_values": _csv_values( + signature.get("guard_terminal_values"), + ), + "terminal_values": _csv_values(signature.get("terminal_values")), + "side_effect_before_guard_values": _csv_values( + signature.get("side_effect_before_guard_values"), + ), + } + case "clone_cohort_drift": + return { + "cohort_id": str(signature.get("cohort_id", "")), + "cohort_arity": _as_int(signature.get("cohort_arity")), + "divergent_members": _as_int(signature.get("divergent_members"), count), + "drift_fields": _csv_values(signature.get("drift_fields")), + "stable_majority_profile": { + "terminal_kind": str(signature.get("majority_terminal_kind", "")), + "guard_exit_profile": str( + signature.get("majority_guard_exit_profile", "") + ), + "try_finally_profile": str( + signature.get("majority_try_finally_profile", "") + ), + "side_effect_order_profile": str( + signature.get("majority_side_effect_order_profile", "") + ), + }, + } + case _: + return { + "occurrence_count": count, + "non_overlapping": True, + "call_bucket": _as_int(signature.get("calls", "0")), + "raise_bucket": _as_int(signature.get("raises", "0")), + } + + +def _build_structural_groups( + groups: Sequence[StructuralFindingGroup] | None, + *, + scan_root: str, +) -> list[dict[str, object]]: + normalized_groups = normalize_structural_findings(groups or ()) + out: list[dict[str, object]] = [] + for group in normalized_groups: + locations = tuple( + report_location_from_structural_occurrence(item, scan_root=scan_root) + for item in group.items + ) + source_scope = _source_scope_from_locations( + [{"source_kind": location.source_kind} for location in locations] + ) + spread_files, spread_functions = group_spread(locations) + severity, priority = _structural_group_assessment( + finding_kind=group.finding_kind, + count=len(group.items), + spread_functions=spread_functions, + ) + out.append( + { + "id": structural_group_id(group.finding_kind, group.finding_key), + "family": FAMILY_STRUCTURAL, + "category": group.finding_kind, + "kind": group.finding_kind, + "severity": severity, + "confidence": ( + CONFIDENCE_HIGH + if group.finding_kind + in {"clone_guard_exit_divergence", "clone_cohort_drift"} + else CONFIDENCE_MEDIUM + ), + "priority": priority, + "count": len(group.items), + "source_scope": source_scope, + "spread": { + "files": spread_files, + "functions": spread_functions, + }, + "signature": _build_structural_signature( + group.finding_kind, + group.signature, + ), + "items": sorted( + [ + { + "relative_path": _contract_report_location_path( + item.file_path, + scan_root=scan_root, + ), + "qualname": item.qualname, + "start_line": item.start, + "end_line": item.end, + } + for item in group.items + ], + key=_item_sort_key, + ), + "facts": _build_structural_facts( + group.finding_kind, + group.signature, + count=len(group.items), + ), + } + ) + out.sort(key=lambda group: (-_as_int(group.get("count")), str(group["id"]))) + return out + + +def _single_location_source_scope( + filepath: str, + *, + scan_root: str, +) -> dict[str, object]: + location = report_location_from_group_item( + { + "filepath": filepath, + "qualname": "", + "start_line": 0, + "end_line": 0, + }, + scan_root=scan_root, + ) + return _source_scope_from_locations([{"source_kind": location.source_kind}]) + + +def _build_dead_code_groups( + metrics_payload: Mapping[str, object], + *, + scan_root: str, +) -> list[dict[str, object]]: + families = _as_mapping(metrics_payload.get("families")) + dead_code = _as_mapping(families.get(FAMILY_DEAD_CODE)) + groups: list[dict[str, object]] = [] + for item in _as_sequence(dead_code.get("items")): + item_map = _as_mapping(item) + qualname = str(item_map.get("qualname", "")) + filepath = str(item_map.get("relative_path", "")) + confidence = str(item_map.get("confidence", CONFIDENCE_MEDIUM)) + severity = SEVERITY_WARNING if confidence == CONFIDENCE_HIGH else SEVERITY_INFO + groups.append( + { + "id": dead_code_group_id(qualname), + "family": FAMILY_DEAD_CODE, + "category": str(item_map.get("kind", "unknown")), + "kind": "unused_symbol", + "severity": severity, + "confidence": confidence, + "priority": _priority(severity, EFFORT_EASY), + "count": 1, + "source_scope": _single_location_source_scope( + filepath, + scan_root=scan_root, + ), + "spread": {"files": 1, "functions": 1 if qualname else 0}, + "items": [ + { + "relative_path": _contract_report_location_path( + filepath, + scan_root=scan_root, + ), + "qualname": qualname, + "start_line": _as_int(item_map.get("start_line")), + "end_line": _as_int(item_map.get("end_line")), + } + ], + "facts": { + "kind": str(item_map.get("kind", "unknown")), + "confidence": confidence, + }, + } + ) + groups.sort(key=lambda group: (-_as_float(group["priority"]), str(group["id"]))) + return groups diff --git a/codeclone/report/document/builder.py b/codeclone/report/document/builder.py new file mode 100644 index 0000000..9d22dfa --- /dev/null +++ b/codeclone/report/document/builder.py @@ -0,0 +1,114 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections.abc import Collection, Mapping, Sequence +from typing import TYPE_CHECKING + +from ...contracts import ( + REPORT_SCHEMA_VERSION, +) +from ...utils.coerce import as_mapping as _as_mapping + +if TYPE_CHECKING: + from ...models import ( + GroupMapLike, + StructuralFindingGroup, + Suggestion, + SuppressedCloneGroup, + ) + +from ._common import _collect_report_file_list +from .derived import _build_derived_overview, _build_derived_suggestions +from .findings import _build_findings_payload +from .integrity import _build_integrity_payload +from .inventory import ( + _baseline_is_trusted, + _build_inventory_payload, + _build_meta_payload, +) +from .metrics import _build_metrics_payload + + +def build_report_document( + *, + func_groups: GroupMapLike, + block_groups: GroupMapLike, + segment_groups: GroupMapLike, + meta: Mapping[str, object] | None = None, + inventory: Mapping[str, object] | None = None, + block_facts: Mapping[str, Mapping[str, str]] | None = None, + new_function_group_keys: Collection[str] | None = None, + new_block_group_keys: Collection[str] | None = None, + new_segment_group_keys: Collection[str] | None = None, + suppressed_clone_groups: Sequence[SuppressedCloneGroup] | None = None, + metrics: Mapping[str, object] | None = None, + suggestions: Sequence[Suggestion] | None = None, + structural_findings: Sequence[StructuralFindingGroup] | None = None, +) -> dict[str, object]: + report_schema_version = REPORT_SCHEMA_VERSION + scan_root = str(_as_mapping(meta).get("scan_root", "")) + meta_payload = _build_meta_payload(meta, scan_root=scan_root) + design_thresholds = _as_mapping( + _as_mapping(meta_payload.get("analysis_thresholds")).get("design_findings") + ) + metrics_payload = _build_metrics_payload(metrics, scan_root=scan_root) + file_list = _collect_report_file_list( + inventory=inventory, + func_groups=func_groups, + block_groups=block_groups, + segment_groups=segment_groups, + suppressed_clone_groups=suppressed_clone_groups, + metrics=metrics, + structural_findings=structural_findings, + ) + inventory_payload = _build_inventory_payload( + inventory=inventory, + file_list=file_list, + metrics_payload=metrics_payload, + scan_root=scan_root, + ) + findings_payload = _build_findings_payload( + func_groups=func_groups, + block_groups=block_groups, + segment_groups=segment_groups, + block_facts=block_facts or {}, + structural_findings=structural_findings, + metrics_payload=metrics_payload, + baseline_trusted=_baseline_is_trusted(meta_payload), + new_function_group_keys=new_function_group_keys, + new_block_group_keys=new_block_group_keys, + new_segment_group_keys=new_segment_group_keys, + suppressed_clone_groups=suppressed_clone_groups, + design_thresholds=design_thresholds, + scan_root=scan_root, + ) + overview_payload, hotlists_payload = _build_derived_overview( + findings=findings_payload, + metrics_payload=metrics_payload, + ) + derived_payload = { + "suggestions": _build_derived_suggestions(suggestions), + "overview": overview_payload, + "hotlists": hotlists_payload, + } + integrity_payload = _build_integrity_payload( + report_schema_version=report_schema_version, + meta=meta_payload, + inventory=inventory_payload, + findings=findings_payload, + metrics=metrics_payload, + ) + return { + "report_schema_version": report_schema_version, + "meta": meta_payload, + "inventory": inventory_payload, + "findings": findings_payload, + "metrics": metrics_payload, + "derived": derived_payload, + "integrity": integrity_payload, + } diff --git a/codeclone/report/document/derived.py b/codeclone/report/document/derived.py new file mode 100644 index 0000000..add2042 --- /dev/null +++ b/codeclone/report/document/derived.py @@ -0,0 +1,425 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections import Counter +from collections.abc import Mapping, Sequence +from typing import TYPE_CHECKING + +from ...domain.findings import ( + CATEGORY_COHESION, + CATEGORY_COMPLEXITY, + CATEGORY_COUPLING, + CATEGORY_DEAD_CODE, + CATEGORY_DEPENDENCY, + CLONE_KIND_BLOCK, + CLONE_KIND_FUNCTION, + CLONE_KIND_SEGMENT, + FAMILY_CLONE, + FAMILY_CLONES, + FAMILY_DEAD_CODE, + FAMILY_DESIGN, + FAMILY_STRUCTURAL, +) +from ...domain.quality import ( + SEVERITY_INFO, + SEVERITY_ORDER, +) +from ...domain.source_scope import ( + IMPACT_SCOPE_MIXED, + IMPACT_SCOPE_NON_RUNTIME, + IMPACT_SCOPE_RUNTIME, + SOURCE_KIND_FIXTURES, + SOURCE_KIND_MIXED, + SOURCE_KIND_OTHER, + SOURCE_KIND_PRODUCTION, + SOURCE_KIND_TESTS, +) +from ...findings.ids import ( + clone_group_id, + dead_code_group_id, + design_group_id, + structural_group_id, +) +from ...utils.coerce import as_float as _as_float +from ...utils.coerce import as_int as _as_int +from ...utils.coerce import as_mapping as _as_mapping +from ...utils.coerce import as_sequence as _as_sequence +from ..overview import build_directory_hotspots +from ._common import _contract_report_location_path, _is_absolute_path + +if TYPE_CHECKING: + from ...models import ( + Suggestion, + ) + + +def _sort_flat_finding_ids( + groups: Sequence[Mapping[str, object]], +) -> list[str]: + ordered = sorted( + groups, + key=lambda group: ( + -_as_float(group.get("priority")), + SEVERITY_ORDER.get(str(group.get("severity", SEVERITY_INFO)), 9), + -_as_int(_as_mapping(group.get("spread")).get("files")), + -_as_int(_as_mapping(group.get("spread")).get("functions")), + -_as_int(group.get("count")), + str(group.get("id", "")), + ), + ) + return [str(group["id"]) for group in ordered] + + +def _sort_highest_spread_ids( + groups: Sequence[Mapping[str, object]], +) -> list[str]: + ordered = sorted( + groups, + key=lambda group: ( + -_as_int(_as_mapping(group.get("spread")).get("files")), + -_as_int(_as_mapping(group.get("spread")).get("functions")), + -_as_int(group.get("count")), + -_as_float(group.get("priority")), + str(group.get("id", "")), + ), + ) + return [str(group["id"]) for group in ordered] + + +def _health_snapshot(metrics_payload: Mapping[str, object]) -> dict[str, object]: + health = _as_mapping(_as_mapping(metrics_payload.get("families")).get("health")) + summary = _as_mapping(health.get("summary")) + dimensions = { + str(key): _as_int(value) + for key, value in _as_mapping(summary.get("dimensions")).items() + } + strongest = None + weakest = None + if dimensions: + strongest = min( + sorted(dimensions), + key=lambda key: (-dimensions[key], key), + ) + weakest = min( + sorted(dimensions), + key=lambda key: (dimensions[key], key), + ) + return { + "score": _as_int(summary.get("score")), + "grade": str(summary.get("grade", "")), + "strongest_dimension": strongest, + "weakest_dimension": weakest, + } + + +def _combined_impact_scope(groups: Sequence[Mapping[str, object]]) -> str: + impact_scopes = { + str( + _as_mapping(group.get("source_scope")).get( + "impact_scope", + IMPACT_SCOPE_NON_RUNTIME, + ) + ) + for group in groups + } + if not impact_scopes: + return IMPACT_SCOPE_NON_RUNTIME + if len(impact_scopes) == 1: + return next(iter(impact_scopes)) + return IMPACT_SCOPE_MIXED + + +def _top_risks( + *, + dead_code_groups: Sequence[Mapping[str, object]], + design_groups: Sequence[Mapping[str, object]], + structural_groups: Sequence[Mapping[str, object]], + clone_groups: Sequence[Mapping[str, object]], +) -> list[dict[str, object]]: + risks: list[dict[str, object]] = [] + + if dead_code_groups: + label = ( + "1 dead code item" + if len(dead_code_groups) == 1 + else f"{len(dead_code_groups)} dead code items" + ) + risks.append( + { + "kind": "family_summary", + "family": FAMILY_DEAD_CODE, + "count": len(dead_code_groups), + "scope": IMPACT_SCOPE_MIXED + if len( + { + _as_mapping(group.get("source_scope")).get("impact_scope") + for group in dead_code_groups + } + ) + > 1 + else str( + _as_mapping(dead_code_groups[0].get("source_scope")).get( + "impact_scope", + IMPACT_SCOPE_NON_RUNTIME, + ) + ), + "label": label, + } + ) + + low_cohesion = [ + group + for group in design_groups + if str(group.get("category", "")) == CATEGORY_COHESION + ] + if low_cohesion: + label = ( + "1 low cohesion class" + if len(low_cohesion) == 1 + else f"{len(low_cohesion)} low cohesion classes" + ) + risks.append( + { + "kind": "family_summary", + "family": FAMILY_DESIGN, + "category": CATEGORY_COHESION, + "count": len(low_cohesion), + "scope": _combined_impact_scope(low_cohesion), + "label": label, + } + ) + + production_structural = [ + group + for group in structural_groups + if str(_as_mapping(group.get("source_scope")).get("impact_scope")) + in {IMPACT_SCOPE_RUNTIME, IMPACT_SCOPE_MIXED} + ] + if production_structural: + label = ( + "1 structural finding in production code" + if len(production_structural) == 1 + else ( + f"{len(production_structural)} structural findings in production code" + ) + ) + risks.append( + { + "kind": "family_summary", + "family": FAMILY_STRUCTURAL, + "count": len(production_structural), + "scope": SOURCE_KIND_PRODUCTION, + "label": label, + } + ) + + fixture_test_clones = [ + group + for group in clone_groups + if _as_mapping(group.get("source_scope")).get("impact_scope") + == IMPACT_SCOPE_NON_RUNTIME + and _as_mapping(group.get("source_scope")).get("dominant_kind") + in {SOURCE_KIND_TESTS, SOURCE_KIND_FIXTURES} + ] + if fixture_test_clones: + label = ( + "1 clone group in fixtures/tests" + if len(fixture_test_clones) == 1 + else f"{len(fixture_test_clones)} clone groups in fixtures/tests" + ) + risks.append( + { + "kind": "family_summary", + "family": FAMILY_CLONE, + "count": len(fixture_test_clones), + "scope": IMPACT_SCOPE_NON_RUNTIME, + "label": label, + } + ) + + return risks[:6] + + +def _build_derived_overview( + *, + findings: Mapping[str, object], + metrics_payload: Mapping[str, object], +) -> tuple[dict[str, object], dict[str, object]]: + groups = _as_mapping(findings.get("groups")) + clones = _as_mapping(groups.get(FAMILY_CLONES)) + clone_groups = [ + *_as_sequence(clones.get("functions")), + *_as_sequence(clones.get("blocks")), + *_as_sequence(clones.get("segments")), + ] + structural_groups = _as_sequence( + _as_mapping(groups.get(FAMILY_STRUCTURAL)).get("groups") + ) + dead_code_groups = _as_sequence( + _as_mapping(groups.get(FAMILY_DEAD_CODE)).get("groups") + ) + design_groups = _as_sequence(_as_mapping(groups.get("design")).get("groups")) + flat_groups = [ + *clone_groups, + *structural_groups, + *dead_code_groups, + *design_groups, + ] + dominant_kind_counts: Counter[str] = Counter( + str( + _as_mapping(_as_mapping(group).get("source_scope")).get( + "dominant_kind", + SOURCE_KIND_OTHER, + ) + ) + for group in flat_groups + ) + summary = _as_mapping(findings.get("summary")) + overview: dict[str, object] = { + "families": dict(_as_mapping(summary.get("families"))), + "top_risks": _top_risks( + dead_code_groups=[_as_mapping(group) for group in dead_code_groups], + design_groups=[_as_mapping(group) for group in design_groups], + structural_groups=[_as_mapping(group) for group in structural_groups], + clone_groups=[_as_mapping(group) for group in clone_groups], + ), + "source_scope_breakdown": { + key: dominant_kind_counts[key] + for key in ( + SOURCE_KIND_PRODUCTION, + SOURCE_KIND_TESTS, + SOURCE_KIND_FIXTURES, + SOURCE_KIND_MIXED, + SOURCE_KIND_OTHER, + ) + if dominant_kind_counts[key] > 0 + }, + "health_snapshot": _health_snapshot(metrics_payload), + "directory_hotspots": build_directory_hotspots(findings=findings), + } + hotlists: dict[str, object] = { + "most_actionable_ids": _sort_flat_finding_ids( + [ + group + for group in map(_as_mapping, flat_groups) + if str(group.get("severity")) != SEVERITY_INFO + ] + )[:5], + "highest_spread_ids": _sort_highest_spread_ids( + list(map(_as_mapping, flat_groups)) + )[:5], + "production_hotspot_ids": _sort_flat_finding_ids( + [ + group + for group in map(_as_mapping, flat_groups) + if str(_as_mapping(group.get("source_scope")).get("impact_scope")) + in {IMPACT_SCOPE_RUNTIME, IMPACT_SCOPE_MIXED} + ] + )[:5], + "test_fixture_hotspot_ids": _sort_flat_finding_ids( + [ + group + for group in map(_as_mapping, flat_groups) + if str(_as_mapping(group.get("source_scope")).get("impact_scope")) + == IMPACT_SCOPE_NON_RUNTIME + and str(_as_mapping(group.get("source_scope")).get("dominant_kind")) + in {SOURCE_KIND_TESTS, SOURCE_KIND_FIXTURES} + ] + )[:5], + } + return overview, hotlists + + +def _representative_location_rows( + suggestion: Suggestion, +) -> list[dict[str, object]]: + rows = [ + { + "relative_path": ( + location.relative_path + if ( + location.relative_path + and not _is_absolute_path(location.relative_path) + ) + else _contract_report_location_path( + location.filepath, + scan_root="", + ) + ), + "start_line": location.start_line, + "end_line": location.end_line, + "qualname": location.qualname, + "source_kind": location.source_kind, + } + for location in suggestion.representative_locations + ] + rows.sort( + key=lambda row: ( + str(row["relative_path"]), + _as_int(row["start_line"]), + _as_int(row["end_line"]), + str(row["qualname"]), + ) + ) + return rows[:3] + + +def _suggestion_finding_id(suggestion: Suggestion) -> str: + if suggestion.finding_family == FAMILY_CLONES: + if suggestion.fact_kind.startswith("Function"): + return clone_group_id(CLONE_KIND_FUNCTION, suggestion.subject_key) + if suggestion.fact_kind.startswith("Block"): + return clone_group_id(CLONE_KIND_BLOCK, suggestion.subject_key) + return clone_group_id(CLONE_KIND_SEGMENT, suggestion.subject_key) + if suggestion.finding_family == FAMILY_STRUCTURAL: + return structural_group_id( + suggestion.finding_kind or "duplicated_branches", + suggestion.subject_key, + ) + if suggestion.category == CATEGORY_DEAD_CODE: + return dead_code_group_id(suggestion.subject_key) + if suggestion.category in { + CATEGORY_COMPLEXITY, + CATEGORY_COUPLING, + CATEGORY_COHESION, + CATEGORY_DEPENDENCY, + }: + return design_group_id(suggestion.category, suggestion.subject_key) + return design_group_id( + suggestion.category, + suggestion.subject_key or suggestion.title, + ) + + +def _build_derived_suggestions( + suggestions: Sequence[Suggestion] | None, +) -> list[dict[str, object]]: + suggestion_rows = list(suggestions or ()) + suggestion_rows.sort( + key=lambda suggestion: ( + -suggestion.priority, + SEVERITY_ORDER.get(suggestion.severity, 9), + suggestion.title, + _suggestion_finding_id(suggestion), + ) + ) + return [ + { + "id": f"suggestion:{_suggestion_finding_id(suggestion)}", + "finding_id": _suggestion_finding_id(suggestion), + "title": suggestion.title, + "summary": suggestion.fact_summary, + "location_label": suggestion.location_label or suggestion.location, + "representative_locations": _representative_location_rows(suggestion), + "action": { + "effort": suggestion.effort, + "steps": list(suggestion.steps), + }, + } + for suggestion in suggestion_rows + ] diff --git a/codeclone/report/document/findings.py b/codeclone/report/document/findings.py new file mode 100644 index 0000000..43aaedd --- /dev/null +++ b/codeclone/report/document/findings.py @@ -0,0 +1,245 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections.abc import Collection, Mapping, Sequence +from typing import TYPE_CHECKING + +from ...domain.findings import ( + CLONE_KIND_BLOCK, + CLONE_KIND_FUNCTION, + CLONE_KIND_SEGMENT, + CLONE_NOVELTY_KNOWN, + CLONE_NOVELTY_NEW, + FAMILY_CLONES, + FAMILY_DEAD_CODE, + FAMILY_STRUCTURAL, +) +from ...domain.quality import ( + SEVERITY_CRITICAL, + SEVERITY_INFO, + SEVERITY_WARNING, +) +from ...domain.source_scope import ( + IMPACT_SCOPE_MIXED, + IMPACT_SCOPE_NON_RUNTIME, + IMPACT_SCOPE_RUNTIME, +) +from ...utils.coerce import as_int as _as_int +from ...utils.coerce import as_mapping as _as_mapping +from ...utils.coerce import as_sequence as _as_sequence + +if TYPE_CHECKING: + from ...models import ( + GroupMapLike, + StructuralFindingGroup, + SuppressedCloneGroup, + ) + +from ._design_groups import _build_design_groups +from ._findings_groups import ( + _build_clone_groups, + _build_dead_code_groups, + _build_structural_groups, + _build_suppressed_clone_groups, +) + + +def _findings_summary( + *, + clone_functions: Sequence[Mapping[str, object]], + clone_blocks: Sequence[Mapping[str, object]], + clone_segments: Sequence[Mapping[str, object]], + structural_groups: Sequence[Mapping[str, object]], + dead_code_groups: Sequence[Mapping[str, object]], + design_groups: Sequence[Mapping[str, object]], + suppressed_clone_groups: Mapping[str, Sequence[Mapping[str, object]]] | None = None, + dead_code_suppressed: int = 0, +) -> dict[str, object]: + flat_groups = [ + *clone_functions, + *clone_blocks, + *clone_segments, + *structural_groups, + *dead_code_groups, + *design_groups, + ] + severity_counts = dict.fromkeys( + (SEVERITY_CRITICAL, SEVERITY_WARNING, SEVERITY_INFO), + 0, + ) + source_scope_counts = dict.fromkeys( + (IMPACT_SCOPE_RUNTIME, IMPACT_SCOPE_NON_RUNTIME, IMPACT_SCOPE_MIXED), + 0, + ) + for group in flat_groups: + severity = str(group.get("severity", SEVERITY_INFO)) + if severity in severity_counts: + severity_counts[severity] += 1 + impact_scope = str( + _as_mapping(group.get("source_scope")).get( + "impact_scope", + IMPACT_SCOPE_NON_RUNTIME, + ) + ) + if impact_scope in source_scope_counts: + source_scope_counts[impact_scope] += 1 + clone_groups = [*clone_functions, *clone_blocks, *clone_segments] + clone_suppressed_map = _as_mapping(suppressed_clone_groups) + suppressed_functions = len(_as_sequence(clone_suppressed_map.get("function"))) + suppressed_blocks = len(_as_sequence(clone_suppressed_map.get("block"))) + suppressed_segments = len(_as_sequence(clone_suppressed_map.get("segment"))) + suppressed_clone_total = ( + suppressed_functions + suppressed_blocks + suppressed_segments + ) + clones_summary: dict[str, object] = { + "functions": len(clone_functions), + "blocks": len(clone_blocks), + "segments": len(clone_segments), + CLONE_NOVELTY_NEW: sum( + 1 + for group in clone_groups + if str(group.get("novelty", "")) == CLONE_NOVELTY_NEW + ), + CLONE_NOVELTY_KNOWN: sum( + 1 + for group in clone_groups + if str(group.get("novelty", "")) == CLONE_NOVELTY_KNOWN + ), + } + if suppressed_clone_total > 0: + clones_summary.update( + { + "suppressed": suppressed_clone_total, + "suppressed_functions": suppressed_functions, + "suppressed_blocks": suppressed_blocks, + "suppressed_segments": suppressed_segments, + } + ) + suppressed_summary = { + FAMILY_DEAD_CODE: max(0, dead_code_suppressed), + } + if suppressed_clone_total > 0: + suppressed_summary[FAMILY_CLONES] = suppressed_clone_total + return { + "total": len(flat_groups), + "families": { + FAMILY_CLONES: len(clone_groups), + FAMILY_STRUCTURAL: len(structural_groups), + FAMILY_DEAD_CODE: len(dead_code_groups), + "design": len(design_groups), + }, + "severity": severity_counts, + "impact_scope": source_scope_counts, + "clones": clones_summary, + "suppressed": suppressed_summary, + } + + +def _build_findings_payload( + *, + func_groups: GroupMapLike, + block_groups: GroupMapLike, + segment_groups: GroupMapLike, + block_facts: Mapping[str, Mapping[str, str]], + structural_findings: Sequence[StructuralFindingGroup] | None, + metrics_payload: Mapping[str, object], + baseline_trusted: bool, + new_function_group_keys: Collection[str] | None, + new_block_group_keys: Collection[str] | None, + new_segment_group_keys: Collection[str] | None, + suppressed_clone_groups: Sequence[SuppressedCloneGroup] | None, + design_thresholds: Mapping[str, object] | None, + scan_root: str, +) -> dict[str, object]: + clone_functions = _build_clone_groups( + groups=func_groups, + kind=CLONE_KIND_FUNCTION, + baseline_trusted=baseline_trusted, + new_keys=new_function_group_keys, + block_facts=block_facts, + scan_root=scan_root, + ) + clone_blocks = _build_clone_groups( + groups=block_groups, + kind=CLONE_KIND_BLOCK, + baseline_trusted=baseline_trusted, + new_keys=new_block_group_keys, + block_facts=block_facts, + scan_root=scan_root, + ) + clone_segments = _build_clone_groups( + groups=segment_groups, + kind=CLONE_KIND_SEGMENT, + baseline_trusted=baseline_trusted, + new_keys=new_segment_group_keys, + block_facts={}, + scan_root=scan_root, + ) + structural_groups = _build_structural_groups( + structural_findings, + scan_root=scan_root, + ) + dead_code_groups = _build_dead_code_groups( + metrics_payload, + scan_root=scan_root, + ) + dead_code_family = _as_mapping( + _as_mapping(metrics_payload.get("families")).get(FAMILY_DEAD_CODE) + ) + dead_code_summary = _as_mapping(dead_code_family.get("summary")) + dead_code_suppressed = _as_int( + dead_code_summary.get( + "suppressed", + len(_as_sequence(dead_code_family.get("suppressed_items"))), + ) + ) + design_groups = _build_design_groups( + metrics_payload, + design_thresholds=design_thresholds, + scan_root=scan_root, + ) + suppressed_clone_payload = _build_suppressed_clone_groups( + groups=suppressed_clone_groups, + block_facts=block_facts, + scan_root=scan_root, + ) + clone_groups_payload: dict[str, object] = { + "functions": clone_functions, + "blocks": clone_blocks, + "segments": clone_segments, + } + if any(suppressed_clone_payload.values()): + clone_groups_payload["suppressed"] = { + "functions": suppressed_clone_payload[CLONE_KIND_FUNCTION], + "blocks": suppressed_clone_payload[CLONE_KIND_BLOCK], + "segments": suppressed_clone_payload[CLONE_KIND_SEGMENT], + } + return { + "summary": _findings_summary( + clone_functions=clone_functions, + clone_blocks=clone_blocks, + clone_segments=clone_segments, + structural_groups=structural_groups, + dead_code_groups=dead_code_groups, + design_groups=design_groups, + suppressed_clone_groups=suppressed_clone_payload, + dead_code_suppressed=dead_code_suppressed, + ), + "groups": { + FAMILY_CLONES: clone_groups_payload, + FAMILY_STRUCTURAL: { + "groups": structural_groups, + }, + FAMILY_DEAD_CODE: { + "groups": dead_code_groups, + }, + "design": { + "groups": design_groups, + }, + }, + } diff --git a/codeclone/report/document/integrity.py b/codeclone/report/document/integrity.py new file mode 100644 index 0000000..5360ef8 --- /dev/null +++ b/codeclone/report/document/integrity.py @@ -0,0 +1,87 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections.abc import Mapping, Sequence +from hashlib import sha256 + +import orjson + + +def _canonical_integrity_payload( + *, + report_schema_version: str, + meta: Mapping[str, object], + inventory: Mapping[str, object], + findings: Mapping[str, object], + metrics: Mapping[str, object], +) -> dict[str, object]: + canonical_meta = { + str(key): value for key, value in meta.items() if str(key) != "runtime" + } + + def _strip_noncanonical(value: object) -> object: + if isinstance(value, Mapping): + return { + str(key): _strip_noncanonical(item) + for key, item in value.items() + if str(key) != "display_facts" + } + if isinstance(value, Sequence) and not isinstance( + value, + (str, bytes, bytearray), + ): + return [_strip_noncanonical(item) for item in value] + return value + + return { + "report_schema_version": report_schema_version, + "meta": canonical_meta, + "inventory": inventory, + "findings": _strip_noncanonical(findings), + "metrics": metrics, + } + + +def _build_integrity_payload( + *, + report_schema_version: str, + meta: Mapping[str, object], + inventory: Mapping[str, object], + findings: Mapping[str, object], + metrics: Mapping[str, object], +) -> dict[str, object]: + canonical_payload = _canonical_integrity_payload( + report_schema_version=report_schema_version, + meta=meta, + inventory=inventory, + findings=findings, + metrics=metrics, + ) + canonical_json = orjson.dumps( + canonical_payload, + option=orjson.OPT_SORT_KEYS, + ) + payload_sha = sha256(canonical_json).hexdigest() + return { + "canonicalization": { + "version": "1", + "scope": "canonical_only", + "sections": [ + "report_schema_version", + "meta", + "inventory", + "findings", + "metrics", + ], + }, + "digest": { + "verified": True, + "algorithm": "sha256", + "value": payload_sha, + }, + } diff --git a/codeclone/report/document/inventory.py b/codeclone/report/document/inventory.py new file mode 100644 index 0000000..17ed577 --- /dev/null +++ b/codeclone/report/document/inventory.py @@ -0,0 +1,218 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections.abc import Mapping, Sequence + +from ...domain.findings import ( + CATEGORY_COHESION, + CATEGORY_COMPLEXITY, +) +from ...utils.coerce import as_int as _as_int +from ...utils.coerce import as_mapping as _as_mapping +from ...utils.coerce import as_sequence as _as_sequence +from ._common import ( + _analysis_profile_payload, + _contract_path, + _count_file_lines, + _design_findings_thresholds_payload, + _optional_str, +) + + +def _derive_inventory_code_counts( + *, + metrics_payload: Mapping[str, object], + inventory_code: Mapping[str, object], + file_list: Sequence[str], + cached_files: int, +) -> dict[str, object]: + complexity = _as_mapping( + _as_mapping(metrics_payload.get("families")).get(CATEGORY_COMPLEXITY) + ) + cohesion = _as_mapping( + _as_mapping(metrics_payload.get("families")).get(CATEGORY_COHESION) + ) + complexity_items = _as_sequence(complexity.get("items")) + cohesion_items = _as_sequence(cohesion.get("items")) + + exact_entities = bool(complexity_items or cohesion_items) + method_count = sum( + _as_int(_as_mapping(item).get("method_count")) for item in cohesion_items + ) + class_count = len(cohesion_items) + function_total = max(len(complexity_items) - method_count, 0) + + if not exact_entities: + function_total = _as_int(inventory_code.get("functions")) + method_count = _as_int(inventory_code.get("methods")) + class_count = _as_int(inventory_code.get("classes")) + + parsed_lines_raw = inventory_code.get("parsed_lines") + if isinstance(parsed_lines_raw, int) and parsed_lines_raw >= 0: + parsed_lines = parsed_lines_raw + elif cached_files > 0 and file_list: + parsed_lines = _count_file_lines(file_list) + else: + parsed_lines = _as_int(parsed_lines_raw) + + if exact_entities and ((cached_files > 0 and file_list) or parsed_lines > 0): + scope = "analysis_root" + elif cached_files > 0 and file_list: + scope = "mixed" + else: + scope = "current_run" + + return { + "scope": scope, + "parsed_lines": parsed_lines, + "functions": function_total, + "methods": method_count, + "classes": class_count, + } + + +def _build_inventory_payload( + *, + inventory: Mapping[str, object] | None, + file_list: Sequence[str], + metrics_payload: Mapping[str, object], + scan_root: str, +) -> dict[str, object]: + inventory_map = _as_mapping(inventory) + files_map = _as_mapping(inventory_map.get("files")) + code_map = _as_mapping(inventory_map.get("code")) + cached_files = _as_int(files_map.get("cached")) + file_registry = [ + path + for path in ( + _contract_path(filepath, scan_root=scan_root)[0] for filepath in file_list + ) + if path is not None + ] + return { + "files": { + "total_found": _as_int(files_map.get("total_found"), len(file_list)), + "analyzed": _as_int(files_map.get("analyzed")), + "cached": cached_files, + "skipped": _as_int(files_map.get("skipped")), + "source_io_skipped": _as_int(files_map.get("source_io_skipped")), + }, + "code": _derive_inventory_code_counts( + metrics_payload=metrics_payload, + inventory_code=code_map, + file_list=file_list, + cached_files=cached_files, + ), + "file_registry": { + "encoding": "relative_path", + "items": file_registry, + }, + } + + +def _baseline_is_trusted(meta: Mapping[str, object]) -> bool: + baseline = _as_mapping(meta.get("baseline")) + return ( + baseline.get("loaded") is True + and str(baseline.get("status", "")).strip().lower() == "ok" + ) + + +def _build_meta_payload( + raw_meta: Mapping[str, object] | None, + *, + scan_root: str, +) -> dict[str, object]: + meta = dict(raw_meta or {}) + metrics_computed = sorted( + { + str(item) + for item in _as_sequence(meta.get("metrics_computed")) + if str(item).strip() + } + ) + baseline_path, baseline_path_scope, baseline_abs = _contract_path( + meta.get("baseline_path"), + scan_root=scan_root, + ) + cache_path, cache_path_scope, cache_abs = _contract_path( + meta.get("cache_path"), + scan_root=scan_root, + ) + metrics_baseline_path, metrics_baseline_path_scope, metrics_baseline_abs = ( + _contract_path( + meta.get("metrics_baseline_path"), + scan_root=scan_root, + ) + ) + payload: dict[str, object] = { + "codeclone_version": str(meta.get("codeclone_version", "")), + "project_name": str(meta.get("project_name", "")), + "scan_root": ".", + "python_version": str(meta.get("python_version", "")), + "python_tag": str(meta.get("python_tag", "")), + "analysis_mode": str(meta.get("analysis_mode", "full") or "full"), + "report_mode": str(meta.get("report_mode", "full") or "full"), + "computed_metric_families": metrics_computed, + "analysis_thresholds": _design_findings_thresholds_payload(meta), + "baseline": { + "path": baseline_path, + "path_scope": baseline_path_scope, + "loaded": bool(meta.get("baseline_loaded")), + "status": _optional_str(meta.get("baseline_status")), + "fingerprint_version": _optional_str( + meta.get("baseline_fingerprint_version") + ), + "schema_version": _optional_str(meta.get("baseline_schema_version")), + "python_tag": _optional_str(meta.get("baseline_python_tag")), + "generator_name": _optional_str(meta.get("baseline_generator_name")), + "generator_version": _optional_str(meta.get("baseline_generator_version")), + "payload_sha256": _optional_str(meta.get("baseline_payload_sha256")), + "payload_sha256_verified": bool( + meta.get("baseline_payload_sha256_verified") + ), + }, + "cache": { + "path": cache_path, + "path_scope": cache_path_scope, + "used": bool(meta.get("cache_used")), + "status": _optional_str(meta.get("cache_status")), + "schema_version": _optional_str(meta.get("cache_schema_version")), + }, + "metrics_baseline": { + "path": metrics_baseline_path, + "path_scope": metrics_baseline_path_scope, + "loaded": bool(meta.get("metrics_baseline_loaded")), + "status": _optional_str(meta.get("metrics_baseline_status")), + "schema_version": _optional_str( + meta.get("metrics_baseline_schema_version") + ), + "payload_sha256": _optional_str( + meta.get("metrics_baseline_payload_sha256") + ), + "payload_sha256_verified": bool( + meta.get("metrics_baseline_payload_sha256_verified") + ), + }, + "runtime": { + "analysis_started_at_utc": _optional_str( + meta.get("analysis_started_at_utc") + ), + "report_generated_at_utc": _optional_str( + meta.get("report_generated_at_utc") + ), + "scan_root_absolute": _optional_str(meta.get("scan_root")), + "baseline_path_absolute": baseline_abs, + "cache_path_absolute": cache_abs, + "metrics_baseline_path_absolute": metrics_baseline_abs, + }, + } + analysis_profile = _analysis_profile_payload(meta) + if analysis_profile is not None: + payload["analysis_profile"] = analysis_profile + return payload diff --git a/codeclone/report/document/metrics.py b/codeclone/report/document/metrics.py new file mode 100644 index 0000000..dbb165c --- /dev/null +++ b/codeclone/report/document/metrics.py @@ -0,0 +1,781 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections.abc import Mapping + +from ...analysis.suppressions import INLINE_CODECLONE_SUPPRESSION_SOURCE +from ...domain.findings import ( + CATEGORY_COHESION, + CATEGORY_COMPLEXITY, + CATEGORY_COUPLING, + FAMILY_DEAD_CODE, +) +from ...domain.quality import ( + CONFIDENCE_HIGH, + CONFIDENCE_MEDIUM, + RISK_LOW, +) +from ...domain.source_scope import ( + SOURCE_KIND_FIXTURES, + SOURCE_KIND_OTHER, + SOURCE_KIND_PRODUCTION, + SOURCE_KIND_TESTS, +) +from ...metrics.registry import METRIC_FAMILIES +from ...utils.coerce import as_float as _as_float +from ...utils.coerce import as_int as _as_int +from ...utils.coerce import as_mapping as _as_mapping +from ...utils.coerce import as_sequence as _as_sequence +from ..derived import normalized_source_kind as _normalized_source_kind +from ._common import ( + _contract_path, + _normalize_nested_string_rows, + _optional_str, +) + +_OVERLOADED_MODULES_FAMILY = "overloaded_modules" + +_COVERAGE_ADOPTION_FAMILY = "coverage_adoption" + +_API_SURFACE_FAMILY = "api_surface" + +_COVERAGE_JOIN_FAMILY = "coverage_join" + +_SECURITY_SURFACES_FAMILY = "security_surfaces" + + +def _normalize_metrics_families( + metrics: Mapping[str, object] | None, + *, + scan_root: str, +) -> dict[str, object]: + metrics_map = _as_mapping(metrics) + complexity = _as_mapping(metrics_map.get(CATEGORY_COMPLEXITY)) + complexity_items = sorted( + ( + { + "qualname": str(item_map.get("qualname", "")), + "relative_path": _contract_path( + item_map.get("filepath", ""), + scan_root=scan_root, + )[0] + or "", + "start_line": _as_int(item_map.get("start_line")), + "end_line": _as_int(item_map.get("end_line")), + "cyclomatic_complexity": _as_int( + item_map.get("cyclomatic_complexity"), + 1, + ), + "nesting_depth": _as_int(item_map.get("nesting_depth")), + "risk": str(item_map.get("risk", RISK_LOW)), + } + for item in _as_sequence(complexity.get("functions")) + for item_map in (_as_mapping(item),) + ), + key=lambda item: ( + item["relative_path"], + item["start_line"], + item["end_line"], + item["qualname"], + ), + ) + + coupling = _as_mapping(metrics_map.get(CATEGORY_COUPLING)) + coupling_items = sorted( + ( + { + "qualname": str(item_map.get("qualname", "")), + "relative_path": _contract_path( + item_map.get("filepath", ""), + scan_root=scan_root, + )[0] + or "", + "start_line": _as_int(item_map.get("start_line")), + "end_line": _as_int(item_map.get("end_line")), + "cbo": _as_int(item_map.get("cbo")), + "risk": str(item_map.get("risk", RISK_LOW)), + "coupled_classes": sorted( + { + str(name) + for name in _as_sequence(item_map.get("coupled_classes")) + if str(name).strip() + } + ), + } + for item in _as_sequence(coupling.get("classes")) + for item_map in (_as_mapping(item),) + ), + key=lambda item: ( + item["relative_path"], + item["start_line"], + item["end_line"], + item["qualname"], + ), + ) + + cohesion = _as_mapping(metrics_map.get(CATEGORY_COHESION)) + cohesion_items = sorted( + ( + { + "qualname": str(item_map.get("qualname", "")), + "relative_path": _contract_path( + item_map.get("filepath", ""), + scan_root=scan_root, + )[0] + or "", + "start_line": _as_int(item_map.get("start_line")), + "end_line": _as_int(item_map.get("end_line")), + "lcom4": _as_int(item_map.get("lcom4")), + "risk": str(item_map.get("risk", RISK_LOW)), + "method_count": _as_int(item_map.get("method_count")), + "instance_var_count": _as_int(item_map.get("instance_var_count")), + } + for item in _as_sequence(cohesion.get("classes")) + for item_map in (_as_mapping(item),) + ), + key=lambda item: ( + item["relative_path"], + item["start_line"], + item["end_line"], + item["qualname"], + ), + ) + + dependencies = _as_mapping(metrics_map.get("dependencies")) + dependency_edges = sorted( + ( + { + "source": str(item_map.get("source", "")), + "target": str(item_map.get("target", "")), + "import_type": str(item_map.get("import_type", "")), + "line": _as_int(item_map.get("line")), + } + for item in _as_sequence(dependencies.get("edge_list")) + for item_map in (_as_mapping(item),) + ), + key=lambda item: ( + item["source"], + item["target"], + item["import_type"], + item["line"], + ), + ) + dependency_cycles = _normalize_nested_string_rows(dependencies.get("cycles")) + longest_chains = _normalize_nested_string_rows(dependencies.get("longest_chains")) + + dead_code = _as_mapping(metrics_map.get(FAMILY_DEAD_CODE)) + + def _normalize_suppressed_by( + raw_bindings: object, + ) -> list[dict[str, str]]: + normalized_bindings = sorted( + { + ( + str(binding_map.get("rule", "")).strip(), + str(binding_map.get("source", "")).strip(), + ) + for binding in _as_sequence(raw_bindings) + for binding_map in (_as_mapping(binding),) + if str(binding_map.get("rule", "")).strip() + }, + key=lambda item: (item[0], item[1]), + ) + if not normalized_bindings: + return [] + return [ + { + "rule": rule, + "source": source or INLINE_CODECLONE_SUPPRESSION_SOURCE, + } + for rule, source in normalized_bindings + ] + + dead_items = sorted( + ( + { + "qualname": str(item_map.get("qualname", "")), + "relative_path": _contract_path( + item_map.get("filepath", ""), + scan_root=scan_root, + )[0] + or "", + "start_line": _as_int(item_map.get("start_line")), + "end_line": _as_int(item_map.get("end_line")), + "kind": str(item_map.get("kind", "")), + "confidence": str(item_map.get("confidence", CONFIDENCE_MEDIUM)), + } + for item in _as_sequence(dead_code.get("items")) + for item_map in (_as_mapping(item),) + ), + key=lambda item: ( + item["relative_path"], + item["start_line"], + item["end_line"], + item["qualname"], + item["kind"], + ), + ) + dead_suppressed_items = sorted( + ( + { + "qualname": str(item_map.get("qualname", "")), + "relative_path": _contract_path( + item_map.get("filepath", ""), + scan_root=scan_root, + )[0] + or "", + "start_line": _as_int(item_map.get("start_line")), + "end_line": _as_int(item_map.get("end_line")), + "kind": str(item_map.get("kind", "")), + "confidence": str(item_map.get("confidence", CONFIDENCE_MEDIUM)), + "suppressed_by": _normalize_suppressed_by( + item_map.get("suppressed_by") + ), + } + for item in _as_sequence(dead_code.get("suppressed_items")) + for item_map in (_as_mapping(item),) + ), + key=lambda item: ( + item["relative_path"], + item["start_line"], + item["end_line"], + item["qualname"], + item["kind"], + item["confidence"], + tuple( + ( + str(_as_mapping(binding).get("rule", "")), + str(_as_mapping(binding).get("source", "")), + ) + for binding in _as_sequence(item.get("suppressed_by")) + ), + ), + ) + for item in dead_suppressed_items: + suppressed_by = _as_sequence(item.get("suppressed_by")) + first_binding = _as_mapping(suppressed_by[0]) if suppressed_by else {} + item["suppression_rule"] = str(first_binding.get("rule", "")) + item["suppression_source"] = str(first_binding.get("source", "")) + + health = _as_mapping(metrics_map.get("health")) + health_dimensions = { + str(key): _as_int(value) + for key, value in sorted(_as_mapping(health.get("dimensions")).items()) + } + overloaded_modules = _as_mapping(metrics_map.get(_OVERLOADED_MODULES_FAMILY)) + overloaded_modules_detection = _as_mapping(overloaded_modules.get("detection")) + overloaded_module_items = sorted( + ( + { + "module": str(item_map.get("module", "")).strip(), + "relative_path": _contract_path( + item_map.get("filepath", ""), + scan_root=scan_root, + )[0] + or "", + "source_kind": _normalized_source_kind(item_map.get("source_kind")), + "loc": _as_int(item_map.get("loc")), + "functions": _as_int(item_map.get("functions")), + "methods": _as_int(item_map.get("methods")), + "classes": _as_int(item_map.get("classes")), + "callable_count": _as_int(item_map.get("callable_count")), + "complexity_total": _as_int(item_map.get("complexity_total")), + "complexity_max": _as_int(item_map.get("complexity_max")), + "fan_in": _as_int(item_map.get("fan_in")), + "fan_out": _as_int(item_map.get("fan_out")), + "total_deps": _as_int(item_map.get("total_deps")), + "import_edges": _as_int(item_map.get("import_edges")), + "reimport_edges": _as_int(item_map.get("reimport_edges")), + "reimport_ratio": round( + _as_float(item_map.get("reimport_ratio")), + 4, + ), + "instability": round(_as_float(item_map.get("instability")), 4), + "hub_balance": round(_as_float(item_map.get("hub_balance")), 4), + "size_score": round(_as_float(item_map.get("size_score")), 4), + "dependency_score": round( + _as_float(item_map.get("dependency_score")), + 4, + ), + "shape_score": round(_as_float(item_map.get("shape_score")), 4), + "score": round(_as_float(item_map.get("score")), 4), + "candidate_status": str( + item_map.get("candidate_status", "non_candidate") + ), + "candidate_reasons": [ + str(reason) + for reason in _as_sequence(item_map.get("candidate_reasons")) + if str(reason).strip() + ], + } + for item in _as_sequence(overloaded_modules.get("items")) + for item_map in (_as_mapping(item),) + ), + key=lambda item: ( + {"candidate": 0, "ranked_only": 1, "non_candidate": 2}.get( + str(item["candidate_status"]), + 3, + ), + -_as_float(item["score"]), + -_as_float(item["size_score"]), + -_as_float(item["dependency_score"]), + item["relative_path"], + item["module"], + ), + ) + + complexity_summary = _as_mapping(complexity.get("summary")) + coupling_summary = _as_mapping(coupling.get("summary")) + cohesion_summary = _as_mapping(cohesion.get("summary")) + dead_code_summary = _as_mapping(dead_code.get("summary")) + overloaded_modules_summary = _as_mapping(overloaded_modules.get("summary")) + coverage_adoption = _as_mapping(metrics_map.get(_COVERAGE_ADOPTION_FAMILY)) + coverage_adoption_summary = _as_mapping(coverage_adoption.get("summary")) + coverage_adoption_items = sorted( + ( + { + "module": str(item_map.get("module", "")).strip(), + "relative_path": _contract_path( + item_map.get("filepath", ""), + scan_root=scan_root, + )[0] + or "", + "callable_count": _as_int(item_map.get("callable_count")), + "params_total": _as_int(item_map.get("params_total")), + "params_annotated": _as_int(item_map.get("params_annotated")), + "param_permille": _as_int(item_map.get("param_permille")), + "returns_total": _as_int(item_map.get("returns_total")), + "returns_annotated": _as_int(item_map.get("returns_annotated")), + "return_permille": _as_int(item_map.get("return_permille")), + "any_annotation_count": _as_int(item_map.get("any_annotation_count")), + "public_symbol_total": _as_int(item_map.get("public_symbol_total")), + "public_symbol_documented": _as_int( + item_map.get("public_symbol_documented") + ), + "docstring_permille": _as_int(item_map.get("docstring_permille")), + } + for item in _as_sequence(coverage_adoption.get("items")) + for item_map in (_as_mapping(item),) + ), + key=lambda item: ( + item["relative_path"], + item["module"], + ), + ) + api_surface = _as_mapping(metrics_map.get(_API_SURFACE_FAMILY)) + api_surface_summary = _as_mapping(api_surface.get("summary")) + api_surface_items = sorted( + ( + { + "record_kind": str(item_map.get("record_kind", "symbol")), + "module": str(item_map.get("module", "")).strip(), + "relative_path": _contract_path( + item_map.get("filepath", ""), + scan_root=scan_root, + )[0] + or "", + "qualname": str(item_map.get("qualname", "")), + "start_line": _as_int(item_map.get("start_line")), + "end_line": _as_int(item_map.get("end_line")), + "symbol_kind": str(item_map.get("symbol_kind", "")), + "exported_via": _optional_str(item_map.get("exported_via")), + "params_total": _as_int(item_map.get("params_total")), + "params": [ + { + "name": str(param_map.get("name", "")), + "kind": str(param_map.get("kind", "")), + "has_default": bool(param_map.get("has_default")), + "annotated": bool(param_map.get("annotated")), + } + for param in _as_sequence(item_map.get("params")) + for param_map in (_as_mapping(param),) + ], + "returns_annotated": bool(item_map.get("returns_annotated")), + "change_kind": _optional_str(item_map.get("change_kind")), + "detail": _optional_str(item_map.get("detail")), + } + for item in _as_sequence(api_surface.get("items")) + for item_map in (_as_mapping(item),) + ), + key=lambda item: ( + item["relative_path"], + item["start_line"], + item["end_line"], + item["qualname"], + item["record_kind"], + ), + ) + coverage_join = _as_mapping(metrics_map.get(_COVERAGE_JOIN_FAMILY)) + coverage_join_summary = _as_mapping(coverage_join.get("summary")) + coverage_join_items = sorted( + ( + { + "relative_path": _contract_path( + item_map.get("filepath", ""), + scan_root=scan_root, + )[0] + or "", + "qualname": str(item_map.get("qualname", "")).strip(), + "start_line": _as_int(item_map.get("start_line")), + "end_line": _as_int(item_map.get("end_line")), + "cyclomatic_complexity": _as_int( + item_map.get("cyclomatic_complexity"), + 1, + ), + "risk": str(item_map.get("risk", RISK_LOW)).strip() or RISK_LOW, + "executable_lines": _as_int(item_map.get("executable_lines")), + "covered_lines": _as_int(item_map.get("covered_lines")), + "coverage_permille": _as_int(item_map.get("coverage_permille")), + "coverage_status": str(item_map.get("coverage_status", "")).strip(), + "coverage_hotspot": bool(item_map.get("coverage_hotspot")), + "scope_gap_hotspot": bool(item_map.get("scope_gap_hotspot")), + } + for item in _as_sequence(coverage_join.get("items")) + for item_map in (_as_mapping(item),) + ), + key=lambda item: ( + 0 if bool(item["coverage_hotspot"]) else 1, + 0 if bool(item["scope_gap_hotspot"]) else 1, + {"high": 0, "medium": 1, "low": 2}.get(str(item["risk"]), 3), + _as_int(item["coverage_permille"]), + -_as_int(item["cyclomatic_complexity"]), + item["relative_path"], + _as_int(item["start_line"]), + item["qualname"], + ), + ) + security_surfaces = _as_mapping(metrics_map.get(_SECURITY_SURFACES_FAMILY)) + security_surfaces_summary = _as_mapping(security_surfaces.get("summary")) + raw_category_counts = _as_mapping(security_surfaces_summary.get("categories")) + raw_source_kind_counts = _as_mapping( + security_surfaces_summary.get("by_source_kind") + ) + security_surface_items = sorted( + ( + { + "category": str(item_map.get("category", "")).strip(), + "capability": str(item_map.get("capability", "")).strip(), + "module": str(item_map.get("module", "")).strip(), + "qualname": str(item_map.get("qualname", "")).strip(), + "relative_path": _contract_path( + item_map.get("filepath", ""), + scan_root=scan_root, + )[0] + or "", + "source_kind": str(item_map.get("source_kind", SOURCE_KIND_OTHER)), + "start_line": _as_int(item_map.get("start_line")), + "end_line": _as_int(item_map.get("end_line")), + "location_scope": str(item_map.get("location_scope", "")).strip(), + "classification_mode": str( + item_map.get("classification_mode", "") + ).strip(), + "evidence_kind": str(item_map.get("evidence_kind", "")).strip(), + "evidence_symbol": str(item_map.get("evidence_symbol", "")).strip(), + } + for item in _as_sequence(security_surfaces.get("items")) + for item_map in (_as_mapping(item),) + ), + key=lambda item: ( + item["relative_path"], + item["start_line"], + item["end_line"], + item["qualname"], + item["category"], + item["capability"], + item["evidence_symbol"], + ), + ) + dead_high_confidence = sum( + 1 + for item in dead_items + if str(_as_mapping(item).get("confidence", "")).strip().lower() + == CONFIDENCE_HIGH + ) + + family_sections: dict[str, object] = { + CATEGORY_COMPLEXITY: { + "summary": { + "total": len(complexity_items), + "average": round(_as_float(complexity_summary.get("average")), 2), + "max": _as_int(complexity_summary.get("max")), + "high_risk": _as_int(complexity_summary.get("high_risk")), + }, + "items": complexity_items, + "items_truncated": False, + }, + CATEGORY_COUPLING: { + "summary": { + "total": len(coupling_items), + "average": round(_as_float(coupling_summary.get("average")), 2), + "max": _as_int(coupling_summary.get("max")), + "high_risk": _as_int(coupling_summary.get("high_risk")), + }, + "items": coupling_items, + "items_truncated": False, + }, + CATEGORY_COHESION: { + "summary": { + "total": len(cohesion_items), + "average": round(_as_float(cohesion_summary.get("average")), 2), + "max": _as_int(cohesion_summary.get("max")), + "low_cohesion": _as_int(cohesion_summary.get("low_cohesion")), + }, + "items": cohesion_items, + "items_truncated": False, + }, + "dependencies": { + "summary": { + "modules": _as_int(dependencies.get("modules")), + "edges": _as_int(dependencies.get("edges")), + "cycles": len(dependency_cycles), + "max_depth": _as_int(dependencies.get("max_depth")), + "avg_depth": round(_as_float(dependencies.get("avg_depth")), 2), + "p95_depth": _as_int(dependencies.get("p95_depth")), + }, + "items": dependency_edges, + "cycles": dependency_cycles, + "longest_chains": longest_chains, + "items_truncated": False, + }, + FAMILY_DEAD_CODE: { + "summary": { + "total": len(dead_items), + "high_confidence": dead_high_confidence + or _as_int( + dead_code_summary.get( + "high_confidence", dead_code_summary.get("critical") + ) + ), + "suppressed": len(dead_suppressed_items) + or _as_int(dead_code_summary.get("suppressed")), + }, + "items": dead_items, + "suppressed_items": dead_suppressed_items, + "items_truncated": False, + }, + "health": { + "summary": { + "score": _as_int(health.get("score")), + "grade": str(health.get("grade", "")), + "dimensions": health_dimensions, + }, + "items": [], + "items_truncated": False, + }, + _COVERAGE_ADOPTION_FAMILY: { + "summary": { + "modules": len(coverage_adoption_items), + "params_total": _as_int(coverage_adoption_summary.get("params_total")), + "params_annotated": _as_int( + coverage_adoption_summary.get("params_annotated") + ), + "param_permille": _as_int( + coverage_adoption_summary.get("param_permille") + ), + "baseline_diff_available": bool( + coverage_adoption_summary.get("baseline_diff_available") + ), + "param_delta": _as_int(coverage_adoption_summary.get("param_delta")), + "returns_total": _as_int( + coverage_adoption_summary.get("returns_total") + ), + "returns_annotated": _as_int( + coverage_adoption_summary.get("returns_annotated") + ), + "return_permille": _as_int( + coverage_adoption_summary.get("return_permille") + ), + "return_delta": _as_int(coverage_adoption_summary.get("return_delta")), + "public_symbol_total": _as_int( + coverage_adoption_summary.get("public_symbol_total") + ), + "public_symbol_documented": _as_int( + coverage_adoption_summary.get("public_symbol_documented") + ), + "docstring_permille": _as_int( + coverage_adoption_summary.get("docstring_permille") + ), + "docstring_delta": _as_int( + coverage_adoption_summary.get("docstring_delta") + ), + "typing_any_count": _as_int( + coverage_adoption_summary.get("typing_any_count") + ), + }, + "items": coverage_adoption_items, + "items_truncated": False, + }, + _API_SURFACE_FAMILY: { + "summary": { + "enabled": bool(api_surface_summary.get("enabled")), + "baseline_diff_available": bool( + api_surface_summary.get("baseline_diff_available") + ), + "modules": _as_int(api_surface_summary.get("modules")), + "public_symbols": _as_int(api_surface_summary.get("public_symbols")), + "added": _as_int(api_surface_summary.get("added")), + "breaking": _as_int(api_surface_summary.get("breaking")), + "strict_types": bool(api_surface_summary.get("strict_types")), + }, + "items": api_surface_items, + "items_truncated": False, + }, + _OVERLOADED_MODULES_FAMILY: { + "summary": { + "total": len(overloaded_module_items), + "candidates": _as_int(overloaded_modules_summary.get("candidates")), + "population_status": str( + overloaded_modules_summary.get("population_status", "limited") + ), + "top_score": round( + _as_float(overloaded_modules_summary.get("top_score")), + 4, + ), + "average_score": round( + _as_float(overloaded_modules_summary.get("average_score")), + 4, + ), + "candidate_score_cutoff": round( + _as_float(overloaded_modules_summary.get("candidate_score_cutoff")), + 4, + ), + }, + "detection": { + "version": str(overloaded_modules_detection.get("version", "1")), + "scope": str(overloaded_modules_detection.get("scope", "report_only")), + "strategy": str( + overloaded_modules_detection.get( + "strategy", + "project_relative_composite", + ) + ), + "minimum_population": _as_int( + overloaded_modules_detection.get("minimum_population"), + ), + "size_signals": [ + str(signal) + for signal in _as_sequence( + overloaded_modules_detection.get("size_signals") + ) + if str(signal).strip() + ], + "dependency_signals": [ + str(signal) + for signal in _as_sequence( + overloaded_modules_detection.get("dependency_signals") + ) + if str(signal).strip() + ], + "shape_signals": [ + str(signal) + for signal in _as_sequence( + overloaded_modules_detection.get("shape_signals") + ) + if str(signal).strip() + ], + }, + "items": overloaded_module_items, + "items_truncated": False, + }, + _SECURITY_SURFACES_FAMILY: { + "summary": { + "items": _as_int(security_surfaces_summary.get("items")), + "modules": _as_int(security_surfaces_summary.get("modules")), + "exact_items": _as_int(security_surfaces_summary.get("exact_items")), + "category_count": _as_int( + security_surfaces_summary.get("category_count") + ), + "categories": { + str(key): _as_int(value) + for key, value in sorted(raw_category_counts.items()) + if str(key).strip() + }, + "by_source_kind": { + SOURCE_KIND_PRODUCTION: _as_int( + raw_source_kind_counts.get(SOURCE_KIND_PRODUCTION) + ), + SOURCE_KIND_TESTS: _as_int( + raw_source_kind_counts.get(SOURCE_KIND_TESTS) + ), + SOURCE_KIND_FIXTURES: _as_int( + raw_source_kind_counts.get(SOURCE_KIND_FIXTURES) + ), + SOURCE_KIND_OTHER: _as_int( + raw_source_kind_counts.get(SOURCE_KIND_OTHER) + ), + }, + "production": _as_int(security_surfaces_summary.get("production")), + "tests": _as_int(security_surfaces_summary.get("tests")), + "fixtures": _as_int(security_surfaces_summary.get("fixtures")), + "other": _as_int(security_surfaces_summary.get("other")), + "report_only": bool(security_surfaces_summary.get("report_only")), + }, + "items": security_surface_items, + "items_truncated": False, + }, + } + if coverage_join_summary or coverage_join_items or coverage_join: + family_sections[_COVERAGE_JOIN_FAMILY] = { + "summary": { + "status": str(coverage_join_summary.get("status", "")), + "source": _contract_path( + coverage_join_summary.get("source", ""), + scan_root=scan_root, + )[0], + "files": _as_int(coverage_join_summary.get("files")), + "units": _as_int(coverage_join_summary.get("units")), + "measured_units": _as_int(coverage_join_summary.get("measured_units")), + "overall_executable_lines": _as_int( + coverage_join_summary.get("overall_executable_lines") + ), + "overall_covered_lines": _as_int( + coverage_join_summary.get("overall_covered_lines") + ), + "overall_permille": _as_int( + coverage_join_summary.get("overall_permille") + ), + "missing_from_report_units": _as_int( + coverage_join_summary.get("missing_from_report_units") + ), + "coverage_hotspots": _as_int( + coverage_join_summary.get("coverage_hotspots") + ), + "scope_gap_hotspots": _as_int( + coverage_join_summary.get("scope_gap_hotspots") + ), + "hotspot_threshold_percent": _as_int( + coverage_join_summary.get("hotspot_threshold_percent") + ), + "invalid_reason": _optional_str( + coverage_join_summary.get("invalid_reason") + ), + }, + "items": coverage_join_items, + "items_truncated": False, + } + normalized: dict[str, object] = {} + for family in METRIC_FAMILIES.values(): + section = family.report_section + if section in family_sections: + normalized[section] = family_sections[section] + return normalized + + +def _build_metrics_payload( + metrics: Mapping[str, object] | None, + *, + scan_root: str, +) -> dict[str, object]: + families = _normalize_metrics_families(metrics, scan_root=scan_root) + return { + "summary": { + family_name: _as_mapping(_as_mapping(family_payload).get("summary")) + for family_name, family_payload in families.items() + }, + "families": families, + } diff --git a/codeclone/report/explain.py b/codeclone/report/explain.py index 73605b0..2a85a1f 100644 --- a/codeclone/report/explain.py +++ b/codeclone/report/explain.py @@ -12,7 +12,7 @@ from pathlib import Path from typing import TYPE_CHECKING -from .._coerce import as_int +from ..utils.coerce import as_int from .explain_contract import ( BLOCK_HINT_ASSERT_ONLY, BLOCK_HINT_ASSERT_ONLY_LABEL, diff --git a/codeclone/report/findings.py b/codeclone/report/findings.py index 350b836..7843967 100644 --- a/codeclone/report/findings.py +++ b/codeclone/report/findings.py @@ -6,7 +6,7 @@ """Deterministic structural-finding helpers for the report layer. -HTML rendering lives in ``codeclone._html_report._sections._structural``. +HTML rendering lives in ``codeclone.report.html.sections._structural``. """ from __future__ import annotations diff --git a/codeclone/report/gates/__init__.py b/codeclone/report/gates/__init__.py new file mode 100644 index 0000000..9135843 --- /dev/null +++ b/codeclone/report/gates/__init__.py @@ -0,0 +1,5 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy diff --git a/codeclone/report/gates/evaluator.py b/codeclone/report/gates/evaluator.py new file mode 100644 index 0000000..62310f2 --- /dev/null +++ b/codeclone/report/gates/evaluator.py @@ -0,0 +1,681 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections.abc import Callable, Mapping +from dataclasses import dataclass +from typing import TYPE_CHECKING + +from ...contracts import DEFAULT_COVERAGE_MIN, ExitCode +from ...metrics.registry import METRIC_FAMILIES +from ...utils.coerce import as_int as _as_int +from ...utils.coerce import as_mapping as _as_mapping +from ...utils.coerce import as_sequence as _as_sequence + +if TYPE_CHECKING: + from ...models import CoverageJoinResult, ProjectMetrics + + +@dataclass(frozen=True, slots=True) +class MetricGateConfig: + fail_complexity: int + fail_coupling: int + fail_cohesion: int + fail_cycles: bool + fail_dead_code: bool + fail_health: int + fail_on_new_metrics: bool + fail_on_typing_regression: bool = False + fail_on_docstring_regression: bool = False + fail_on_api_break: bool = False + fail_on_untested_hotspots: bool = False + min_typing_coverage: int = -1 + min_docstring_coverage: int = -1 + coverage_min: int = DEFAULT_COVERAGE_MIN + fail_on_new: bool = False + fail_threshold: int = -1 + + +@dataclass(frozen=True, slots=True) +class GateResult: + exit_code: int + reasons: tuple[str, ...] + + +@dataclass(frozen=True, slots=True) +class GateState: + clone_new_count: int = 0 + clone_total: int = 0 + complexity_max: int = 0 + coupling_max: int = 0 + cohesion_max: int = 0 + dependency_cycles: int = 0 + dead_high_confidence: int = 0 + health_score: int = 0 + typing_param_permille: int = 0 + docstring_permille: int = 0 + coverage_join_status: str = "" + coverage_hotspots: int = 0 + api_breaking_changes: int = 0 + diff_new_high_risk_functions: int = 0 + diff_new_high_coupling_classes: int = 0 + diff_new_cycles: int = 0 + diff_new_dead_code: int = 0 + diff_health_delta: int = 0 + diff_typing_param_permille_delta: int = 0 + diff_typing_return_permille_delta: int = 0 + diff_docstring_permille_delta: int = 0 + + +def summarize_metrics_diff(metrics_diff: object | None) -> dict[str, object] | None: + if metrics_diff is None: + return None + + if isinstance(metrics_diff, Mapping): + payload = metrics_diff + return { + "new_high_risk_functions": _as_int( + payload.get("new_high_risk_functions"), + 0, + ), + "new_high_coupling_classes": _as_int( + payload.get("new_high_coupling_classes"), + 0, + ), + "new_cycles": _as_int(payload.get("new_cycles"), 0), + "new_dead_code": _as_int(payload.get("new_dead_code"), 0), + "health_delta": _as_int(payload.get("health_delta"), 0), + "typing_param_permille_delta": _as_int( + payload.get("typing_param_permille_delta"), + 0, + ), + "typing_return_permille_delta": _as_int( + payload.get("typing_return_permille_delta"), + 0, + ), + "docstring_permille_delta": _as_int( + payload.get("docstring_permille_delta"), + 0, + ), + "new_api_symbols": _as_int(payload.get("new_api_symbols"), 0), + "api_breaking_changes": _as_int( + payload.get("api_breaking_changes"), + _as_int(payload.get("new_api_breaking_changes"), 0), + ), + } + + new_high_risk_functions = tuple( + str(item) + for item in _as_sequence(getattr(metrics_diff, "new_high_risk_functions", ())) + if str(item).strip() + ) + new_high_coupling_classes = tuple( + str(item) + for item in _as_sequence(getattr(metrics_diff, "new_high_coupling_classes", ())) + if str(item).strip() + ) + new_cycles = tuple( + tuple(str(part) for part in _as_sequence(item) if str(part).strip()) + for item in _as_sequence(getattr(metrics_diff, "new_cycles", ())) + ) + new_dead_code = tuple( + str(item) + for item in _as_sequence(getattr(metrics_diff, "new_dead_code", ())) + if str(item).strip() + ) + api_breaking_changes = tuple( + _as_sequence(getattr(metrics_diff, "new_api_breaking_changes", ())) + ) + new_api_symbols = tuple(_as_sequence(getattr(metrics_diff, "new_api_symbols", ()))) + return { + "new_high_risk_functions": len(new_high_risk_functions), + "new_high_coupling_classes": len(new_high_coupling_classes), + "new_cycles": len(new_cycles), + "new_dead_code": len(new_dead_code), + "health_delta": _as_int(getattr(metrics_diff, "health_delta", 0), 0), + "typing_param_permille_delta": _as_int( + getattr(metrics_diff, "typing_param_permille_delta", 0), + 0, + ), + "typing_return_permille_delta": _as_int( + getattr(metrics_diff, "typing_return_permille_delta", 0), + 0, + ), + "docstring_permille_delta": _as_int( + getattr(metrics_diff, "docstring_permille_delta", 0), + 0, + ), + "new_api_symbols": len(new_api_symbols), + "api_breaking_changes": len(api_breaking_changes), + } + + +def gate_state_from_project_metrics( + *, + project_metrics: ProjectMetrics, + coverage_join: CoverageJoinResult | None, + metrics_diff: object | None, + clone_new_count: int = 0, + clone_total: int = 0, +) -> GateState: + diff_summary = summarize_metrics_diff(metrics_diff) or {} + return GateState( + clone_new_count=max(clone_new_count, 0), + clone_total=max(clone_total, 0), + complexity_max=max(int(project_metrics.complexity_max), 0), + coupling_max=max(int(project_metrics.coupling_max), 0), + cohesion_max=max(int(project_metrics.cohesion_max), 0), + dependency_cycles=len(tuple(project_metrics.dependency_cycles)), + dead_high_confidence=sum( + 1 + for item in project_metrics.dead_code + if str(getattr(item, "confidence", "")).strip().lower() == "high" + ), + health_score=max(int(project_metrics.health.total), 0), + typing_param_permille=_permille( + int(project_metrics.typing_param_annotated), + int(project_metrics.typing_param_total), + ), + docstring_permille=_permille( + int(project_metrics.docstring_public_documented), + int(project_metrics.docstring_public_total), + ), + coverage_join_status=( + str(coverage_join.status) if coverage_join is not None else "" + ), + coverage_hotspots=( + int(coverage_join.coverage_hotspots) if coverage_join is not None else 0 + ), + api_breaking_changes=_as_int(diff_summary.get("api_breaking_changes"), 0), + diff_new_high_risk_functions=_as_int( + diff_summary.get("new_high_risk_functions"), + 0, + ), + diff_new_high_coupling_classes=_as_int( + diff_summary.get("new_high_coupling_classes"), + 0, + ), + diff_new_cycles=_as_int(diff_summary.get("new_cycles"), 0), + diff_new_dead_code=_as_int(diff_summary.get("new_dead_code"), 0), + diff_health_delta=_as_int(diff_summary.get("health_delta"), 0), + diff_typing_param_permille_delta=_as_int( + diff_summary.get("typing_param_permille_delta"), + 0, + ), + diff_typing_return_permille_delta=_as_int( + diff_summary.get("typing_return_permille_delta"), + 0, + ), + diff_docstring_permille_delta=_as_int( + diff_summary.get("docstring_permille_delta"), + 0, + ), + ) + + +def metric_gate_reasons_for_state( + *, + state: GateState, + config: MetricGateConfig, +) -> tuple[str, ...]: + gate_keys = sorted( + { + gate_key + for family in METRIC_FAMILIES.values() + for gate_key in family.gate_keys + }, + key=lambda gate_key: (_GATE_REASON_ORDER.get(gate_key, 999), gate_key), + ) + reasons: list[str] = [] + for gate_key in gate_keys: + builder = _GATE_REASON_BUILDERS.get(gate_key) + if builder is None: + continue + reasons.extend(builder(state=state, config=config)) + return tuple(reasons) + + +_GATE_REASON_ORDER = { + "complexity_threshold": 10, + "coupling_threshold": 20, + "cohesion_threshold": 30, + "health_threshold": 40, + "dependency_cycles": 50, + "dead_code_high_confidence": 60, + "new_high_risk_functions": 70, + "new_high_coupling_classes": 80, + "new_dependency_cycles": 90, + "new_dead_code": 100, + "health_regression": 110, + "typing_coverage_threshold": 120, + "docstring_coverage_threshold": 130, + "typing_regression": 140, + "docstring_regression": 150, + "api_breaking_changes": 160, + "coverage_hotspots": 170, +} + + +def _reason_if(triggered: bool, message: str) -> tuple[str, ...]: + return (message,) if triggered else () + + +def _complexity_threshold_reason( + *, + state: GateState, + config: MetricGateConfig, +) -> tuple[str, ...]: + return _reason_if( + 0 <= config.fail_complexity < state.complexity_max, + "Complexity threshold exceeded: " + f"max CC={state.complexity_max}, " + f"threshold={config.fail_complexity}.", + ) + + +def _coupling_threshold_reason( + *, + state: GateState, + config: MetricGateConfig, +) -> tuple[str, ...]: + return _reason_if( + 0 <= config.fail_coupling < state.coupling_max, + "Coupling threshold exceeded: " + f"max CBO={state.coupling_max}, " + f"threshold={config.fail_coupling}.", + ) + + +def _cohesion_threshold_reason( + *, + state: GateState, + config: MetricGateConfig, +) -> tuple[str, ...]: + return _reason_if( + 0 <= config.fail_cohesion < state.cohesion_max, + "Cohesion threshold exceeded: " + f"max LCOM4={state.cohesion_max}, " + f"threshold={config.fail_cohesion}.", + ) + + +def _health_threshold_reason( + *, + state: GateState, + config: MetricGateConfig, +) -> tuple[str, ...]: + return _reason_if( + config.fail_health >= 0 and state.health_score < config.fail_health, + "Health score below threshold: " + f"score={state.health_score}, threshold={config.fail_health}.", + ) + + +def _dependency_cycles_reason( + *, + state: GateState, + config: MetricGateConfig, +) -> tuple[str, ...]: + return _reason_if( + config.fail_cycles and state.dependency_cycles > 0, + f"Dependency cycles detected: {state.dependency_cycles} cycle(s).", + ) + + +def _dead_code_high_confidence_reason( + *, + state: GateState, + config: MetricGateConfig, +) -> tuple[str, ...]: + return _reason_if( + config.fail_dead_code and state.dead_high_confidence > 0, + f"Dead code detected (high confidence): {state.dead_high_confidence} item(s).", + ) + + +def _new_high_risk_functions_reason( + *, + state: GateState, + config: MetricGateConfig, +) -> tuple[str, ...]: + return _reason_if( + config.fail_on_new_metrics and state.diff_new_high_risk_functions > 0, + "New high-risk functions vs metrics baseline: " + f"{state.diff_new_high_risk_functions}.", + ) + + +def _new_high_coupling_classes_reason( + *, + state: GateState, + config: MetricGateConfig, +) -> tuple[str, ...]: + return _reason_if( + config.fail_on_new_metrics and state.diff_new_high_coupling_classes > 0, + "New high-coupling classes vs metrics baseline: " + f"{state.diff_new_high_coupling_classes}.", + ) + + +def _new_dependency_cycles_reason( + *, + state: GateState, + config: MetricGateConfig, +) -> tuple[str, ...]: + return _reason_if( + config.fail_on_new_metrics and state.diff_new_cycles > 0, + f"New dependency cycles vs metrics baseline: {state.diff_new_cycles}.", + ) + + +def _new_dead_code_reason( + *, + state: GateState, + config: MetricGateConfig, +) -> tuple[str, ...]: + return _reason_if( + config.fail_on_new_metrics and state.diff_new_dead_code > 0, + f"New dead code items vs metrics baseline: {state.diff_new_dead_code}.", + ) + + +def _health_regression_reason( + *, + state: GateState, + config: MetricGateConfig, +) -> tuple[str, ...]: + return _reason_if( + config.fail_on_new_metrics and state.diff_health_delta < 0, + f"Health score regressed vs metrics baseline: delta={state.diff_health_delta}.", + ) + + +def _typing_coverage_threshold_reason( + *, + state: GateState, + config: MetricGateConfig, +) -> tuple[str, ...]: + typing_percent = state.typing_param_permille / 10.0 + return _reason_if( + config.min_typing_coverage >= 0 + and typing_percent < float(config.min_typing_coverage), + "Typing coverage below threshold: " + f"coverage={typing_percent:.1f}%, threshold={config.min_typing_coverage}%.", + ) + + +def _docstring_coverage_threshold_reason( + *, + state: GateState, + config: MetricGateConfig, +) -> tuple[str, ...]: + docstring_percent = state.docstring_permille / 10.0 + return _reason_if( + config.min_docstring_coverage >= 0 + and docstring_percent < float(config.min_docstring_coverage), + "Docstring coverage below threshold: " + f"coverage={docstring_percent:.1f}%, " + f"threshold={config.min_docstring_coverage}%.", + ) + + +def _typing_regression_reason( + *, + state: GateState, + config: MetricGateConfig, +) -> tuple[str, ...]: + return _reason_if( + config.fail_on_typing_regression + and ( + state.diff_typing_param_permille_delta < 0 + or state.diff_typing_return_permille_delta < 0 + ), + "Typing coverage regressed vs metrics baseline: " + f"params_delta={state.diff_typing_param_permille_delta}, " + f"returns_delta={state.diff_typing_return_permille_delta}.", + ) + + +def _docstring_regression_reason( + *, + state: GateState, + config: MetricGateConfig, +) -> tuple[str, ...]: + return _reason_if( + config.fail_on_docstring_regression and state.diff_docstring_permille_delta < 0, + "Docstring coverage regressed vs metrics baseline: " + f"delta={state.diff_docstring_permille_delta}.", + ) + + +def _api_breaking_changes_reason( + *, + state: GateState, + config: MetricGateConfig, +) -> tuple[str, ...]: + return _reason_if( + config.fail_on_api_break and state.api_breaking_changes > 0, + "Public API breaking changes vs metrics baseline: " + f"{state.api_breaking_changes}.", + ) + + +def _coverage_hotspots_reason( + *, + state: GateState, + config: MetricGateConfig, +) -> tuple[str, ...]: + return _reason_if( + config.fail_on_untested_hotspots + and state.coverage_join_status == "ok" + and state.coverage_hotspots > 0, + "Coverage hotspots detected: " + f"hotspots={state.coverage_hotspots}, " + f"threshold={config.coverage_min}%.", + ) + + +_GATE_REASON_BUILDERS: dict[str, Callable[..., tuple[str, ...]]] = { + "complexity_threshold": _complexity_threshold_reason, + "coupling_threshold": _coupling_threshold_reason, + "cohesion_threshold": _cohesion_threshold_reason, + "health_threshold": _health_threshold_reason, + "dependency_cycles": _dependency_cycles_reason, + "dead_code_high_confidence": _dead_code_high_confidence_reason, + "new_high_risk_functions": _new_high_risk_functions_reason, + "new_high_coupling_classes": _new_high_coupling_classes_reason, + "new_dependency_cycles": _new_dependency_cycles_reason, + "new_dead_code": _new_dead_code_reason, + "health_regression": _health_regression_reason, + "typing_coverage_threshold": _typing_coverage_threshold_reason, + "docstring_coverage_threshold": _docstring_coverage_threshold_reason, + "typing_regression": _typing_regression_reason, + "docstring_regression": _docstring_regression_reason, + "api_breaking_changes": _api_breaking_changes_reason, + "coverage_hotspots": _coverage_hotspots_reason, +} + + +def evaluate_gate_state( + *, + state: GateState, + config: MetricGateConfig, +) -> GateResult: + reasons = [ + f"metric:{reason}" + for reason in metric_gate_reasons_for_state(state=state, config=config) + ] + + if config.fail_on_new and state.clone_new_count > 0: + reasons.append("clone:new") + + if 0 <= config.fail_threshold < state.clone_total: + reasons.append(f"clone:threshold:{state.clone_total}:{config.fail_threshold}") + + if reasons: + return GateResult( + exit_code=int(ExitCode.GATING_FAILURE), + reasons=tuple(reasons), + ) + return GateResult(exit_code=int(ExitCode.SUCCESS), reasons=()) + + +# codeclone: ignore[dead-code] +def metric_gate_reasons( + *, + report_document: Mapping[str, object], + config: MetricGateConfig, + metrics_diff: object | None = None, +) -> tuple[str, ...]: + state = _gate_state_from_report_document( + report_document=report_document, + metrics_diff=metrics_diff, + ) + return metric_gate_reasons_for_state(state=state, config=config) + + +def evaluate_gates( + *, + report_document: Mapping[str, object], + config: MetricGateConfig, + baseline_status: str | None = None, + metrics_diff: object | None = None, + clone_new_count: int | None = None, + clone_total: int | None = None, +) -> GateResult: + _ = baseline_status + state = _gate_state_from_report_document( + report_document=report_document, + metrics_diff=metrics_diff, + clone_new_count=clone_new_count, + clone_total=clone_total, + ) + return evaluate_gate_state(state=state, config=config) + + +def _gate_state_from_report_document( + *, + report_document: Mapping[str, object], + metrics_diff: object | None, + clone_new_count: int | None = None, + clone_total: int | None = None, +) -> GateState: + findings = _as_mapping(report_document.get("findings")) + groups = _as_mapping(findings.get("groups")) + clone_groups = _as_mapping(groups.get("clones")) + function_groups = _as_sequence(clone_groups.get("functions")) + block_groups = _as_sequence(clone_groups.get("blocks")) + derived_clone_new_count = sum( + 1 + for group in (*function_groups, *block_groups) + if str(_as_mapping(group).get("novelty", "")).strip() == "new" + ) + metrics = _as_mapping(report_document.get("metrics")) + families = _as_mapping(metrics.get("families")) + complexity_summary = _as_mapping( + _as_mapping(families.get("complexity")).get("summary") + ) + coupling_summary = _as_mapping(_as_mapping(families.get("coupling")).get("summary")) + cohesion_summary = _as_mapping(_as_mapping(families.get("cohesion")).get("summary")) + dependencies_summary = _as_mapping( + _as_mapping(families.get("dependencies")).get("summary") + ) + dead_code_summary = _as_mapping( + _as_mapping(families.get("dead_code")).get("summary") + ) + health_summary = _as_mapping(_as_mapping(families.get("health")).get("summary")) + coverage_adoption_summary = _as_mapping( + _as_mapping(families.get("coverage_adoption")).get("summary") + ) + api_surface_summary = _as_mapping( + _as_mapping(families.get("api_surface")).get("summary") + ) + coverage_join_summary = _as_mapping( + _as_mapping(families.get("coverage_join")).get("summary") + ) + diff_summary = summarize_metrics_diff(metrics_diff) or {} + prefer_diff_summary = metrics_diff is not None + return GateState( + clone_new_count=max( + clone_new_count if clone_new_count is not None else derived_clone_new_count, + 0, + ), + clone_total=max( + clone_total + if clone_total is not None + else len(function_groups) + len(block_groups), + 0, + ), + complexity_max=_as_int(complexity_summary.get("max"), 0), + coupling_max=_as_int(coupling_summary.get("max"), 0), + cohesion_max=_as_int(cohesion_summary.get("max"), 0), + dependency_cycles=_as_int(dependencies_summary.get("cycles"), 0), + dead_high_confidence=_as_int(dead_code_summary.get("high_confidence"), 0), + health_score=_as_int(health_summary.get("score"), 0), + typing_param_permille=_as_int( + coverage_adoption_summary.get("param_permille"), 0 + ), + docstring_permille=_as_int( + coverage_adoption_summary.get("docstring_permille"), + 0, + ), + coverage_join_status=str(coverage_join_summary.get("status", "")), + coverage_hotspots=_as_int( + coverage_join_summary.get("coverage_hotspots"), + 0, + ), + api_breaking_changes=( + _as_int(diff_summary.get("api_breaking_changes"), 0) + if prefer_diff_summary + else _as_int(api_surface_summary.get("breaking"), 0) + ), + diff_new_high_risk_functions=_as_int( + diff_summary.get("new_high_risk_functions"), + 0, + ), + diff_new_high_coupling_classes=_as_int( + diff_summary.get("new_high_coupling_classes"), + 0, + ), + diff_new_cycles=_as_int(diff_summary.get("new_cycles"), 0), + diff_new_dead_code=_as_int(diff_summary.get("new_dead_code"), 0), + diff_health_delta=_as_int(diff_summary.get("health_delta"), 0), + diff_typing_param_permille_delta=( + _as_int(diff_summary.get("typing_param_permille_delta"), 0) + if prefer_diff_summary + else _as_int(coverage_adoption_summary.get("param_delta"), 0) + ), + diff_typing_return_permille_delta=( + _as_int(diff_summary.get("typing_return_permille_delta"), 0) + if prefer_diff_summary + else _as_int(coverage_adoption_summary.get("return_delta"), 0) + ), + diff_docstring_permille_delta=( + _as_int(diff_summary.get("docstring_permille_delta"), 0) + if prefer_diff_summary + else _as_int(coverage_adoption_summary.get("docstring_delta"), 0) + ), + ) + + +def _permille(numerator: int, denominator: int) -> int: + if denominator <= 0: + return 0 + return round(numerator * 1000 / denominator) + + +__all__ = [ + "GateResult", + "GateState", + "MetricGateConfig", + "evaluate_gate_state", + "evaluate_gates", + "gate_state_from_project_metrics", + "metric_gate_reasons", + "metric_gate_reasons_for_state", + "summarize_metrics_diff", +] diff --git a/codeclone/_cli_gating.py b/codeclone/report/gates/reasons.py similarity index 100% rename from codeclone/_cli_gating.py rename to codeclone/report/gates/reasons.py diff --git a/codeclone/_html_report/__init__.py b/codeclone/report/html/__init__.py similarity index 76% rename from codeclone/_html_report/__init__.py rename to codeclone/report/html/__init__.py index 69b89c1..cdde57d 100644 --- a/codeclone/_html_report/__init__.py +++ b/codeclone/report/html/__init__.py @@ -4,10 +4,10 @@ # SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy -"""New HTML report package — component-based architecture.""" +"""Canonical HTML report package.""" from __future__ import annotations -from ._assemble import build_html_report +from .assemble import build_html_report __all__ = ["build_html_report"] diff --git a/codeclone/_html_report/_context.py b/codeclone/report/html/_context.py similarity index 93% rename from codeclone/_html_report/_context.py rename to codeclone/report/html/_context.py index efac981..02865cb 100644 --- a/codeclone/_html_report/_context.py +++ b/codeclone/report/html/_context.py @@ -12,19 +12,19 @@ from dataclasses import dataclass from typing import TYPE_CHECKING -from .._coerce import as_mapping as _as_mapping -from ..contracts import REPORT_SCHEMA_VERSION -from ..report.overview import build_report_overview, materialize_report_overview +from ...contracts import REPORT_SCHEMA_VERSION +from ...utils.coerce import as_mapping as _as_mapping +from ..overview import build_report_overview, materialize_report_overview if TYPE_CHECKING: - from .._html_snippets import _FileCache - from ..models import ( + from ...models import ( GroupItemLike, GroupMapLike, MetricsDiff, StructuralFindingGroup, Suggestion, ) + from .widgets.snippets import _FileCache @dataclass(frozen=True, slots=True) @@ -63,6 +63,7 @@ class ReportContext: dependencies_map: Mapping[str, object] dead_code_map: Mapping[str, object] overloaded_modules_map: Mapping[str, object] + security_surfaces_map: Mapping[str, object] health_map: Mapping[str, object] # -- suggestions + structural -- @@ -166,7 +167,7 @@ def build_context( max_snippet_lines: int = 220, ) -> ReportContext: """Build a ReportContext from raw build_html_report parameters.""" - from .._html_escape import _escape_html + from .primitives.escape import _escape_html meta = dict(report_meta or {}) baseline_meta = _as_mapping(meta.get("baseline")) @@ -177,6 +178,8 @@ def build_context( inventory_map = _as_mapping(report_document_map.get("inventory")) derived_map = _as_mapping(report_document_map.get("derived")) integrity_map = _as_mapping(report_document_map.get("integrity")) + report_metrics_map = _as_mapping(report_document_map.get("metrics")) + report_metric_families = _as_mapping(report_metrics_map.get("families")) report_schema_version = str( meta.get("report_schema_version") or REPORT_SCHEMA_VERSION @@ -237,6 +240,9 @@ def build_context( overloaded_modules_map = _as_mapping(metrics_map.get("overloaded_modules")) if not overloaded_modules_map: overloaded_modules_map = _as_mapping(metrics_map.get("god_modules")) + security_surfaces_map = _as_mapping(report_metric_families.get("security_surfaces")) + if not security_surfaces_map: + security_surfaces_map = _as_mapping(metrics_map.get("security_surfaces")) health_map = _as_mapping(metrics_map.get("health")) suggestions_tuple = tuple(suggestions or ()) @@ -282,6 +288,7 @@ def build_context( dependencies_map=dependencies_map, dead_code_map=dead_code_map, overloaded_modules_map=overloaded_modules_map, + security_surfaces_map=security_surfaces_map, health_map=health_map, suggestions=suggestions_tuple, structural_findings=tuple(structural_findings or ()), diff --git a/codeclone/_html_report/_assemble.py b/codeclone/report/html/assemble.py similarity index 93% rename from codeclone/_html_report/_assemble.py rename to codeclone/report/html/assemble.py index 13f4964..ed40620 100644 --- a/codeclone/_html_report/_assemble.py +++ b/codeclone/report/html/assemble.py @@ -11,28 +11,29 @@ from collections.abc import Collection, Mapping, Sequence from typing import TYPE_CHECKING -from .. import __version__, _coerce -from .._html_css import build_css -from .._html_escape import _escape_html -from .._html_js import build_js -from .._html_snippets import _FileCache, _pygments_css -from ..contracts import DOCS_URL, ISSUES_URL, REPOSITORY_URL -from ..domain.quality import CONFIDENCE_HIGH -from ..structural_findings import normalize_structural_findings -from ..templates import FONT_CSS_URL, REPORT_TEMPLATE +from ... import __version__ +from ...contracts import DOCS_URL, ISSUES_URL, REPOSITORY_URL +from ...domain.quality import CONFIDENCE_HIGH +from ...findings.structural.detectors import normalize_structural_findings +from ...utils import coerce as _coerce from ._context import _meta_pick, build_context -from ._icons import BRAND_LOGO, ICONS, section_icon_html -from ._sections._clones import render_clones_panel -from ._sections._coupling import render_quality_panel -from ._sections._dead_code import render_dead_code_panel -from ._sections._dependencies import render_dependencies_panel -from ._sections._meta import build_topbar_provenance_summary, render_meta_panel -from ._sections._overview import render_overview_panel -from ._sections._structural import render_structural_panel -from ._sections._suggestions import render_suggestions_panel +from .assets.css import build_css +from .assets.js import build_js +from .primitives.escape import _escape_html +from .sections._clones import render_clones_panel +from .sections._coupling import render_quality_panel +from .sections._dead_code import render_dead_code_panel +from .sections._dependencies import render_dependencies_panel +from .sections._meta import build_topbar_provenance_summary, render_meta_panel +from .sections._overview import render_overview_panel +from .sections._structural import render_structural_panel +from .sections._suggestions import render_suggestions_panel +from .template import FONT_CSS_URL, REPORT_TEMPLATE +from .widgets.icons import BRAND_LOGO, ICONS, section_icon_html +from .widgets.snippets import _FileCache, _pygments_css if TYPE_CHECKING: - from ..models import GroupMapLike, MetricsDiff, StructuralFindingGroup, Suggestion + from ...models import GroupMapLike, MetricsDiff, StructuralFindingGroup, Suggestion def build_html_report( @@ -124,6 +125,7 @@ def build_html_report( _as_mapping(ctx.overloaded_modules_map.get("summary")).get("candidates") ) + coverage_review_items + + _as_int(_as_mapping(ctx.security_surfaces_map.get("summary")).get("items")) ) def _tab_badge(count: int) -> str: diff --git a/codeclone/report/html/assets/__init__.py b/codeclone/report/html/assets/__init__.py new file mode 100644 index 0000000..9135843 --- /dev/null +++ b/codeclone/report/html/assets/__init__.py @@ -0,0 +1,5 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy diff --git a/codeclone/_html_css.py b/codeclone/report/html/assets/css.py similarity index 99% rename from codeclone/_html_css.py rename to codeclone/report/html/assets/css.py index 66a4609..af4776d 100644 --- a/codeclone/_html_css.py +++ b/codeclone/report/html/assets/css.py @@ -360,6 +360,11 @@ .insight-warn{border-left-color:var(--warning);background:var(--warning-muted)} .insight-risk{border-left-color:var(--error);background:var(--error-muted)} .insight-info{border-left-color:var(--info);background:var(--info-muted)} +.insight-banner .overview-summary-grid{margin:0} +.insight-banner .overview-summary-item{background:none;border:none;border-radius:0;padding:0} +.insight-banner .overview-summary-label{font-size:.76rem;margin-bottom:var(--sp-2); + padding-bottom:var(--sp-1);border-bottom:1px solid color-mix(in srgb,var(--border) 55%,transparent)} +.insight-banner .overview-fact-row{font-size:.78rem} """ # --------------------------------------------------------------------------- @@ -844,7 +849,7 @@ .stat-cards .kpi-detail,.dep-stats .kpi-detail{margin-top:0;align-self:end} .dep-graph-wrap{overflow:hidden;margin-bottom:var(--sp-4);border:1px solid var(--border); border-radius:var(--radius-lg);background:var(--bg-surface);padding:var(--sp-4)} -.dep-graph-svg{width:100%;height:auto;max-height:520px} +.dep-graph-svg{display:block;width:100%;height:auto;max-height:680px;margin:0 auto} .dep-graph-svg text{fill:var(--text-secondary);font-family:var(--font-mono)} .dep-node{transition:fill-opacity var(--dur-fast) var(--ease)} .dep-edge{transition:stroke-opacity var(--dur-fast) var(--ease)} @@ -1129,7 +1134,6 @@ .prov-copy-btn svg{width:12px;height:12px} """ - # --------------------------------------------------------------------------- # Shared micro-interactions # --------------------------------------------------------------------------- @@ -1414,7 +1418,6 @@ font-variant-numeric:tabular-nums;opacity:.85} """ - # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- diff --git a/codeclone/_html_js.py b/codeclone/report/html/assets/js.py similarity index 100% rename from codeclone/_html_js.py rename to codeclone/report/html/assets/js.py diff --git a/codeclone/report/html/primitives/__init__.py b/codeclone/report/html/primitives/__init__.py new file mode 100644 index 0000000..9135843 --- /dev/null +++ b/codeclone/report/html/primitives/__init__.py @@ -0,0 +1,5 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy diff --git a/codeclone/_html_data_attrs.py b/codeclone/report/html/primitives/data_attrs.py similarity index 96% rename from codeclone/_html_data_attrs.py rename to codeclone/report/html/primitives/data_attrs.py index d4e94f3..3c942a1 100644 --- a/codeclone/_html_data_attrs.py +++ b/codeclone/report/html/primitives/data_attrs.py @@ -8,7 +8,7 @@ from __future__ import annotations -from ._html_escape import _escape_html +from .escape import _escape_html __all__ = ["_build_data_attrs"] diff --git a/codeclone/_html_escape.py b/codeclone/report/html/primitives/escape.py similarity index 100% rename from codeclone/_html_escape.py rename to codeclone/report/html/primitives/escape.py diff --git a/codeclone/_html_filters.py b/codeclone/report/html/primitives/filters.py similarity index 97% rename from codeclone/_html_filters.py rename to codeclone/report/html/primitives/filters.py index e700fad..f578b16 100644 --- a/codeclone/_html_filters.py +++ b/codeclone/report/html/primitives/filters.py @@ -10,7 +10,7 @@ from collections.abc import Sequence -from ._html_escape import _escape_html +from .escape import _escape_html __all__ = [ "CLONE_TYPE_OPTIONS", diff --git a/codeclone/report/html/primitives/location.py b/codeclone/report/html/primitives/location.py new file mode 100644 index 0000000..859d0d8 --- /dev/null +++ b/codeclone/report/html/primitives/location.py @@ -0,0 +1,48 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +"""Shared location/path helpers for HTML section renderers.""" + +from __future__ import annotations + +from collections.abc import Mapping +from pathlib import Path +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from .._context import ReportContext + + +def relative_location_path(ctx: ReportContext, item: Mapping[str, object]) -> str: + relative_path = str(item.get("relative_path", "")).strip() + if relative_path: + return relative_path + filepath = str(item.get("filepath", "")).strip() + if not filepath: + return "" + return ctx.relative_path(filepath).strip() + + +def location_file_target( + ctx: ReportContext, + item: Mapping[str, object], + *, + relative_path: str, +) -> str: + filepath = str(item.get("filepath", "")).strip() + if filepath: + path_obj = Path(filepath) + if path_obj.is_absolute(): + return filepath + if ctx.scan_root: + return str((Path(ctx.scan_root) / path_obj).resolve()) + return filepath + if ctx.scan_root and relative_path: + return str((Path(ctx.scan_root) / relative_path).resolve()) + return relative_path + + +__all__ = ["location_file_target", "relative_location_path"] diff --git a/codeclone/report/html/sections/__init__.py b/codeclone/report/html/sections/__init__.py new file mode 100644 index 0000000..9135843 --- /dev/null +++ b/codeclone/report/html/sections/__init__.py @@ -0,0 +1,5 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy diff --git a/codeclone/_html_report/_sections/_clones.py b/codeclone/report/html/sections/_clones.py similarity index 97% rename from codeclone/_html_report/_sections/_clones.py rename to codeclone/report/html/sections/_clones.py index 65ab657..b18624f 100644 --- a/codeclone/_html_report/_sections/_clones.py +++ b/codeclone/report/html/sections/_clones.py @@ -11,29 +11,31 @@ from collections.abc import Mapping, Sequence from typing import TYPE_CHECKING, Literal -from ... import _coerce -from ..._html_badges import _micro_badges, _source_kind_badge_html, _stat_card -from ..._html_data_attrs import _build_data_attrs -from ..._html_escape import _escape_html -from ..._html_filters import CLONE_TYPE_OPTIONS, SPREAD_OPTIONS, _render_select -from ..._html_snippets import _render_code_block -from ...report._source_kinds import SOURCE_KIND_FILTER_VALUES -from ...report.derived import ( +from codeclone.findings.ids import clone_group_id +from codeclone.utils import coerce as _coerce + +from ..._source_kinds import SOURCE_KIND_FILTER_VALUES +from ...derived import ( combine_source_kinds, group_spread, report_location_from_group_item, ) -from ...report.explain_contract import format_group_instance_compare_meta -from ...report.json_contract import clone_group_id -from ...report.suggestions import classify_clone_type -from .._components import Tone, insight_block -from .._glossary import glossary_tip -from .._icons import ICONS -from .._tables import render_rows_table -from .._tabs import render_split_tabs +from ...explain_contract import format_group_instance_compare_meta +from ...suggestions import classify_clone_type +from ..primitives.data_attrs import _build_data_attrs +from ..primitives.escape import _escape_html +from ..primitives.filters import CLONE_TYPE_OPTIONS, SPREAD_OPTIONS, _render_select +from ..widgets.badges import _micro_badges, _source_kind_badge_html, _stat_card +from ..widgets.components import Tone, insight_block +from ..widgets.glossary import glossary_tip +from ..widgets.icons import ICONS +from ..widgets.snippets import _render_code_block +from ..widgets.tables import render_rows_table +from ..widgets.tabs import render_split_tabs if TYPE_CHECKING: - from ...models import GroupItemLike + from codeclone.models import GroupItemLike + from .._context import ReportContext _as_int = _coerce.as_int diff --git a/codeclone/_html_report/_sections/_coupling.py b/codeclone/report/html/sections/_coupling.py similarity index 94% rename from codeclone/_html_report/_sections/_coupling.py rename to codeclone/report/html/sections/_coupling.py index 08fdf22..23a860d 100644 --- a/codeclone/_html_report/_sections/_coupling.py +++ b/codeclone/report/html/sections/_coupling.py @@ -10,17 +10,22 @@ from typing import TYPE_CHECKING -from ... import _coerce -from ..._html_badges import _micro_badges, _render_chain_flow, _stat_card -from .._components import Tone, insight_block -from .._glossary import glossary_tip -from .._tables import render_rows_table -from .._tabs import render_split_tabs +from codeclone.utils import coerce as _coerce + +from ..widgets.badges import _micro_badges, _render_chain_flow, _stat_card +from ..widgets.components import Tone, insight_block +from ..widgets.glossary import glossary_tip +from ..widgets.tables import render_rows_table +from ..widgets.tabs import render_split_tabs from ._coverage_join import ( coverage_join_quality_count, coverage_join_quality_summary, render_coverage_join_panel, ) +from ._security_surfaces import ( + render_security_surfaces_panel, + security_surfaces_quality_count, +) if TYPE_CHECKING: from collections.abc import Mapping, Sequence @@ -269,12 +274,12 @@ def render_quality_panel(ctx: ReportContext) -> str: complexity_summary = _as_mapping(ctx.complexity_map.get("summary")) overloaded_modules_summary = _as_mapping(ctx.overloaded_modules_map.get("summary")) coverage_join_summary = coverage_join_quality_summary(ctx) - coupling_high_risk = _as_int(coupling_summary.get("high_risk")) cohesion_low = _as_int(cohesion_summary.get("low_cohesion")) complexity_high_risk = _as_int(complexity_summary.get("high_risk")) overloaded_module_candidates = _as_int(overloaded_modules_summary.get("candidates")) coverage_review_items = coverage_join_quality_count(ctx) + security_surface_items = security_surfaces_quality_count(ctx) coverage_hotspots = _as_int(coverage_join_summary.get("coverage_hotspots")) coverage_scope_gaps = _as_int(coverage_join_summary.get("scope_gap_hotspots")) coverage_join_status = str(coverage_join_summary.get("status", "")).strip() @@ -292,6 +297,7 @@ def render_quality_panel(ctx: ReportContext) -> str: f"high-coupling: {coupling_high_risk}; " f"low-cohesion: {cohesion_low}; " f"overloaded modules: {overloaded_module_candidates}; " + f"security surfaces: {security_surface_items}; " f"max CC {cc_max}; " f"max CBO {coupling_summary.get('max', 'n/a')}; " f"max LCOM4 {cohesion_summary.get('max', 'n/a')}." @@ -441,6 +447,16 @@ def render_quality_panel(ctx: ReportContext) -> str: coverage_join_panel, ) ) + security_surfaces_panel = render_security_surfaces_panel(ctx) + if security_surfaces_panel: + sub_tabs.append( + ( + "security-surfaces", + "Security Surfaces", + security_surface_items, + security_surfaces_panel, + ) + ) return insight_block( question="Are there quality hotspots in the codebase?", diff --git a/codeclone/_html_report/_sections/_coverage_join.py b/codeclone/report/html/sections/_coverage_join.py similarity index 94% rename from codeclone/_html_report/_sections/_coverage_join.py rename to codeclone/report/html/sections/_coverage_join.py index 5268d50..4821850 100644 --- a/codeclone/_html_report/_sections/_coverage_join.py +++ b/codeclone/report/html/sections/_coverage_join.py @@ -11,11 +11,13 @@ from pathlib import Path from typing import TYPE_CHECKING -from ... import _coerce -from ..._html_badges import _micro_badges, _stat_card, _tab_empty_info -from ..._html_escape import _escape_html -from .._glossary import glossary_tip -from .._tables import render_rows_table +from codeclone.utils import coerce as _coerce + +from ..primitives.escape import _escape_html +from ..primitives.location import location_file_target, relative_location_path +from ..widgets.badges import _micro_badges, _stat_card, _tab_empty_info +from ..widgets.glossary import glossary_tip +from ..widgets.tables import render_rows_table if TYPE_CHECKING: from collections.abc import Mapping @@ -199,7 +201,7 @@ def _coverage_join_empty_description() -> str: def _location_cell_html(ctx: ReportContext, item: Mapping[str, object]) -> str: - relative_path = str(item.get("relative_path", "")).strip() + relative_path = relative_location_path(ctx, item) start_line = _as_int(item.get("start_line")) end_line = _as_int(item.get("end_line")) line_label = ( @@ -209,11 +211,7 @@ def _location_cell_html(ctx: ReportContext, item: Mapping[str, object]) -> str: ) if end_line > start_line > 0: line_label = f"{relative_path}:{start_line}-{end_line}" - file_target = ( - f"{ctx.scan_root.rstrip('/')}/{relative_path}" - if ctx.scan_root and relative_path - else relative_path - ) + file_target = location_file_target(ctx, item, relative_path=relative_path) return ( f'' diff --git a/codeclone/_html_report/_sections/_dead_code.py b/codeclone/report/html/sections/_dead_code.py similarity index 94% rename from codeclone/_html_report/_sections/_dead_code.py rename to codeclone/report/html/sections/_dead_code.py index eaa5bd2..ffdad1d 100644 --- a/codeclone/_html_report/_sections/_dead_code.py +++ b/codeclone/report/html/sections/_dead_code.py @@ -10,12 +10,13 @@ from typing import TYPE_CHECKING -from ... import _coerce -from ..._html_badges import _micro_badges, _stat_card -from .._components import Tone, insight_block -from .._glossary import glossary_tip -from .._tables import render_rows_table -from .._tabs import render_split_tabs +from codeclone.utils import coerce as _coerce + +from ..widgets.badges import _micro_badges, _stat_card +from ..widgets.components import Tone, insight_block +from ..widgets.glossary import glossary_tip +from ..widgets.tables import render_rows_table +from ..widgets.tabs import render_split_tabs if TYPE_CHECKING: from collections.abc import Mapping diff --git a/codeclone/_html_report/_sections/_dependencies.py b/codeclone/report/html/sections/_dependencies.py similarity index 73% rename from codeclone/_html_report/_sections/_dependencies.py rename to codeclone/report/html/sections/_dependencies.py index b0df4af..c3fbbfb 100644 --- a/codeclone/_html_report/_sections/_dependencies.py +++ b/codeclone/report/html/sections/_dependencies.py @@ -12,29 +12,34 @@ from collections.abc import Mapping, Sequence from typing import TYPE_CHECKING -from ... import _coerce -from ..._html_badges import ( +from codeclone.utils import coerce as _coerce + +from ..primitives.escape import _escape_html +from ..widgets.badges import ( _micro_badges, _render_chain_flow, _short_label, _stat_card, _tab_empty, ) -from ..._html_escape import _escape_html -from .._components import Tone, insight_block -from .._glossary import glossary_tip -from .._tables import render_rows_table +from ..widgets.components import Tone, insight_block +from ..widgets.glossary import glossary_tip +from ..widgets.tables import render_rows_table if TYPE_CHECKING: from .._context import ReportContext _as_int = _coerce.as_int +_as_float = _coerce.as_float _as_mapping = _coerce.as_mapping _as_sequence = _coerce.as_sequence def _select_dep_nodes( edges: Sequence[tuple[str, str]], + *, + dep_cycles: Sequence[object], + longest_chains: Sequence[object], ) -> tuple[list[str], list[tuple[str, str]]]: all_nodes = sorted({part for edge in edges for part in edge}) if len(all_nodes) > 20: @@ -42,7 +47,38 @@ def _select_dep_nodes( for source, target in edges: degree_count[source] = degree_count.get(source, 0) + 1 degree_count[target] = degree_count.get(target, 0) + 1 - nodes = sorted(all_nodes, key=lambda node: -degree_count.get(node, 0))[:20] + all_node_set = set(all_nodes) + nodes: list[str] = [] + node_set: set[str] = set() + + def _seed_node(node: object) -> None: + node_name = str(node).strip() + if ( + not node_name + or node_name not in all_node_set + or node_name in node_set + or len(nodes) >= 20 + ): + return + nodes.append(node_name) + node_set.add(node_name) + + # Keep the visual graph aligned with the dependency tables. When we + # downsample a large graph, cycle members and longest-chain nodes must + # remain visible instead of being dropped behind high-degree hubs. + for cycle in dep_cycles: + for node in _as_sequence(cycle): + _seed_node(node) + for chain in longest_chains: + for node in _as_sequence(chain): + _seed_node(node) + + for node in sorted( + all_nodes, key=lambda item: (-degree_count.get(item, 0), item) + ): + _seed_node(node) + if len(nodes) >= 20: + break nodes.sort() else: nodes = all_nodes @@ -107,17 +143,58 @@ def _build_layer_groups( def _layout_dep_graph( layer_groups: Mapping[int, Sequence[str]], + *, + in_degree: Mapping[str, int], + out_degree: Mapping[str, int], ) -> tuple[int, int, int, dict[str, tuple[float, float]]]: num_layers = max(layer_groups.keys(), default=0) + 1 max_per_layer = max((len(members) for members in layer_groups.values()), default=1) - width = max(600, min(1200, max_per_layer * 70 + 140)) - height = max(260, num_layers * 80 + 80) - pad_x, pad_y = 60.0, 40.0 + pad_x, pad_y = 56.0, 36.0 + prefer_horizontal = num_layers >= 6 and num_layers > max_per_layer + 2 + + def _ordered_members(members: Sequence[str]) -> list[str]: + if not prefer_horizontal or len(members) < 3: + return list(members) + ranked = sorted( + members, + key=lambda node: ( + -(in_degree.get(node, 0) + out_degree.get(node, 0)), + node, + ), + ) + center = (len(ranked) - 1) / 2 + slot_order = sorted( + range(len(ranked)), + key=lambda index: (abs(index - center), index), + ) + ordered = [""] * len(ranked) + for node, slot in zip(ranked, slot_order, strict=False): + ordered[slot] = node + return ordered + + if prefer_horizontal: + width = max(920, min(1600, num_layers * 118 + max_per_layer * 28 + 180)) + height = max(300, max_per_layer * 84 + 104) + else: + width = max(600, min(1200, max_per_layer * 70 + 140)) + height = max(260, num_layers * 80 + 80) positions: dict[str, tuple[float, float]] = {} for layer_index in range(num_layers): members = layer_groups.get(layer_index, []) count = len(members) + if prefer_horizontal: + members = _ordered_members(members) + layer_step = (width - 2 * pad_x) / max(1, num_layers - 1) + x = pad_x + layer_index * layer_step + fan = min(14.0, layer_step * 0.12) + offset_unit = fan / max(1, count - 1) + center = (count - 1) / 2 + for index, node in enumerate(members): + y = pad_y + (index + 0.5) * ((height - 2 * pad_y) / max(1, count)) + positions[node] = (x + (index - center) * offset_unit, y) + continue + y = pad_y + layer_index * ((height - 2 * pad_y) / max(1, num_layers - 1)) for index, node in enumerate(members): x = pad_x + (index + 0.5) * ((width - 2 * pad_x) / max(1, count)) @@ -222,10 +299,11 @@ def _render_dep_nodes_and_labels( cycle_node_set: set[str], hub_threshold: int, max_per_layer: int, + prefer_horizontal: bool, ) -> tuple[list[str], list[str]]: nodes_svg: list[str] = [] labels_svg: list[str] = [] - rotate_labels = max_per_layer > 6 + rotate_labels = prefer_horizontal or max_per_layer > 6 for node in nodes: x, y = positions[node] @@ -234,6 +312,7 @@ def _render_dep_nodes_and_labels( label = _short_label(node) is_cycle = node in cycle_node_set is_hub = degree >= hub_threshold and degree > 2 + is_secondary = not is_hub and not is_cycle if is_cycle: fill, fill_opacity, extra = ( @@ -258,19 +337,25 @@ def _render_dep_nodes_and_labels( f'fill="{fill}" fill-opacity="{fill_opacity}" {extra}/>' ) - font_size = "10" if is_hub else "9" + font_size = "10" if is_hub else ("8" if is_secondary else "9") if rotate_labels: + label_x = ( + x + radius + (4 if is_secondary else 6 if prefer_horizontal else 0) + ) + label_y = ( + y - radius - (1 if is_secondary else 2 if prefer_horizontal else 6) + ) labels_svg.append( f'' + f'transform="translate({label_x:.1f},{label_y:.1f}) rotate(-45)">' f"{_escape_html(node)}{_escape_html(label)}" ) continue labels_svg.append( f'' + f'x="{x:.1f}" y="{y - radius - (4 if is_secondary else 5):.1f}" font-size="{font_size}" text-anchor="middle">' f"{_escape_html(node)}{_escape_html(label)}" ) @@ -281,14 +366,24 @@ def _render_dep_svg( edges: Sequence[tuple[str, str]], cycle_node_set: set[str], dep_cycles: Sequence[object], + longest_chains: Sequence[object], ) -> str: if not edges: return _tab_empty("Dependency graph is not available.") - nodes, filtered_edges = _select_dep_nodes(edges) + nodes, filtered_edges = _select_dep_nodes( + edges, + dep_cycles=dep_cycles, + longest_chains=longest_chains, + ) in_degree, out_degree = _build_degree_maps(nodes, filtered_edges) layer_groups = _build_layer_groups(nodes, filtered_edges, in_degree, out_degree) - width, height, max_per_layer, positions = _layout_dep_graph(layer_groups) + width, height, max_per_layer, positions = _layout_dep_graph( + layer_groups, + in_degree=in_degree, + out_degree=out_degree, + ) + prefer_horizontal = width > height hub_threshold = _hub_threshold(nodes, in_degree, out_degree) node_radii = _build_node_radii( nodes, @@ -309,15 +404,19 @@ def _render_dep_svg( cycle_node_set=cycle_node_set, hub_threshold=hub_threshold, max_per_layer=max_per_layer, + prefer_horizontal=prefer_horizontal, ) - label_pad = 50 if max_per_layer > 6 else 0 + label_pad = 44 if prefer_horizontal else (50 if max_per_layer > 6 else 0) + label_pad_x = 52 if prefer_horizontal else (28 if max_per_layer > 6 else 0) + vb_x = -label_pad_x vb_y = -label_pad + vb_w = width + label_pad_x * 2 vb_h = height + label_pad return ( '
' - f'' f"{defs}{''.join(edge_svg)}{''.join(node_svg)}{''.join(label_svg)}" @@ -327,6 +426,7 @@ def _render_dep_svg( def render_dependencies_panel(ctx: ReportContext) -> str: dep_cycles = _as_sequence(ctx.dependencies_map.get("cycles")) + dep_longest = _as_sequence(ctx.dependencies_map.get("longest_chains")) dep_edge_data = _as_sequence(ctx.dependencies_map.get("edge_list")) dep_edges = [ (str(_as_mapping(r).get("source", "")), str(_as_mapping(r).get("target", ""))) @@ -342,11 +442,25 @@ def render_dependencies_panel(ctx: ReportContext) -> str: dep_module_count = _as_int(ctx.dependencies_map.get("modules")) dep_edge_count = _as_int(ctx.dependencies_map.get("edges")) dep_max_depth = _as_int(ctx.dependencies_map.get("max_depth")) + dep_avg_depth = _as_float(ctx.dependencies_map.get("avg_depth")) + dep_p95_depth = _as_int(ctx.dependencies_map.get("p95_depth")) cycle_count = len(dep_cycles) + dependency_health = _as_int( + _as_mapping(ctx.health_map.get("dimensions")).get("dependencies"), + ) dep_avg = ( f"{dep_edge_count / dep_module_count:.1f}" if dep_module_count > 0 else "n/a" ) + dep_avg_depth_label = f"{dep_avg_depth:.1f}" if dep_module_count > 0 else "n/a" + + dependency_tone: Tone + if cycle_count > 0: + dependency_tone = "risk" + elif dependency_health < 100: + dependency_tone = "warn" + else: + dependency_tone = "ok" cards = [ _stat_card( @@ -366,8 +480,13 @@ def render_dependencies_panel(ctx: ReportContext) -> str: _stat_card( "Max depth", dep_max_depth, - detail=_micro_badges(("target", "< 8")), - value_tone="warn" if dep_max_depth > 8 else "good", + detail=_micro_badges( + ("avg", dep_avg_depth_label), + ("p95", dep_p95_depth), + ), + value_tone="bad" + if cycle_count > 0 + else ("warn" if dependency_health < 100 else "good"), css_class="meta-item", glossary_tip_fn=glossary_tip, ), @@ -386,7 +505,12 @@ def render_dependencies_panel(ctx: ReportContext) -> str: ] # SVG graph - graph_svg = _render_dep_svg(dep_edges, cycle_node_set, dep_cycles) + graph_svg = _render_dep_svg( + dep_edges, + cycle_node_set, + dep_cycles, + dep_longest, + ) # Hub bar deg_map = dict.fromkeys(sorted({p for e in dep_edges for p in e}), 0) @@ -423,7 +547,6 @@ def render_dependencies_panel(ctx: ReportContext) -> str: (_render_chain_flow([str(p) for p in _as_sequence(c)], arrows=True),) for c in dep_cycles ] - dep_longest = _as_sequence(ctx.dependencies_map.get("longest_chains")) dep_chain_rows = [ ( _render_chain_flow([str(p) for p in _as_sequence(ch)], arrows=True), @@ -438,13 +561,11 @@ def render_dependencies_panel(ctx: ReportContext) -> str: if not ctx.metrics_available: answer, tone = "Metrics are skipped for this run.", "info" else: - answer = f"Cycles: {cycle_count}; max dependency depth: {dep_max_depth}." - if cycle_count > 0: - tone = "risk" - elif dep_max_depth > 8: - tone = "warn" - else: - tone = "ok" + answer = ( + f"Cycles: {cycle_count}; avg depth: {dep_avg_depth_label}; " + f"p95 depth: {dep_p95_depth}; max dependency depth: {dep_max_depth}." + ) + tone = dependency_tone return ( insight_block( diff --git a/codeclone/_html_report/_sections/_meta.py b/codeclone/report/html/sections/_meta.py similarity index 98% rename from codeclone/_html_report/_sections/_meta.py rename to codeclone/report/html/sections/_meta.py index 2704446..e66cb4e 100644 --- a/codeclone/_html_report/_sections/_meta.py +++ b/codeclone/report/html/sections/_meta.py @@ -10,11 +10,13 @@ from typing import TYPE_CHECKING -from ... import __version__, _coerce -from ..._html_data_attrs import _build_data_attrs -from ..._html_escape import _escape_html, _meta_display +from codeclone import __version__ +from codeclone.utils import coerce as _coerce + from .._context import _meta_pick -from .._glossary import glossary_tip +from ..primitives.data_attrs import _build_data_attrs +from ..primitives.escape import _escape_html, _meta_display +from ..widgets.glossary import glossary_tip if TYPE_CHECKING: from .._context import ReportContext diff --git a/codeclone/_html_report/_sections/_overview.py b/codeclone/report/html/sections/_overview.py similarity index 99% rename from codeclone/_html_report/_sections/_overview.py rename to codeclone/report/html/sections/_overview.py index 7afdf6c..c9ac47b 100644 --- a/codeclone/_html_report/_sections/_overview.py +++ b/codeclone/report/html/sections/_overview.py @@ -12,22 +12,23 @@ from collections.abc import Mapping from typing import TYPE_CHECKING -from ... import _coerce -from ..._html_badges import ( +from codeclone.utils import coerce as _coerce + +from ..primitives.escape import _escape_html +from ..widgets.badges import ( _inline_empty, _micro_badges, _source_kind_badge_html, _stat_card, ) -from ..._html_escape import _escape_html -from .._components import ( +from ..widgets.components import ( Tone, insight_block, overview_cluster_header, overview_source_breakdown_html, overview_summary_item_html, ) -from .._glossary import glossary_tip +from ..widgets.glossary import glossary_tip if TYPE_CHECKING: from .._context import ReportContext diff --git a/codeclone/report/html/sections/_security_surfaces.py b/codeclone/report/html/sections/_security_surfaces.py new file mode 100644 index 0000000..0d853ec --- /dev/null +++ b/codeclone/report/html/sections/_security_surfaces.py @@ -0,0 +1,390 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +"""Security Surfaces HTML helpers for Quality tab rendering.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from codeclone.utils import coerce as _coerce + +from ..primitives.escape import _escape_html +from ..primitives.location import location_file_target, relative_location_path +from ..widgets.badges import _micro_badges, _stat_card, _tab_empty_info +from ..widgets.components import overview_summary_item_html +from ..widgets.glossary import glossary_tip +from ..widgets.tables import render_rows_table + +if TYPE_CHECKING: + from collections.abc import Mapping + + from .._context import ReportContext + +_as_int = _coerce.as_int +_as_mapping = _coerce.as_mapping +_as_sequence = _coerce.as_sequence + + +def security_surfaces_quality_count(ctx: ReportContext) -> int: + return _as_int(_security_surfaces_summary(ctx).get("items")) + + +def render_security_surfaces_panel(ctx: ReportContext) -> str: + summary = _security_surfaces_summary(ctx) + if not summary: + return "" + items = tuple( + map(_as_mapping, _as_sequence(ctx.security_surfaces_map.get("items"))) + ) + if not items: + return _tab_empty_info( + "No security-relevant capability surfaces matched the exact registry.", + detail_html=( + "This inventory is report-only and focuses on exact boundary " + "capabilities rather than vulnerability claims." + ), + ) + cards = [ + _stat_card( + "Surfaces", + _as_int(summary.get("items")), + detail=_micro_badges(("report", "only"), ("evidence", "exact")), + value_tone="warn" if _as_int(summary.get("items")) > 0 else "muted", + css_class="meta-item", + glossary_tip_fn=glossary_tip, + ), + _stat_card( + "Categories", + _as_int(summary.get("category_count")), + detail=_micro_badges(("modules", _as_int(summary.get("modules")))), + css_class="meta-item", + glossary_tip_fn=glossary_tip, + ), + _stat_card( + "Production", + _as_int(summary.get("production")), + detail=_micro_badges(("tests", _as_int(summary.get("tests")))), + css_class="meta-item", + glossary_tip_fn=glossary_tip, + ), + _stat_card( + "Exact items", + _as_int(summary.get("exact_items")), + detail=_micro_badges(("fixtures", _as_int(summary.get("fixtures")))), + css_class="meta-item", + glossary_tip_fn=glossary_tip, + ), + ] + return ( + f'
{"".join(cards)}
' + + _security_surfaces_context_html(ctx, items) + + '

Security-relevant capability inventory

' + + render_rows_table( + headers=( + "Category", + "Capability", + "Evidence", + "Source", + "Location", + "Review", + ), + rows=_security_surface_rows(ctx, items), + empty_message="No exact security surfaces are available.", + empty_description=( + "CodeClone inventories trust-boundary capabilities but does not " + "claim vulnerabilities or exploitability." + ), + raw_html_headers=("Location",), + ctx=ctx, + ) + ) + + +def _security_surfaces_summary(ctx: ReportContext) -> Mapping[str, object]: + return _as_mapping(ctx.security_surfaces_map.get("summary")) + + +def _security_surface_rows( + ctx: ReportContext, + items: tuple[Mapping[str, object], ...], +) -> list[tuple[str, str, str, str, str, str]]: + coverage_index = _coverage_review_index(ctx) + return [ + ( + _humanize(str(item.get("category", ""))), + _humanize(str(item.get("capability", ""))), + str(item.get("evidence_symbol", "")).strip() or "(unknown)", + _humanize(str(item.get("source_kind", ""))), + _location_cell_html(ctx, item), + _review_cell_text(ctx, item, coverage_index=coverage_index), + ) + for item in items[:50] + ] + + +def _location_cell_html(ctx: ReportContext, item: Mapping[str, object]) -> str: + relative_path = relative_location_path(ctx, item) + qualname = str(item.get("qualname", "")).strip() + start_line = _as_int(item.get("start_line")) + end_line = _as_int(item.get("end_line")) + file_target = location_file_target(ctx, item, relative_path=relative_path) + line_label = ( + f"{relative_path}:{start_line}" + if start_line > 0 + else (relative_path or "(unknown)") + ) + if end_line > start_line > 0: + line_label = f"{relative_path}:{start_line}-{end_line}" + title = qualname or line_label or "(unknown)" + return ( + f'
' + f"{_escape_html(line_label)}" + ) + + +def _security_surfaces_context_html( + ctx: ReportContext, + items: tuple[Mapping[str, object], ...], +) -> str: + review_order_rows = _security_review_order_rows(ctx, items) + return ( + '
' + '
How should I review this inventory?
' + '
' + '
' + + overview_summary_item_html( + label="How to read", + body_html=_fact_list_html( + ( + ("Signal", "boundary inventory", None), + ("Evidence", "exact imports/calls/builtins", None), + ("Meaning", "inventory, not vulnerability proof", None), + ) + ), + ) + + overview_summary_item_html( + label="Review order", + body_html=_fact_list_html(review_order_rows), + ) + + "
" + ) + + +def _security_review_order_rows( + ctx: ReportContext, + items: tuple[Mapping[str, object], ...], +) -> tuple[tuple[str, str, str | None], ...]: + production_callable_count = sum( + 1 for item in items if _is_production_callable(item) + ) + non_callable_count = sum( + 1 for item in items if str(item.get("location_scope", "")).strip() != "callable" + ) + coverage_index = _coverage_review_index(ctx) + coverage_overlap_total = 0 + coverage_scope_gaps = 0 + coverage_hotspots = 0 + for item in items: + if not _is_production_callable(item): + continue + cues = _coverage_review_cues(ctx, item, coverage_index=coverage_index) + if cues["overlap"]: + coverage_overlap_total += 1 + coverage_scope_gaps += 1 if cues["scope_gap_hotspot"] else 0 + coverage_hotspots += 1 if cues["coverage_hotspot"] else 0 + + return ( + ( + "Start with", + ( + f"{production_callable_count} " + f"{_pluralize(production_callable_count, 'production callable')}" + if production_callable_count > 0 + else "production module rows only" + ), + "warn" if production_callable_count > 0 else None, + ), + ( + "Coverage join", + _coverage_join_review_text( + ctx, + overlap_total=coverage_overlap_total, + scope_gaps=coverage_scope_gaps, + hotspots=coverage_hotspots, + ), + "warn" if coverage_overlap_total > 0 else None, + ), + ( + "Then review", + ( + f"{non_callable_count} " + f"{_pluralize(non_callable_count, 'module/class inventory row')}" + if non_callable_count > 0 + else "no inventory-only rows" + ), + None, + ), + ) + + +def _coverage_join_review_text( + ctx: ReportContext, + *, + overlap_total: int, + scope_gaps: int, + hotspots: int, +) -> str: + coverage_join = _as_mapping(_as_mapping(ctx.metrics_map).get("coverage_join")) + coverage_summary = _as_mapping(coverage_join.get("summary")) + if str(coverage_summary.get("status", "")).strip() != "ok": + return "unavailable for this run" + if overlap_total <= 0: + return "no overlap in current review set" + parts = [f"{overlap_total} {_pluralize(overlap_total, 'overlap')}"] + if scope_gaps > 0: + parts.append(f"{scope_gaps} {_pluralize(scope_gaps, 'scope gap')}") + if hotspots > 0: + parts.append(f"{hotspots} {_pluralize(hotspots, 'low-coverage overlap')}") + return " · ".join(parts) + + +def _review_cell_text( + ctx: ReportContext, + item: Mapping[str, object], + *, + coverage_index: Mapping[tuple[str, str], Mapping[str, bool]], +) -> str: + location_scope = str(item.get("location_scope", "")).strip() + scope_text = _humanize(location_scope) + if location_scope == "module": + return f"{scope_text} · capability present" + cues = _coverage_review_cues(ctx, item, coverage_index=coverage_index) + if cues["scope_gap_hotspot"]: + return f"{scope_text} · scope gap" + if cues["coverage_hotspot"]: + return f"{scope_text} · low coverage" + return f"{scope_text} · exact evidence" + + +def _coverage_review_cues( + ctx: ReportContext, + item: Mapping[str, object], + *, + coverage_index: Mapping[tuple[str, str], Mapping[str, bool]], +) -> Mapping[str, bool]: + relative_path = relative_location_path(ctx, item) + qualname = str(item.get("qualname", "")).strip() + if not relative_path or not qualname: + return { + "overlap": False, + "coverage_hotspot": False, + "scope_gap_hotspot": False, + } + return coverage_index.get( + (relative_path, qualname), + { + "overlap": False, + "coverage_hotspot": False, + "scope_gap_hotspot": False, + }, + ) + + +def _coverage_review_index( + ctx: ReportContext, +) -> dict[tuple[str, str], dict[str, bool]]: + coverage_join = _as_mapping(_as_mapping(ctx.metrics_map).get("coverage_join")) + coverage_summary = _as_mapping(coverage_join.get("summary")) + if str(coverage_summary.get("status", "")).strip() != "ok": + return {} + index: dict[tuple[str, str], dict[str, bool]] = {} + for item in map(_as_mapping, _as_sequence(coverage_join.get("items"))): + item_key = _coverage_review_item_key(ctx, item) + if item_key is None: + continue + entry = index.setdefault( + item_key, + { + "overlap": True, + "coverage_hotspot": False, + "scope_gap_hotspot": False, + }, + ) + entry["coverage_hotspot"] = entry["coverage_hotspot"] or bool( + item.get("coverage_hotspot") + ) + entry["scope_gap_hotspot"] = entry["scope_gap_hotspot"] or bool( + item.get("scope_gap_hotspot") + ) + return index + + +def _is_production_callable(item: Mapping[str, object]) -> bool: + return ( + str(item.get("source_kind", "")).strip() == "production" + and str(item.get("location_scope", "")).strip() == "callable" + ) + + +def _coverage_review_key( + ctx: ReportContext, + item: Mapping[str, object], +) -> tuple[str, str] | None: + relative_path = relative_location_path(ctx, item) + qualname = str(item.get("qualname", "")).strip() + if not relative_path or not qualname: + return None + return (relative_path, qualname) + + +def _coverage_review_item_key( + ctx: ReportContext, + item: Mapping[str, object], +) -> tuple[str, str] | None: + if not ( + bool(item.get("coverage_review_item")) + or bool(item.get("coverage_hotspot")) + or bool(item.get("scope_gap_hotspot")) + ): + return None + return _coverage_review_key(ctx, item) + + +def _fact_list_html( + rows: tuple[tuple[str, str, str | None], ...], +) -> str: + return ( + '
' + + "".join( + '
' + f'{_escape_html(label)}' + f'' + f"{_escape_html(value)}
" + for label, value, tone in rows + ) + + "
" + ) + + +def _pluralize(count: int, singular: str, plural: str | None = None) -> str: + if count == 1: + return singular + return plural or f"{singular}s" + + +def _humanize(value: str) -> str: + text = value.strip().replace("_", " ") + return text if not text else text[0].upper() + text[1:] + + +__all__ = [ + "render_security_surfaces_panel", + "security_surfaces_quality_count", +] diff --git a/codeclone/_html_report/_sections/_structural.py b/codeclone/report/html/sections/_structural.py similarity index 96% rename from codeclone/_html_report/_sections/_structural.py rename to codeclone/report/html/sections/_structural.py index d86428d..e2ba87d 100644 --- a/codeclone/_html_report/_sections/_structural.py +++ b/codeclone/report/html/sections/_structural.py @@ -10,35 +10,37 @@ from typing import TYPE_CHECKING -from ..._html_badges import _source_kind_badge_html, _tab_empty -from ..._html_escape import _escape_html -from ..._html_snippets import _FileCache, _render_code_block -from ...domain.findings import ( +from codeclone.domain.findings import ( STRUCTURAL_KIND_CLONE_COHORT_DRIFT, STRUCTURAL_KIND_CLONE_GUARD_EXIT_DIVERGENCE, STRUCTURAL_KIND_DUPLICATED_BRANCHES, ) -from ...domain.quality import RISK_HIGH, RISK_LOW -from ...report._source_kinds import SOURCE_KIND_FILTER_VALUES, source_kind_label -from ...report.derived import ( +from codeclone.domain.quality import RISK_HIGH, RISK_LOW +from codeclone.findings.ids import structural_group_id +from codeclone.findings.structural.detectors import normalize_structural_findings + +from ..._source_kinds import SOURCE_KIND_FILTER_VALUES, source_kind_label +from ...derived import ( combine_source_kinds, group_spread, relative_report_path, report_location_from_structural_occurrence, ) -from ...report.findings import _dedupe_items, _finding_scope_text, _spread -from ...report.json_contract import structural_group_id -from ...report.suggestions import ( +from ...findings import _dedupe_items, _finding_scope_text, _spread +from ...suggestions import ( structural_action_steps, structural_has_separate_suggestion, ) -from ...structural_findings import normalize_structural_findings -from .._tabs import render_split_tabs +from ..primitives.escape import _escape_html +from ..widgets.badges import _source_kind_badge_html, _tab_empty +from ..widgets.snippets import _FileCache, _render_code_block +from ..widgets.tabs import render_split_tabs if TYPE_CHECKING: from collections.abc import Sequence - from ...models import StructuralFindingGroup, StructuralFindingOccurrence + from codeclone.models import StructuralFindingGroup, StructuralFindingOccurrence + from .._context import ReportContext __all__ = [ @@ -46,7 +48,6 @@ "render_structural_panel", ] - _KIND_LABEL: dict[str, str] = { STRUCTURAL_KIND_DUPLICATED_BRANCHES: "Duplicated branches", STRUCTURAL_KIND_CLONE_GUARD_EXIT_DIVERGENCE: "Clone guard/exit divergence", diff --git a/codeclone/_html_report/_sections/_suggestions.py b/codeclone/report/html/sections/_suggestions.py similarity index 95% rename from codeclone/_html_report/_sections/_suggestions.py rename to codeclone/report/html/sections/_suggestions.py index b0d8d04..f5ad11b 100644 --- a/codeclone/_html_report/_sections/_suggestions.py +++ b/codeclone/report/html/sections/_suggestions.py @@ -11,12 +11,7 @@ from collections.abc import Mapping, Sequence from typing import TYPE_CHECKING -from ... import _coerce -from ..._html_badges import _micro_badges, _stat_card, _tab_empty -from ..._html_data_attrs import _build_data_attrs -from ..._html_escape import _escape_html -from ..._html_filters import SPREAD_OPTIONS, _render_select -from ...domain.findings import ( +from codeclone.domain.findings import ( CATEGORY_CLONE, CATEGORY_COHESION, CATEGORY_COMPLEXITY, @@ -28,13 +23,20 @@ FAMILY_METRICS, FAMILY_STRUCTURAL, ) -from ...domain.quality import SEVERITY_CRITICAL, SEVERITY_INFO, SEVERITY_WARNING -from ...report._source_kinds import SOURCE_KIND_FILTER_VALUES, source_kind_label -from .._components import insight_block -from .._glossary import glossary_tip +from codeclone.domain.quality import SEVERITY_CRITICAL, SEVERITY_INFO, SEVERITY_WARNING +from codeclone.utils import coerce as _coerce + +from ..._source_kinds import SOURCE_KIND_FILTER_VALUES, source_kind_label +from ..primitives.data_attrs import _build_data_attrs +from ..primitives.escape import _escape_html +from ..primitives.filters import SPREAD_OPTIONS, _render_select +from ..widgets.badges import _micro_badges, _stat_card, _tab_empty +from ..widgets.components import insight_block +from ..widgets.glossary import glossary_tip if TYPE_CHECKING: - from ...models import Suggestion + from codeclone.models import Suggestion + from .._context import ReportContext _as_int = _coerce.as_int diff --git a/codeclone/templates.py b/codeclone/report/html/template.py similarity index 96% rename from codeclone/templates.py rename to codeclone/report/html/template.py index a13cb31..b083d7d 100644 --- a/codeclone/templates.py +++ b/codeclone/report/html/template.py @@ -45,3 +45,8 @@ """ ) + +__all__ = [ + "FONT_CSS_URL", + "REPORT_TEMPLATE", +] diff --git a/codeclone/report/html/widgets/__init__.py b/codeclone/report/html/widgets/__init__.py new file mode 100644 index 0000000..9135843 --- /dev/null +++ b/codeclone/report/html/widgets/__init__.py @@ -0,0 +1,5 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy diff --git a/codeclone/_html_badges.py b/codeclone/report/html/widgets/badges.py similarity index 98% rename from codeclone/_html_badges.py rename to codeclone/report/html/widgets/badges.py index 716d1ad..ba77e0a 100644 --- a/codeclone/_html_badges.py +++ b/codeclone/report/html/widgets/badges.py @@ -18,8 +18,7 @@ from collections.abc import Callable, Sequence -from ._html_escape import _escape_html -from .domain.quality import ( +from codeclone.domain.quality import ( EFFORT_EASY, EFFORT_HARD, EFFORT_MODERATE, @@ -30,7 +29,9 @@ SEVERITY_INFO, SEVERITY_WARNING, ) -from .report._source_kinds import normalize_source_kind, source_kind_label + +from ..._source_kinds import normalize_source_kind, source_kind_label +from ..primitives.escape import _escape_html __all__ = [ "CHECK_CIRCLE_SVG", diff --git a/codeclone/_html_report/_components.py b/codeclone/report/html/widgets/components.py similarity index 95% rename from codeclone/_html_report/_components.py rename to codeclone/report/html/widgets/components.py index 7a9fcae..e970332 100644 --- a/codeclone/_html_report/_components.py +++ b/codeclone/report/html/widgets/components.py @@ -11,10 +11,11 @@ from collections.abc import Mapping from typing import Literal -from .._coerce import as_int as _as_int -from .._html_badges import _inline_empty, _source_kind_badge_html -from .._html_escape import _escape_html -from ._icons import section_icon_html +from codeclone.utils.coerce import as_int as _as_int + +from ..primitives.escape import _escape_html +from .badges import _inline_empty, _source_kind_badge_html +from .icons import section_icon_html Tone = Literal["ok", "warn", "risk", "info"] diff --git a/codeclone/_html_report/_glossary.py b/codeclone/report/html/widgets/glossary.py similarity index 96% rename from codeclone/_html_report/_glossary.py rename to codeclone/report/html/widgets/glossary.py index e48d4f0..19ab172 100644 --- a/codeclone/_html_report/_glossary.py +++ b/codeclone/report/html/widgets/glossary.py @@ -8,7 +8,7 @@ from __future__ import annotations -from .._html_escape import _escape_html +from ..primitives.escape import _escape_html GLOSSARY: dict[str, str] = { # Complexity @@ -46,7 +46,9 @@ # Dependency stat cards "modules": "Total number of Python modules analyzed", "edges": "Total number of import relationships between modules", - "max depth": "Longest chain of transitive imports", + "max depth": ( + "Longest internal transitive import chain; compare with avg and p95 depth" + ), "cycles": "Number of circular import dependencies detected", # Complexity stat cards "high-risk functions": ( diff --git a/codeclone/_html_report/_icons.py b/codeclone/report/html/widgets/icons.py similarity index 100% rename from codeclone/_html_report/_icons.py rename to codeclone/report/html/widgets/icons.py diff --git a/codeclone/_html_snippets.py b/codeclone/report/html/widgets/snippets.py similarity index 94% rename from codeclone/_html_snippets.py rename to codeclone/report/html/widgets/snippets.py index dac7eec..eeac1d7 100644 --- a/codeclone/_html_snippets.py +++ b/codeclone/report/html/widgets/snippets.py @@ -10,9 +10,9 @@ import importlib from dataclasses import dataclass from functools import lru_cache -from typing import TYPE_CHECKING, NamedTuple, cast +from typing import TYPE_CHECKING, NamedTuple -from .errors import FileProcessingError +from ....contracts.errors import FileProcessingError if TYPE_CHECKING: from types import ModuleType @@ -69,7 +69,13 @@ class _CacheInfo(NamedTuple): currsize: int def cache_info(self) -> _CacheInfo: - return cast("_FileCache._CacheInfo", self._get_file_lines_impl.cache_info()) + info = self._get_file_lines_impl.cache_info() + return self._CacheInfo( + hits=info.hits, + misses=info.misses, + maxsize=info.maxsize, + currsize=info.currsize, + ) _PYGMENTS_IMPORTER_ID: int | None = None diff --git a/codeclone/_html_report/_tables.py b/codeclone/report/html/widgets/tables.py similarity index 95% rename from codeclone/_html_report/_tables.py rename to codeclone/report/html/widgets/tables.py index 7f633f2..59cbf0e 100644 --- a/codeclone/_html_report/_tables.py +++ b/codeclone/report/html/widgets/tables.py @@ -11,12 +11,12 @@ from collections.abc import Collection, Sequence from typing import TYPE_CHECKING -from .._html_badges import _quality_badge_html, _tab_empty -from .._html_escape import _escape_html -from ._glossary import glossary_tip +from ..primitives.escape import _escape_html +from .badges import _quality_badge_html, _tab_empty +from .glossary import glossary_tip if TYPE_CHECKING: - from ._context import ReportContext + from .._context import ReportContext _RISK_HEADERS = {"risk", "confidence", "severity", "effort"} _PATH_HEADERS = {"file", "location"} diff --git a/codeclone/_html_report/_tabs.py b/codeclone/report/html/widgets/tabs.py similarity index 97% rename from codeclone/_html_report/_tabs.py rename to codeclone/report/html/widgets/tabs.py index 8ce1e43..5b708e3 100644 --- a/codeclone/_html_report/_tabs.py +++ b/codeclone/report/html/widgets/tabs.py @@ -10,7 +10,7 @@ from collections.abc import Sequence -from .._html_escape import _escape_html +from ..primitives.escape import _escape_html def render_split_tabs( diff --git a/codeclone/report/json_contract.py b/codeclone/report/json_contract.py deleted file mode 100644 index decfeb6..0000000 --- a/codeclone/report/json_contract.py +++ /dev/null @@ -1,2918 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at https://mozilla.org/MPL/2.0/. -# SPDX-License-Identifier: MPL-2.0 -# Copyright (c) 2026 Den Rozhnovskiy - -from __future__ import annotations - -from collections import Counter -from collections.abc import Collection, Iterable, Mapping, Sequence -from hashlib import sha256 -from typing import TYPE_CHECKING, Literal - -import orjson - -from .._coerce import as_float as _as_float -from .._coerce import as_int as _as_int -from .._coerce import as_mapping as _as_mapping -from .._coerce import as_sequence as _as_sequence -from ..contracts import ( - DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, - DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, - DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, - REPORT_SCHEMA_VERSION, -) -from ..domain.findings import ( - CATEGORY_COHESION, - CATEGORY_COMPLEXITY, - CATEGORY_COUPLING, - CATEGORY_COVERAGE, - CATEGORY_DEAD_CODE, - CATEGORY_DEPENDENCY, - CLONE_KIND_BLOCK, - CLONE_KIND_FUNCTION, - CLONE_KIND_SEGMENT, - CLONE_NOVELTY_KNOWN, - CLONE_NOVELTY_NEW, - FAMILY_CLONE, - FAMILY_CLONES, - FAMILY_DEAD_CODE, - FAMILY_DESIGN, - FAMILY_STRUCTURAL, - FINDING_KIND_COVERAGE_HOTSPOT, - FINDING_KIND_COVERAGE_SCOPE_GAP, -) -from ..domain.quality import ( - CONFIDENCE_HIGH, - CONFIDENCE_MEDIUM, - EFFORT_EASY, - EFFORT_HARD, - EFFORT_MODERATE, - EFFORT_WEIGHT, - RISK_LOW, - SEVERITY_CRITICAL, - SEVERITY_INFO, - SEVERITY_ORDER, - SEVERITY_RANK, - SEVERITY_WARNING, -) -from ..domain.source_scope import ( - IMPACT_SCOPE_MIXED, - IMPACT_SCOPE_NON_RUNTIME, - IMPACT_SCOPE_RUNTIME, - SOURCE_KIND_FIXTURES, - SOURCE_KIND_MIXED, - SOURCE_KIND_OTHER, - SOURCE_KIND_PRODUCTION, - SOURCE_KIND_TESTS, -) -from ..structural_findings import normalize_structural_findings -from ..suppressions import INLINE_CODECLONE_SUPPRESSION_SOURCE -from .derived import ( - group_spread, - relative_report_path, - report_location_from_group_item, - report_location_from_structural_occurrence, -) -from .derived import ( - normalized_source_kind as _normalized_source_kind, -) -from .derived import ( - source_scope_from_counts as _report_source_scope_from_counts, -) -from .derived import ( - source_scope_from_locations as _report_source_scope_from_locations, -) -from .overview import build_directory_hotspots -from .suggestions import classify_clone_type - -if TYPE_CHECKING: - from ..models import ( - GroupItemLike, - GroupMapLike, - SourceKind, - StructuralFindingGroup, - Suggestion, - SuppressedCloneGroup, - ) - -__all__ = [ - "build_report_document", - "clone_group_id", - "dead_code_group_id", - "design_group_id", - "structural_group_id", -] - -_OVERLOADED_MODULES_FAMILY = "overloaded_modules" -_COVERAGE_ADOPTION_FAMILY = "coverage_adoption" -_API_SURFACE_FAMILY = "api_surface" -_COVERAGE_JOIN_FAMILY = "coverage_join" - - -def _optional_str(value: object) -> str | None: - if value is None: - return None - text = str(value).strip() - return text or None - - -def _coerced_nonnegative_threshold(value: object, *, default: int) -> int: - threshold = _as_int(value, default) - return threshold if threshold >= 0 else default - - -def _design_findings_thresholds_payload( - raw_meta: Mapping[str, object] | None, -) -> dict[str, object]: - meta = dict(raw_meta or {}) - return { - "design_findings": { - CATEGORY_COMPLEXITY: { - "metric": "cyclomatic_complexity", - "operator": ">", - "value": _coerced_nonnegative_threshold( - meta.get("design_complexity_threshold"), - default=DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, - ), - }, - CATEGORY_COUPLING: { - "metric": "cbo", - "operator": ">", - "value": _coerced_nonnegative_threshold( - meta.get("design_coupling_threshold"), - default=DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, - ), - }, - CATEGORY_COHESION: { - "metric": "lcom4", - "operator": ">=", - "value": _coerced_nonnegative_threshold( - meta.get("design_cohesion_threshold"), - default=DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, - ), - }, - } - } - - -def _analysis_profile_payload( - raw_meta: Mapping[str, object] | None, -) -> dict[str, int] | None: - meta = dict(raw_meta or {}) - nested = _as_mapping(meta.get("analysis_profile")) - if nested: - meta = dict(nested) - keys = ( - "min_loc", - "min_stmt", - "block_min_loc", - "block_min_stmt", - "segment_min_loc", - "segment_min_stmt", - ) - if any(key not in meta for key in keys): - return None - payload = {key: _as_int(meta.get(key), -1) for key in keys} - if any(value < 0 for value in payload.values()): - return None - return payload - - -def _normalize_path(value: str) -> str: - return value.replace("\\", "/").strip() - - -def _is_absolute_path(value: str) -> bool: - normalized = _normalize_path(value) - if not normalized: - return False - if normalized.startswith("/"): - return True - return len(normalized) > 2 and normalized[1] == ":" and normalized[2] == "/" - - -def _contract_path( - value: object, - *, - scan_root: str, -) -> tuple[str | None, str | None, str | None]: - path_text = _optional_str(value) - if path_text is None: - return None, None, None - normalized_path = _normalize_path(path_text) - relative_path = relative_report_path(normalized_path, scan_root=scan_root) - if relative_path and relative_path != normalized_path: - return relative_path, "in_root", normalized_path - if _is_absolute_path(normalized_path): - return normalized_path.rsplit("/", maxsplit=1)[-1], "external", normalized_path - return normalized_path, "relative", None - - -def _contract_report_location_path(location_path: str, *, scan_root: str) -> str: - contract_path, _scope, _absolute = _contract_path( - location_path, - scan_root=scan_root, - ) - return contract_path or "" - - -def _priority( - severity: str, - effort: str, -) -> float: - severity_rank = SEVERITY_RANK.get(severity, 1) - effort_rank = EFFORT_WEIGHT.get(effort, 1) - return float(severity_rank) / float(effort_rank) - - -def clone_group_id(kind: str, group_key: str) -> str: - return f"clone:{kind}:{group_key}" - - -def structural_group_id(finding_kind: str, finding_key: str) -> str: - return f"structural:{finding_kind}:{finding_key}" - - -def dead_code_group_id(subject_key: str) -> str: - return f"dead_code:{subject_key}" - - -def design_group_id(category: str, subject_key: str) -> str: - return f"design:{category}:{subject_key}" - - -def _clone_novelty( - *, - group_key: str, - baseline_trusted: bool, - new_keys: Collection[str] | None, -) -> str: - if not baseline_trusted: - return CLONE_NOVELTY_NEW - if new_keys is None: - return CLONE_NOVELTY_NEW - return CLONE_NOVELTY_NEW if group_key in new_keys else CLONE_NOVELTY_KNOWN - - -def _item_sort_key(item: Mapping[str, object]) -> tuple[str, int, int, str]: - return ( - str(item.get("relative_path", "")), - _as_int(item.get("start_line")), - _as_int(item.get("end_line")), - str(item.get("qualname", "")), - ) - - -def _parse_bool_text(value: object) -> bool: - text = str(value).strip().lower() - return text in {"1", "true", "yes"} - - -def _parse_ratio_percent(value: object) -> float | None: - text = str(value).strip() - if not text: - return None - if text.endswith("%"): - try: - return float(text[:-1]) / 100.0 - except ValueError: - return None - try: - numeric = float(text) - except ValueError: - return None - return numeric if numeric <= 1.0 else numeric / 100.0 - - -def _normalize_block_machine_facts( - *, - group_key: str, - group_arity: int, - block_facts: Mapping[str, str], -) -> tuple[dict[str, object], dict[str, str]]: - facts: dict[str, object] = { - "group_key": group_key, - "group_arity": group_arity, - } - display_facts: dict[str, str] = {} - for key in sorted(block_facts): - value = str(block_facts[key]) - match key: - case "group_arity": - facts[key] = _as_int(value) - case "block_size" | "consecutive_asserts" | "instance_peer_count": - facts[key] = _as_int(value) - case "merged_regions": - facts[key] = _parse_bool_text(value) - case "assert_ratio": - ratio = _parse_ratio_percent(value) - if ratio is not None: - facts[key] = ratio - display_facts[key] = value - case ( - "match_rule" | "pattern" | "signature_kind" | "hint" | "hint_confidence" - ): - facts[key] = value - case _: - display_facts[key] = value - return facts, display_facts - - -def _source_scope_from_filepaths( - filepaths: Iterable[str], - *, - scan_root: str, -) -> dict[str, object]: - counts: Counter[SourceKind] = Counter() - for filepath in filepaths: - location = report_location_from_group_item( - {"filepath": filepath, "start_line": 0, "end_line": 0, "qualname": ""}, - scan_root=scan_root, - ) - counts[location.source_kind] += 1 - return _source_scope_from_counts(counts) - - -def _source_scope_from_counts( - counts: Mapping[SourceKind, int], -) -> dict[str, object]: - return _report_source_scope_from_counts(counts) - - -def _source_scope_from_locations( - locations: Sequence[Mapping[str, object]], -) -> dict[str, object]: - normalized_locations = [ - {"source_kind": _normalized_source_kind(location.get("source_kind"))} - for location in locations - ] - return _report_source_scope_from_locations(normalized_locations) - - -def _collect_paths_from_metrics(metrics: Mapping[str, object]) -> set[str]: - paths: set[str] = set() - complexity = _as_mapping(metrics.get(CATEGORY_COMPLEXITY)) - for item in _as_sequence(complexity.get("functions")): - item_map = _as_mapping(item) - filepath = _optional_str(item_map.get("filepath")) - if filepath is not None: - paths.add(filepath) - for family_name in (CATEGORY_COUPLING, CATEGORY_COHESION): - family = _as_mapping(metrics.get(family_name)) - for item in _as_sequence(family.get("classes")): - item_map = _as_mapping(item) - filepath = _optional_str(item_map.get("filepath")) - if filepath is not None: - paths.add(filepath) - dead_code = _as_mapping(metrics.get(FAMILY_DEAD_CODE)) - for item in _as_sequence(dead_code.get("items")): - item_map = _as_mapping(item) - filepath = _optional_str(item_map.get("filepath")) - if filepath is not None: - paths.add(filepath) - for item in _as_sequence(dead_code.get("suppressed_items")): - item_map = _as_mapping(item) - filepath = _optional_str(item_map.get("filepath")) - if filepath is not None: - paths.add(filepath) - overloaded_modules = _as_mapping(metrics.get(_OVERLOADED_MODULES_FAMILY)) - for item in _as_sequence(overloaded_modules.get("items")): - item_map = _as_mapping(item) - filepath = _optional_str(item_map.get("filepath")) - if filepath is not None: - paths.add(filepath) - coverage_adoption = _as_mapping(metrics.get(_COVERAGE_ADOPTION_FAMILY)) - for item in _as_sequence(coverage_adoption.get("items")): - item_map = _as_mapping(item) - filepath = _optional_str(item_map.get("filepath")) - if filepath is not None: - paths.add(filepath) - api_surface = _as_mapping(metrics.get(_API_SURFACE_FAMILY)) - for item in _as_sequence(api_surface.get("items")): - item_map = _as_mapping(item) - filepath = _optional_str(item_map.get("filepath")) - if filepath is not None: - paths.add(filepath) - coverage_join = _as_mapping(metrics.get(_COVERAGE_JOIN_FAMILY)) - for item in _as_sequence(coverage_join.get("items")): - item_map = _as_mapping(item) - filepath = _optional_str(item_map.get("filepath")) - if filepath is not None: - paths.add(filepath) - return paths - - -def _collect_report_file_list( - *, - inventory: Mapping[str, object] | None, - func_groups: GroupMapLike, - block_groups: GroupMapLike, - segment_groups: GroupMapLike, - suppressed_clone_groups: Sequence[SuppressedCloneGroup] | None = None, - metrics: Mapping[str, object] | None, - structural_findings: Sequence[StructuralFindingGroup] | None, -) -> list[str]: - files: set[str] = set() - inventory_map = _as_mapping(inventory) - for filepath in _as_sequence(inventory_map.get("file_list")): - file_text = _optional_str(filepath) - if file_text is not None: - files.add(file_text) - for groups in (func_groups, block_groups, segment_groups): - for items in groups.values(): - for item in items: - filepath = _optional_str(item.get("filepath")) - if filepath is not None: - files.add(filepath) - for suppressed_group in suppressed_clone_groups or (): - for item in suppressed_group.items: - filepath = _optional_str(item.get("filepath")) - if filepath is not None: - files.add(filepath) - if metrics is not None: - files.update(_collect_paths_from_metrics(metrics)) - if structural_findings: - for structural_group in normalize_structural_findings(structural_findings): - for occurrence in structural_group.items: - filepath = _optional_str(occurrence.file_path) - if filepath is not None: - files.add(filepath) - return sorted(files) - - -def _count_file_lines(filepaths: Sequence[str]) -> int: - total = 0 - for filepath in filepaths: - total += _count_file_lines_for_path(filepath) - return total - - -def _count_file_lines_for_path(filepath: str) -> int: - try: - with open(filepath, encoding="utf-8", errors="surrogateescape") as handle: - return sum(1 for _ in handle) - except OSError: - return 0 - - -def _normalize_nested_string_rows(value: object) -> list[list[str]]: - rows: list[tuple[str, ...]] = [] - for row in _as_sequence(value): - modules = tuple( - str(module) for module in _as_sequence(row) if str(module).strip() - ) - if modules: - rows.append(modules) - rows.sort(key=lambda row: (len(row), row)) - return [list(row) for row in rows] - - -def _normalize_metrics_families( - metrics: Mapping[str, object] | None, - *, - scan_root: str, -) -> dict[str, object]: - metrics_map = _as_mapping(metrics) - complexity = _as_mapping(metrics_map.get(CATEGORY_COMPLEXITY)) - complexity_items = sorted( - ( - { - "qualname": str(item_map.get("qualname", "")), - "relative_path": _contract_path( - item_map.get("filepath", ""), - scan_root=scan_root, - )[0] - or "", - "start_line": _as_int(item_map.get("start_line")), - "end_line": _as_int(item_map.get("end_line")), - "cyclomatic_complexity": _as_int( - item_map.get("cyclomatic_complexity"), - 1, - ), - "nesting_depth": _as_int(item_map.get("nesting_depth")), - "risk": str(item_map.get("risk", RISK_LOW)), - } - for item in _as_sequence(complexity.get("functions")) - for item_map in (_as_mapping(item),) - ), - key=lambda item: ( - item["relative_path"], - item["start_line"], - item["end_line"], - item["qualname"], - ), - ) - - coupling = _as_mapping(metrics_map.get(CATEGORY_COUPLING)) - coupling_items = sorted( - ( - { - "qualname": str(item_map.get("qualname", "")), - "relative_path": _contract_path( - item_map.get("filepath", ""), - scan_root=scan_root, - )[0] - or "", - "start_line": _as_int(item_map.get("start_line")), - "end_line": _as_int(item_map.get("end_line")), - "cbo": _as_int(item_map.get("cbo")), - "risk": str(item_map.get("risk", RISK_LOW)), - "coupled_classes": sorted( - { - str(name) - for name in _as_sequence(item_map.get("coupled_classes")) - if str(name).strip() - } - ), - } - for item in _as_sequence(coupling.get("classes")) - for item_map in (_as_mapping(item),) - ), - key=lambda item: ( - item["relative_path"], - item["start_line"], - item["end_line"], - item["qualname"], - ), - ) - - cohesion = _as_mapping(metrics_map.get(CATEGORY_COHESION)) - cohesion_items = sorted( - ( - { - "qualname": str(item_map.get("qualname", "")), - "relative_path": _contract_path( - item_map.get("filepath", ""), - scan_root=scan_root, - )[0] - or "", - "start_line": _as_int(item_map.get("start_line")), - "end_line": _as_int(item_map.get("end_line")), - "lcom4": _as_int(item_map.get("lcom4")), - "risk": str(item_map.get("risk", RISK_LOW)), - "method_count": _as_int(item_map.get("method_count")), - "instance_var_count": _as_int(item_map.get("instance_var_count")), - } - for item in _as_sequence(cohesion.get("classes")) - for item_map in (_as_mapping(item),) - ), - key=lambda item: ( - item["relative_path"], - item["start_line"], - item["end_line"], - item["qualname"], - ), - ) - - dependencies = _as_mapping(metrics_map.get("dependencies")) - dependency_edges = sorted( - ( - { - "source": str(item_map.get("source", "")), - "target": str(item_map.get("target", "")), - "import_type": str(item_map.get("import_type", "")), - "line": _as_int(item_map.get("line")), - } - for item in _as_sequence(dependencies.get("edge_list")) - for item_map in (_as_mapping(item),) - ), - key=lambda item: ( - item["source"], - item["target"], - item["import_type"], - item["line"], - ), - ) - dependency_cycles = _normalize_nested_string_rows(dependencies.get("cycles")) - longest_chains = _normalize_nested_string_rows(dependencies.get("longest_chains")) - - dead_code = _as_mapping(metrics_map.get(FAMILY_DEAD_CODE)) - - def _normalize_suppressed_by( - raw_bindings: object, - ) -> list[dict[str, str]]: - normalized_bindings = sorted( - { - ( - str(binding_map.get("rule", "")).strip(), - str(binding_map.get("source", "")).strip(), - ) - for binding in _as_sequence(raw_bindings) - for binding_map in (_as_mapping(binding),) - if str(binding_map.get("rule", "")).strip() - }, - key=lambda item: (item[0], item[1]), - ) - if not normalized_bindings: - return [] - return [ - { - "rule": rule, - "source": source or INLINE_CODECLONE_SUPPRESSION_SOURCE, - } - for rule, source in normalized_bindings - ] - - dead_items = sorted( - ( - { - "qualname": str(item_map.get("qualname", "")), - "relative_path": _contract_path( - item_map.get("filepath", ""), - scan_root=scan_root, - )[0] - or "", - "start_line": _as_int(item_map.get("start_line")), - "end_line": _as_int(item_map.get("end_line")), - "kind": str(item_map.get("kind", "")), - "confidence": str(item_map.get("confidence", CONFIDENCE_MEDIUM)), - } - for item in _as_sequence(dead_code.get("items")) - for item_map in (_as_mapping(item),) - ), - key=lambda item: ( - item["relative_path"], - item["start_line"], - item["end_line"], - item["qualname"], - item["kind"], - ), - ) - dead_suppressed_items = sorted( - ( - { - "qualname": str(item_map.get("qualname", "")), - "relative_path": _contract_path( - item_map.get("filepath", ""), - scan_root=scan_root, - )[0] - or "", - "start_line": _as_int(item_map.get("start_line")), - "end_line": _as_int(item_map.get("end_line")), - "kind": str(item_map.get("kind", "")), - "confidence": str(item_map.get("confidence", CONFIDENCE_MEDIUM)), - "suppressed_by": _normalize_suppressed_by( - item_map.get("suppressed_by") - ), - } - for item in _as_sequence(dead_code.get("suppressed_items")) - for item_map in (_as_mapping(item),) - ), - key=lambda item: ( - item["relative_path"], - item["start_line"], - item["end_line"], - item["qualname"], - item["kind"], - item["confidence"], - tuple( - ( - str(_as_mapping(binding).get("rule", "")), - str(_as_mapping(binding).get("source", "")), - ) - for binding in _as_sequence(item.get("suppressed_by")) - ), - ), - ) - for item in dead_suppressed_items: - suppressed_by = _as_sequence(item.get("suppressed_by")) - first_binding = _as_mapping(suppressed_by[0]) if suppressed_by else {} - item["suppression_rule"] = str(first_binding.get("rule", "")) - item["suppression_source"] = str(first_binding.get("source", "")) - - health = _as_mapping(metrics_map.get("health")) - health_dimensions = { - str(key): _as_int(value) - for key, value in sorted(_as_mapping(health.get("dimensions")).items()) - } - overloaded_modules = _as_mapping(metrics_map.get(_OVERLOADED_MODULES_FAMILY)) - overloaded_modules_detection = _as_mapping(overloaded_modules.get("detection")) - overloaded_module_items = sorted( - ( - { - "module": str(item_map.get("module", "")).strip(), - "relative_path": _contract_path( - item_map.get("filepath", ""), - scan_root=scan_root, - )[0] - or "", - "source_kind": str(item_map.get("source_kind", SOURCE_KIND_OTHER)), - "loc": _as_int(item_map.get("loc")), - "functions": _as_int(item_map.get("functions")), - "methods": _as_int(item_map.get("methods")), - "classes": _as_int(item_map.get("classes")), - "callable_count": _as_int(item_map.get("callable_count")), - "complexity_total": _as_int(item_map.get("complexity_total")), - "complexity_max": _as_int(item_map.get("complexity_max")), - "fan_in": _as_int(item_map.get("fan_in")), - "fan_out": _as_int(item_map.get("fan_out")), - "total_deps": _as_int(item_map.get("total_deps")), - "import_edges": _as_int(item_map.get("import_edges")), - "reimport_edges": _as_int(item_map.get("reimport_edges")), - "reimport_ratio": round( - _as_float(item_map.get("reimport_ratio")), - 4, - ), - "instability": round(_as_float(item_map.get("instability")), 4), - "hub_balance": round(_as_float(item_map.get("hub_balance")), 4), - "size_score": round(_as_float(item_map.get("size_score")), 4), - "dependency_score": round( - _as_float(item_map.get("dependency_score")), - 4, - ), - "shape_score": round(_as_float(item_map.get("shape_score")), 4), - "score": round(_as_float(item_map.get("score")), 4), - "candidate_status": str( - item_map.get("candidate_status", "non_candidate") - ), - "candidate_reasons": [ - str(reason) - for reason in _as_sequence(item_map.get("candidate_reasons")) - if str(reason).strip() - ], - } - for item in _as_sequence(overloaded_modules.get("items")) - for item_map in (_as_mapping(item),) - ), - key=lambda item: ( - {"candidate": 0, "ranked_only": 1, "non_candidate": 2}.get( - str(item["candidate_status"]), - 3, - ), - -_as_float(item["score"]), - -_as_float(item["size_score"]), - -_as_float(item["dependency_score"]), - item["relative_path"], - item["module"], - ), - ) - - complexity_summary = _as_mapping(complexity.get("summary")) - coupling_summary = _as_mapping(coupling.get("summary")) - cohesion_summary = _as_mapping(cohesion.get("summary")) - dead_code_summary = _as_mapping(dead_code.get("summary")) - overloaded_modules_summary = _as_mapping(overloaded_modules.get("summary")) - coverage_adoption = _as_mapping(metrics_map.get(_COVERAGE_ADOPTION_FAMILY)) - coverage_adoption_summary = _as_mapping(coverage_adoption.get("summary")) - coverage_adoption_items = sorted( - ( - { - "module": str(item_map.get("module", "")).strip(), - "relative_path": _contract_path( - item_map.get("filepath", ""), - scan_root=scan_root, - )[0] - or "", - "callable_count": _as_int(item_map.get("callable_count")), - "params_total": _as_int(item_map.get("params_total")), - "params_annotated": _as_int(item_map.get("params_annotated")), - "param_permille": _as_int(item_map.get("param_permille")), - "returns_total": _as_int(item_map.get("returns_total")), - "returns_annotated": _as_int(item_map.get("returns_annotated")), - "return_permille": _as_int(item_map.get("return_permille")), - "any_annotation_count": _as_int(item_map.get("any_annotation_count")), - "public_symbol_total": _as_int(item_map.get("public_symbol_total")), - "public_symbol_documented": _as_int( - item_map.get("public_symbol_documented") - ), - "docstring_permille": _as_int(item_map.get("docstring_permille")), - } - for item in _as_sequence(coverage_adoption.get("items")) - for item_map in (_as_mapping(item),) - ), - key=lambda item: ( - item["relative_path"], - item["module"], - ), - ) - api_surface = _as_mapping(metrics_map.get(_API_SURFACE_FAMILY)) - api_surface_summary = _as_mapping(api_surface.get("summary")) - api_surface_items = sorted( - ( - { - "record_kind": str(item_map.get("record_kind", "symbol")), - "module": str(item_map.get("module", "")).strip(), - "relative_path": _contract_path( - item_map.get("filepath", ""), - scan_root=scan_root, - )[0] - or "", - "qualname": str(item_map.get("qualname", "")), - "start_line": _as_int(item_map.get("start_line")), - "end_line": _as_int(item_map.get("end_line")), - "symbol_kind": str(item_map.get("symbol_kind", "")), - "exported_via": _optional_str(item_map.get("exported_via")), - "params_total": _as_int(item_map.get("params_total")), - "params": [ - { - "name": str(param_map.get("name", "")), - "kind": str(param_map.get("kind", "")), - "has_default": bool(param_map.get("has_default")), - "annotated": bool(param_map.get("annotated")), - } - for param in _as_sequence(item_map.get("params")) - for param_map in (_as_mapping(param),) - ], - "returns_annotated": bool(item_map.get("returns_annotated")), - "change_kind": _optional_str(item_map.get("change_kind")), - "detail": _optional_str(item_map.get("detail")), - } - for item in _as_sequence(api_surface.get("items")) - for item_map in (_as_mapping(item),) - ), - key=lambda item: ( - item["relative_path"], - item["start_line"], - item["end_line"], - item["qualname"], - item["record_kind"], - ), - ) - coverage_join = _as_mapping(metrics_map.get(_COVERAGE_JOIN_FAMILY)) - coverage_join_summary = _as_mapping(coverage_join.get("summary")) - coverage_join_items = sorted( - ( - { - "relative_path": _contract_path( - item_map.get("filepath", ""), - scan_root=scan_root, - )[0] - or "", - "qualname": str(item_map.get("qualname", "")).strip(), - "start_line": _as_int(item_map.get("start_line")), - "end_line": _as_int(item_map.get("end_line")), - "cyclomatic_complexity": _as_int( - item_map.get("cyclomatic_complexity"), - 1, - ), - "risk": str(item_map.get("risk", RISK_LOW)).strip() or RISK_LOW, - "executable_lines": _as_int(item_map.get("executable_lines")), - "covered_lines": _as_int(item_map.get("covered_lines")), - "coverage_permille": _as_int(item_map.get("coverage_permille")), - "coverage_status": str(item_map.get("coverage_status", "")).strip(), - "coverage_hotspot": bool(item_map.get("coverage_hotspot")), - "scope_gap_hotspot": bool(item_map.get("scope_gap_hotspot")), - } - for item in _as_sequence(coverage_join.get("items")) - for item_map in (_as_mapping(item),) - ), - key=lambda item: ( - 0 if bool(item["coverage_hotspot"]) else 1, - 0 if bool(item["scope_gap_hotspot"]) else 1, - {"high": 0, "medium": 1, "low": 2}.get(str(item["risk"]), 3), - _as_int(item["coverage_permille"]), - -_as_int(item["cyclomatic_complexity"]), - item["relative_path"], - _as_int(item["start_line"]), - item["qualname"], - ), - ) - dead_high_confidence = sum( - 1 - for item in dead_items - if str(_as_mapping(item).get("confidence", "")).strip().lower() - == CONFIDENCE_HIGH - ) - - normalized: dict[str, object] = { - CATEGORY_COMPLEXITY: { - "summary": { - "total": len(complexity_items), - "average": round(_as_float(complexity_summary.get("average")), 2), - "max": _as_int(complexity_summary.get("max")), - "high_risk": _as_int(complexity_summary.get("high_risk")), - }, - "items": complexity_items, - "items_truncated": False, - }, - CATEGORY_COUPLING: { - "summary": { - "total": len(coupling_items), - "average": round(_as_float(coupling_summary.get("average")), 2), - "max": _as_int(coupling_summary.get("max")), - "high_risk": _as_int(coupling_summary.get("high_risk")), - }, - "items": coupling_items, - "items_truncated": False, - }, - CATEGORY_COHESION: { - "summary": { - "total": len(cohesion_items), - "average": round(_as_float(cohesion_summary.get("average")), 2), - "max": _as_int(cohesion_summary.get("max")), - "low_cohesion": _as_int(cohesion_summary.get("low_cohesion")), - }, - "items": cohesion_items, - "items_truncated": False, - }, - "dependencies": { - "summary": { - "modules": _as_int(dependencies.get("modules")), - "edges": _as_int(dependencies.get("edges")), - "cycles": len(dependency_cycles), - "max_depth": _as_int(dependencies.get("max_depth")), - }, - "items": dependency_edges, - "cycles": dependency_cycles, - "longest_chains": longest_chains, - "items_truncated": False, - }, - FAMILY_DEAD_CODE: { - "summary": { - "total": len(dead_items), - "high_confidence": dead_high_confidence - or _as_int( - dead_code_summary.get( - "high_confidence", dead_code_summary.get("critical") - ) - ), - "suppressed": len(dead_suppressed_items) - or _as_int(dead_code_summary.get("suppressed")), - }, - "items": dead_items, - "suppressed_items": dead_suppressed_items, - "items_truncated": False, - }, - "health": { - "summary": { - "score": _as_int(health.get("score")), - "grade": str(health.get("grade", "")), - "dimensions": health_dimensions, - }, - "items": [], - "items_truncated": False, - }, - _COVERAGE_ADOPTION_FAMILY: { - "summary": { - "modules": len(coverage_adoption_items), - "params_total": _as_int(coverage_adoption_summary.get("params_total")), - "params_annotated": _as_int( - coverage_adoption_summary.get("params_annotated") - ), - "param_permille": _as_int( - coverage_adoption_summary.get("param_permille") - ), - "baseline_diff_available": bool( - coverage_adoption_summary.get("baseline_diff_available") - ), - "param_delta": _as_int(coverage_adoption_summary.get("param_delta")), - "returns_total": _as_int( - coverage_adoption_summary.get("returns_total") - ), - "returns_annotated": _as_int( - coverage_adoption_summary.get("returns_annotated") - ), - "return_permille": _as_int( - coverage_adoption_summary.get("return_permille") - ), - "return_delta": _as_int(coverage_adoption_summary.get("return_delta")), - "public_symbol_total": _as_int( - coverage_adoption_summary.get("public_symbol_total") - ), - "public_symbol_documented": _as_int( - coverage_adoption_summary.get("public_symbol_documented") - ), - "docstring_permille": _as_int( - coverage_adoption_summary.get("docstring_permille") - ), - "docstring_delta": _as_int( - coverage_adoption_summary.get("docstring_delta") - ), - "typing_any_count": _as_int( - coverage_adoption_summary.get("typing_any_count") - ), - }, - "items": coverage_adoption_items, - "items_truncated": False, - }, - _API_SURFACE_FAMILY: { - "summary": { - "enabled": bool(api_surface_summary.get("enabled")), - "baseline_diff_available": bool( - api_surface_summary.get("baseline_diff_available") - ), - "modules": _as_int(api_surface_summary.get("modules")), - "public_symbols": _as_int(api_surface_summary.get("public_symbols")), - "added": _as_int(api_surface_summary.get("added")), - "breaking": _as_int(api_surface_summary.get("breaking")), - "strict_types": bool(api_surface_summary.get("strict_types")), - }, - "items": api_surface_items, - "items_truncated": False, - }, - _OVERLOADED_MODULES_FAMILY: { - "summary": { - "total": len(overloaded_module_items), - "candidates": _as_int(overloaded_modules_summary.get("candidates")), - "population_status": str( - overloaded_modules_summary.get("population_status", "limited") - ), - "top_score": round( - _as_float(overloaded_modules_summary.get("top_score")), - 4, - ), - "average_score": round( - _as_float(overloaded_modules_summary.get("average_score")), - 4, - ), - "candidate_score_cutoff": round( - _as_float(overloaded_modules_summary.get("candidate_score_cutoff")), - 4, - ), - }, - "detection": { - "version": str(overloaded_modules_detection.get("version", "1")), - "scope": str(overloaded_modules_detection.get("scope", "report_only")), - "strategy": str( - overloaded_modules_detection.get( - "strategy", - "project_relative_composite", - ) - ), - "minimum_population": _as_int( - overloaded_modules_detection.get("minimum_population"), - ), - "size_signals": [ - str(signal) - for signal in _as_sequence( - overloaded_modules_detection.get("size_signals") - ) - if str(signal).strip() - ], - "dependency_signals": [ - str(signal) - for signal in _as_sequence( - overloaded_modules_detection.get("dependency_signals") - ) - if str(signal).strip() - ], - "shape_signals": [ - str(signal) - for signal in _as_sequence( - overloaded_modules_detection.get("shape_signals") - ) - if str(signal).strip() - ], - }, - "items": overloaded_module_items, - "items_truncated": False, - }, - } - if coverage_join_summary or coverage_join_items or coverage_join: - normalized[_COVERAGE_JOIN_FAMILY] = { - "summary": { - "status": str(coverage_join_summary.get("status", "")), - "source": _contract_path( - coverage_join_summary.get("source", ""), - scan_root=scan_root, - )[0], - "files": _as_int(coverage_join_summary.get("files")), - "units": _as_int(coverage_join_summary.get("units")), - "measured_units": _as_int(coverage_join_summary.get("measured_units")), - "overall_executable_lines": _as_int( - coverage_join_summary.get("overall_executable_lines") - ), - "overall_covered_lines": _as_int( - coverage_join_summary.get("overall_covered_lines") - ), - "overall_permille": _as_int( - coverage_join_summary.get("overall_permille") - ), - "missing_from_report_units": _as_int( - coverage_join_summary.get("missing_from_report_units") - ), - "coverage_hotspots": _as_int( - coverage_join_summary.get("coverage_hotspots") - ), - "scope_gap_hotspots": _as_int( - coverage_join_summary.get("scope_gap_hotspots") - ), - "hotspot_threshold_percent": _as_int( - coverage_join_summary.get("hotspot_threshold_percent") - ), - "invalid_reason": _optional_str( - coverage_join_summary.get("invalid_reason") - ), - }, - "items": coverage_join_items, - "items_truncated": False, - } - return normalized - - -def _build_metrics_payload( - metrics: Mapping[str, object] | None, - *, - scan_root: str, -) -> dict[str, object]: - families = _normalize_metrics_families(metrics, scan_root=scan_root) - return { - "summary": { - family_name: _as_mapping(_as_mapping(family_payload).get("summary")) - for family_name, family_payload in families.items() - }, - "families": families, - } - - -def _derive_inventory_code_counts( - *, - metrics_payload: Mapping[str, object], - inventory_code: Mapping[str, object], - file_list: Sequence[str], - cached_files: int, -) -> dict[str, object]: - complexity = _as_mapping( - _as_mapping(metrics_payload.get("families")).get(CATEGORY_COMPLEXITY) - ) - cohesion = _as_mapping( - _as_mapping(metrics_payload.get("families")).get(CATEGORY_COHESION) - ) - complexity_items = _as_sequence(complexity.get("items")) - cohesion_items = _as_sequence(cohesion.get("items")) - - exact_entities = bool(complexity_items or cohesion_items) - method_count = sum( - _as_int(_as_mapping(item).get("method_count")) for item in cohesion_items - ) - class_count = len(cohesion_items) - function_total = max(len(complexity_items) - method_count, 0) - - if not exact_entities: - function_total = _as_int(inventory_code.get("functions")) - method_count = _as_int(inventory_code.get("methods")) - class_count = _as_int(inventory_code.get("classes")) - - parsed_lines_raw = inventory_code.get("parsed_lines") - if isinstance(parsed_lines_raw, int) and parsed_lines_raw >= 0: - parsed_lines = parsed_lines_raw - elif cached_files > 0 and file_list: - parsed_lines = _count_file_lines(file_list) - else: - parsed_lines = _as_int(parsed_lines_raw) - - if exact_entities and ((cached_files > 0 and file_list) or parsed_lines > 0): - scope = "analysis_root" - elif cached_files > 0 and file_list: - scope = "mixed" - else: - scope = "current_run" - - return { - "scope": scope, - "parsed_lines": parsed_lines, - "functions": function_total, - "methods": method_count, - "classes": class_count, - } - - -def _build_inventory_payload( - *, - inventory: Mapping[str, object] | None, - file_list: Sequence[str], - metrics_payload: Mapping[str, object], - scan_root: str, -) -> dict[str, object]: - inventory_map = _as_mapping(inventory) - files_map = _as_mapping(inventory_map.get("files")) - code_map = _as_mapping(inventory_map.get("code")) - cached_files = _as_int(files_map.get("cached")) - file_registry = [ - path - for path in ( - _contract_path(filepath, scan_root=scan_root)[0] for filepath in file_list - ) - if path is not None - ] - return { - "files": { - "total_found": _as_int(files_map.get("total_found"), len(file_list)), - "analyzed": _as_int(files_map.get("analyzed")), - "cached": cached_files, - "skipped": _as_int(files_map.get("skipped")), - "source_io_skipped": _as_int(files_map.get("source_io_skipped")), - }, - "code": _derive_inventory_code_counts( - metrics_payload=metrics_payload, - inventory_code=code_map, - file_list=file_list, - cached_files=cached_files, - ), - "file_registry": { - "encoding": "relative_path", - "items": file_registry, - }, - } - - -def _baseline_is_trusted(meta: Mapping[str, object]) -> bool: - baseline = _as_mapping(meta.get("baseline")) - return ( - baseline.get("loaded") is True - and str(baseline.get("status", "")).strip().lower() == "ok" - ) - - -def _build_meta_payload( - raw_meta: Mapping[str, object] | None, - *, - scan_root: str, -) -> dict[str, object]: - meta = dict(raw_meta or {}) - metrics_computed = sorted( - { - str(item) - for item in _as_sequence(meta.get("metrics_computed")) - if str(item).strip() - } - ) - baseline_path, baseline_path_scope, baseline_abs = _contract_path( - meta.get("baseline_path"), - scan_root=scan_root, - ) - cache_path, cache_path_scope, cache_abs = _contract_path( - meta.get("cache_path"), - scan_root=scan_root, - ) - metrics_baseline_path, metrics_baseline_path_scope, metrics_baseline_abs = ( - _contract_path( - meta.get("metrics_baseline_path"), - scan_root=scan_root, - ) - ) - payload: dict[str, object] = { - "codeclone_version": str(meta.get("codeclone_version", "")), - "project_name": str(meta.get("project_name", "")), - "scan_root": ".", - "python_version": str(meta.get("python_version", "")), - "python_tag": str(meta.get("python_tag", "")), - "analysis_mode": str(meta.get("analysis_mode", "full") or "full"), - "report_mode": str(meta.get("report_mode", "full") or "full"), - "computed_metric_families": metrics_computed, - "analysis_thresholds": _design_findings_thresholds_payload(meta), - "baseline": { - "path": baseline_path, - "path_scope": baseline_path_scope, - "loaded": bool(meta.get("baseline_loaded")), - "status": _optional_str(meta.get("baseline_status")), - "fingerprint_version": _optional_str( - meta.get("baseline_fingerprint_version") - ), - "schema_version": _optional_str(meta.get("baseline_schema_version")), - "python_tag": _optional_str(meta.get("baseline_python_tag")), - "generator_name": _optional_str(meta.get("baseline_generator_name")), - "generator_version": _optional_str(meta.get("baseline_generator_version")), - "payload_sha256": _optional_str(meta.get("baseline_payload_sha256")), - "payload_sha256_verified": bool( - meta.get("baseline_payload_sha256_verified") - ), - }, - "cache": { - "path": cache_path, - "path_scope": cache_path_scope, - "used": bool(meta.get("cache_used")), - "status": _optional_str(meta.get("cache_status")), - "schema_version": _optional_str(meta.get("cache_schema_version")), - }, - "metrics_baseline": { - "path": metrics_baseline_path, - "path_scope": metrics_baseline_path_scope, - "loaded": bool(meta.get("metrics_baseline_loaded")), - "status": _optional_str(meta.get("metrics_baseline_status")), - "schema_version": _optional_str( - meta.get("metrics_baseline_schema_version") - ), - "payload_sha256": _optional_str( - meta.get("metrics_baseline_payload_sha256") - ), - "payload_sha256_verified": bool( - meta.get("metrics_baseline_payload_sha256_verified") - ), - }, - "runtime": { - "analysis_started_at_utc": _optional_str( - meta.get("analysis_started_at_utc") - ), - "report_generated_at_utc": _optional_str( - meta.get("report_generated_at_utc") - ), - "scan_root_absolute": _optional_str(meta.get("scan_root")), - "baseline_path_absolute": baseline_abs, - "cache_path_absolute": cache_abs, - "metrics_baseline_path_absolute": metrics_baseline_abs, - }, - } - analysis_profile = _analysis_profile_payload(meta) - if analysis_profile is not None: - payload["analysis_profile"] = analysis_profile - return payload - - -def _clone_group_assessment( - *, - count: int, - clone_type: str, -) -> tuple[str, float]: - match (count >= 4, clone_type in {"Type-1", "Type-2"}): - case (True, _): - severity = SEVERITY_CRITICAL - case (False, True): - severity = SEVERITY_WARNING - case _: - severity = SEVERITY_INFO - effort = "easy" if clone_type in {"Type-1", "Type-2"} else "moderate" - return severity, _priority(severity, effort) - - -def _build_clone_group_facts( - *, - group_key: str, - kind: Literal["function", "block", "segment"], - items: Sequence[GroupItemLike], - block_facts: Mapping[str, Mapping[str, str]], -) -> tuple[dict[str, object], dict[str, str]]: - base: dict[str, object] = { - "group_key": group_key, - "group_arity": len(items), - } - display_facts: dict[str, str] = {} - match kind: - case "function": - loc_buckets = sorted( - { - str(item.get("loc_bucket", "")) - for item in items - if str(item.get("loc_bucket", "")).strip() - } - ) - base["loc_buckets"] = loc_buckets - case "block" if group_key in block_facts: - typed_facts, block_display_facts = _normalize_block_machine_facts( - group_key=group_key, - group_arity=len(items), - block_facts=block_facts[group_key], - ) - base.update(typed_facts) - display_facts.update(block_display_facts) - case _: - pass - return base, display_facts - - -def _clone_item_payload( - item: GroupItemLike, - *, - kind: Literal["function", "block", "segment"], - scan_root: str, -) -> dict[str, object]: - payload: dict[str, object] = { - "relative_path": _contract_report_location_path( - str(item.get("filepath", "")), - scan_root=scan_root, - ), - "qualname": str(item.get("qualname", "")), - "start_line": _as_int(item.get("start_line", 0)), - "end_line": _as_int(item.get("end_line", 0)), - } - match kind: - case "function": - payload.update( - { - "loc": _as_int(item.get("loc", 0)), - "stmt_count": _as_int(item.get("stmt_count", 0)), - "fingerprint": str(item.get("fingerprint", "")), - "loc_bucket": str(item.get("loc_bucket", "")), - "cyclomatic_complexity": _as_int( - item.get("cyclomatic_complexity", 1) - ), - "nesting_depth": _as_int(item.get("nesting_depth", 0)), - "risk": str(item.get("risk", RISK_LOW)), - "raw_hash": str(item.get("raw_hash", "")), - } - ) - case "block": - payload["size"] = _as_int(item.get("size", 0)) - case _: - payload.update( - { - "size": _as_int(item.get("size", 0)), - "segment_hash": str(item.get("segment_hash", "")), - "segment_sig": str(item.get("segment_sig", "")), - } - ) - return payload - - -def _build_clone_groups( - *, - groups: GroupMapLike, - kind: Literal["function", "block", "segment"], - baseline_trusted: bool, - new_keys: Collection[str] | None, - block_facts: Mapping[str, Mapping[str, str]], - scan_root: str, -) -> list[dict[str, object]]: - encoded_groups: list[dict[str, object]] = [] - new_key_set = set(new_keys) if new_keys is not None else None - for group_key in sorted(groups): - items = groups[group_key] - clone_type = classify_clone_type(items=items, kind=kind) - severity, priority = _clone_group_assessment( - count=len(items), - clone_type=clone_type, - ) - novelty = _clone_novelty( - group_key=group_key, - baseline_trusted=baseline_trusted, - new_keys=new_key_set, - ) - locations = tuple( - report_location_from_group_item(item, scan_root=scan_root) for item in items - ) - source_scope = _source_scope_from_locations( - [ - { - "source_kind": location.source_kind, - } - for location in locations - ] - ) - spread_files, spread_functions = group_spread(locations) - rows = sorted( - [ - _clone_item_payload( - item, - kind=kind, - scan_root=scan_root, - ) - for item in items - ], - key=_item_sort_key, - ) - facts, display_facts = _build_clone_group_facts( - group_key=group_key, - kind=kind, - items=items, - block_facts=block_facts, - ) - encoded_groups.append( - { - "id": clone_group_id(kind, group_key), - "family": FAMILY_CLONE, - "category": kind, - "kind": "clone_group", - "severity": severity, - "confidence": CONFIDENCE_HIGH, - "priority": priority, - "clone_kind": kind, - "clone_type": clone_type, - "novelty": novelty, - "count": len(items), - "source_scope": source_scope, - "spread": { - "files": spread_files, - "functions": spread_functions, - }, - "items": rows, - "facts": facts, - **({"display_facts": display_facts} if display_facts else {}), - } - ) - encoded_groups.sort( - key=lambda group: (-_as_int(group.get("count")), str(group["id"])) - ) - return encoded_groups - - -def _build_suppressed_clone_groups( - *, - groups: Sequence[SuppressedCloneGroup] | None, - block_facts: Mapping[str, Mapping[str, str]], - scan_root: str, -) -> dict[str, list[dict[str, object]]]: - buckets: dict[str, list[dict[str, object]]] = { - CLONE_KIND_FUNCTION: [], - CLONE_KIND_BLOCK: [], - CLONE_KIND_SEGMENT: [], - } - for group in groups or (): - items = group.items - clone_type = classify_clone_type(items=items, kind=group.kind) - severity, priority = _clone_group_assessment( - count=len(items), - clone_type=clone_type, - ) - locations = tuple( - report_location_from_group_item(item, scan_root=scan_root) for item in items - ) - source_scope = _source_scope_from_locations( - [ - { - "source_kind": location.source_kind, - } - for location in locations - ] - ) - spread_files, spread_functions = group_spread(locations) - rows = sorted( - [ - _clone_item_payload( - item, - kind=group.kind, - scan_root=scan_root, - ) - for item in items - ], - key=_item_sort_key, - ) - facts, display_facts = _build_clone_group_facts( - group_key=group.group_key, - kind=group.kind, - items=items, - block_facts=block_facts, - ) - encoded: dict[str, object] = { - "id": clone_group_id(group.kind, group.group_key), - "family": FAMILY_CLONE, - "category": group.kind, - "kind": "clone_group", - "severity": severity, - "confidence": CONFIDENCE_HIGH, - "priority": priority, - "clone_kind": group.kind, - "clone_type": clone_type, - "count": len(items), - "source_scope": source_scope, - "spread": { - "files": spread_files, - "functions": spread_functions, - }, - "items": rows, - "facts": facts, - "suppression_rule": group.suppression_rule, - "suppression_source": group.suppression_source, - "matched_patterns": list(group.matched_patterns), - } - if display_facts: - encoded["display_facts"] = display_facts - buckets[group.kind].append(encoded) - for bucket in buckets.values(): - bucket.sort(key=lambda group: (-_as_int(group.get("count")), str(group["id"]))) - return buckets - - -def _structural_group_assessment( - *, - finding_kind: str, - count: int, - spread_functions: int, -) -> tuple[str, float]: - match finding_kind: - case "clone_guard_exit_divergence" | "clone_cohort_drift": - severity = SEVERITY_WARNING - if count >= 3 or spread_functions > 1: - severity = SEVERITY_CRITICAL - return severity, _priority(severity, "moderate") - case _: - severity = ( - SEVERITY_WARNING - if count >= 4 or spread_functions > 1 - else SEVERITY_INFO - ) - return severity, _priority(severity, "moderate") - - -def _csv_values(value: object) -> list[str]: - raw = str(value).strip() - if not raw: - return [] - return sorted({part.strip() for part in raw.split(",") if part.strip()}) - - -def _build_structural_signature( - finding_kind: str, - signature: Mapping[str, str], -) -> dict[str, object]: - debug = {str(key): str(signature[key]) for key in sorted(signature)} - match finding_kind: - case "clone_guard_exit_divergence": - return { - "version": "1", - "stable": { - "family": "clone_guard_exit_divergence", - "cohort_id": str(signature.get("cohort_id", "")), - "majority_guard_count": _as_int( - signature.get("majority_guard_count") - ), - "majority_guard_terminal_profile": str( - signature.get("majority_guard_terminal_profile", "none") - ), - "majority_terminal_kind": str( - signature.get("majority_terminal_kind", "fallthrough") - ), - "majority_side_effect_before_guard": ( - str(signature.get("majority_side_effect_before_guard", "0")) - == "1" - ), - }, - "debug": debug, - } - case "clone_cohort_drift": - return { - "version": "1", - "stable": { - "family": "clone_cohort_drift", - "cohort_id": str(signature.get("cohort_id", "")), - "drift_fields": _csv_values(signature.get("drift_fields")), - "majority_profile": { - "terminal_kind": str( - signature.get("majority_terminal_kind", "") - ), - "guard_exit_profile": str( - signature.get("majority_guard_exit_profile", "") - ), - "try_finally_profile": str( - signature.get("majority_try_finally_profile", "") - ), - "side_effect_order_profile": str( - signature.get("majority_side_effect_order_profile", "") - ), - }, - }, - "debug": debug, - } - case _: - return { - "version": "1", - "stable": { - "family": "duplicated_branches", - "stmt_shape": str(signature.get("stmt_seq", "")), - "terminal_kind": str(signature.get("terminal", "")), - "control_flow": { - "has_loop": str(signature.get("has_loop", "0")) == "1", - "has_try": str(signature.get("has_try", "0")) == "1", - "nested_if": str(signature.get("nested_if", "0")) == "1", - }, - }, - "debug": debug, - } - - -def _build_structural_facts( - finding_kind: str, - signature: Mapping[str, str], - *, - count: int, -) -> dict[str, object]: - match finding_kind: - case "clone_guard_exit_divergence": - return { - "cohort_id": str(signature.get("cohort_id", "")), - "cohort_arity": _as_int(signature.get("cohort_arity")), - "divergent_members": _as_int(signature.get("divergent_members"), count), - "majority_entry_guard_count": _as_int( - signature.get("majority_guard_count"), - ), - "majority_guard_terminal_profile": str( - signature.get("majority_guard_terminal_profile", "none") - ), - "majority_terminal_kind": str( - signature.get("majority_terminal_kind", "fallthrough") - ), - "majority_side_effect_before_guard": ( - str(signature.get("majority_side_effect_before_guard", "0")) == "1" - ), - "guard_count_values": _csv_values(signature.get("guard_count_values")), - "guard_terminal_values": _csv_values( - signature.get("guard_terminal_values"), - ), - "terminal_values": _csv_values(signature.get("terminal_values")), - "side_effect_before_guard_values": _csv_values( - signature.get("side_effect_before_guard_values"), - ), - } - case "clone_cohort_drift": - return { - "cohort_id": str(signature.get("cohort_id", "")), - "cohort_arity": _as_int(signature.get("cohort_arity")), - "divergent_members": _as_int(signature.get("divergent_members"), count), - "drift_fields": _csv_values(signature.get("drift_fields")), - "stable_majority_profile": { - "terminal_kind": str(signature.get("majority_terminal_kind", "")), - "guard_exit_profile": str( - signature.get("majority_guard_exit_profile", "") - ), - "try_finally_profile": str( - signature.get("majority_try_finally_profile", "") - ), - "side_effect_order_profile": str( - signature.get("majority_side_effect_order_profile", "") - ), - }, - } - case _: - return { - "occurrence_count": count, - "non_overlapping": True, - "call_bucket": _as_int(signature.get("calls", "0")), - "raise_bucket": _as_int(signature.get("raises", "0")), - } - - -def _build_structural_groups( - groups: Sequence[StructuralFindingGroup] | None, - *, - scan_root: str, -) -> list[dict[str, object]]: - normalized_groups = normalize_structural_findings(groups or ()) - out: list[dict[str, object]] = [] - for group in normalized_groups: - locations = tuple( - report_location_from_structural_occurrence(item, scan_root=scan_root) - for item in group.items - ) - source_scope = _source_scope_from_locations( - [{"source_kind": location.source_kind} for location in locations] - ) - spread_files, spread_functions = group_spread(locations) - severity, priority = _structural_group_assessment( - finding_kind=group.finding_kind, - count=len(group.items), - spread_functions=spread_functions, - ) - out.append( - { - "id": structural_group_id(group.finding_kind, group.finding_key), - "family": FAMILY_STRUCTURAL, - "category": group.finding_kind, - "kind": group.finding_kind, - "severity": severity, - "confidence": ( - CONFIDENCE_HIGH - if group.finding_kind - in {"clone_guard_exit_divergence", "clone_cohort_drift"} - else CONFIDENCE_MEDIUM - ), - "priority": priority, - "count": len(group.items), - "source_scope": source_scope, - "spread": { - "files": spread_files, - "functions": spread_functions, - }, - "signature": _build_structural_signature( - group.finding_kind, - group.signature, - ), - "items": sorted( - [ - { - "relative_path": _contract_report_location_path( - item.file_path, - scan_root=scan_root, - ), - "qualname": item.qualname, - "start_line": item.start, - "end_line": item.end, - } - for item in group.items - ], - key=_item_sort_key, - ), - "facts": _build_structural_facts( - group.finding_kind, - group.signature, - count=len(group.items), - ), - } - ) - out.sort(key=lambda group: (-_as_int(group.get("count")), str(group["id"]))) - return out - - -def _single_location_source_scope( - filepath: str, - *, - scan_root: str, -) -> dict[str, object]: - location = report_location_from_group_item( - { - "filepath": filepath, - "qualname": "", - "start_line": 0, - "end_line": 0, - }, - scan_root=scan_root, - ) - return _source_scope_from_locations([{"source_kind": location.source_kind}]) - - -def _build_dead_code_groups( - metrics_payload: Mapping[str, object], - *, - scan_root: str, -) -> list[dict[str, object]]: - families = _as_mapping(metrics_payload.get("families")) - dead_code = _as_mapping(families.get(FAMILY_DEAD_CODE)) - groups: list[dict[str, object]] = [] - for item in _as_sequence(dead_code.get("items")): - item_map = _as_mapping(item) - qualname = str(item_map.get("qualname", "")) - filepath = str(item_map.get("relative_path", "")) - confidence = str(item_map.get("confidence", CONFIDENCE_MEDIUM)) - severity = SEVERITY_WARNING if confidence == CONFIDENCE_HIGH else SEVERITY_INFO - groups.append( - { - "id": dead_code_group_id(qualname), - "family": FAMILY_DEAD_CODE, - "category": str(item_map.get("kind", "unknown")), - "kind": "unused_symbol", - "severity": severity, - "confidence": confidence, - "priority": _priority(severity, EFFORT_EASY), - "count": 1, - "source_scope": _single_location_source_scope( - filepath, - scan_root=scan_root, - ), - "spread": {"files": 1, "functions": 1 if qualname else 0}, - "items": [ - { - "relative_path": _contract_report_location_path( - filepath, - scan_root=scan_root, - ), - "qualname": qualname, - "start_line": _as_int(item_map.get("start_line")), - "end_line": _as_int(item_map.get("end_line")), - } - ], - "facts": { - "kind": str(item_map.get("kind", "unknown")), - "confidence": confidence, - }, - } - ) - groups.sort(key=lambda group: (-_as_float(group["priority"]), str(group["id"]))) - return groups - - -def _design_singleton_group( - *, - category: str, - kind: str, - severity: str, - qualname: str, - filepath: str, - start_line: int, - end_line: int, - scan_root: str, - item_data: Mapping[str, object], - facts: Mapping[str, object], -) -> dict[str, object]: - return { - "id": design_group_id(category, qualname), - "family": FAMILY_DESIGN, - "category": category, - "kind": kind, - "severity": severity, - "confidence": CONFIDENCE_HIGH, - "priority": _priority(severity, EFFORT_MODERATE), - "count": 1, - "source_scope": _single_location_source_scope( - filepath, - scan_root=scan_root, - ), - "spread": {"files": 1, "functions": 1}, - "items": [ - { - "relative_path": _contract_report_location_path( - filepath, - scan_root=scan_root, - ), - "qualname": qualname, - "start_line": start_line, - "end_line": end_line, - **item_data, - } - ], - "facts": dict(facts), - } - - -def _complexity_design_group( - item_map: Mapping[str, object], - *, - threshold: int, - scan_root: str, -) -> dict[str, object] | None: - cc = _as_int(item_map.get("cyclomatic_complexity"), 1) - if cc <= threshold: - return None - qualname = str(item_map.get("qualname", "")) - filepath = str(item_map.get("relative_path", "")) - nesting_depth = _as_int(item_map.get("nesting_depth")) - severity = SEVERITY_CRITICAL if cc > 40 else SEVERITY_WARNING - return _design_singleton_group( - category=CATEGORY_COMPLEXITY, - kind="function_hotspot", - severity=severity, - qualname=qualname, - filepath=filepath, - start_line=_as_int(item_map.get("start_line")), - end_line=_as_int(item_map.get("end_line")), - scan_root=scan_root, - item_data={ - "cyclomatic_complexity": cc, - "nesting_depth": nesting_depth, - "risk": str(item_map.get("risk", RISK_LOW)), - }, - facts={ - "cyclomatic_complexity": cc, - "nesting_depth": nesting_depth, - }, - ) - - -def _coupling_design_group( - item_map: Mapping[str, object], - *, - threshold: int, - scan_root: str, -) -> dict[str, object] | None: - cbo = _as_int(item_map.get("cbo")) - if cbo <= threshold: - return None - qualname = str(item_map.get("qualname", "")) - filepath = str(item_map.get("relative_path", "")) - coupled_classes = list(_as_sequence(item_map.get("coupled_classes"))) - return _design_singleton_group( - category=CATEGORY_COUPLING, - kind="class_hotspot", - severity=SEVERITY_WARNING, - qualname=qualname, - filepath=filepath, - start_line=_as_int(item_map.get("start_line")), - end_line=_as_int(item_map.get("end_line")), - scan_root=scan_root, - item_data={ - "cbo": cbo, - "risk": str(item_map.get("risk", RISK_LOW)), - "coupled_classes": coupled_classes, - }, - facts={ - "cbo": cbo, - "coupled_classes": coupled_classes, - }, - ) - - -def _cohesion_design_group( - item_map: Mapping[str, object], - *, - threshold: int, - scan_root: str, -) -> dict[str, object] | None: - lcom4 = _as_int(item_map.get("lcom4")) - if lcom4 < threshold: - return None - qualname = str(item_map.get("qualname", "")) - filepath = str(item_map.get("relative_path", "")) - method_count = _as_int(item_map.get("method_count")) - instance_var_count = _as_int(item_map.get("instance_var_count")) - return _design_singleton_group( - category=CATEGORY_COHESION, - kind="class_hotspot", - severity=SEVERITY_WARNING, - qualname=qualname, - filepath=filepath, - start_line=_as_int(item_map.get("start_line")), - end_line=_as_int(item_map.get("end_line")), - scan_root=scan_root, - item_data={ - "lcom4": lcom4, - "risk": str(item_map.get("risk", RISK_LOW)), - "method_count": method_count, - "instance_var_count": instance_var_count, - }, - facts={ - "lcom4": lcom4, - "method_count": method_count, - "instance_var_count": instance_var_count, - }, - ) - - -def _dependency_design_group( - cycle: object, - *, - scan_root: str, -) -> dict[str, object] | None: - modules = [str(module) for module in _as_sequence(cycle) if str(module).strip()] - if not modules: - return None - cycle_key = " -> ".join(modules) - return { - "id": design_group_id(CATEGORY_DEPENDENCY, cycle_key), - "family": FAMILY_DESIGN, - "category": CATEGORY_DEPENDENCY, - "kind": "cycle", - "severity": SEVERITY_CRITICAL, - "confidence": CONFIDENCE_HIGH, - "priority": _priority(SEVERITY_CRITICAL, EFFORT_HARD), - "count": len(modules), - "source_scope": _source_scope_from_filepaths( - (module.replace(".", "/") + ".py" for module in modules), - scan_root=scan_root, - ), - "spread": {"files": len(modules), "functions": 0}, - "items": [ - { - "module": module, - "relative_path": module.replace(".", "/") + ".py", - "source_kind": report_location_from_group_item( - { - "filepath": module.replace(".", "/") + ".py", - "qualname": "", - "start_line": 0, - "end_line": 0, - } - ).source_kind, - } - for module in modules - ], - "facts": { - "cycle_length": len(modules), - }, - } - - -def _coverage_design_group( - item_map: Mapping[str, object], - *, - threshold_percent: int, - scan_root: str, -) -> dict[str, object] | None: - coverage_hotspot = bool(item_map.get("coverage_hotspot")) - scope_gap_hotspot = bool(item_map.get("scope_gap_hotspot")) - if not coverage_hotspot and not scope_gap_hotspot: - return None - qualname = str(item_map.get("qualname", "")).strip() - filepath = str(item_map.get("relative_path", "")).strip() - if not filepath: - return None - start_line = _as_int(item_map.get("start_line")) - end_line = _as_int(item_map.get("end_line")) - subject_key = qualname or f"{filepath}:{start_line}:{end_line}" - risk = str(item_map.get("risk", RISK_LOW)).strip() or RISK_LOW - coverage_status = str(item_map.get("coverage_status", "")).strip() - coverage_permille = _as_int(item_map.get("coverage_permille")) - covered_lines = _as_int(item_map.get("covered_lines")) - executable_lines = _as_int(item_map.get("executable_lines")) - complexity = _as_int(item_map.get("cyclomatic_complexity"), 1) - severity = SEVERITY_CRITICAL if risk == "high" else SEVERITY_WARNING - if scope_gap_hotspot: - kind = FINDING_KIND_COVERAGE_SCOPE_GAP - detail = "The supplied coverage.xml did not map to this function's file." - else: - kind = FINDING_KIND_COVERAGE_HOTSPOT - detail = "Joined line coverage is below the configured hotspot threshold." - return { - "id": design_group_id(CATEGORY_COVERAGE, subject_key), - "family": FAMILY_DESIGN, - "category": CATEGORY_COVERAGE, - "kind": kind, - "severity": severity, - "confidence": CONFIDENCE_HIGH, - "priority": _priority(severity, EFFORT_MODERATE), - "count": 1, - "source_scope": _single_location_source_scope( - filepath, - scan_root=scan_root, - ), - "spread": {"files": 1, "functions": 1}, - "items": [ - { - "relative_path": filepath, - "qualname": qualname, - "start_line": start_line, - "end_line": end_line, - "risk": risk, - "cyclomatic_complexity": complexity, - "coverage_permille": coverage_permille, - "coverage_status": coverage_status, - "covered_lines": covered_lines, - "executable_lines": executable_lines, - "coverage_hotspot": coverage_hotspot, - "scope_gap_hotspot": scope_gap_hotspot, - } - ], - "facts": { - "coverage_permille": coverage_permille, - "hotspot_threshold_percent": threshold_percent, - "coverage_status": coverage_status, - "covered_lines": covered_lines, - "executable_lines": executable_lines, - "cyclomatic_complexity": complexity, - "coverage_hotspot": coverage_hotspot, - "scope_gap_hotspot": scope_gap_hotspot, - "detail": detail, - }, - } - - -def _build_design_groups( - metrics_payload: Mapping[str, object], - *, - design_thresholds: Mapping[str, object] | None = None, - scan_root: str, -) -> list[dict[str, object]]: - families = _as_mapping(metrics_payload.get("families")) - thresholds = _as_mapping(design_thresholds) - complexity_threshold = _coerced_nonnegative_threshold( - _as_mapping(thresholds.get(CATEGORY_COMPLEXITY)).get("value"), - default=DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, - ) - coupling_threshold = _coerced_nonnegative_threshold( - _as_mapping(thresholds.get(CATEGORY_COUPLING)).get("value"), - default=DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, - ) - cohesion_threshold = _coerced_nonnegative_threshold( - _as_mapping(thresholds.get(CATEGORY_COHESION)).get("value"), - default=DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, - ) - coverage_join = _as_mapping(families.get(_COVERAGE_JOIN_FAMILY)) - coverage_threshold = _as_int( - _as_mapping(coverage_join.get("summary")).get("hotspot_threshold_percent"), - 50, - ) - groups: list[dict[str, object]] = [] - - complexity = _as_mapping(families.get(CATEGORY_COMPLEXITY)) - for item in _as_sequence(complexity.get("items")): - group = _complexity_design_group( - _as_mapping(item), - threshold=complexity_threshold, - scan_root=scan_root, - ) - if group is not None: - groups.append(group) - - coupling = _as_mapping(families.get(CATEGORY_COUPLING)) - for item in _as_sequence(coupling.get("items")): - group = _coupling_design_group( - _as_mapping(item), - threshold=coupling_threshold, - scan_root=scan_root, - ) - if group is not None: - groups.append(group) - - cohesion = _as_mapping(families.get(CATEGORY_COHESION)) - for item in _as_sequence(cohesion.get("items")): - group = _cohesion_design_group( - _as_mapping(item), - threshold=cohesion_threshold, - scan_root=scan_root, - ) - if group is not None: - groups.append(group) - - dependencies = _as_mapping(families.get("dependencies")) - for cycle in _as_sequence(dependencies.get("cycles")): - group = _dependency_design_group(cycle, scan_root=scan_root) - if group is not None: - groups.append(group) - - for item in _as_sequence(coverage_join.get("items")): - group = _coverage_design_group( - _as_mapping(item), - threshold_percent=coverage_threshold, - scan_root=scan_root, - ) - if group is not None: - groups.append(group) - - groups.sort(key=lambda group: (-_as_float(group["priority"]), str(group["id"]))) - return groups - - -def _findings_summary( - *, - clone_functions: Sequence[Mapping[str, object]], - clone_blocks: Sequence[Mapping[str, object]], - clone_segments: Sequence[Mapping[str, object]], - structural_groups: Sequence[Mapping[str, object]], - dead_code_groups: Sequence[Mapping[str, object]], - design_groups: Sequence[Mapping[str, object]], - suppressed_clone_groups: Mapping[str, Sequence[Mapping[str, object]]] | None = None, - dead_code_suppressed: int = 0, -) -> dict[str, object]: - flat_groups = [ - *clone_functions, - *clone_blocks, - *clone_segments, - *structural_groups, - *dead_code_groups, - *design_groups, - ] - severity_counts = dict.fromkeys( - (SEVERITY_CRITICAL, SEVERITY_WARNING, SEVERITY_INFO), - 0, - ) - source_scope_counts = dict.fromkeys( - (IMPACT_SCOPE_RUNTIME, IMPACT_SCOPE_NON_RUNTIME, IMPACT_SCOPE_MIXED), - 0, - ) - for group in flat_groups: - severity = str(group.get("severity", SEVERITY_INFO)) - if severity in severity_counts: - severity_counts[severity] += 1 - impact_scope = str( - _as_mapping(group.get("source_scope")).get( - "impact_scope", - IMPACT_SCOPE_NON_RUNTIME, - ) - ) - if impact_scope in source_scope_counts: - source_scope_counts[impact_scope] += 1 - clone_groups = [*clone_functions, *clone_blocks, *clone_segments] - clone_suppressed_map = _as_mapping(suppressed_clone_groups) - suppressed_functions = len(_as_sequence(clone_suppressed_map.get("function"))) - suppressed_blocks = len(_as_sequence(clone_suppressed_map.get("block"))) - suppressed_segments = len(_as_sequence(clone_suppressed_map.get("segment"))) - suppressed_clone_total = ( - suppressed_functions + suppressed_blocks + suppressed_segments - ) - clones_summary: dict[str, object] = { - "functions": len(clone_functions), - "blocks": len(clone_blocks), - "segments": len(clone_segments), - CLONE_NOVELTY_NEW: sum( - 1 - for group in clone_groups - if str(group.get("novelty", "")) == CLONE_NOVELTY_NEW - ), - CLONE_NOVELTY_KNOWN: sum( - 1 - for group in clone_groups - if str(group.get("novelty", "")) == CLONE_NOVELTY_KNOWN - ), - } - if suppressed_clone_total > 0: - clones_summary.update( - { - "suppressed": suppressed_clone_total, - "suppressed_functions": suppressed_functions, - "suppressed_blocks": suppressed_blocks, - "suppressed_segments": suppressed_segments, - } - ) - suppressed_summary = { - FAMILY_DEAD_CODE: max(0, dead_code_suppressed), - } - if suppressed_clone_total > 0: - suppressed_summary[FAMILY_CLONES] = suppressed_clone_total - return { - "total": len(flat_groups), - "families": { - FAMILY_CLONES: len(clone_groups), - FAMILY_STRUCTURAL: len(structural_groups), - FAMILY_DEAD_CODE: len(dead_code_groups), - "design": len(design_groups), - }, - "severity": severity_counts, - "impact_scope": source_scope_counts, - "clones": clones_summary, - "suppressed": suppressed_summary, - } - - -def _sort_flat_finding_ids( - groups: Sequence[Mapping[str, object]], -) -> list[str]: - ordered = sorted( - groups, - key=lambda group: ( - -_as_float(group.get("priority")), - SEVERITY_ORDER.get(str(group.get("severity", SEVERITY_INFO)), 9), - -_as_int(_as_mapping(group.get("spread")).get("files")), - -_as_int(_as_mapping(group.get("spread")).get("functions")), - -_as_int(group.get("count")), - str(group.get("id", "")), - ), - ) - return [str(group["id"]) for group in ordered] - - -def _sort_highest_spread_ids( - groups: Sequence[Mapping[str, object]], -) -> list[str]: - ordered = sorted( - groups, - key=lambda group: ( - -_as_int(_as_mapping(group.get("spread")).get("files")), - -_as_int(_as_mapping(group.get("spread")).get("functions")), - -_as_int(group.get("count")), - -_as_float(group.get("priority")), - str(group.get("id", "")), - ), - ) - return [str(group["id"]) for group in ordered] - - -def _health_snapshot(metrics_payload: Mapping[str, object]) -> dict[str, object]: - health = _as_mapping(_as_mapping(metrics_payload.get("families")).get("health")) - summary = _as_mapping(health.get("summary")) - dimensions = { - str(key): _as_int(value) - for key, value in _as_mapping(summary.get("dimensions")).items() - } - strongest = None - weakest = None - if dimensions: - strongest = min( - sorted(dimensions), - key=lambda key: (-dimensions[key], key), - ) - weakest = min( - sorted(dimensions), - key=lambda key: (dimensions[key], key), - ) - return { - "score": _as_int(summary.get("score")), - "grade": str(summary.get("grade", "")), - "strongest_dimension": strongest, - "weakest_dimension": weakest, - } - - -def _combined_impact_scope(groups: Sequence[Mapping[str, object]]) -> str: - impact_scopes = { - str( - _as_mapping(group.get("source_scope")).get( - "impact_scope", - IMPACT_SCOPE_NON_RUNTIME, - ) - ) - for group in groups - } - if not impact_scopes: - return IMPACT_SCOPE_NON_RUNTIME - if len(impact_scopes) == 1: - return next(iter(impact_scopes)) - return IMPACT_SCOPE_MIXED - - -def _top_risks( - *, - dead_code_groups: Sequence[Mapping[str, object]], - design_groups: Sequence[Mapping[str, object]], - structural_groups: Sequence[Mapping[str, object]], - clone_groups: Sequence[Mapping[str, object]], -) -> list[dict[str, object]]: - risks: list[dict[str, object]] = [] - - if dead_code_groups: - label = ( - "1 dead code item" - if len(dead_code_groups) == 1 - else f"{len(dead_code_groups)} dead code items" - ) - risks.append( - { - "kind": "family_summary", - "family": FAMILY_DEAD_CODE, - "count": len(dead_code_groups), - "scope": IMPACT_SCOPE_MIXED - if len( - { - _as_mapping(group.get("source_scope")).get("impact_scope") - for group in dead_code_groups - } - ) - > 1 - else str( - _as_mapping(dead_code_groups[0].get("source_scope")).get( - "impact_scope", - IMPACT_SCOPE_NON_RUNTIME, - ) - ), - "label": label, - } - ) - - low_cohesion = [ - group - for group in design_groups - if str(group.get("category", "")) == CATEGORY_COHESION - ] - if low_cohesion: - label = ( - "1 low cohesion class" - if len(low_cohesion) == 1 - else f"{len(low_cohesion)} low cohesion classes" - ) - risks.append( - { - "kind": "family_summary", - "family": FAMILY_DESIGN, - "category": CATEGORY_COHESION, - "count": len(low_cohesion), - "scope": _combined_impact_scope(low_cohesion), - "label": label, - } - ) - - production_structural = [ - group - for group in structural_groups - if str(_as_mapping(group.get("source_scope")).get("impact_scope")) - in {IMPACT_SCOPE_RUNTIME, IMPACT_SCOPE_MIXED} - ] - if production_structural: - label = ( - "1 structural finding in production code" - if len(production_structural) == 1 - else ( - f"{len(production_structural)} structural findings in production code" - ) - ) - risks.append( - { - "kind": "family_summary", - "family": FAMILY_STRUCTURAL, - "count": len(production_structural), - "scope": SOURCE_KIND_PRODUCTION, - "label": label, - } - ) - - fixture_test_clones = [ - group - for group in clone_groups - if _as_mapping(group.get("source_scope")).get("impact_scope") - == IMPACT_SCOPE_NON_RUNTIME - and _as_mapping(group.get("source_scope")).get("dominant_kind") - in {SOURCE_KIND_TESTS, SOURCE_KIND_FIXTURES} - ] - if fixture_test_clones: - label = ( - "1 clone group in fixtures/tests" - if len(fixture_test_clones) == 1 - else f"{len(fixture_test_clones)} clone groups in fixtures/tests" - ) - risks.append( - { - "kind": "family_summary", - "family": FAMILY_CLONE, - "count": len(fixture_test_clones), - "scope": IMPACT_SCOPE_NON_RUNTIME, - "label": label, - } - ) - - return risks[:6] - - -def _build_derived_overview( - *, - findings: Mapping[str, object], - metrics_payload: Mapping[str, object], -) -> tuple[dict[str, object], dict[str, object]]: - groups = _as_mapping(findings.get("groups")) - clones = _as_mapping(groups.get(FAMILY_CLONES)) - clone_groups = [ - *_as_sequence(clones.get("functions")), - *_as_sequence(clones.get("blocks")), - *_as_sequence(clones.get("segments")), - ] - structural_groups = _as_sequence( - _as_mapping(groups.get(FAMILY_STRUCTURAL)).get("groups") - ) - dead_code_groups = _as_sequence( - _as_mapping(groups.get(FAMILY_DEAD_CODE)).get("groups") - ) - design_groups = _as_sequence(_as_mapping(groups.get("design")).get("groups")) - flat_groups = [ - *clone_groups, - *structural_groups, - *dead_code_groups, - *design_groups, - ] - dominant_kind_counts: Counter[str] = Counter( - str( - _as_mapping(_as_mapping(group).get("source_scope")).get( - "dominant_kind", - SOURCE_KIND_OTHER, - ) - ) - for group in flat_groups - ) - summary = _as_mapping(findings.get("summary")) - overview: dict[str, object] = { - "families": dict(_as_mapping(summary.get("families"))), - "top_risks": _top_risks( - dead_code_groups=[_as_mapping(group) for group in dead_code_groups], - design_groups=[_as_mapping(group) for group in design_groups], - structural_groups=[_as_mapping(group) for group in structural_groups], - clone_groups=[_as_mapping(group) for group in clone_groups], - ), - "source_scope_breakdown": { - key: dominant_kind_counts[key] - for key in ( - SOURCE_KIND_PRODUCTION, - SOURCE_KIND_TESTS, - SOURCE_KIND_FIXTURES, - SOURCE_KIND_MIXED, - SOURCE_KIND_OTHER, - ) - if dominant_kind_counts[key] > 0 - }, - "health_snapshot": _health_snapshot(metrics_payload), - "directory_hotspots": build_directory_hotspots(findings=findings), - } - hotlists: dict[str, object] = { - "most_actionable_ids": _sort_flat_finding_ids( - [ - group - for group in map(_as_mapping, flat_groups) - if str(group.get("severity")) != SEVERITY_INFO - ] - )[:5], - "highest_spread_ids": _sort_highest_spread_ids( - list(map(_as_mapping, flat_groups)) - )[:5], - "production_hotspot_ids": _sort_flat_finding_ids( - [ - group - for group in map(_as_mapping, flat_groups) - if str(_as_mapping(group.get("source_scope")).get("impact_scope")) - in {IMPACT_SCOPE_RUNTIME, IMPACT_SCOPE_MIXED} - ] - )[:5], - "test_fixture_hotspot_ids": _sort_flat_finding_ids( - [ - group - for group in map(_as_mapping, flat_groups) - if str(_as_mapping(group.get("source_scope")).get("impact_scope")) - == IMPACT_SCOPE_NON_RUNTIME - and str(_as_mapping(group.get("source_scope")).get("dominant_kind")) - in {SOURCE_KIND_TESTS, SOURCE_KIND_FIXTURES} - ] - )[:5], - } - return overview, hotlists - - -def _representative_location_rows( - suggestion: Suggestion, -) -> list[dict[str, object]]: - rows = [ - { - "relative_path": ( - location.relative_path - if ( - location.relative_path - and not _is_absolute_path(location.relative_path) - ) - else _contract_report_location_path( - location.filepath, - scan_root="", - ) - ), - "start_line": location.start_line, - "end_line": location.end_line, - "qualname": location.qualname, - "source_kind": location.source_kind, - } - for location in suggestion.representative_locations - ] - rows.sort( - key=lambda row: ( - str(row["relative_path"]), - _as_int(row["start_line"]), - _as_int(row["end_line"]), - str(row["qualname"]), - ) - ) - return rows[:3] - - -def _suggestion_finding_id(suggestion: Suggestion) -> str: - if suggestion.finding_family == FAMILY_CLONES: - if suggestion.fact_kind.startswith("Function"): - return clone_group_id(CLONE_KIND_FUNCTION, suggestion.subject_key) - if suggestion.fact_kind.startswith("Block"): - return clone_group_id(CLONE_KIND_BLOCK, suggestion.subject_key) - return clone_group_id(CLONE_KIND_SEGMENT, suggestion.subject_key) - if suggestion.finding_family == FAMILY_STRUCTURAL: - return structural_group_id( - suggestion.finding_kind or "duplicated_branches", - suggestion.subject_key, - ) - if suggestion.category == CATEGORY_DEAD_CODE: - return dead_code_group_id(suggestion.subject_key) - if suggestion.category in { - CATEGORY_COMPLEXITY, - CATEGORY_COUPLING, - CATEGORY_COHESION, - CATEGORY_DEPENDENCY, - }: - return design_group_id(suggestion.category, suggestion.subject_key) - return design_group_id( - suggestion.category, - suggestion.subject_key or suggestion.title, - ) - - -def _build_derived_suggestions( - suggestions: Sequence[Suggestion] | None, -) -> list[dict[str, object]]: - suggestion_rows = list(suggestions or ()) - suggestion_rows.sort( - key=lambda suggestion: ( - -suggestion.priority, - SEVERITY_ORDER.get(suggestion.severity, 9), - suggestion.title, - _suggestion_finding_id(suggestion), - ) - ) - return [ - { - "id": f"suggestion:{_suggestion_finding_id(suggestion)}", - "finding_id": _suggestion_finding_id(suggestion), - "title": suggestion.title, - "summary": suggestion.fact_summary, - "location_label": suggestion.location_label or suggestion.location, - "representative_locations": _representative_location_rows(suggestion), - "action": { - "effort": suggestion.effort, - "steps": list(suggestion.steps), - }, - } - for suggestion in suggestion_rows - ] - - -def _build_findings_payload( - *, - func_groups: GroupMapLike, - block_groups: GroupMapLike, - segment_groups: GroupMapLike, - block_facts: Mapping[str, Mapping[str, str]], - structural_findings: Sequence[StructuralFindingGroup] | None, - metrics_payload: Mapping[str, object], - baseline_trusted: bool, - new_function_group_keys: Collection[str] | None, - new_block_group_keys: Collection[str] | None, - new_segment_group_keys: Collection[str] | None, - suppressed_clone_groups: Sequence[SuppressedCloneGroup] | None, - design_thresholds: Mapping[str, object] | None, - scan_root: str, -) -> dict[str, object]: - clone_functions = _build_clone_groups( - groups=func_groups, - kind=CLONE_KIND_FUNCTION, - baseline_trusted=baseline_trusted, - new_keys=new_function_group_keys, - block_facts=block_facts, - scan_root=scan_root, - ) - clone_blocks = _build_clone_groups( - groups=block_groups, - kind=CLONE_KIND_BLOCK, - baseline_trusted=baseline_trusted, - new_keys=new_block_group_keys, - block_facts=block_facts, - scan_root=scan_root, - ) - clone_segments = _build_clone_groups( - groups=segment_groups, - kind=CLONE_KIND_SEGMENT, - baseline_trusted=baseline_trusted, - new_keys=new_segment_group_keys, - block_facts={}, - scan_root=scan_root, - ) - structural_groups = _build_structural_groups( - structural_findings, - scan_root=scan_root, - ) - dead_code_groups = _build_dead_code_groups( - metrics_payload, - scan_root=scan_root, - ) - dead_code_family = _as_mapping( - _as_mapping(metrics_payload.get("families")).get(FAMILY_DEAD_CODE) - ) - dead_code_summary = _as_mapping(dead_code_family.get("summary")) - dead_code_suppressed = _as_int( - dead_code_summary.get( - "suppressed", - len(_as_sequence(dead_code_family.get("suppressed_items"))), - ) - ) - design_groups = _build_design_groups( - metrics_payload, - design_thresholds=design_thresholds, - scan_root=scan_root, - ) - suppressed_clone_payload = _build_suppressed_clone_groups( - groups=suppressed_clone_groups, - block_facts=block_facts, - scan_root=scan_root, - ) - clone_groups_payload: dict[str, object] = { - "functions": clone_functions, - "blocks": clone_blocks, - "segments": clone_segments, - } - if any(suppressed_clone_payload.values()): - clone_groups_payload["suppressed"] = { - "functions": suppressed_clone_payload[CLONE_KIND_FUNCTION], - "blocks": suppressed_clone_payload[CLONE_KIND_BLOCK], - "segments": suppressed_clone_payload[CLONE_KIND_SEGMENT], - } - return { - "summary": _findings_summary( - clone_functions=clone_functions, - clone_blocks=clone_blocks, - clone_segments=clone_segments, - structural_groups=structural_groups, - dead_code_groups=dead_code_groups, - design_groups=design_groups, - suppressed_clone_groups=suppressed_clone_payload, - dead_code_suppressed=dead_code_suppressed, - ), - "groups": { - FAMILY_CLONES: clone_groups_payload, - FAMILY_STRUCTURAL: { - "groups": structural_groups, - }, - FAMILY_DEAD_CODE: { - "groups": dead_code_groups, - }, - "design": { - "groups": design_groups, - }, - }, - } - - -def _canonical_integrity_payload( - *, - report_schema_version: str, - meta: Mapping[str, object], - inventory: Mapping[str, object], - findings: Mapping[str, object], - metrics: Mapping[str, object], -) -> dict[str, object]: - canonical_meta = { - str(key): value for key, value in meta.items() if str(key) != "runtime" - } - - def _strip_noncanonical(value: object) -> object: - if isinstance(value, Mapping): - return { - str(key): _strip_noncanonical(item) - for key, item in value.items() - if str(key) != "display_facts" - } - if isinstance(value, Sequence) and not isinstance( - value, - (str, bytes, bytearray), - ): - return [_strip_noncanonical(item) for item in value] - return value - - return { - "report_schema_version": report_schema_version, - "meta": canonical_meta, - "inventory": inventory, - "findings": _strip_noncanonical(findings), - "metrics": metrics, - } - - -def _build_integrity_payload( - *, - report_schema_version: str, - meta: Mapping[str, object], - inventory: Mapping[str, object], - findings: Mapping[str, object], - metrics: Mapping[str, object], -) -> dict[str, object]: - canonical_payload = _canonical_integrity_payload( - report_schema_version=report_schema_version, - meta=meta, - inventory=inventory, - findings=findings, - metrics=metrics, - ) - canonical_json = orjson.dumps( - canonical_payload, - option=orjson.OPT_SORT_KEYS, - ) - payload_sha = sha256(canonical_json).hexdigest() - return { - "canonicalization": { - "version": "1", - "scope": "canonical_only", - "sections": [ - "report_schema_version", - "meta", - "inventory", - "findings", - "metrics", - ], - }, - "digest": { - "verified": True, - "algorithm": "sha256", - "value": payload_sha, - }, - } - - -def build_report_document( - *, - func_groups: GroupMapLike, - block_groups: GroupMapLike, - segment_groups: GroupMapLike, - meta: Mapping[str, object] | None = None, - inventory: Mapping[str, object] | None = None, - block_facts: Mapping[str, Mapping[str, str]] | None = None, - new_function_group_keys: Collection[str] | None = None, - new_block_group_keys: Collection[str] | None = None, - new_segment_group_keys: Collection[str] | None = None, - suppressed_clone_groups: Sequence[SuppressedCloneGroup] | None = None, - metrics: Mapping[str, object] | None = None, - suggestions: Sequence[Suggestion] | None = None, - structural_findings: Sequence[StructuralFindingGroup] | None = None, -) -> dict[str, object]: - report_schema_version = REPORT_SCHEMA_VERSION - scan_root = str(_as_mapping(meta).get("scan_root", "")) - meta_payload = _build_meta_payload(meta, scan_root=scan_root) - design_thresholds = _as_mapping( - _as_mapping(meta_payload.get("analysis_thresholds")).get("design_findings") - ) - metrics_payload = _build_metrics_payload(metrics, scan_root=scan_root) - file_list = _collect_report_file_list( - inventory=inventory, - func_groups=func_groups, - block_groups=block_groups, - segment_groups=segment_groups, - suppressed_clone_groups=suppressed_clone_groups, - metrics=metrics, - structural_findings=structural_findings, - ) - inventory_payload = _build_inventory_payload( - inventory=inventory, - file_list=file_list, - metrics_payload=metrics_payload, - scan_root=scan_root, - ) - findings_payload = _build_findings_payload( - func_groups=func_groups, - block_groups=block_groups, - segment_groups=segment_groups, - block_facts=block_facts or {}, - structural_findings=structural_findings, - metrics_payload=metrics_payload, - baseline_trusted=_baseline_is_trusted(meta_payload), - new_function_group_keys=new_function_group_keys, - new_block_group_keys=new_block_group_keys, - new_segment_group_keys=new_segment_group_keys, - suppressed_clone_groups=suppressed_clone_groups, - design_thresholds=design_thresholds, - scan_root=scan_root, - ) - overview_payload, hotlists_payload = _build_derived_overview( - findings=findings_payload, - metrics_payload=metrics_payload, - ) - derived_payload = { - "suggestions": _build_derived_suggestions(suggestions), - "overview": overview_payload, - "hotlists": hotlists_payload, - } - integrity_payload = _build_integrity_payload( - report_schema_version=report_schema_version, - meta=meta_payload, - inventory=inventory_payload, - findings=findings_payload, - metrics=metrics_payload, - ) - return { - "report_schema_version": report_schema_version, - "meta": meta_payload, - "inventory": inventory_payload, - "findings": findings_payload, - "metrics": metrics_payload, - "derived": derived_payload, - "integrity": integrity_payload, - } diff --git a/codeclone/_cli_meta.py b/codeclone/report/meta.py similarity index 67% rename from codeclone/_cli_meta.py rename to codeclone/report/meta.py index ffa9245..cc1bfc9 100644 --- a/codeclone/_cli_meta.py +++ b/codeclone/report/meta.py @@ -8,91 +8,30 @@ import sys from datetime import datetime, timezone -from typing import TYPE_CHECKING, TypedDict +from typing import TYPE_CHECKING -from .baseline import Baseline, current_python_tag -from .contracts import ( +from ..baseline.clone_baseline import Baseline +from ..baseline.trust import current_python_tag +from ..contracts import ( DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, ) +from ..contracts.schemas import ReportMeta if TYPE_CHECKING: from pathlib import Path - from .metrics_baseline import MetricsBaseline + from ..baseline.metrics_baseline import MetricsBaseline -class AnalysisProfileMeta(TypedDict): - min_loc: int - min_stmt: int - block_min_loc: int - block_min_stmt: int - segment_min_loc: int - segment_min_stmt: int - - -def _current_python_version() -> str: - return f"{sys.version_info.major}.{sys.version_info.minor}" - - -def _current_report_timestamp_utc() -> str: +def current_report_timestamp_utc() -> str: return ( datetime.now(timezone.utc).replace(microsecond=0).strftime("%Y-%m-%dT%H:%M:%SZ") ) -class ReportMeta(TypedDict): - """ - Canonical report metadata contract shared by HTML, JSON, and TXT reports. - - Key semantics: - - python_version: runtime major.minor string for human readability (e.g. "3.13") - - python_tag: runtime compatibility tag used by baseline/cache contracts - (e.g. "cp313") - - baseline_*: values loaded from baseline metadata for audit/provenance - - cache_*: cache status/provenance for run transparency - """ - - codeclone_version: str - project_name: str - scan_root: str - python_version: str - python_tag: str - baseline_path: str - baseline_fingerprint_version: str | None - baseline_schema_version: str | None - baseline_python_tag: str | None - baseline_generator_name: str | None - baseline_generator_version: str | None - baseline_payload_sha256: str | None - baseline_payload_sha256_verified: bool - baseline_loaded: bool - baseline_status: str - cache_path: str - cache_used: bool - cache_status: str - cache_schema_version: str | None - files_skipped_source_io: int - metrics_baseline_path: str - metrics_baseline_loaded: bool - metrics_baseline_status: str - metrics_baseline_schema_version: str | None - metrics_baseline_payload_sha256: str | None - metrics_baseline_payload_sha256_verified: bool - health_score: int | None - health_grade: str | None - analysis_mode: str - metrics_computed: list[str] - analysis_profile: AnalysisProfileMeta - design_complexity_threshold: int - design_coupling_threshold: int - design_cohesion_threshold: int - analysis_started_at_utc: str | None - report_generated_at_utc: str - - -def _build_report_meta( +def build_report_meta( *, codeclone_version: str, scan_root: Path, @@ -179,3 +118,7 @@ def _build_report_meta( "analysis_started_at_utc": analysis_started_at_utc, "report_generated_at_utc": report_generated_at_utc, } + + +def _current_python_version() -> str: + return f"{sys.version_info.major}.{sys.version_info.minor}" diff --git a/codeclone/report/overview.py b/codeclone/report/overview.py index c8c4a3a..b0fe214 100644 --- a/codeclone/report/overview.py +++ b/codeclone/report/overview.py @@ -8,12 +8,10 @@ from collections import Counter from collections.abc import Mapping, Sequence +from dataclasses import dataclass, field from pathlib import PurePosixPath -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING -from .._coerce import as_int as _as_int -from .._coerce import as_mapping as _as_mapping -from .._coerce import as_sequence as _as_sequence from ..domain.findings import ( CATEGORY_COHESION, CATEGORY_COMPLEXITY, @@ -44,6 +42,9 @@ BLOCK_HINT_ASSERT_ONLY, BLOCK_PATTERN_REPEATED_STMT_HASH, ) +from ..utils.coerce import as_int as _as_int +from ..utils.coerce import as_mapping as _as_mapping +from ..utils.coerce import as_sequence as _as_sequence from .derived import ( classify_source_kind, format_spread_location_label, @@ -125,6 +126,25 @@ def _flatten_findings(findings: Mapping[str, object]) -> list[Mapping[str, objec ) +@dataclass(slots=True) +class _DirectoryContribution: + affected_items: int = 0 + files: set[str] = field(default_factory=set) + locations: list[dict[str, object]] = field(default_factory=list) + + +@dataclass(slots=True) +class _DirectoryBucketRow: + path: str + finding_ids: set[str] = field(default_factory=set) + affected_items: int = 0 + files: set[str] = field(default_factory=set) + locations: list[dict[str, object]] = field(default_factory=list) + kind_breakdown_ids: dict[str, set[str]] = field( + default_factory=lambda: {key: set() for key in _DIRECTORY_KIND_BREAKDOWN_KEYS} + ) + + def _directory_bucket_keys(group: Mapping[str, object]) -> tuple[str, ...]: family = str(group.get("family", "")).strip() category = str(group.get("category", "")).strip() @@ -221,8 +241,8 @@ def _overview_directory_label( def _directory_contributions( group: Mapping[str, object], -) -> dict[str, dict[str, object]]: - contributions: dict[str, dict[str, object]] = {} +) -> dict[str, _DirectoryContribution]: + contributions: dict[str, _DirectoryContribution] = {} for item in map(_as_mapping, _as_sequence(group.get("items"))): relative_path = _directory_relative_path(item) if relative_path is None: @@ -231,25 +251,16 @@ def _directory_contributions( relative_path ) directory = _overview_directory_label(relative_path, source_kind=source_kind) - entry = contributions.setdefault( - directory, - { - "affected_items": 0, - "files": set(), - "locations": [], - }, - ) - entry["affected_items"] = _as_int(entry.get("affected_items")) + 1 - cast(set[str], entry["files"]).add(relative_path) - cast(list[dict[str, object]], entry["locations"]).append( - {"source_kind": source_kind} - ) + entry = contributions.setdefault(directory, _DirectoryContribution()) + entry.affected_items += 1 + entry.files.add(relative_path) + entry.locations.append({"source_kind": source_kind}) return contributions def _directory_group_data( group: Mapping[str, object], -) -> tuple[str, dict[str, dict[str, object]]] | None: +) -> tuple[str, dict[str, _DirectoryContribution]] | None: group_id = str(group.get("id", "")).strip() if not group_id: return None @@ -265,7 +276,7 @@ def build_directory_hotspots( limit: int = 5, ) -> dict[str, object]: normalized_limit = max(1, _as_int(limit, 5)) - bucket_rows: dict[str, dict[str, dict[str, object]]] = { + bucket_rows: dict[str, dict[str, _DirectoryBucketRow]] = { bucket: {} for bucket in _DIRECTORY_HOTSPOT_BUCKETS } bucket_totals: Counter[str] = Counter() @@ -282,41 +293,22 @@ def build_directory_hotspots( for directory, contribution in contributions.items(): row = rows.setdefault( directory, - { - "path": directory, - "finding_ids": set(), - "affected_items": 0, - "files": set(), - "locations": [], - "kind_breakdown_ids": { - key: set() for key in _DIRECTORY_KIND_BREAKDOWN_KEYS - }, - }, - ) - cast(set[str], row["finding_ids"]).add(group_id) - row["affected_items"] = _as_int(row.get("affected_items")) + _as_int( - contribution.get("affected_items") - ) - cast(set[str], row["files"]).update( - cast(set[str], contribution["files"]) - ) - cast(list[dict[str, object]], row["locations"]).extend( - cast(list[dict[str, object]], contribution["locations"]) + _DirectoryBucketRow(path=directory), ) + row.finding_ids.add(group_id) + row.affected_items += contribution.affected_items + row.files.update(contribution.files) + row.locations.extend(contribution.locations) if bucket == "all" and kind_key is not None: - kind_rows = cast( - dict[str, set[str]], - row["kind_breakdown_ids"], - ) - kind_rows[kind_key].add(group_id) - bucket_totals[bucket] += _as_int(contribution.get("affected_items")) - - def _row_sort_key(row: Mapping[str, object]) -> tuple[int, int, int, str]: + row.kind_breakdown_ids[kind_key].add(group_id) + bucket_totals[bucket] += contribution.affected_items + + def _row_sort_key(row: _DirectoryBucketRow) -> tuple[int, int, int, str]: return ( - -len(cast(set[str], row["finding_ids"])), - -_as_int(row.get("affected_items")), - -len(cast(set[str], row["files"])), - str(row.get("path", "")), + -len(row.finding_ids), + -row.affected_items, + -len(row.files), + row.path, ) hotspots: dict[str, object] = {} @@ -326,11 +318,11 @@ def _row_sort_key(row: Mapping[str, object]) -> tuple[int, int, int, str]: total_affected_items = bucket_totals[bucket] items: list[dict[str, object]] = [] for row in bucket_items[:normalized_limit]: - finding_groups = len(cast(set[str], row["finding_ids"])) - affected_items = _as_int(row.get("affected_items")) - files = len(cast(set[str], row["files"])) + finding_groups = len(row.finding_ids) + affected_items = row.affected_items + files = len(row.files) item = { - "path": str(row.get("path", ".")), + "path": row.path, "finding_groups": finding_groups, "affected_items": affected_items, "files": files, @@ -340,13 +332,11 @@ def _row_sort_key(row: Mapping[str, object]) -> tuple[int, int, int, str]: ) if total_affected_items > 0 else 0.0, - "source_scope": source_scope_from_locations( - cast(list[dict[str, object]], row["locations"]) - ), + "source_scope": source_scope_from_locations(row.locations), } if bucket == "all": item["kind_breakdown"] = { - key: len(cast(dict[str, set[str]], row["kind_breakdown_ids"])[key]) + key: len(row.kind_breakdown_ids[key]) for key in _DIRECTORY_KIND_BREAKDOWN_KEYS } items.append(item) diff --git a/codeclone/report/renderers/__init__.py b/codeclone/report/renderers/__init__.py new file mode 100644 index 0000000..9135843 --- /dev/null +++ b/codeclone/report/renderers/__init__.py @@ -0,0 +1,5 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy diff --git a/codeclone/report/renderers/json.py b/codeclone/report/renderers/json.py new file mode 100644 index 0000000..6052bcd --- /dev/null +++ b/codeclone/report/renderers/json.py @@ -0,0 +1,18 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections.abc import Mapping + +import orjson + + +def render_json_report_document(payload: Mapping[str, object]) -> str: + return orjson.dumps(payload, option=orjson.OPT_INDENT_2).decode("utf-8") + + +__all__ = ["render_json_report_document"] diff --git a/codeclone/report/markdown.py b/codeclone/report/renderers/markdown.py similarity index 91% rename from codeclone/report/markdown.py rename to codeclone/report/renderers/markdown.py index 6ad5c2e..cf64a69 100644 --- a/codeclone/report/markdown.py +++ b/codeclone/report/renderers/markdown.py @@ -9,14 +9,13 @@ from collections.abc import Collection, Mapping, Sequence from typing import TYPE_CHECKING -from .._coerce import as_float, as_int, as_mapping, as_sequence -from ..domain.findings import FAMILY_CLONE, FAMILY_DEAD_CODE, FAMILY_STRUCTURAL -from ._formatting import format_spread_text -from .json_contract import build_report_document +from ...domain.findings import FAMILY_CLONE, FAMILY_DEAD_CODE, FAMILY_STRUCTURAL +from ...utils.coerce import as_float, as_int, as_mapping, as_sequence +from .._formatting import format_spread_text if TYPE_CHECKING: - from ..models import StructuralFindingGroup, Suggestion, SuppressedCloneGroup - from .types import GroupMapLike + from ...models import StructuralFindingGroup, Suggestion, SuppressedCloneGroup + from ..types import GroupMapLike MARKDOWN_SCHEMA_VERSION = "1.0" _MAX_FINDING_LOCATIONS = 5 @@ -544,10 +543,30 @@ def render_markdown_report_document(payload: Mapping[str, object]) -> str: "complexity_total", ), ), + ( + "security-surfaces", + "Security Surfaces", + ( + "items", + "modules", + "exact_items", + "category_count", + "production", + "tests", + ), + ( + "category", + "capability", + "source_kind", + "evidence_symbol", + "qualname", + "location_scope", + ), + ), ( "dependencies", "Dependencies", - ("modules", "edges", "cycles", "max_depth"), + ("modules", "edges", "cycles", "avg_depth", "p95_depth", "max_depth"), ("source", "target", "import_type", "line"), ), ( @@ -566,6 +585,8 @@ def render_markdown_report_document(payload: Mapping[str, object]) -> str: ) if family_key == "coverage-join": family_key = "coverage_join" + if family_key == "security-surfaces": + family_key = "security_surfaces" family_payload = _as_mapping(metrics_families.get(family_key)) if not family_payload and family_key == "overloaded_modules": family_payload = _as_mapping(metrics_families.get("god_modules")) @@ -641,19 +662,34 @@ def to_markdown_report( suggestions: Collection[Suggestion] | None = None, structural_findings: Sequence[StructuralFindingGroup] | None = None, ) -> str: - payload = report_document or build_report_document( - func_groups=func_groups, - block_groups=block_groups, - segment_groups=segment_groups, - meta=meta, - inventory=inventory, - block_facts=block_facts or {}, - new_function_group_keys=new_function_group_keys, - new_block_group_keys=new_block_group_keys, - new_segment_group_keys=new_segment_group_keys, - suppressed_clone_groups=suppressed_clone_groups, - metrics=metrics, - suggestions=tuple(suggestions or ()), - structural_findings=tuple(structural_findings or ()), - ) + payload = report_document + if payload is None: + from ..document.builder import build_report_document + + payload = build_report_document( + func_groups=func_groups, + block_groups=block_groups, + segment_groups=segment_groups, + meta=meta, + inventory=inventory, + block_facts=block_facts or {}, + new_function_group_keys=new_function_group_keys, + new_block_group_keys=new_block_group_keys, + new_segment_group_keys=new_segment_group_keys, + suppressed_clone_groups=suppressed_clone_groups, + metrics=metrics, + suggestions=tuple(suggestions or ()), + structural_findings=tuple(structural_findings or ()), + ) return render_markdown_report_document(payload) + + +__all__ = [ + "MARKDOWN_SCHEMA_VERSION", + "_append_findings_section", + "_append_metric_items", + "_as_float", + "_location_text", + "render_markdown_report_document", + "to_markdown_report", +] diff --git a/codeclone/report/sarif.py b/codeclone/report/renderers/sarif.py similarity index 94% rename from codeclone/report/sarif.py rename to codeclone/report/renderers/sarif.py index ec2177d..ba443c8 100644 --- a/codeclone/report/sarif.py +++ b/codeclone/report/renderers/sarif.py @@ -10,16 +10,12 @@ from collections.abc import Collection, Mapping, Sequence from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING import orjson -from .._coerce import as_float as _as_float -from .._coerce import as_int as _as_int -from .._coerce import as_mapping as _as_mapping -from .._coerce import as_sequence as _as_sequence -from ..contracts import DOCS_URL, REPOSITORY_URL -from ..domain.findings import ( +from ...contracts import DOCS_URL, REPOSITORY_URL +from ...domain.findings import ( CATEGORY_COHESION, CATEGORY_COMPLEXITY, CATEGORY_COUPLING, @@ -46,17 +42,20 @@ SYMBOL_KIND_FUNCTION, SYMBOL_KIND_METHOD, ) -from ..domain.quality import ( +from ...domain.quality import ( CONFIDENCE_HIGH, CONFIDENCE_MEDIUM, SEVERITY_CRITICAL, SEVERITY_WARNING, ) -from .json_contract import build_report_document +from ...utils.coerce import as_float as _as_float +from ...utils.coerce import as_int as _as_int +from ...utils.coerce import as_mapping as _as_mapping +from ...utils.coerce import as_sequence as _as_sequence if TYPE_CHECKING: - from ..models import StructuralFindingGroup, Suggestion - from .types import GroupMapLike + from ...models import StructuralFindingGroup, Suggestion + from ..types import GroupMapLike SARIF_VERSION = "2.1.0" SARIF_PROFILE_VERSION = "1.0" @@ -194,7 +193,7 @@ def _artifact_catalog( } ) artifact_index_map = {path: index for index, path in enumerate(artifact_paths)} - artifacts = [ + artifacts: list[dict[str, object]] = [ { "location": { "uri": path, @@ -203,7 +202,7 @@ def _artifact_catalog( } for path in artifact_paths ] - return cast(list[dict[str, object]], artifacts), artifact_index_map + return artifacts, artifact_index_map def _clone_rule_spec(category: str) -> _RuleSpec: @@ -807,10 +806,10 @@ def _result_entry( group=group, primary_item=primary_item, ), - "properties": _result_properties(group), } + properties = _result_properties(group) + result["properties"] = properties if primary_item: - properties = cast(dict[str, object], result["properties"]) properties.update(_primary_location_properties(primary_item)) baseline_state = _baseline_state(group) if baseline_state: @@ -973,18 +972,42 @@ def to_sarif_report( suggestions: Collection[Suggestion] | None = None, structural_findings: Sequence[StructuralFindingGroup] | None = None, ) -> str: - payload = report_document or build_report_document( - func_groups=func_groups, - block_groups=block_groups, - segment_groups=segment_groups, - meta=meta, - inventory=inventory, - block_facts=block_facts or {}, - new_function_group_keys=new_function_group_keys, - new_block_group_keys=new_block_group_keys, - new_segment_group_keys=new_segment_group_keys, - metrics=metrics, - suggestions=tuple(suggestions or ()), - structural_findings=tuple(structural_findings or ()), - ) + payload = report_document + if payload is None: + from ..document.builder import build_report_document + + payload = build_report_document( + func_groups=func_groups, + block_groups=block_groups, + segment_groups=segment_groups, + meta=meta, + inventory=inventory, + block_facts=block_facts or {}, + new_function_group_keys=new_function_group_keys, + new_block_group_keys=new_block_group_keys, + new_segment_group_keys=new_segment_group_keys, + metrics=metrics, + suggestions=tuple(suggestions or ()), + structural_findings=tuple(structural_findings or ()), + ) return render_sarif_report_document(payload) + + +__all__ = [ + "_baseline_state", + "_location_entry", + "_location_message", + "_logical_locations", + "_partial_fingerprints", + "_primary_location_properties", + "_result_entry", + "_result_message", + "_result_properties", + "_rule_name", + "_rule_spec", + "_scan_root_uri", + "_severity_to_level", + "_text", + "render_sarif_report_document", + "to_sarif_report", +] diff --git a/codeclone/report/serialize.py b/codeclone/report/renderers/text.py similarity index 86% rename from codeclone/report/serialize.py rename to codeclone/report/renderers/text.py index 80c37d4..a3f2a9d 100644 --- a/codeclone/report/serialize.py +++ b/codeclone/report/renderers/text.py @@ -8,21 +8,15 @@ from collections.abc import Mapping, Sequence -import orjson - -from .._coerce import as_int, as_mapping, as_sequence -from ..domain.source_scope import IMPACT_SCOPE_NON_RUNTIME, SOURCE_KIND_OTHER -from ._formatting import format_spread_text +from ...domain.source_scope import IMPACT_SCOPE_NON_RUNTIME, SOURCE_KIND_OTHER +from ...utils.coerce import as_int, as_mapping, as_sequence +from .._formatting import format_spread_text _as_int = as_int _as_mapping = as_mapping _as_sequence = as_sequence -def render_json_report_document(payload: Mapping[str, object]) -> str: - return orjson.dumps(payload, option=orjson.OPT_INDENT_2).decode("utf-8") - - def format_meta_text_value(value: object) -> str: if isinstance(value, bool): return "true" if value else "false" @@ -499,6 +493,224 @@ def _append_overview( ) +def _append_metrics_summary_lines( + lines: list[str], + *, + metrics_summary: Mapping[str, object], +) -> None: + for family_name in ( + "complexity", + "coupling", + "cohesion", + "coverage_join", + "overloaded_modules", + "security_surfaces", + "dependencies", + "dead_code", + "health", + ): + family_summary = _as_mapping(metrics_summary.get(family_name)) + if family_name == "coverage_join" and not family_summary: + continue + keys: Sequence[str] + match family_name: + case "complexity" | "coupling": + keys = ("total", "average", "max", "high_risk") + case "cohesion": + keys = ("total", "average", "max", "low_cohesion") + case "coverage_join": + keys = ( + "status", + "source", + "units", + "measured_units", + "overall_permille", + "coverage_hotspots", + "scope_gap_hotspots", + "hotspot_threshold_percent", + ) + case "dependencies": + keys = ( + "modules", + "edges", + "cycles", + "avg_depth", + "p95_depth", + "max_depth", + ) + case "overloaded_modules": + keys = ( + "total", + "candidates", + "population_status", + "top_score", + "average_score", + ) + case "security_surfaces": + keys = ( + "items", + "modules", + "exact_items", + "category_count", + "production", + "tests", + ) + case "dead_code": + keys = ("total", "high_confidence", "suppressed") + case _: + keys = ("score", "grade") + lines.append(f"{family_name}: {_format_key_values(family_summary, keys)}") + + +def _append_top_metric_family( + lines: list[str], + *, + title: str, + items: Sequence[object], + key_order: Sequence[str], +) -> None: + lines.extend(["", title]) + rows = list(map(_as_mapping, items[:10])) + if not rows: + lines.append("(none)") + return + lines.extend("- " + _format_key_values(item, key_order) for item in rows) + + +def _append_metric_family_sections( + lines: list[str], + *, + metrics_families: Mapping[str, object], +) -> None: + coverage_join_family = _as_mapping(metrics_families.get("coverage_join")) + if coverage_join_family: + _append_top_metric_family( + lines, + title="COVERAGE JOIN (top 10)", + items=_as_sequence(coverage_join_family.get("items")), + key_order=( + "relative_path", + "qualname", + "coverage_status", + "risk", + "coverage_permille", + "cyclomatic_complexity", + "coverage_hotspot", + "scope_gap_hotspot", + ), + ) + + overloaded_modules_family = _as_mapping(metrics_families.get("overloaded_modules")) + if not overloaded_modules_family: + overloaded_modules_family = _as_mapping(metrics_families.get("god_modules")) + _append_top_metric_family( + lines, + title="OVERLOADED MODULES (top 10)", + items=_as_sequence(overloaded_modules_family.get("items")), + key_order=( + "module", + "relative_path", + "source_kind", + "score", + "candidate_status", + "loc", + "fan_in", + "fan_out", + "complexity_total", + ), + ) + + security_surfaces_family = _as_mapping(metrics_families.get("security_surfaces")) + _append_top_metric_family( + lines, + title="SECURITY SURFACES (top 10)", + items=_as_sequence(security_surfaces_family.get("items")), + key_order=( + "category", + "capability", + "source_kind", + "evidence_symbol", + "relative_path", + "qualname", + "location_scope", + ), + ) + + +def _append_findings_sections( + lines: list[str], + *, + findings_groups: Mapping[str, object], + clone_groups: Mapping[str, object], + suppressed_clone_groups: Mapping[str, object], + metrics_families: Mapping[str, object], +) -> None: + for title, group_key, metric_name in ( + ("FUNCTION CLONES", "functions", "loc"), + ("BLOCK CLONES", "blocks", "size"), + ("SEGMENT CLONES", "segments", "size"), + ): + groups = _as_sequence(clone_groups.get(group_key)) + lines.append("") + _append_clone_section( + lines, + title=title, + groups=groups, + novelty="new", + metric_name=metric_name, + ) + lines.append("") + _append_clone_section( + lines, + title=title, + groups=groups, + novelty="known", + metric_name=metric_name, + ) + + if suppressed_clone_groups: + for title, group_key, metric_name in ( + ("SUPPRESSED FUNCTION CLONES", "functions", "loc"), + ("SUPPRESSED BLOCK CLONES", "blocks", "size"), + ("SUPPRESSED SEGMENT CLONES", "segments", "size"), + ): + lines.append("") + _append_suppressed_clone_section( + lines, + title=title, + groups=_as_sequence(suppressed_clone_groups.get(group_key)), + metric_name=metric_name, + ) + + lines.append("") + _append_structural_findings( + lines, + _as_sequence(_as_mapping(findings_groups.get("structural")).get("groups")), + ) + lines.append("") + _append_single_item_findings( + lines, + title="DEAD CODE FINDINGS", + groups=_as_sequence( + _as_mapping(findings_groups.get("dead_code")).get("groups") + ), + fact_keys=("kind", "confidence"), + ) + lines.append("") + dead_code_family = _as_mapping(metrics_families.get("dead_code")) + _append_suppressed_dead_code_items( + lines, + items=_as_sequence(dead_code_family.get("suppressed_items")), + ) + lines.append("") + _append_single_item_findings( + lines, + title="DESIGN FINDINGS", + groups=_as_sequence(_as_mapping(findings_groups.get("design")).get("groups")), + fact_keys=("lcom4", "method_count", "instance_var_count", "fan_out", "risk"), + ) + + def render_text_report_document(payload: Mapping[str, object]) -> str: meta_payload = _as_mapping(payload.get("meta")) baseline = _as_mapping(meta_payload.get("baseline")) @@ -647,216 +859,20 @@ def render_text_report_document(payload: Mapping[str, object]) -> str: "METRICS SUMMARY", ] ) - for family_name in ( - "complexity", - "coupling", - "cohesion", - "coverage_join", - "overloaded_modules", - "dependencies", - "dead_code", - "health", - ): - family_summary = _as_mapping(metrics_summary.get(family_name)) - if family_name == "coverage_join" and not family_summary: - continue - keys: Sequence[str] - match family_name: - case "complexity" | "coupling": - keys = ("total", "average", "max", "high_risk") - case "cohesion": - keys = ("total", "average", "max", "low_cohesion") - case "coverage_join": - keys = ( - "status", - "source", - "units", - "measured_units", - "overall_permille", - "coverage_hotspots", - "scope_gap_hotspots", - "hotspot_threshold_percent", - ) - case "dependencies": - keys = ("modules", "edges", "cycles", "max_depth") - case "overloaded_modules": - keys = ( - "total", - "candidates", - "population_status", - "top_score", - "average_score", - ) - case "dead_code": - keys = ("total", "high_confidence", "suppressed") - case _: - keys = ("score", "grade") - lines.append(f"{family_name}: {_format_key_values(family_summary, keys)}") - - coverage_join_family = _as_mapping(metrics_families.get("coverage_join")) - coverage_join_items = _as_sequence(coverage_join_family.get("items")) - if coverage_join_family: - lines.extend( - [ - "", - "COVERAGE JOIN (top 10)", - ] - ) - if not coverage_join_items: - lines.append("(none)") - else: - lines.extend( - "- " - + _format_key_values( - item, - ( - "relative_path", - "qualname", - "coverage_status", - "risk", - "coverage_permille", - "cyclomatic_complexity", - "coverage_hotspot", - "scope_gap_hotspot", - ), - ) - for item in map(_as_mapping, coverage_join_items[:10]) - ) - - overloaded_modules_family = _as_mapping(metrics_families.get("overloaded_modules")) - if not overloaded_modules_family: - overloaded_modules_family = _as_mapping(metrics_families.get("god_modules")) - overloaded_module_items = _as_sequence(overloaded_modules_family.get("items")) - lines.extend( - [ - "", - "OVERLOADED MODULES (top 10)", - ] - ) - if not overloaded_module_items: - lines.append("(none)") - else: - lines.extend( - "- " - + _format_key_values( - item, - ( - "module", - "relative_path", - "source_kind", - "score", - "candidate_status", - "loc", - "fan_in", - "fan_out", - "complexity_total", - ), - ) - for item in map(_as_mapping, overloaded_module_items[:10]) - ) + _append_metrics_summary_lines(lines, metrics_summary=metrics_summary) + _append_metric_family_sections(lines, metrics_families=metrics_families) lines.append("") _append_overview(lines, overview, hotlists) lines.append("") _append_suggestions(lines, suggestions=suggestions_payload, findings=findings) - - lines.append("") - _append_clone_section( - lines, - title="FUNCTION CLONES", - groups=_as_sequence(clone_groups.get("functions")), - novelty="new", - metric_name="loc", - ) - lines.append("") - _append_clone_section( - lines, - title="FUNCTION CLONES", - groups=_as_sequence(clone_groups.get("functions")), - novelty="known", - metric_name="loc", - ) - lines.append("") - _append_clone_section( - lines, - title="BLOCK CLONES", - groups=_as_sequence(clone_groups.get("blocks")), - novelty="new", - metric_name="size", - ) - lines.append("") - _append_clone_section( - lines, - title="BLOCK CLONES", - groups=_as_sequence(clone_groups.get("blocks")), - novelty="known", - metric_name="size", - ) - lines.append("") - _append_clone_section( - lines, - title="SEGMENT CLONES", - groups=_as_sequence(clone_groups.get("segments")), - novelty="new", - metric_name="size", - ) - lines.append("") - _append_clone_section( + _append_findings_sections( lines, - title="SEGMENT CLONES", - groups=_as_sequence(clone_groups.get("segments")), - novelty="known", - metric_name="size", - ) - if suppressed_clone_groups: - lines.append("") - _append_suppressed_clone_section( - lines, - title="SUPPRESSED FUNCTION CLONES", - groups=_as_sequence(suppressed_clone_groups.get("functions")), - metric_name="loc", - ) - lines.append("") - _append_suppressed_clone_section( - lines, - title="SUPPRESSED BLOCK CLONES", - groups=_as_sequence(suppressed_clone_groups.get("blocks")), - metric_name="size", - ) - lines.append("") - _append_suppressed_clone_section( - lines, - title="SUPPRESSED SEGMENT CLONES", - groups=_as_sequence(suppressed_clone_groups.get("segments")), - metric_name="size", - ) - lines.append("") - _append_structural_findings( - lines, - _as_sequence(_as_mapping(findings_groups.get("structural")).get("groups")), - ) - lines.append("") - _append_single_item_findings( - lines, - title="DEAD CODE FINDINGS", - groups=_as_sequence( - _as_mapping(findings_groups.get("dead_code")).get("groups") - ), - fact_keys=("kind", "confidence"), - ) - lines.append("") - dead_code_family = _as_mapping(metrics_families.get("dead_code")) - _append_suppressed_dead_code_items( - lines, - items=_as_sequence(dead_code_family.get("suppressed_items")), - ) - lines.append("") - _append_single_item_findings( - lines, - title="DESIGN FINDINGS", - groups=_as_sequence(_as_mapping(findings_groups.get("design")).get("groups")), - fact_keys=("lcom4", "method_count", "instance_var_count", "fan_out", "risk"), + findings_groups=findings_groups, + clone_groups=clone_groups, + suppressed_clone_groups=suppressed_clone_groups, + metrics_families=metrics_families, ) lines.extend( [ @@ -876,3 +892,15 @@ def render_text_report_document(payload: Mapping[str, object]) -> str: ) return "\n".join(lines).rstrip() + "\n" + + +__all__ = [ + "_append_clone_section", + "_append_single_item_findings", + "_append_structural_findings", + "_append_suggestions", + "_append_suppressed_dead_code_items", + "_as_int", + "_structural_kind_label", + "render_text_report_document", +] diff --git a/codeclone/report/suggestions.py b/codeclone/report/suggestions.py index 3715353..85ff186 100644 --- a/codeclone/report/suggestions.py +++ b/codeclone/report/suggestions.py @@ -8,7 +8,6 @@ from typing import TYPE_CHECKING, Literal -from .._coerce import as_int, as_str from ..domain.findings import ( CATEGORY_CLONE, CATEGORY_COHESION, @@ -36,6 +35,7 @@ SEVERITY_RANK, SEVERITY_WARNING, ) +from ..findings.structural.detectors import normalize_structural_findings from ..models import ( ClassMetrics, GroupItemLike, @@ -49,7 +49,7 @@ BLOCK_HINT_ASSERT_ONLY, BLOCK_PATTERN_REPEATED_STMT_HASH, ) -from ..structural_findings import normalize_structural_findings +from ..utils.coerce import as_int, as_str from .derived import ( combine_source_kinds, format_group_location_label, diff --git a/codeclone/scanner.py b/codeclone/scanner/__init__.py similarity index 98% rename from codeclone/scanner.py rename to codeclone/scanner/__init__.py index a9c65a9..8f05ffc 100644 --- a/codeclone/scanner.py +++ b/codeclone/scanner/__init__.py @@ -11,7 +11,7 @@ from pathlib import Path from typing import TYPE_CHECKING -from .errors import ValidationError +from ..contracts.errors import ValidationError if TYPE_CHECKING: from collections.abc import Iterable diff --git a/codeclone/surfaces/__init__.py b/codeclone/surfaces/__init__.py new file mode 100644 index 0000000..557317f --- /dev/null +++ b/codeclone/surfaces/__init__.py @@ -0,0 +1,4 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 diff --git a/codeclone/surfaces/cli/__init__.py b/codeclone/surfaces/cli/__init__.py new file mode 100644 index 0000000..9135843 --- /dev/null +++ b/codeclone/surfaces/cli/__init__.py @@ -0,0 +1,5 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy diff --git a/codeclone/surfaces/cli/attrs.py b/codeclone/surfaces/cli/attrs.py new file mode 100644 index 0000000..b450862 --- /dev/null +++ b/codeclone/surfaces/cli/attrs.py @@ -0,0 +1,42 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from pathlib import Path + + +def bool_attr(obj: object, name: str) -> bool: + return bool(getattr(obj, name, False)) + + +def int_attr(obj: object, name: str, default: int = 0) -> int: + value = getattr(obj, name, default) + if isinstance(value, bool): + return default + if isinstance(value, int): + return value + return default + + +def optional_text_attr(obj: object, name: str) -> str | None: + value = getattr(obj, name, None) + if value is None: + return None + if isinstance(value, Path): + return str(value) + if isinstance(value, str): + return value + return None + + +def text_attr(obj: object, name: str, default: str = "") -> str: + value = optional_text_attr(obj, name) + return default if value is None else value + + +def set_bool_attr(obj: object, name: str, value: bool) -> None: + setattr(obj, name, value) diff --git a/codeclone/_cli_baselines.py b/codeclone/surfaces/cli/baseline_state.py similarity index 82% rename from codeclone/_cli_baselines.py rename to codeclone/surfaces/cli/baseline_state.py index 2be5a59..429dcd4 100644 --- a/codeclone/_cli_baselines.py +++ b/codeclone/surfaces/cli/baseline_state.py @@ -8,40 +8,49 @@ import sys from dataclasses import dataclass -from json import JSONDecodeError from pathlib import Path from typing import TYPE_CHECKING, Protocol -import orjson - -from . import ui_messages as ui -from .baseline import ( +from ... import __version__ +from ... import ui_messages as ui +from ...baseline import ( BASELINE_UNTRUSTED_STATUSES, Baseline, BaselineStatus, coerce_baseline_status, current_python_tag, ) -from .contracts import ( - BASELINE_FINGERPRINT_VERSION, - BASELINE_SCHEMA_VERSION, - ExitCode, -) -from .errors import BaselineValidationError -from .metrics_baseline import ( +from ...baseline.metrics_baseline import ( METRICS_BASELINE_UNTRUSTED_STATUSES, MetricsBaseline, + MetricsBaselineSectionProbe, MetricsBaselineStatus, coerce_metrics_baseline_status, + probe_metrics_baseline_section, +) +from ...contracts import ( + BASELINE_FINGERPRINT_VERSION, + BASELINE_SCHEMA_VERSION, + ExitCode, ) +from ...contracts.errors import BaselineValidationError +from . import state as cli_state +from .types import CLIArgsLike, require_status_console if TYPE_CHECKING: - from .models import GroupMapLike, ProjectMetrics + from ...core._types import AnalysisResult + from ...models import GroupMapLike, ProjectMetrics __all__ = [ "CloneBaselineState", "MetricsBaselineSectionProbe", "MetricsBaselineState", + "_CloneBaselineState", + "_MetricsBaselineSectionProbe", + "_MetricsBaselineState", + "_probe_metrics_baseline_section", + "_resolve_clone_baseline_state", + "_resolve_metrics_baseline_state", "probe_metrics_baseline_section", "resolve_clone_baseline_state", "resolve_metrics_baseline_state", @@ -94,35 +103,9 @@ class _MetricsBaselineRuntime: trusted_for_diff: bool = False -@dataclass(frozen=True, slots=True) -class MetricsBaselineSectionProbe: - has_metrics_section: bool - payload: dict[str, object] | None - - -def probe_metrics_baseline_section(path: Path) -> MetricsBaselineSectionProbe: - if not path.exists(): - return MetricsBaselineSectionProbe( - has_metrics_section=False, - payload=None, - ) - try: - raw_payload = orjson.loads(path.read_text("utf-8")) - except (OSError, JSONDecodeError): - return MetricsBaselineSectionProbe( - has_metrics_section=True, - payload=None, - ) - if not isinstance(raw_payload, dict): - return MetricsBaselineSectionProbe( - has_metrics_section=True, - payload=None, - ) - payload = dict(raw_payload) - return MetricsBaselineSectionProbe( - has_metrics_section=("metrics" in payload), - payload=payload, - ) +_CloneBaselineState = CloneBaselineState +_MetricsBaselineSectionProbe = MetricsBaselineSectionProbe +_MetricsBaselineState = MetricsBaselineState def resolve_clone_baseline_state( @@ -227,6 +210,7 @@ def resolve_metrics_baseline_state( args: _BaselineArgs, metrics_baseline_path: Path, metrics_baseline_exists: bool, + clone_baseline_state: CloneBaselineState, baseline_updated_path: Path | None, project_metrics: ProjectMetrics | None, console: _PrinterLike, @@ -246,6 +230,7 @@ def resolve_metrics_baseline_state( _load_metrics_baseline_for_diff( args=args, metrics_baseline_exists=metrics_baseline_exists, + clone_baseline_state=clone_baseline_state, state=state, console=console, shared_baseline_payload=shared_baseline_payload, @@ -299,10 +284,17 @@ def _load_metrics_baseline_for_diff( *, args: _BaselineArgs, metrics_baseline_exists: bool, + clone_baseline_state: CloneBaselineState, state: _MetricsBaselineRuntime, console: _PrinterLike, shared_baseline_payload: dict[str, object] | None = None, ) -> None: + suppress_invalid_message = bool( + shared_baseline_payload is not None + and not clone_baseline_state.loaded + and clone_baseline_state.status in BASELINE_UNTRUSTED_STATUSES + ) + if not metrics_baseline_exists: if _metrics_baseline_gate_requested(args) and not args.update_metrics_baseline: state.failure_code = ExitCode.CONTRACT_ERROR @@ -325,7 +317,8 @@ def _load_metrics_baseline_for_diff( except BaselineValidationError as exc: state.status = coerce_metrics_baseline_status(exc.status) if not args.update_metrics_baseline: - console.print(ui.fmt_invalid_baseline(exc)) + if not suppress_invalid_message: + console.print(ui.fmt_invalid_baseline(exc)) if args.fail_on_new_metrics: state.failure_code = ExitCode.CONTRACT_ERROR return @@ -337,7 +330,8 @@ def _load_metrics_baseline_for_diff( state.baseline.verify_compatibility(runtime_python_tag=current_python_tag()) except BaselineValidationError as exc: state.status = coerce_metrics_baseline_status(exc.status) - console.print(ui.fmt_invalid_baseline(exc)) + if not suppress_invalid_message: + console.print(ui.fmt_invalid_baseline(exc)) if args.fail_on_new_metrics: state.failure_code = ExitCode.CONTRACT_ERROR else: @@ -457,3 +451,49 @@ def _enforce_metrics_gate_schema_requirements( "--update-metrics-baseline first." ) ) + + +def _probe_metrics_baseline_section(path: Path) -> _MetricsBaselineSectionProbe: + return probe_metrics_baseline_section(path) + + +def _resolve_clone_baseline_state( + *, + args: CLIArgsLike, + baseline_path: Path, + baseline_exists: bool, + analysis: AnalysisResult, + shared_baseline_payload: dict[str, object] | None = None, +) -> _CloneBaselineState: + return resolve_clone_baseline_state( + args=args, + baseline_path=baseline_path, + baseline_exists=baseline_exists, + func_groups=analysis.func_groups, + block_groups=analysis.block_groups, + codeclone_version=__version__, + console=require_status_console(cli_state.get_console()), + shared_baseline_payload=shared_baseline_payload, + ) + + +def _resolve_metrics_baseline_state( + *, + args: CLIArgsLike, + metrics_baseline_path: Path, + metrics_baseline_exists: bool, + clone_baseline_state: CloneBaselineState, + baseline_updated_path: Path | None, + analysis: AnalysisResult, + shared_baseline_payload: dict[str, object] | None = None, +) -> _MetricsBaselineState: + return resolve_metrics_baseline_state( + args=args, + metrics_baseline_path=metrics_baseline_path, + metrics_baseline_exists=metrics_baseline_exists, + clone_baseline_state=clone_baseline_state, + baseline_updated_path=baseline_updated_path, + project_metrics=analysis.project_metrics, + console=require_status_console(cli_state.get_console()), + shared_baseline_payload=shared_baseline_payload, + ) diff --git a/codeclone/surfaces/cli/changed_scope.py b/codeclone/surfaces/cli/changed_scope.py new file mode 100644 index 0000000..4a47d23 --- /dev/null +++ b/codeclone/surfaces/cli/changed_scope.py @@ -0,0 +1,221 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import subprocess +import sys +from collections.abc import Mapping, Sequence +from pathlib import Path + +from ... import ui_messages as ui +from ...contracts import ExitCode +from ...utils import coerce as _coerce +from ...utils.git_diff import validate_git_diff_ref +from . import state as cli_state +from .attrs import bool_attr, optional_text_attr, set_bool_attr +from .types import ChangedCloneGate, require_status_console + +_as_mapping = _coerce.as_mapping +_as_sequence = _coerce.as_sequence + +__all__ = ["ChangedCloneGate"] + + +def _validate_changed_scope_args(*, args: object) -> str | None: + console = require_status_console(cli_state.get_console()) + diff_against = optional_text_attr(args, "diff_against") + paths_from_git_diff = optional_text_attr(args, "paths_from_git_diff") + if diff_against and paths_from_git_diff: + console.print( + ui.fmt_contract_error( + "Use --diff-against or --paths-from-git-diff, not both." + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + if paths_from_git_diff: + set_bool_attr(args, "changed_only", True) + return paths_from_git_diff + if diff_against and not bool_attr(args, "changed_only"): + console.print(ui.fmt_contract_error("--diff-against requires --changed-only.")) + sys.exit(ExitCode.CONTRACT_ERROR) + if bool_attr(args, "changed_only") and not diff_against: + console.print( + ui.fmt_contract_error( + "--changed-only requires --diff-against or --paths-from-git-diff." + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + return diff_against + + +def _normalize_changed_paths( + *, + root_path: Path, + paths: Sequence[str], +) -> tuple[str, ...]: + console = require_status_console(cli_state.get_console()) + normalized: set[str] = set() + for raw_path in paths: + candidate = raw_path.strip() + if not candidate: + continue + candidate_path = Path(candidate) + try: + absolute_path = ( + candidate_path.resolve() + if candidate_path.is_absolute() + else (root_path / candidate_path).resolve() + ) + except OSError as exc: + console.print( + ui.fmt_contract_error( + f"Unable to resolve changed path '{candidate}': {exc}" + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + try: + relative_path = absolute_path.relative_to(root_path) + except ValueError: + console.print( + ui.fmt_contract_error( + f"Changed path '{candidate}' is outside the scan root." + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + cleaned = str(relative_path).replace("\\", "/").strip("/") + if cleaned: + normalized.add(cleaned) + return tuple(sorted(normalized)) + + +def _git_diff_changed_paths(*, root_path: Path, git_diff_ref: str) -> tuple[str, ...]: + console = require_status_console(cli_state.get_console()) + try: + validated_ref = validate_git_diff_ref(git_diff_ref) + except ValueError as exc: + console.print(ui.fmt_contract_error(str(exc))) + sys.exit(ExitCode.CONTRACT_ERROR) + try: + completed = subprocess.run( + ["git", "diff", "--name-only", validated_ref, "--"], + cwd=str(root_path), + check=True, + capture_output=True, + text=True, + timeout=30, + ) + except ( + FileNotFoundError, + subprocess.CalledProcessError, + subprocess.TimeoutExpired, + ) as exc: + console.print( + ui.fmt_contract_error( + "Unable to resolve changed files from git diff ref " + f"'{validated_ref}': {exc}" + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + lines = [line.strip() for line in completed.stdout.splitlines() if line.strip()] + return _normalize_changed_paths(root_path=root_path, paths=lines) + + +def _path_matches(relative_path: str, changed_paths: Sequence[str]) -> bool: + return any( + relative_path == candidate or relative_path.startswith(candidate + "/") + for candidate in changed_paths + ) + + +def _flatten_report_findings( + report_document: Mapping[str, object], +) -> list[dict[str, object]]: + findings = _as_mapping(report_document.get("findings")) + groups = _as_mapping(findings.get("groups")) + clone_groups = _as_mapping(groups.get("clones")) + return [ + *[ + dict(_as_mapping(item)) + for item in _as_sequence(clone_groups.get("functions")) + ], + *[dict(_as_mapping(item)) for item in _as_sequence(clone_groups.get("blocks"))], + *[ + dict(_as_mapping(item)) + for item in _as_sequence(clone_groups.get("segments")) + ], + *[ + dict(_as_mapping(item)) + for item in _as_sequence( + _as_mapping(groups.get("structural")).get("groups") + ) + ], + *[ + dict(_as_mapping(item)) + for item in _as_sequence(_as_mapping(groups.get("dead_code")).get("groups")) + ], + *[ + dict(_as_mapping(item)) + for item in _as_sequence(_as_mapping(groups.get("design")).get("groups")) + ], + ] + + +def _finding_touches_changed_paths( + finding: Mapping[str, object], + *, + changed_paths: Sequence[str], +) -> bool: + for item in _as_sequence(finding.get("items")): + relative_path = str(_as_mapping(item).get("relative_path", "")).strip() + if relative_path and _path_matches(relative_path, changed_paths): + return True + return False + + +def _changed_clone_gate_from_report( + report_document: Mapping[str, object], + *, + changed_paths: Sequence[str], +) -> ChangedCloneGate: + findings = [ + finding + for finding in _flatten_report_findings(report_document) + if _finding_touches_changed_paths(finding, changed_paths=changed_paths) + ] + clone_findings = [ + finding + for finding in findings + if str(finding.get("family", "")).strip() == "clone" + and str(finding.get("category", "")).strip() in {"function", "block"} + ] + new_func = frozenset( + str(finding.get("id", "")) + for finding in clone_findings + if str(finding.get("category", "")).strip() == "function" + and str(finding.get("novelty", "")).strip() == "new" + ) + new_block = frozenset( + str(finding.get("id", "")) + for finding in clone_findings + if str(finding.get("category", "")).strip() == "block" + and str(finding.get("novelty", "")).strip() == "new" + ) + findings_new = sum( + 1 for finding in findings if str(finding.get("novelty", "")).strip() == "new" + ) + findings_known = sum( + 1 for finding in findings if str(finding.get("novelty", "")).strip() == "known" + ) + return ChangedCloneGate( + changed_paths=tuple(changed_paths), + new_func=new_func, + new_block=new_block, + total_clone_groups=len(clone_findings), + findings_total=len(findings), + findings_new=findings_new, + findings_known=findings_known, + ) diff --git a/codeclone/_cli_rich.py b/codeclone/surfaces/cli/console.py similarity index 57% rename from codeclone/_cli_rich.py rename to codeclone/surfaces/cli/console.py index 88f9d00..9a7ffb5 100644 --- a/codeclone/_cli_rich.py +++ b/codeclone/surfaces/cli/console.py @@ -6,10 +6,20 @@ from __future__ import annotations +import os import re +import sys +from collections.abc import Mapping, Sequence from contextlib import AbstractContextManager, nullcontext from functools import lru_cache -from typing import TYPE_CHECKING, Protocol +from pathlib import Path +from typing import TYPE_CHECKING + +from ... import __version__ +from ... import ui_messages as ui +from ...report.gates import reasons as gate_reasons +from . import state as cli_state +from .types import CLIArgsLike, PrinterLike, StatusConsole, require_status_console if TYPE_CHECKING: from rich.console import Console as RichConsole @@ -30,19 +40,6 @@ } _RICH_MARKUP_TAG_RE = re.compile(r"\[/?[a-zA-Z][a-zA-Z0-9_ .#:-]*]") -__all__ = [ - "PlainConsole", - "make_console", - "make_plain_console", - "print_banner", - "rich_console_symbols", - "rich_progress_symbols", -] - - -class _PrinterLike(Protocol): - def print(self, *objects: object, **kwargs: object) -> None: ... - class PlainConsole: """Lightweight console for quiet/no-progress mode.""" @@ -110,9 +107,9 @@ def make_plain_console() -> PlainConsole: return PlainConsole() -def print_banner( +def _render_banner( *, - console: _PrinterLike, + console: PrinterLike, banner_title: str, project_name: str | None = None, root_display: str | None = None, @@ -129,3 +126,78 @@ def print_banner( ) if root_display is not None: console.print(f" [dim]Root:[/dim] [dim]{root_display}[/dim]") + + +def _console() -> StatusConsole: + return require_status_console(cli_state.get_console()) + + +def _rich_progress_symbols() -> tuple[ + type[RichProgress], + type[RichSpinnerColumn], + type[RichTextColumn], + type[RichBarColumn], + type[RichTimeElapsedColumn], +]: + progress, spinner, text, bar, elapsed = rich_progress_symbols() + return (progress, spinner, text, bar, elapsed) + + +def _make_console(*, no_color: bool) -> object: + return make_console(no_color=no_color, width=ui.CLI_LAYOUT_MAX_WIDTH) + + +def _make_plain_console() -> PlainConsole: + return make_plain_console() + + +def _parse_metric_reason_entry(reason: str) -> tuple[str, str]: + return gate_reasons.parse_metric_reason_entry(reason) + + +def _print_gating_failure_block( + *, + code: str, + entries: Sequence[tuple[str, object]], + args: CLIArgsLike, +) -> None: + gate_reasons.print_gating_failure_block( + console=_console(), + code=code, + entries=list(entries), + args=args, + ) + + +def _print_verbose_clone_hashes( + console: PrinterLike, + *, + label: str, + clone_hashes: set[str], +) -> None: + if not clone_hashes: + return + console.print(f"\n {label}:") + for clone_hash in sorted(clone_hashes): + console.print(f" - {clone_hash}") + + +def print_banner(*, root: Path | None = None) -> None: + _render_banner( + console=_console(), + banner_title=ui.banner_title(__version__), + project_name=(root.name if root is not None else None), + root_display=(str(root) if root is not None else None), + ) + + +def _is_debug_enabled( + *, + argv: Sequence[str] | None = None, + environ: Mapping[str, str] | None = None, +) -> bool: + args = list(sys.argv[1:] if argv is None else argv) + debug_from_flag = any(arg == "--debug" for arg in args) + env = os.environ if environ is None else environ + debug_from_env = env.get("CODECLONE_DEBUG") == "1" + return debug_from_flag or debug_from_env diff --git a/codeclone/surfaces/cli/execution.py b/codeclone/surfaces/cli/execution.py new file mode 100644 index 0000000..f4445ba --- /dev/null +++ b/codeclone/surfaces/cli/execution.py @@ -0,0 +1,351 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import sys +import time +from collections.abc import Callable +from dataclasses import replace +from pathlib import Path + +from rich.console import Console as RichConsole +from rich.progress import ( + BarColumn, + Progress, + SpinnerColumn, + TextColumn, + TimeElapsedColumn, +) + +from ... import ui_messages as ui +from ...cache.store import Cache +from ...contracts import DEFAULT_HTML_REPORT_PATH, ExitCode +from ...contracts.errors import CacheError +from ...core._types import AnalysisResult, BootstrapResult, DiscoveryResult +from ...core._types import ProcessingResult as PipelineProcessingResult +from ...core.reporting import GatingResult +from ...models import MetricsDiff +from . import state as cli_state +from .attrs import bool_attr +from .console import PlainConsole +from .types import require_status_console + + +def run_analysis_stages( + *, + args: object, + boot: BootstrapResult, + cache: Cache, + discover_fn: Callable[..., DiscoveryResult], + process_fn: Callable[..., PipelineProcessingResult], + analyze_fn: Callable[..., AnalysisResult], + print_failed_files_fn: Callable[[tuple[str, ...]], None], + cache_update_segment_projection_fn: Callable[[Cache, AnalysisResult], None], + rich_progress_symbols_fn: Callable[ + [], + tuple[ + type[Progress], + type[SpinnerColumn], + type[TextColumn], + type[BarColumn], + type[TimeElapsedColumn], + ], + ], +) -> tuple[DiscoveryResult, PipelineProcessingResult, AnalysisResult]: + def _require_rich_console(value: object) -> RichConsole: + if isinstance(value, PlainConsole): + raise RuntimeError("Rich console is required when progress UI is enabled.") + if not isinstance(value, RichConsole): + raise RuntimeError("Rich console is required when progress UI is enabled.") + return value + + printer = require_status_console(cli_state.get_console()) + use_status = not bool_attr(args, "quiet") and not bool_attr(args, "no_progress") + + try: + if use_status: + with printer.status(ui.STATUS_DISCOVERING, spinner="dots"): + discovery_result = discover_fn(boot=boot, cache=cache) + else: + discovery_result = discover_fn(boot=boot, cache=cache) + except OSError as exc: + printer.print(ui.fmt_contract_error(ui.ERR_SCAN_FAILED.format(error=exc))) + sys.exit(ExitCode.CONTRACT_ERROR) + + for warning in discovery_result.skipped_warnings: + printer.print(ui.fmt_cli_runtime_warning(warning)) + + total_files = len(discovery_result.files_to_process) + if ( + total_files > 0 + and not bool_attr(args, "quiet") + and bool_attr(args, "no_progress") + ): + printer.print(ui.fmt_processing_changed(total_files)) + + if total_files > 0 and not bool_attr(args, "no_progress"): + ( + progress_cls, + spinner_column_cls, + text_column_cls, + bar_column_cls, + time_elapsed_column_cls, + ) = rich_progress_symbols_fn() + + with progress_cls( + spinner_column_cls(), + text_column_cls("[progress.description]{task.description}"), + bar_column_cls(), + text_column_cls("[progress.percentage]{task.percentage:>3.0f}%"), + time_elapsed_column_cls(), + console=_require_rich_console(cli_state.get_console()), + ) as progress_ui: + task_id = progress_ui.add_task( + f"Analyzing {total_files} files...", + total=total_files, + ) + processing_result = process_fn( + boot=boot, + discovery=discovery_result, + cache=cache, + on_advance=lambda: progress_ui.advance(task_id), + on_worker_error=lambda reason: printer.print( + ui.fmt_cli_runtime_warning(ui.fmt_worker_failed(reason)) + ), + on_parallel_fallback=lambda exc: printer.print( + ui.fmt_cli_runtime_warning(ui.fmt_parallel_fallback(exc)) + ), + ) + else: + processing_result = process_fn( + boot=boot, + discovery=discovery_result, + cache=cache, + on_worker_error=( + ( + lambda reason: printer.print( + ui.fmt_cli_runtime_warning(ui.fmt_batch_item_failed(reason)) + ) + ) + if bool_attr(args, "no_progress") + else ( + lambda reason: printer.print( + ui.fmt_cli_runtime_warning(ui.fmt_worker_failed(reason)) + ) + ) + ), + on_parallel_fallback=lambda exc: printer.print( + ui.fmt_cli_runtime_warning(ui.fmt_parallel_fallback(exc)) + ), + ) + + print_failed_files_fn(tuple(processing_result.failed_files)) + if not processing_result.failed_files and processing_result.source_read_failures: + print_failed_files_fn(tuple(processing_result.source_read_failures)) + + if use_status: + with printer.status(ui.STATUS_GROUPING, spinner="dots"): + analysis_result = analyze_fn( + boot=boot, + discovery=discovery_result, + processing=processing_result, + ) + cache_update_segment_projection_fn(cache, analysis_result) + try: + cache.save() + except CacheError as exc: + printer.print(ui.fmt_cli_runtime_warning(ui.fmt_cache_save_failed(exc))) + else: + analysis_result = analyze_fn( + boot=boot, + discovery=discovery_result, + processing=processing_result, + ) + cache_update_segment_projection_fn(cache, analysis_result) + try: + cache.save() + except CacheError as exc: + printer.print(ui.fmt_cli_runtime_warning(ui.fmt_cache_save_failed(exc))) + + coverage_join = getattr(analysis_result, "coverage_join", None) + if ( + coverage_join is not None + and coverage_join.status != "ok" + and coverage_join.invalid_reason + ): + printer.print( + ui.fmt_cli_runtime_warning( + ui.fmt_coverage_join_ignored(coverage_join.invalid_reason) + ) + ) + + return discovery_result, processing_result, analysis_result + + +def enforce_gating( + *, + args: object, + boot: BootstrapResult, + analysis: AnalysisResult, + processing: PipelineProcessingResult, + source_read_contract_failure: bool, + baseline_failure_code: ExitCode | None, + metrics_baseline_failure_code: ExitCode | None, + new_func: set[str], + new_block: set[str], + metrics_diff: MetricsDiff | None, + html_report_path: str | None, + gate_fn: Callable[..., GatingResult], + parse_metric_reason_entry_fn: Callable[[str], tuple[str, str]], + print_gating_failure_block_fn: Callable[..., None], + print_verbose_clone_hashes_fn: Callable[..., None], + clone_threshold_total: int | None = None, +) -> None: + printer = require_status_console(cli_state.get_console()) + + if source_read_contract_failure: + printer.print( + ui.fmt_contract_error( + ui.fmt_unreadable_source_in_gating( + count=len(processing.source_read_failures) + ) + ) + ) + for failure in processing.source_read_failures[:10]: + printer.print(f" • {failure}") + if len(processing.source_read_failures) > 10: + printer.print(f" ... and {len(processing.source_read_failures) - 10} more") + sys.exit(ExitCode.CONTRACT_ERROR) + + if baseline_failure_code is not None: + printer.print( + ui.fmt_contract_error( + ui.fmt_baseline_gating_requires_trusted(ci=bool_attr(args, "ci")) + ) + ) + sys.exit(baseline_failure_code) + + if metrics_baseline_failure_code is not None: + printer.print( + ui.fmt_contract_error( + "Metrics baseline is untrusted or missing for requested metrics gating." + ) + ) + sys.exit(metrics_baseline_failure_code) + + if bool_attr(args, "fail_on_untested_hotspots"): + if analysis.coverage_join is None: + printer.print( + ui.fmt_contract_error( + "--fail-on-untested-hotspots requires --coverage." + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + if analysis.coverage_join.status != "ok": + detail = analysis.coverage_join.invalid_reason or "invalid coverage input" + printer.print( + ui.fmt_contract_error( + "Coverage gating requires a valid Cobertura XML input.\n" + f"Reason: {detail}" + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + + gating_analysis = analysis + if clone_threshold_total is not None: + preserved_block_count = min( + max(analysis.block_clones_count, 0), + max(clone_threshold_total, 0), + ) + gating_analysis = replace( + analysis, + func_clones_count=max(clone_threshold_total - preserved_block_count, 0), + block_clones_count=preserved_block_count, + ) + + gate_result = gate_fn( + boot=boot, + analysis=gating_analysis, + new_func=new_func, + new_block=new_block, + metrics_diff=metrics_diff, + ) + + metric_reasons = [ + reason[len("metric:") :] + for reason in gate_result.reasons + if reason.startswith("metric:") + ] + if metric_reasons: + print_gating_failure_block_fn( + code="metrics", + entries=[parse_metric_reason_entry_fn(reason) for reason in metric_reasons], + args=args, + ) + sys.exit(ExitCode.GATING_FAILURE) + + if "clone:new" in gate_result.reasons: + default_report = Path(DEFAULT_HTML_REPORT_PATH) + resolved_html_report_path = html_report_path + if resolved_html_report_path is None and default_report.exists(): + resolved_html_report_path = str(default_report) + + clone_entries: list[tuple[str, object]] = [ + ("new_function_clone_groups", len(new_func)), + ("new_block_clone_groups", len(new_block)), + ] + if resolved_html_report_path: + clone_entries.append(("report", resolved_html_report_path)) + clone_entries.append(("accept", "codeclone . --update-baseline")) + print_gating_failure_block_fn( + code="new-clones", + entries=clone_entries, + args=args, + ) + + if bool_attr(args, "verbose"): + print_verbose_clone_hashes_fn( + printer, + label="Function clone hashes", + clone_hashes=new_func, + ) + print_verbose_clone_hashes_fn( + printer, + label="Block clone hashes", + clone_hashes=new_block, + ) + + sys.exit(ExitCode.GATING_FAILURE) + + threshold_reason = next( + ( + reason + for reason in gate_result.reasons + if reason.startswith("clone:threshold:") + ), + None, + ) + if threshold_reason is not None: + _, _, total_raw, threshold_raw = threshold_reason.split(":", maxsplit=3) + print_gating_failure_block_fn( + code="threshold", + entries=( + ("clone_groups_total", int(total_raw)), + ("clone_groups_limit", int(threshold_raw)), + ), + args=args, + ) + sys.exit(ExitCode.GATING_FAILURE) + + +def print_pipeline_done_if_needed(*, args: object, run_started_at: float) -> None: + if bool_attr(args, "quiet"): + return + elapsed = time.monotonic() - run_started_at + printer = require_status_console(cli_state.get_console()) + printer.print() + printer.print(ui.fmt_pipeline_done(elapsed)) diff --git a/codeclone/surfaces/cli/post_run.py b/codeclone/surfaces/cli/post_run.py new file mode 100644 index 0000000..1662ce3 --- /dev/null +++ b/codeclone/surfaces/cli/post_run.py @@ -0,0 +1,140 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections.abc import Callable, Collection, Mapping +from dataclasses import dataclass +from pathlib import Path + +from ... import ui_messages as ui +from ...baseline import Baseline +from ...core._types import AnalysisResult +from ...models import MetricsDiff +from .baseline_state import CloneBaselineState, MetricsBaselineState +from .changed_scope import ChangedCloneGate +from .summary import ChangedScopeSnapshot +from .types import CLIArgsLike, PrinterLike + + +@dataclass(frozen=True, slots=True) +class DiffContext: + new_func: set[str] + new_block: set[str] + new_clones_count: int + metrics_diff: MetricsDiff | None + coverage_adoption_diff_available: bool + api_surface_diff_available: bool + + +def build_diff_context( + *, + analysis: AnalysisResult, + baseline_path: Path, + baseline_state: CloneBaselineState, + metrics_baseline_state: MetricsBaselineState, +) -> DiffContext: + baseline_for_diff = ( + baseline_state.baseline + if baseline_state.trusted_for_diff + else Baseline(baseline_path) + ) + raw_new_func, raw_new_block = baseline_for_diff.diff( + analysis.func_groups, + analysis.block_groups, + ) + metrics_diff = None + if analysis.project_metrics is not None and metrics_baseline_state.trusted_for_diff: + metrics_diff = metrics_baseline_state.baseline.diff(analysis.project_metrics) + return DiffContext( + new_func=set(raw_new_func), + new_block=set(raw_new_block), + new_clones_count=len(raw_new_func) + len(raw_new_block), + metrics_diff=metrics_diff, + coverage_adoption_diff_available=bool( + metrics_baseline_state.trusted_for_diff + and getattr( + metrics_baseline_state.baseline, + "has_coverage_adoption_snapshot", + False, + ) + ), + api_surface_diff_available=bool( + metrics_baseline_state.trusted_for_diff + and getattr(metrics_baseline_state.baseline, "api_surface_snapshot", None) + is not None + ), + ) + + +def print_metrics_if_available( + *, + args: CLIArgsLike, + analysis: AnalysisResult, + metrics_diff: MetricsDiff | None, + api_surface_diff_available: bool, + console: PrinterLike, + build_metrics_snapshot_fn: Callable[..., object], + print_metrics_fn: Callable[..., None], +) -> None: + if analysis.project_metrics is None: + return + print_metrics_fn( + console=console, + quiet=args.quiet, + metrics=build_metrics_snapshot_fn( + analysis_result=analysis, + metrics_diff=metrics_diff, + api_surface_diff_available=api_surface_diff_available, + ), + ) + + +def resolve_changed_clone_gate( + *, + args: CLIArgsLike, + report_document: Mapping[str, object] | None, + changed_paths: Collection[str], + changed_clone_gate_from_report_fn: Callable[..., ChangedCloneGate], +) -> ChangedCloneGate | None: + if not args.changed_only or report_document is None: + return None + return changed_clone_gate_from_report_fn( + report_document, + changed_paths=tuple(changed_paths), + ) + + +def maybe_print_changed_scope_snapshot( + *, + args: CLIArgsLike, + changed_clone_gate: ChangedCloneGate | None, + console: PrinterLike, + print_changed_scope_fn: Callable[..., None], +) -> None: + if changed_clone_gate is None: + return + print_changed_scope_fn( + console=console, + quiet=args.quiet, + changed_scope=ChangedScopeSnapshot( + paths_count=len(changed_clone_gate.changed_paths), + findings_total=changed_clone_gate.findings_total, + findings_new=changed_clone_gate.findings_new, + findings_known=changed_clone_gate.findings_known, + ), + ) + + +def warn_new_clones_without_fail( + *, + args: CLIArgsLike, + notice_new_clones_count: int, + console: PrinterLike, +) -> None: + if args.update_baseline or args.fail_on_new or notice_new_clones_count <= 0: + return + console.print(ui.WARN_NEW_CLONES_WITHOUT_FAIL) diff --git a/codeclone/surfaces/cli/report_meta.py b/codeclone/surfaces/cli/report_meta.py new file mode 100644 index 0000000..e2a418f --- /dev/null +++ b/codeclone/surfaces/cli/report_meta.py @@ -0,0 +1,75 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from ...cache.versioning import CacheStatus +from ...contracts.schemas import ReportMeta +from ...report import meta as _report_meta +from ...report.meta import build_report_meta as _build_report_meta +from .types import CLIArgsLike + +if TYPE_CHECKING: + from pathlib import Path + + from ...core._types import AnalysisResult + from ...core._types import ProcessingResult as PipelineProcessingResult + from .baseline_state import CloneBaselineState, MetricsBaselineState + + +_current_report_timestamp_utc = _report_meta.current_report_timestamp_utc + + +def build_cli_report_meta( + *, + codeclone_version: str, + scan_root: Path, + baseline_path: Path, + baseline_state: CloneBaselineState, + cache_path: Path, + cache_status: CacheStatus, + cache_schema_version: str | None, + processing_result: PipelineProcessingResult, + metrics_baseline_path: Path, + metrics_baseline_state: MetricsBaselineState, + analysis_result: AnalysisResult, + args: CLIArgsLike, + metrics_computed: tuple[str, ...], + analysis_started_at_utc: str | None, + report_generated_at_utc: str, +) -> ReportMeta: + project_metrics = analysis_result.project_metrics + return _build_report_meta( + codeclone_version=codeclone_version, + scan_root=scan_root, + baseline_path=baseline_path, + baseline=baseline_state.baseline, + baseline_loaded=baseline_state.loaded, + baseline_status=baseline_state.status.value, + cache_path=cache_path, + cache_used=cache_status == CacheStatus.OK, + cache_status=cache_status.value, + cache_schema_version=cache_schema_version, + files_skipped_source_io=len(processing_result.source_read_failures), + metrics_baseline_path=metrics_baseline_path, + metrics_baseline=metrics_baseline_state.baseline, + metrics_baseline_loaded=metrics_baseline_state.loaded, + metrics_baseline_status=metrics_baseline_state.status.value, + health_score=(project_metrics.health.total if project_metrics else None), + health_grade=(project_metrics.health.grade if project_metrics else None), + analysis_mode=("clones_only" if args.skip_metrics else "full"), + metrics_computed=metrics_computed, + min_loc=args.min_loc, + min_stmt=args.min_stmt, + block_min_loc=args.block_min_loc, + block_min_stmt=args.block_min_stmt, + segment_min_loc=args.segment_min_loc, + segment_min_stmt=args.segment_min_stmt, + analysis_started_at_utc=analysis_started_at_utc, + report_generated_at_utc=report_generated_at_utc, + ) diff --git a/codeclone/surfaces/cli/reports_output.py b/codeclone/surfaces/cli/reports_output.py new file mode 100644 index 0000000..0cc417b --- /dev/null +++ b/codeclone/surfaces/cli/reports_output.py @@ -0,0 +1,326 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import sys +import webbrowser +from collections.abc import Callable, Mapping, Sequence +from pathlib import Path +from typing import Protocol + +from ... import ui_messages as ui +from ...contracts import ExitCode +from . import state as cli_state +from .attrs import bool_attr, optional_text_attr +from .types import ( + CLIArgsLike, + OutputPaths, + PrinterLike, + ReportArtifacts, + ReportPathOrigin, + require_status_console, +) + + +class _QuietArgs(Protocol): + quiet: bool + + +def _path_attr(obj: object, name: str) -> Path | None: + value = getattr(obj, name, None) + return value if isinstance(value, Path) else None + + +def _text_attr(obj: object, name: str) -> str | None: + value = getattr(obj, name, None) + return value if isinstance(value, str) else None + + +def _write_report_output( + *, + out: Path, + content: str, + label: str, + console: PrinterLike, +) -> None: + try: + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(content, "utf-8") + except OSError as exc: + console.print( + ui.fmt_contract_error( + ui.fmt_report_write_failed(label=label, path=out, error=exc) + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + + +def _open_html_report_in_browser(*, path: Path) -> None: + if not webbrowser.open_new_tab(path.as_uri()): + raise OSError("no browser handler available") + + +def write_report_outputs( + *, + args: _QuietArgs, + output_paths: OutputPaths, + report_artifacts: ReportArtifacts, + console: PrinterLike, + open_html_report: bool = False, +) -> str | None: + html_report_path: str | None = None + saved_reports: list[tuple[str, Path]] = [] + html_path = _path_attr(output_paths, "html") + json_path = _path_attr(output_paths, "json") + md_path = _path_attr(output_paths, "md") + sarif_path = _path_attr(output_paths, "sarif") + text_path = _path_attr(output_paths, "text") + html_report = _text_attr(report_artifacts, "html") + json_report = _text_attr(report_artifacts, "json") + md_report = _text_attr(report_artifacts, "md") + sarif_report = _text_attr(report_artifacts, "sarif") + text_report = _text_attr(report_artifacts, "text") + + if html_path and html_report is not None: + out = html_path + _write_report_output( + out=out, + content=html_report, + label="HTML", + console=console, + ) + html_report_path = str(out) + saved_reports.append(("HTML", out)) + + if json_path and json_report is not None: + out = json_path + _write_report_output( + out=out, + content=json_report, + label="JSON", + console=console, + ) + saved_reports.append(("JSON", out)) + + if md_path and md_report is not None: + out = md_path + _write_report_output( + out=out, + content=md_report, + label="Markdown", + console=console, + ) + saved_reports.append(("Markdown", out)) + + if sarif_path and sarif_report is not None: + out = sarif_path + _write_report_output( + out=out, + content=sarif_report, + label="SARIF", + console=console, + ) + saved_reports.append(("SARIF", out)) + + if text_path and text_report is not None: + out = text_path + _write_report_output( + out=out, + content=text_report, + label="text", + console=console, + ) + saved_reports.append(("Text", out)) + + if saved_reports and not args.quiet: + cwd = Path.cwd() + console.print() + for label, path in saved_reports: + try: + display = path.relative_to(cwd) + except ValueError: + display = path + console.print(f" [bold]{label} report saved:[/bold] [dim]{display}[/dim]") + + if open_html_report and html_path is not None: + try: + _open_html_report_in_browser(path=html_path) + except Exception as exc: + console.print(ui.fmt_html_report_open_failed(path=html_path, error=exc)) + + return html_report_path + + +def _validate_output_path( + path: str, + *, + expected_suffix: str, + label: str, + console: PrinterLike, + invalid_message: Callable[..., str], + invalid_path_message: Callable[..., str], +) -> Path: + out = Path(path).expanduser() + if out.suffix.lower() != expected_suffix: + console.print( + ui.fmt_contract_error( + invalid_message(label=label, path=out, expected_suffix=expected_suffix) + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + try: + return out.resolve() + except OSError as exc: + console.print( + ui.fmt_contract_error( + invalid_path_message(label=label, path=out, error=exc) + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + + +def _report_path_origins(argv: Sequence[str]) -> dict[str, ReportPathOrigin | None]: + origins: dict[str, ReportPathOrigin | None] = { + "html": None, + "json": None, + "md": None, + "sarif": None, + "text": None, + } + flag_to_field = { + "--html": "html", + "--json": "json", + "--md": "md", + "--sarif": "sarif", + "--text": "text", + } + index = 0 + while index < len(argv): + token = argv[index] + if token == "--": + break + if "=" in token: + flag, _value = token.split("=", maxsplit=1) + field_name = flag_to_field.get(flag) + if field_name is not None: + origins[field_name] = "explicit" + index += 1 + continue + field_name = flag_to_field.get(token) + if field_name is None: + index += 1 + continue + next_token = argv[index + 1] if index + 1 < len(argv) else None + if next_token is None or next_token.startswith("-"): + origins[field_name] = "default" + index += 1 + continue + origins[field_name] = "explicit" + index += 2 + return origins + + +def _report_path_timestamp_slug(report_generated_at_utc: str) -> str: + return report_generated_at_utc.replace("-", "").replace(":", "") + + +def _timestamped_report_path(path: Path, *, report_generated_at_utc: str) -> Path: + suffix = path.suffix + stem = path.name[: -len(suffix)] if suffix else path.name + return path.with_name( + f"{stem}-{_report_path_timestamp_slug(report_generated_at_utc)}{suffix}" + ) + + +def _resolve_output_paths( + args: CLIArgsLike, + *, + report_path_origins: Mapping[str, ReportPathOrigin | None], + report_generated_at_utc: str, +) -> OutputPaths: + printer = require_status_console(cli_state.get_console()) + resolved: dict[str, Path | None] = { + "html": None, + "json": None, + "md": None, + "sarif": None, + "text": None, + } + output_specs = ( + ("html", "html_out", ".html", "HTML"), + ("json", "json_out", ".json", "JSON"), + ("md", "md_out", ".md", "Markdown"), + ("sarif", "sarif_out", ".sarif", "SARIF"), + ("text", "text_out", ".txt", "text"), + ) + + for field_name, arg_name, expected_suffix, label in output_specs: + raw_value = optional_text_attr(args, arg_name) + if not raw_value: + continue + path = _validate_output_path( + raw_value, + expected_suffix=expected_suffix, + label=label, + console=printer, + invalid_message=ui.fmt_invalid_output_extension, + invalid_path_message=ui.fmt_invalid_output_path, + ) + if ( + args.timestamped_report_paths + and report_path_origins.get(field_name) == "default" + ): + path = _timestamped_report_path( + path, + report_generated_at_utc=report_generated_at_utc, + ) + resolved[field_name] = path + + return OutputPaths( + html=resolved["html"], + json=resolved["json"], + text=resolved["text"], + md=resolved["md"], + sarif=resolved["sarif"], + ) + + +def _validate_report_ui_flags(*, args: object, output_paths: OutputPaths) -> None: + console = require_status_console(cli_state.get_console()) + if bool_attr(args, "open_html_report") and output_paths.html is None: + console.print(ui.fmt_contract_error(ui.ERR_OPEN_HTML_REPORT_REQUIRES_HTML)) + sys.exit(ExitCode.CONTRACT_ERROR) + + if bool_attr(args, "timestamped_report_paths") and not any( + ( + output_paths.html, + output_paths.json, + output_paths.md, + output_paths.sarif, + output_paths.text, + ) + ): + console.print( + ui.fmt_contract_error(ui.ERR_TIMESTAMPED_REPORT_PATHS_REQUIRES_REPORT) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + + +def _write_report_outputs( + *, + args: CLIArgsLike, + output_paths: OutputPaths, + report_artifacts: ReportArtifacts, + open_html_report: bool = False, +) -> str | None: + return write_report_outputs( + args=args, + output_paths=output_paths, + report_artifacts=report_artifacts, + console=require_status_console(cli_state.get_console()), + open_html_report=open_html_report, + ) diff --git a/codeclone/surfaces/cli/runtime.py b/codeclone/surfaces/cli/runtime.py new file mode 100644 index 0000000..8f91fb4 --- /dev/null +++ b/codeclone/surfaces/cli/runtime.py @@ -0,0 +1,259 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import sys +from pathlib import Path + +from ... import ui_messages as ui +from ...cache.store import Cache, resolve_cache_status +from ...cache.versioning import CacheStatus +from ...contracts import ExitCode +from . import state as cli_state +from .attrs import bool_attr, int_attr, optional_text_attr, set_bool_attr +from .types import PrinterLike, require_status_console + + +def validate_numeric_args(args: object) -> bool: + return bool( + not ( + int_attr(args, "max_baseline_size_mb") < 0 + or int_attr(args, "max_cache_size_mb") < 0 + or int_attr(args, "fail_threshold", -1) < -1 + or int_attr(args, "fail_complexity", -1) < -1 + or int_attr(args, "fail_coupling", -1) < -1 + or int_attr(args, "fail_cohesion", -1) < -1 + or int_attr(args, "fail_health", -1) < -1 + or int_attr(args, "min_typing_coverage", -1) < -1 + or int_attr(args, "min_typing_coverage", -1) > 100 + or int_attr(args, "min_docstring_coverage", -1) < -1 + or int_attr(args, "min_docstring_coverage", -1) > 100 + or int_attr(args, "coverage_min") < 0 + or int_attr(args, "coverage_min") > 100 + ) + ) + + +def _metrics_flags_requested(args: object) -> bool: + return bool( + int_attr(args, "fail_complexity", -1) >= 0 + or int_attr(args, "fail_coupling", -1) >= 0 + or int_attr(args, "fail_cohesion", -1) >= 0 + or bool_attr(args, "fail_cycles") + or bool_attr(args, "fail_dead_code") + or int_attr(args, "fail_health", -1) >= 0 + or bool_attr(args, "fail_on_new_metrics") + or bool_attr(args, "fail_on_typing_regression") + or bool_attr(args, "fail_on_docstring_regression") + or bool_attr(args, "fail_on_api_break") + or bool_attr(args, "fail_on_untested_hotspots") + or int_attr(args, "min_typing_coverage", -1) >= 0 + or int_attr(args, "min_docstring_coverage", -1) >= 0 + or bool_attr(args, "api_surface") + or bool_attr(args, "update_metrics_baseline") + or bool(optional_text_attr(args, "coverage_xml")) + ) + + +def configure_metrics_mode( + *, + args: object, + metrics_baseline_exists: bool, + console: PrinterLike, +) -> None: + metrics_flags_requested = _metrics_flags_requested(args) + + if bool_attr(args, "skip_metrics") and metrics_flags_requested: + console.print( + ui.fmt_contract_error( + "--skip-metrics cannot be used together with metrics gating/update " + "flags." + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + + if ( + not bool_attr(args, "skip_metrics") + and not metrics_flags_requested + and not metrics_baseline_exists + ): + set_bool_attr(args, "skip_metrics", True) + + if bool_attr(args, "skip_metrics"): + set_bool_attr(args, "skip_dead_code", True) + set_bool_attr(args, "skip_dependencies", True) + return + + if bool_attr(args, "fail_dead_code"): + set_bool_attr(args, "skip_dead_code", False) + if bool_attr(args, "fail_cycles"): + set_bool_attr(args, "skip_dependencies", False) + if bool_attr(args, "fail_on_api_break"): + set_bool_attr(args, "api_surface", True) + + +def resolve_cache_path( + *, + root_path: Path, + args: object, + from_args: bool, + legacy_cache_path: Path, + console: PrinterLike, +) -> Path: + cache_path_arg = optional_text_attr(args, "cache_path") + if from_args and cache_path_arg: + return Path(cache_path_arg).expanduser() + + cache_path = root_path / ".cache" / "codeclone" / "cache.json" + if legacy_cache_path.exists(): + try: + legacy_resolved = legacy_cache_path.resolve() + except OSError: + legacy_resolved = legacy_cache_path + if legacy_resolved != cache_path: + console.print( + ui.fmt_legacy_cache_warning( + legacy_path=legacy_resolved, + new_path=cache_path, + ) + ) + return cache_path + + +def metrics_computed(args: object) -> tuple[str, ...]: + if bool_attr(args, "skip_metrics"): + return () + + computed = ["complexity", "coupling", "cohesion", "health"] + if not bool_attr(args, "skip_dependencies"): + computed.append("dependencies") + if not bool_attr(args, "skip_dead_code"): + computed.append("dead_code") + computed.append("coverage_adoption") + if bool_attr(args, "api_surface"): + computed.append("api_surface") + if bool(optional_text_attr(args, "coverage_xml")): + computed.append("coverage_join") + return tuple(computed) + + +def resolve_report_cache_path(cache_path: Path) -> Path: + try: + return cache_path.resolve() + except OSError: + return cache_path + + +def prepare_metrics_mode_and_ui( + *, + args: object, + root_path: Path, + baseline_path: Path, + baseline_exists: bool, + metrics_baseline_path: Path, + metrics_baseline_exists: bool, + configure_metrics_mode: object, + print_banner: object, +) -> None: + if ( + bool_attr(args, "update_baseline") + and not bool_attr(args, "skip_metrics") + and not bool_attr(args, "update_metrics_baseline") + ): + set_bool_attr(args, "update_metrics_baseline", True) + if callable(configure_metrics_mode): + configure_metrics_mode( + args=args, + metrics_baseline_exists=metrics_baseline_exists, + ) + if ( + bool_attr(args, "update_metrics_baseline") + and metrics_baseline_path == baseline_path + and not baseline_exists + and not bool_attr(args, "update_baseline") + ): + set_bool_attr(args, "update_baseline", True) + if bool_attr(args, "quiet"): + set_bool_attr(args, "no_progress", True) + return + if callable(print_banner): + print_banner(root=root_path) + + +def gating_mode_enabled(args: object) -> bool: + return bool( + bool_attr(args, "fail_on_new") + or int_attr(args, "fail_threshold", -1) >= 0 + or int_attr(args, "fail_complexity", -1) >= 0 + or int_attr(args, "fail_coupling", -1) >= 0 + or int_attr(args, "fail_cohesion", -1) >= 0 + or bool_attr(args, "fail_cycles") + or bool_attr(args, "fail_dead_code") + or int_attr(args, "fail_health", -1) >= 0 + or bool_attr(args, "fail_on_new_metrics") + or bool_attr(args, "fail_on_typing_regression") + or bool_attr(args, "fail_on_docstring_regression") + or bool_attr(args, "fail_on_api_break") + or int_attr(args, "min_typing_coverage", -1) >= 0 + or int_attr(args, "min_docstring_coverage", -1) >= 0 + ) + + +def print_failed_files(*, failed_files: tuple[str, ...], console: PrinterLike) -> None: + if not failed_files: + return + console.print(ui.fmt_failed_files_header(len(failed_files))) + for failure in failed_files[:10]: + console.print(f" • {failure}") + if len(failed_files) > 10: + console.print(f" ... and {len(failed_files) - 10} more") + + +def _resolve_cache_path( + *, + root_path: Path, + args: object, + from_args: bool, +) -> Path: + return resolve_cache_path( + root_path=root_path, + args=args, + from_args=from_args, + legacy_cache_path=cli_state.LEGACY_CACHE_PATH, + console=require_status_console(cli_state.get_console()), + ) + + +def _validate_numeric_args(args: object) -> bool: + return validate_numeric_args(args) + + +def _configure_metrics_mode( + *, + args: object, + metrics_baseline_exists: bool, +) -> None: + configure_metrics_mode( + args=args, + metrics_baseline_exists=metrics_baseline_exists, + console=require_status_console(cli_state.get_console()), + ) + + +def _print_failed_files(failed_files: tuple[str, ...] | list[str]) -> None: + print_failed_files( + failed_files=tuple(failed_files), + console=require_status_console(cli_state.get_console()), + ) + + +def _metrics_computed(args: object) -> tuple[str, ...]: + return metrics_computed(args) + + +def _resolve_cache_status(cache: Cache) -> tuple[CacheStatus, str | None]: + return resolve_cache_status(cache) diff --git a/codeclone/surfaces/cli/startup.py b/codeclone/surfaces/cli/startup.py new file mode 100644 index 0000000..b5c2b41 --- /dev/null +++ b/codeclone/surfaces/cli/startup.py @@ -0,0 +1,189 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import os +from collections.abc import Callable +from dataclasses import dataclass +from pathlib import Path +from typing import NoReturn + +from ... import ui_messages as ui +from ...config.pyproject_loader import ConfigValidationError +from ...contracts import DEFAULT_ROOT, ExitCode +from .attrs import text_attr +from .baseline_state import MetricsBaselineSectionProbe +from .types import CLIArgsLike, ParserWithDefaults, StatusConsole + + +@dataclass(frozen=True, slots=True) +class ResolvedBaselineInputs: + baseline_path: Path + baseline_exists: bool + metrics_baseline_path: Path + metrics_baseline_exists: bool + shared_baseline_payload: dict[str, object] | None + + +def resolve_runtime_path_arg( + *, + root_path: Path, + raw_path: str, + from_cli: bool, +) -> Path: + candidate_path = Path(raw_path).expanduser() + if from_cli or candidate_path.is_absolute(): + return candidate_path.resolve() + return (root_path / candidate_path).resolve() + + +def exit_contract_error( + message: str, + *, + printer: StatusConsole, + cause: BaseException | None = None, +) -> NoReturn: + printer.print(ui.fmt_contract_error(message)) + if cause is None: + raise SystemExit(ExitCode.CONTRACT_ERROR) + raise SystemExit(ExitCode.CONTRACT_ERROR) from cause + + +def resolve_existing_root_path(*, args: object, printer: StatusConsole) -> Path: + try: + root_path = Path(text_attr(args, "root", DEFAULT_ROOT)).resolve() + except OSError as exc: + exit_contract_error( + ui.ERR_INVALID_ROOT_PATH.format(error=exc), + printer=printer, + cause=exc, + ) + if not root_path.exists(): + exit_contract_error( + ui.ERR_ROOT_NOT_FOUND.format(path=root_path), + printer=printer, + ) + return root_path + + +def load_pyproject_config_or_exit( + *, + root_path: Path, + load_pyproject_config_fn: Callable[[Path], dict[str, object]], + printer: StatusConsole, +) -> dict[str, object]: + try: + return load_pyproject_config_fn(root_path) + except ConfigValidationError as exc: + exit_contract_error(str(exc), printer=printer, cause=exc) + + +def configure_runtime_flags(args: CLIArgsLike) -> None: + if args.debug: + os.environ["CODECLONE_DEBUG"] = "1" + if args.ci: + args.fail_on_new = True + args.no_color = True + args.quiet = True + + +def configure_runtime_console( + *, + args: CLIArgsLike, + make_plain_console: Callable[[], object], + make_console: Callable[..., object], + set_console: Callable[[object], None], +) -> object: + console = ( + make_plain_console() if args.quiet else make_console(no_color=args.no_color) + ) + set_console(console) + return console + + +def validate_numeric_args_or_exit( + *, + args: CLIArgsLike, + validate_numeric_args_fn: Callable[[CLIArgsLike], bool], + printer: StatusConsole, +) -> None: + if validate_numeric_args_fn(args): + return + exit_contract_error( + "Size limits must be non-negative integers (MB), " + "threshold flags must be >= 0 or -1, and coverage thresholds " + "must be between 0 and 100.", + printer=printer, + ) + + +def resolve_baseline_inputs( + *, + ap: ParserWithDefaults, + args: CLIArgsLike, + root_path: Path, + baseline_path_from_args: bool, + metrics_path_from_args: bool, + probe_metrics_baseline_section_fn: Callable[[Path], MetricsBaselineSectionProbe], + printer: StatusConsole, +) -> ResolvedBaselineInputs: + baseline_arg_path = Path(text_attr(args, "baseline")).expanduser() + try: + baseline_path = resolve_runtime_path_arg( + root_path=root_path, + raw_path=text_attr(args, "baseline"), + from_cli=baseline_path_from_args, + ) + baseline_exists = baseline_path.exists() + except OSError as exc: + exit_contract_error( + ui.fmt_invalid_baseline_path(path=baseline_arg_path, error=exc), + printer=printer, + cause=exc, + ) + + shared_baseline_payload: dict[str, object] | None = None + default_metrics_baseline = ap.get_default("metrics_baseline") + metrics_baseline_value = text_attr(args, "metrics_baseline") + metrics_path_overridden = metrics_path_from_args or ( + metrics_baseline_value != str(default_metrics_baseline) + ) + metrics_baseline_raw_path = ( + metrics_baseline_value + if metrics_path_overridden + else text_attr(args, "baseline") + ) + metrics_baseline_arg_path = Path(metrics_baseline_raw_path).expanduser() + try: + metrics_baseline_path = resolve_runtime_path_arg( + root_path=root_path, + raw_path=metrics_baseline_raw_path, + from_cli=metrics_path_from_args, + ) + if metrics_baseline_path == baseline_path: + probe = probe_metrics_baseline_section_fn(metrics_baseline_path) + metrics_baseline_exists = probe.has_metrics_section + shared_baseline_payload = probe.payload + else: + metrics_baseline_exists = metrics_baseline_path.exists() + except OSError as exc: + exit_contract_error( + ui.fmt_invalid_baseline_path( + path=metrics_baseline_arg_path, + error=exc, + ), + printer=printer, + cause=exc, + ) + + return ResolvedBaselineInputs( + baseline_path=baseline_path, + baseline_exists=baseline_exists, + metrics_baseline_path=metrics_baseline_path, + metrics_baseline_exists=metrics_baseline_exists, + shared_baseline_payload=shared_baseline_payload, + ) diff --git a/codeclone/surfaces/cli/state.py b/codeclone/surfaces/cli/state.py new file mode 100644 index 0000000..1c9aa9e --- /dev/null +++ b/codeclone/surfaces/cli/state.py @@ -0,0 +1,25 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 + +from __future__ import annotations + +from pathlib import Path + +console: object | None = None +LEGACY_CACHE_PATH = Path("~/.cache/codeclone/cache.json").expanduser() + + +def get_console() -> object: + global console + if console is None: + from .console import make_plain_console + + console = make_plain_console() + return console + + +def set_console(value: object) -> None: + global console + console = value diff --git a/codeclone/_cli_summary.py b/codeclone/surfaces/cli/summary.py similarity index 57% rename from codeclone/_cli_summary.py rename to codeclone/surfaces/cli/summary.py index 5b849a8..1187fbc 100644 --- a/codeclone/_cli_summary.py +++ b/codeclone/surfaces/cli/summary.py @@ -9,7 +9,13 @@ from dataclasses import dataclass from typing import Protocol -from . import ui_messages as ui +from ... import ui_messages as ui +from ...core._types import AnalysisResult, DiscoveryResult, ProcessingResult +from ...models import MetricsDiff +from ...utils import coerce as _coerce + +_as_int = _coerce.as_int +_as_mapping = _coerce.as_mapping @dataclass(frozen=True, slots=True) @@ -25,6 +31,13 @@ class MetricsSnapshot: dead_code_count: int health_total: int health_grade: str + dependency_avg_depth: float = 0.0 + dependency_p95_depth: int = 0 + dependency_max_depth: int = 0 + security_surfaces_items: int = 0 + security_surfaces_category_count: int = 0 + security_surfaces_production: int = 0 + security_surfaces_tests: int = 0 suppressed_dead_code_count: int = 0 overloaded_modules_candidates: int = 0 overloaded_modules_total: int = 0 @@ -59,6 +72,137 @@ class _Printer(Protocol): def print(self, *objects: object, **kwargs: object) -> None: ... +def build_summary_counts( + *, + discovery_result: DiscoveryResult, + processing_result: ProcessingResult, +) -> dict[str, int]: + return { + "analyzed_lines": processing_result.analyzed_lines + + int(getattr(discovery_result, "cached_lines", 0)), + "analyzed_functions": processing_result.analyzed_functions + + int(getattr(discovery_result, "cached_functions", 0)), + "analyzed_methods": processing_result.analyzed_methods + + int(getattr(discovery_result, "cached_methods", 0)), + "analyzed_classes": processing_result.analyzed_classes + + int(getattr(discovery_result, "cached_classes", 0)), + } + + +def build_metrics_snapshot( + *, + analysis_result: AnalysisResult, + metrics_diff: MetricsDiff | None, + api_surface_diff_available: bool, +) -> MetricsSnapshot: + project_metrics = analysis_result.project_metrics + if project_metrics is None: + raise ValueError("Metrics snapshot requires computed project metrics.") + metrics_payload_map = _as_mapping(analysis_result.metrics_payload) + overloaded_modules_summary = _as_mapping( + _as_mapping(metrics_payload_map.get("overloaded_modules")).get("summary") + ) + adoption_summary = _as_mapping( + _as_mapping(metrics_payload_map.get("coverage_adoption")).get("summary") + ) + api_surface_summary = _as_mapping( + _as_mapping(metrics_payload_map.get("api_surface")).get("summary") + ) + coverage_join_summary = _as_mapping( + _as_mapping(metrics_payload_map.get("coverage_join")).get("summary") + ) + security_surfaces_summary = _as_mapping( + _as_mapping(metrics_payload_map.get("security_surfaces")).get("summary") + ) + coverage_join_source = str(coverage_join_summary.get("source", "")).strip() + return MetricsSnapshot( + complexity_avg=project_metrics.complexity_avg, + complexity_max=project_metrics.complexity_max, + high_risk_count=len(project_metrics.high_risk_functions), + coupling_avg=project_metrics.coupling_avg, + coupling_max=project_metrics.coupling_max, + cohesion_avg=project_metrics.cohesion_avg, + cohesion_max=project_metrics.cohesion_max, + cycles_count=len(project_metrics.dependency_cycles), + dependency_avg_depth=_coerce.as_float( + _as_mapping(metrics_payload_map.get("dependencies")).get("avg_depth") + ), + dependency_p95_depth=_as_int( + _as_mapping(metrics_payload_map.get("dependencies")).get("p95_depth") + ), + dependency_max_depth=project_metrics.dependency_max_depth, + security_surfaces_items=_as_int(security_surfaces_summary.get("items")), + security_surfaces_category_count=_as_int( + security_surfaces_summary.get("category_count") + ), + security_surfaces_production=_as_int( + security_surfaces_summary.get("production") + ), + security_surfaces_tests=_as_int(security_surfaces_summary.get("tests")), + dead_code_count=len(project_metrics.dead_code), + health_total=project_metrics.health.total, + health_grade=project_metrics.health.grade, + suppressed_dead_code_count=analysis_result.suppressed_dead_code_items, + overloaded_modules_candidates=_as_int( + overloaded_modules_summary.get("candidates") + ), + overloaded_modules_total=_as_int(overloaded_modules_summary.get("total")), + overloaded_modules_population_status=str( + overloaded_modules_summary.get("population_status", "") + ), + overloaded_modules_top_score=_coerce.as_float( + overloaded_modules_summary.get("top_score") + ), + adoption_param_permille=( + _as_int(adoption_summary.get("param_permille")) + if adoption_summary + else None + ), + adoption_return_permille=( + _as_int(adoption_summary.get("return_permille")) + if adoption_summary + else None + ), + adoption_docstring_permille=( + _as_int(adoption_summary.get("docstring_permille")) + if adoption_summary + else None + ), + adoption_any_annotation_count=_as_int(adoption_summary.get("typing_any_count")), + api_surface_enabled=bool(api_surface_summary.get("enabled")), + api_surface_modules=_as_int(api_surface_summary.get("modules")), + api_surface_public_symbols=_as_int(api_surface_summary.get("public_symbols")), + api_surface_added=( + len(metrics_diff.new_api_symbols) + if metrics_diff is not None and api_surface_diff_available + else 0 + ), + api_surface_breaking=( + len(metrics_diff.new_api_breaking_changes) + if metrics_diff is not None and api_surface_diff_available + else 0 + ), + coverage_join_status=str(coverage_join_summary.get("status", "")).strip(), + coverage_join_overall_permille=_as_int( + coverage_join_summary.get("overall_permille") + ), + coverage_join_coverage_hotspots=_as_int( + coverage_join_summary.get("coverage_hotspots") + ), + coverage_join_scope_gap_hotspots=_as_int( + coverage_join_summary.get("scope_gap_hotspots") + ), + coverage_join_threshold_percent=_as_int( + coverage_join_summary.get("hotspot_threshold_percent") + ), + coverage_join_source_label=( + coverage_join_source.rsplit("/", maxsplit=1)[-1] + if coverage_join_source + else "" + ), + ) + + def _print_summary( *, console: _Printer, @@ -132,7 +276,7 @@ def _print_summary( ) if not invariant_ok: - console.print(f"[warning]{ui.WARN_SUMMARY_ACCOUNTING_MISMATCH}[/warning]") + console.print(ui.fmt_cli_runtime_warning(ui.WARN_SUMMARY_ACCOUNTING_MISMATCH)) def _print_metrics( @@ -157,6 +301,21 @@ def _print_metrics( overloaded_modules=metrics.overloaded_modules_candidates, ) ) + console.print( + ui.fmt_summary_compact_dependencies( + avg_depth=metrics.dependency_avg_depth, + p95_depth=metrics.dependency_p95_depth, + max_depth=metrics.dependency_max_depth, + ) + ) + console.print( + ui.fmt_summary_compact_security_surfaces( + items=metrics.security_surfaces_items, + categories=metrics.security_surfaces_category_count, + production=metrics.security_surfaces_production, + tests=metrics.security_surfaces_tests, + ) + ) if ( metrics.adoption_param_permille is not None and metrics.adoption_return_permille is not None @@ -210,6 +369,21 @@ def _print_metrics( ui.fmt_metrics_cohesion(metrics.cohesion_avg, metrics.cohesion_max) ) console.print(ui.fmt_metrics_cycles(metrics.cycles_count)) + console.print( + ui.fmt_metrics_dependencies( + avg_depth=metrics.dependency_avg_depth, + p95_depth=metrics.dependency_p95_depth, + max_depth=metrics.dependency_max_depth, + ) + ) + console.print( + ui.fmt_metrics_security_surfaces( + items=metrics.security_surfaces_items, + categories=metrics.security_surfaces_category_count, + production=metrics.security_surfaces_production, + tests=metrics.security_surfaces_tests, + ) + ) console.print( ui.fmt_metrics_dead_code( metrics.dead_code_count, diff --git a/codeclone/surfaces/cli/tips.py b/codeclone/surfaces/cli/tips.py new file mode 100644 index 0000000..7b2e22c --- /dev/null +++ b/codeclone/surfaces/cli/tips.py @@ -0,0 +1,159 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import os +import sys +from collections.abc import Mapping +from pathlib import Path +from typing import TextIO + +from ... import ui_messages as ui +from ...utils.json_io import read_json_object, write_json_document_atomically +from .attrs import bool_attr +from .types import PrinterLike + +_VSCODE_EXTENSION_TIP_KEY = "vscode_extension" +_TIPS_SCHEMA_VERSION = 1 +_VSCODE_EXTENSION_URL = ( + "https://marketplace.visualstudio.com/items?itemName=orenlab.codeclone" +) +_CI_ENV_KEYS: tuple[str, ...] = ( + "CI", + "GITHUB_ACTIONS", + "BUILDKITE", + "TF_BUILD", + "TEAMCITY_VERSION", +) +_VSCODE_ENV_KEYS: tuple[str, ...] = ( + "VSCODE_PID", + "VSCODE_IPC_HOOK", + "VSCODE_CWD", +) + + +def _tips_state_path(cache_path: Path) -> Path: + return cache_path.parent / "tips.json" + + +def _is_vscode_environment(environ: Mapping[str, str]) -> bool: + if environ.get("TERM_PROGRAM", "").strip().lower() == "vscode": + return True + return any(key in environ for key in _VSCODE_ENV_KEYS) + + +def _is_ci_environment(environ: Mapping[str, str]) -> bool: + return any(environ.get(key, "").strip() for key in _CI_ENV_KEYS) + + +def _stream_is_tty(stream: TextIO) -> bool: + try: + return bool(stream.isatty()) + except OSError: + return False + + +def _empty_tips_state() -> dict[str, object]: + return { + "schema_version": _TIPS_SCHEMA_VERSION, + "tips": {}, + } + + +def _load_tips_state(path: Path) -> dict[str, object]: + try: + payload = read_json_object(path) + except (OSError, TypeError, ValueError): + return _empty_tips_state() + tips = payload.get("tips") + if not isinstance(tips, dict): + return _empty_tips_state() + return { + "schema_version": _TIPS_SCHEMA_VERSION, + "tips": dict(tips), + } + + +def _tip_last_shown_version(state: Mapping[str, object], *, tip_key: str) -> str: + tips = state.get("tips") + if not isinstance(tips, dict): + return "" + entry = tips.get(tip_key) + if not isinstance(entry, dict): + return "" + last_shown_version = entry.get("last_shown_version") + if isinstance(last_shown_version, str): + return last_shown_version + return "" + + +def _remember_tip_version( + *, + path: Path, + state: Mapping[str, object], + tip_key: str, + codeclone_version: str, +) -> None: + tips = state.get("tips") + updated_tips = dict(tips) if isinstance(tips, dict) else {} + updated_tips[tip_key] = {"last_shown_version": codeclone_version} + write_json_document_atomically( + path, + { + "schema_version": _TIPS_SCHEMA_VERSION, + "tips": updated_tips, + }, + sort_keys=True, + indent=True, + trailing_newline=True, + ) + + +def maybe_print_vscode_extension_tip( + *, + args: object, + console: PrinterLike, + codeclone_version: str, + cache_path: Path, + environ: Mapping[str, str] | None = None, + stream: TextIO | None = None, +) -> bool: + effective_environ = os.environ if environ is None else environ + effective_stream = sys.stdout if stream is None else stream + if bool_attr(args, "quiet") or bool_attr(args, "ci"): + return False + if _is_ci_environment(effective_environ): + return False + if not _stream_is_tty(effective_stream): + return False + if not _is_vscode_environment(effective_environ): + return False + + state_path = _tips_state_path(cache_path) + state = _load_tips_state(state_path) + if ( + _tip_last_shown_version(state, tip_key=_VSCODE_EXTENSION_TIP_KEY) + == codeclone_version + ): + return False + + console.print(ui.fmt_vscode_extension_tip(url=_VSCODE_EXTENSION_URL)) + try: + _remember_tip_version( + path=state_path, + state=state, + tip_key=_VSCODE_EXTENSION_TIP_KEY, + codeclone_version=codeclone_version, + ) + except OSError: + return True + return True + + +__all__ = [ + "maybe_print_vscode_extension_tip", +] diff --git a/codeclone/surfaces/cli/types.py b/codeclone/surfaces/cli/types.py new file mode 100644 index 0000000..3e464b2 --- /dev/null +++ b/codeclone/surfaces/cli/types.py @@ -0,0 +1,143 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from contextlib import AbstractContextManager +from dataclasses import dataclass +from pathlib import Path +from typing import Literal, Protocol, runtime_checkable + +from ...core._types import ( + AnalysisResult, + BootstrapResult, + DiscoveryResult, + OutputPaths, + ReportArtifacts, +) +from ...core._types import ( + FileProcessResult as ProcessingResult, +) + +ReportPathOrigin = Literal["default", "explicit"] + + +@dataclass(frozen=True, slots=True) +class ChangedCloneGate: + """Changed-scope clone summary used by CLI post-run gating.""" + + changed_paths: tuple[str, ...] + new_func: frozenset[str] + new_block: frozenset[str] + total_clone_groups: int + findings_total: int + findings_new: int + findings_known: int + + +@runtime_checkable +class PrinterLike(Protocol): + """Minimal console surface that supports plain text output.""" + + def print(self, *objects: object, **kwargs: object) -> None: ... + + +@runtime_checkable +class StatusConsole(PrinterLike, Protocol): + """Console surface that can open rich status contexts.""" + + def status( + self, + *objects: object, + **kwargs: object, + ) -> AbstractContextManager[object]: ... + + +class CLIArgsLike(Protocol): + """Typed attribute view over the CLI namespace used by the workflow.""" + + root: str | Path + baseline: str | Path + metrics_baseline: str | Path + cache_path: str | Path | None + html_out: str | None + json_out: str | None + md_out: str | None + sarif_out: str | None + text_out: str | None + debug: bool + ci: bool + quiet: bool + no_color: bool + no_progress: bool + open_html_report: bool + timestamped_report_paths: bool + changed_only: bool + diff_against: str | None + paths_from_git_diff: str | None + skip_metrics: bool + skip_dead_code: bool + skip_dependencies: bool + update_baseline: bool + update_metrics_baseline: bool + fail_on_new: bool + fail_threshold: int + fail_complexity: int + fail_coupling: int + fail_cohesion: int + fail_cycles: bool + fail_dead_code: bool + fail_health: int + fail_on_new_metrics: bool + fail_on_typing_regression: bool + fail_on_docstring_regression: bool + fail_on_api_break: bool + fail_on_untested_hotspots: bool + min_typing_coverage: int + min_docstring_coverage: int + coverage_min: int + coverage_xml: str | None + api_surface: bool + verbose: bool + max_baseline_size_mb: int + max_cache_size_mb: int + min_loc: int + min_stmt: int + block_min_loc: int + block_min_stmt: int + segment_min_loc: int + segment_min_stmt: int + + +class ParserWithDefaults(Protocol): + """Argparse-compatible parser surface for default lookups.""" + + def get_default(self, dest: str) -> object: ... + + +def require_status_console(value: object) -> StatusConsole: + """Return a status-capable console or raise a precise type error.""" + + if not isinstance(value, StatusConsole): + raise TypeError("CLI console does not provide print/status methods.") + return value + + +__all__ = [ + "AnalysisResult", + "BootstrapResult", + "CLIArgsLike", + "ChangedCloneGate", + "DiscoveryResult", + "OutputPaths", + "ParserWithDefaults", + "PrinterLike", + "ProcessingResult", + "ReportArtifacts", + "ReportPathOrigin", + "StatusConsole", + "require_status_console", +] diff --git a/codeclone/surfaces/cli/workflow.py b/codeclone/surfaces/cli/workflow.py new file mode 100644 index 0000000..56698a7 --- /dev/null +++ b/codeclone/surfaces/cli/workflow.py @@ -0,0 +1,547 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import sys +import time +from pathlib import Path + +from ... import __version__ +from ... import ui_messages as ui +from ...baseline import Baseline +from ...cache.projection import build_segment_report_projection +from ...cache.store import Cache +from ...config import resolver as config_resolver +from ...config.argparse_builder import build_parser +from ...config.pyproject_loader import load_pyproject_config +from ...contracts import ( + ISSUES_URL, + ExitCode, +) +from ...core._types import AnalysisResult, BootstrapResult, DiscoveryResult +from ...core._types import ProcessingResult as PipelineProcessingResult +from ...core.bootstrap import bootstrap +from ...core.discovery import discover +from ...core.parallelism import process +from ...core.pipeline import analyze +from ...core.reporting import gate, report +from ...models import MetricsDiff +from ...report.html import build_html_report +from . import baseline_state as cli_baseline_state +from . import changed_scope as cli_changed_scope +from . import console as cli_console +from . import execution as cli_execution +from . import post_run as cli_post_run +from . import report_meta as cli_meta_mod +from . import reports_output as cli_reports_output +from . import runtime as cli_runtime +from . import startup as cli_startup +from . import state as cli_state +from . import summary as cli_summary +from . import tips as cli_tips +from .types import CLIArgsLike, StatusConsole, require_status_console + +__all__ = [ + "LEGACY_CACHE_PATH", + "Baseline", + "Cache", + "ExitCode", + "_changed_clone_gate_from_report", + "_configure_metrics_mode", + "_enforce_gating", + "_git_diff_changed_paths", + "_main_impl", + "_make_console", + "_make_plain_console", + "_make_rich_console", + "_print_changed_scope", + "_print_failed_files", + "_print_gating_failure_block", + "_print_summary", + "_probe_metrics_baseline_section", + "_resolve_cache_path", + "_resolve_clone_baseline_state", + "_resolve_metrics_baseline_state", + "_rich_progress_symbols", + "_run_analysis_stages", + "_validate_report_ui_flags", + "_write_report_outputs", + "analyze", + "apply_pyproject_config_overrides", + "bootstrap", + "build_html_report", + "collect_explicit_cli_dests", + "console", + "discover", + "gate", + "main", + "maybe_print_vscode_extension_tip", + "print_banner", + "process", + "report", +] + +apply_pyproject_config_overrides = config_resolver.apply_pyproject_config_overrides +collect_explicit_cli_dests = config_resolver.collect_explicit_cli_dests + +_probe_metrics_baseline_section = cli_baseline_state._probe_metrics_baseline_section +_resolve_clone_baseline_state = cli_baseline_state._resolve_clone_baseline_state +_resolve_metrics_baseline_state = cli_baseline_state._resolve_metrics_baseline_state + +_changed_clone_gate_from_report = cli_changed_scope._changed_clone_gate_from_report +_git_diff_changed_paths = cli_changed_scope._git_diff_changed_paths +_validate_changed_scope_args = cli_changed_scope._validate_changed_scope_args + +_is_debug_enabled = cli_console._is_debug_enabled +_make_plain_console = cli_console._make_plain_console +_make_rich_console = cli_console.make_console +_parse_metric_reason_entry = cli_console._parse_metric_reason_entry +_print_banner_impl = cli_console.print_banner +_print_gating_failure_block = cli_console._print_gating_failure_block +_print_verbose_clone_hashes = cli_console._print_verbose_clone_hashes +_rich_progress_symbols = cli_console._rich_progress_symbols + +print_pipeline_done_if_needed = cli_execution.print_pipeline_done_if_needed +run_analysis_stages = cli_execution.run_analysis_stages + +_build_diff_context = cli_post_run.build_diff_context +maybe_print_changed_scope_snapshot = cli_post_run.maybe_print_changed_scope_snapshot +print_metrics_if_available = cli_post_run.print_metrics_if_available +resolve_changed_clone_gate = cli_post_run.resolve_changed_clone_gate +warn_new_clones_without_fail = cli_post_run.warn_new_clones_without_fail +maybe_print_vscode_extension_tip = cli_tips.maybe_print_vscode_extension_tip + +_report_path_origins = cli_reports_output._report_path_origins +_resolve_output_paths = cli_reports_output._resolve_output_paths +_validate_report_ui_flags = cli_reports_output._validate_report_ui_flags +_write_report_outputs = cli_reports_output._write_report_outputs + +_configure_metrics_mode = cli_runtime._configure_metrics_mode +_metrics_computed = cli_runtime._metrics_computed +_print_failed_files = cli_runtime._print_failed_files +_resolve_cache_path_impl = cli_runtime._resolve_cache_path +_resolve_cache_status = cli_runtime._resolve_cache_status +_validate_numeric_args = cli_runtime._validate_numeric_args +gating_mode_enabled = cli_runtime.gating_mode_enabled +prepare_metrics_mode_and_ui = cli_runtime.prepare_metrics_mode_and_ui +resolve_report_cache_path = cli_runtime.resolve_report_cache_path + +_configure_runtime_console_impl = cli_startup.configure_runtime_console +_configure_runtime_flags = cli_startup.configure_runtime_flags +_load_pyproject_config_or_exit = cli_startup.load_pyproject_config_or_exit +_resolve_baseline_inputs = cli_startup.resolve_baseline_inputs +_resolve_existing_root_path = cli_startup.resolve_existing_root_path +_validate_numeric_args_or_exit = cli_startup.validate_numeric_args_or_exit + +_print_changed_scope = cli_summary._print_changed_scope +_print_metrics = cli_summary._print_metrics +_print_summary = cli_summary._print_summary +build_metrics_snapshot = cli_summary.build_metrics_snapshot +build_summary_counts = cli_summary.build_summary_counts + + +def _set_console(value: object) -> object: + cli_state.set_console(value) + return value + + +def _console() -> StatusConsole: + return require_status_console(_set_console(console)) + + +def _make_console(*, no_color: bool) -> object: + return _make_rich_console(no_color=no_color, width=ui.CLI_LAYOUT_MAX_WIDTH) + + +console: object = _make_plain_console() +_set_console(console) +LEGACY_CACHE_PATH = cli_state.LEGACY_CACHE_PATH + + +def print_banner(*, root: Path | None = None) -> None: + _set_console(console) + _print_banner_impl(root=root) + + +def _configure_runtime_console(args: CLIArgsLike) -> None: + global console + console = _configure_runtime_console_impl( + args=args, + make_plain_console=_make_plain_console, + make_console=_make_console, + set_console=lambda value: cli_state.set_console(value), + ) + + +def _resolve_cache_path( + *, + root_path: Path, + args: CLIArgsLike, + from_args: bool, +) -> Path: + cli_state.LEGACY_CACHE_PATH = LEGACY_CACHE_PATH + _set_console(console) + return _resolve_cache_path_impl( + root_path=root_path, + args=args, + from_args=from_args, + ) + + +def _cache_update_segment_projection(cache: Cache, analysis: AnalysisResult) -> None: + if not hasattr(cache, "segment_report_projection"): + return + new_projection = build_segment_report_projection( + digest=analysis.segment_groups_raw_digest, + suppressed=analysis.suppressed_segment_groups, + groups=analysis.segment_groups, + ) + if new_projection != cache.segment_report_projection: + cache.segment_report_projection = new_projection + cache._dirty = True + + +def _run_analysis_stages( + *, + args: CLIArgsLike, + boot: BootstrapResult, + cache: Cache, +) -> tuple[DiscoveryResult, PipelineProcessingResult, AnalysisResult]: + _set_console(console) + return run_analysis_stages( + args=args, + boot=boot, + cache=cache, + discover_fn=discover, + process_fn=process, + analyze_fn=analyze, + print_failed_files_fn=_print_failed_files, + cache_update_segment_projection_fn=_cache_update_segment_projection, + rich_progress_symbols_fn=_rich_progress_symbols, + ) + + +def _enforce_gating( + *, + args: object, + boot: BootstrapResult, + analysis: AnalysisResult, + processing: PipelineProcessingResult, + source_read_contract_failure: bool, + baseline_failure_code: ExitCode | None, + metrics_baseline_failure_code: ExitCode | None, + new_func: set[str], + new_block: set[str], + metrics_diff: MetricsDiff | None, + html_report_path: str | None, + clone_threshold_total: int | None = None, +) -> None: + _set_console(console) + cli_execution.enforce_gating( + args=args, + boot=boot, + analysis=analysis, + processing=processing, + source_read_contract_failure=source_read_contract_failure, + baseline_failure_code=baseline_failure_code, + metrics_baseline_failure_code=metrics_baseline_failure_code, + new_func=new_func, + new_block=new_block, + metrics_diff=metrics_diff, + html_report_path=html_report_path, + gate_fn=gate, + parse_metric_reason_entry_fn=_parse_metric_reason_entry, + print_gating_failure_block_fn=_print_gating_failure_block, + print_verbose_clone_hashes_fn=_print_verbose_clone_hashes, + clone_threshold_total=clone_threshold_total, + ) + + +def _main_impl() -> None: + run_started_at = time.monotonic() + analysis_started_at_utc = cli_meta_mod._current_report_timestamp_utc() + ap = build_parser(__version__) + + raw_argv = tuple(sys.argv[1:]) + explicit_cli_dests = collect_explicit_cli_dests(ap, argv=raw_argv) + report_path_origins = _report_path_origins(raw_argv) + report_generated_at_utc = cli_meta_mod._current_report_timestamp_utc() + cache_path_from_args = any( + arg in {"--cache-dir", "--cache-path"} + or arg.startswith(("--cache-dir=", "--cache-path=")) + for arg in sys.argv + ) + baseline_path_from_args = any( + arg == "--baseline" or arg.startswith("--baseline=") for arg in sys.argv + ) + metrics_path_from_args = any( + arg == "--metrics-baseline" or arg.startswith("--metrics-baseline=") + for arg in sys.argv + ) + args = ap.parse_args() + + root_path = _resolve_existing_root_path(args=args, printer=_console()) + pyproject_config = _load_pyproject_config_or_exit( + root_path=root_path, + load_pyproject_config_fn=load_pyproject_config, + printer=_console(), + ) + apply_pyproject_config_overrides( + args=args, + config_values=pyproject_config, + explicit_cli_dests=explicit_cli_dests, + ) + git_diff_ref = _validate_changed_scope_args(args=args) + changed_paths = ( + _git_diff_changed_paths(root_path=root_path, git_diff_ref=git_diff_ref) + if git_diff_ref is not None + else () + ) + _configure_runtime_flags(args) + _configure_runtime_console(args) + _validate_numeric_args_or_exit( + args=args, + validate_numeric_args_fn=_validate_numeric_args, + printer=_console(), + ) + baseline_inputs = _resolve_baseline_inputs( + ap=ap, + args=args, + root_path=root_path, + baseline_path_from_args=baseline_path_from_args, + metrics_path_from_args=metrics_path_from_args, + probe_metrics_baseline_section_fn=_probe_metrics_baseline_section, + printer=_console(), + ) + prepare_metrics_mode_and_ui( + args=args, + root_path=root_path, + baseline_path=baseline_inputs.baseline_path, + baseline_exists=baseline_inputs.baseline_exists, + metrics_baseline_path=baseline_inputs.metrics_baseline_path, + metrics_baseline_exists=baseline_inputs.metrics_baseline_exists, + configure_metrics_mode=_configure_metrics_mode, + print_banner=print_banner, + ) + + output_paths = _resolve_output_paths( + args, + report_path_origins=report_path_origins, + report_generated_at_utc=report_generated_at_utc, + ) + _validate_report_ui_flags(args=args, output_paths=output_paths) + cache_path = _resolve_cache_path( + root_path=root_path, + args=args, + from_args=cache_path_from_args, + ) + + cache = Cache( + cache_path, + root=root_path, + max_size_bytes=args.max_cache_size_mb * 1024 * 1024, + min_loc=args.min_loc, + min_stmt=args.min_stmt, + block_min_loc=args.block_min_loc, + block_min_stmt=args.block_min_stmt, + segment_min_loc=args.segment_min_loc, + segment_min_stmt=args.segment_min_stmt, + collect_api_surface=bool(args.api_surface), + ) + cache.load() + if cache.load_warning: + _console().print(ui.fmt_cli_runtime_warning(cache.load_warning)) + + boot = bootstrap( + args=args, + root=root_path, + output_paths=output_paths, + cache_path=cache_path, + ) + discovery_result, processing_result, analysis_result = _run_analysis_stages( + args=args, + boot=boot, + cache=cache, + ) + + source_read_contract_failure = ( + bool(processing_result.source_read_failures) + and gating_mode_enabled(args) + and not args.update_baseline + ) + shared_baseline_payload = ( + baseline_inputs.shared_baseline_payload + if baseline_inputs.metrics_baseline_path == baseline_inputs.baseline_path + else None + ) + baseline_state = _resolve_clone_baseline_state( + args=args, + baseline_path=baseline_inputs.baseline_path, + baseline_exists=baseline_inputs.baseline_exists, + analysis=analysis_result, + shared_baseline_payload=shared_baseline_payload, + ) + metrics_baseline_state = _resolve_metrics_baseline_state( + args=args, + metrics_baseline_path=baseline_inputs.metrics_baseline_path, + metrics_baseline_exists=baseline_inputs.metrics_baseline_exists, + clone_baseline_state=baseline_state, + baseline_updated_path=baseline_state.updated_path, + analysis=analysis_result, + shared_baseline_payload=shared_baseline_payload, + ) + + cache_status, cache_schema_version = _resolve_cache_status(cache) + report_meta = cli_meta_mod.build_cli_report_meta( + codeclone_version=__version__, + scan_root=root_path, + baseline_path=baseline_inputs.baseline_path, + baseline_state=baseline_state, + cache_path=resolve_report_cache_path(cache_path), + cache_status=cache_status, + cache_schema_version=cache_schema_version, + processing_result=processing_result, + metrics_baseline_path=baseline_inputs.metrics_baseline_path, + metrics_baseline_state=metrics_baseline_state, + analysis_result=analysis_result, + args=args, + metrics_computed=_metrics_computed(args), + analysis_started_at_utc=analysis_started_at_utc, + report_generated_at_utc=report_generated_at_utc, + ) + + diff_context = _build_diff_context( + analysis=analysis_result, + baseline_path=baseline_inputs.baseline_path, + baseline_state=baseline_state, + metrics_baseline_state=metrics_baseline_state, + ) + summary_counts = build_summary_counts( + discovery_result=discovery_result, + processing_result=processing_result, + ) + _print_summary( + console=_console(), + quiet=args.quiet, + files_found=discovery_result.files_found, + files_analyzed=processing_result.files_analyzed, + cache_hits=discovery_result.cache_hits, + files_skipped=processing_result.files_skipped, + analyzed_lines=summary_counts["analyzed_lines"], + analyzed_functions=summary_counts["analyzed_functions"], + analyzed_methods=summary_counts["analyzed_methods"], + analyzed_classes=summary_counts["analyzed_classes"], + func_clones_count=analysis_result.func_clones_count, + block_clones_count=analysis_result.block_clones_count, + segment_clones_count=analysis_result.segment_clones_count, + suppressed_golden_fixture_groups=len( + getattr(analysis_result, "suppressed_clone_groups", ()) + ), + suppressed_segment_groups=analysis_result.suppressed_segment_groups, + new_clones_count=diff_context.new_clones_count, + ) + print_metrics_if_available( + args=args, + analysis=analysis_result, + metrics_diff=diff_context.metrics_diff, + api_surface_diff_available=diff_context.api_surface_diff_available, + console=_console(), + build_metrics_snapshot_fn=build_metrics_snapshot, + print_metrics_fn=_print_metrics, + ) + + report_artifacts = report( + boot=boot, + discovery=discovery_result, + processing=processing_result, + analysis=analysis_result, + report_meta=report_meta, + new_func=diff_context.new_func, + new_block=diff_context.new_block, + html_builder=build_html_report, + metrics_diff=diff_context.metrics_diff, + coverage_adoption_diff_available=diff_context.coverage_adoption_diff_available, + api_surface_diff_available=diff_context.api_surface_diff_available, + include_report_document=bool(changed_paths), + ) + changed_clone_gate = resolve_changed_clone_gate( + args=args, + report_document=report_artifacts.report_document, + changed_paths=changed_paths, + changed_clone_gate_from_report_fn=_changed_clone_gate_from_report, + ) + maybe_print_changed_scope_snapshot( + args=args, + changed_clone_gate=changed_clone_gate, + console=_console(), + print_changed_scope_fn=_print_changed_scope, + ) + html_report_path = _write_report_outputs( + args=args, + output_paths=output_paths, + report_artifacts=report_artifacts, + open_html_report=args.open_html_report, + ) + + _enforce_gating( + args=args, + boot=boot, + analysis=analysis_result, + processing=processing_result, + source_read_contract_failure=source_read_contract_failure, + baseline_failure_code=baseline_state.failure_code, + metrics_baseline_failure_code=metrics_baseline_state.failure_code, + new_func=( + set(changed_clone_gate.new_func) + if changed_clone_gate + else diff_context.new_func + ), + new_block=( + set(changed_clone_gate.new_block) + if changed_clone_gate + else diff_context.new_block + ), + metrics_diff=diff_context.metrics_diff, + html_report_path=html_report_path, + clone_threshold_total=( + changed_clone_gate.total_clone_groups if changed_clone_gate else None + ), + ) + + notice_new_clones_count = ( + len(changed_clone_gate.new_func) + len(changed_clone_gate.new_block) + if changed_clone_gate is not None + else diff_context.new_clones_count + ) + warn_new_clones_without_fail( + args=args, + notice_new_clones_count=notice_new_clones_count, + console=_console(), + ) + maybe_print_vscode_extension_tip( + args=args, + console=_console(), + codeclone_version=__version__, + cache_path=cache_path, + ) + print_pipeline_done_if_needed(args=args, run_started_at=run_started_at) + + +def main() -> None: + try: + _main_impl() + except SystemExit: + raise + except Exception as exc: + _console().print( + ui.fmt_internal_error( + exc, + issues_url=ISSUES_URL, + debug=_is_debug_enabled(), + ) + ) + raise SystemExit(ExitCode.INTERNAL_ERROR) from exc diff --git a/codeclone/surfaces/mcp/__init__.py b/codeclone/surfaces/mcp/__init__.py new file mode 100644 index 0000000..557317f --- /dev/null +++ b/codeclone/surfaces/mcp/__init__.py @@ -0,0 +1,4 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 diff --git a/codeclone/surfaces/mcp/__main__.py b/codeclone/surfaces/mcp/__main__.py new file mode 100644 index 0000000..75e1386 --- /dev/null +++ b/codeclone/surfaces/mcp/__main__.py @@ -0,0 +1,9 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from .server import main + +main() diff --git a/codeclone/surfaces/mcp/_session_baseline.py b/codeclone/surfaces/mcp/_session_baseline.py new file mode 100644 index 0000000..b10a93b --- /dev/null +++ b/codeclone/surfaces/mcp/_session_baseline.py @@ -0,0 +1,145 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path + +from ...baseline import ( + Baseline, + BaselineStatus, + coerce_baseline_status, + current_python_tag, +) +from ...baseline.metrics_baseline import ( + MetricsBaseline, + MetricsBaselineStatus, + coerce_metrics_baseline_status, +) +from ...contracts import ExitCode +from ...contracts.errors import BaselineValidationError + + +@dataclass(frozen=True, slots=True) +class CloneBaselineState: + baseline: Baseline + loaded: bool + status: BaselineStatus + failure_code: ExitCode | None + trusted_for_diff: bool + updated_path: Path | None + warning_message: str | None = None + + +@dataclass(frozen=True, slots=True) +class MetricsBaselineState: + baseline: MetricsBaseline + loaded: bool + status: MetricsBaselineStatus + failure_code: ExitCode | None + trusted_for_diff: bool + warning_message: str | None = None + + +def resolve_clone_baseline_state( + *, + baseline_path: Path, + baseline_exists: bool, + max_baseline_size_mb: int, + shared_baseline_payload: dict[str, object] | None = None, +) -> CloneBaselineState: + baseline = Baseline(baseline_path) + if not baseline_exists: + return CloneBaselineState( + baseline=baseline, + loaded=False, + status=BaselineStatus.MISSING, + failure_code=None, + trusted_for_diff=False, + updated_path=None, + warning_message=None, + ) + + try: + if shared_baseline_payload is None: + baseline.load(max_size_bytes=max_baseline_size_mb * 1024 * 1024) + else: + baseline.load( + max_size_bytes=max_baseline_size_mb * 1024 * 1024, + preloaded_payload=shared_baseline_payload, + ) + baseline.verify_compatibility(current_python_tag=current_python_tag()) + except BaselineValidationError as exc: + status = coerce_baseline_status(exc.status) + return CloneBaselineState( + baseline=baseline, + loaded=False, + status=status, + failure_code=None, + trusted_for_diff=False, + updated_path=None, + warning_message=str(exc), + ) + + return CloneBaselineState( + baseline=baseline, + loaded=True, + status=BaselineStatus.OK, + failure_code=None, + trusted_for_diff=True, + updated_path=None, + warning_message=None, + ) + + +def resolve_metrics_baseline_state( + *, + metrics_baseline_path: Path, + metrics_baseline_exists: bool, + max_baseline_size_mb: int, + skip_metrics: bool, + shared_baseline_payload: dict[str, object] | None = None, +) -> MetricsBaselineState: + baseline = MetricsBaseline(metrics_baseline_path) + if skip_metrics or not metrics_baseline_exists: + return MetricsBaselineState( + baseline=baseline, + loaded=False, + status=MetricsBaselineStatus.MISSING, + failure_code=None, + trusted_for_diff=False, + warning_message=None, + ) + + try: + if shared_baseline_payload is None: + baseline.load(max_size_bytes=max_baseline_size_mb * 1024 * 1024) + else: + baseline.load( + max_size_bytes=max_baseline_size_mb * 1024 * 1024, + preloaded_payload=shared_baseline_payload, + ) + baseline.verify_compatibility(runtime_python_tag=current_python_tag()) + except BaselineValidationError as exc: + status = coerce_metrics_baseline_status(exc.status) + return MetricsBaselineState( + baseline=baseline, + loaded=False, + status=status, + failure_code=None, + trusted_for_diff=False, + warning_message=str(exc), + ) + + return MetricsBaselineState( + baseline=baseline, + loaded=True, + status=MetricsBaselineStatus.OK, + failure_code=None, + trusted_for_diff=True, + warning_message=None, + ) diff --git a/codeclone/surfaces/mcp/_session_finding_mixin.py b/codeclone/surfaces/mcp/_session_finding_mixin.py new file mode 100644 index 0000000..ff8bccb --- /dev/null +++ b/codeclone/surfaces/mcp/_session_finding_mixin.py @@ -0,0 +1,1557 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from types import TracebackType +from typing import Protocol + +from . import _session_helpers as _helpers +from ._session_shared import ( + _CHECK_TO_DIMENSION, + _CONFIDENCE_WEIGHT, + _DESIGN_CHECK_CONTEXT, + _EFFORT_WEIGHT, + _HOTLIST_REPORT_KEYS, + _NOVELTY_WEIGHT, + _RUNTIME_WEIGHT, + _SEVERITY_WEIGHT, + _VALID_ANALYSIS_MODES, + _VALID_CACHE_POLICIES, + _VALID_DETAIL_LEVELS, + _VALID_FINDING_FAMILIES, + _VALID_FINDING_NOVELTY, + _VALID_FINDING_SORT, + _VALID_HOTLIST_KINDS, + _VALID_SEVERITIES, + CATEGORY_COHESION, + CATEGORY_COMPLEXITY, + CATEGORY_COUPLING, + CONFIDENCE_MEDIUM, + EFFORT_MODERATE, + FAMILY_CLONES, + FAMILY_DEAD_CODE, + FAMILY_DESIGN, + FAMILY_STRUCTURAL, + SOURCE_KIND_OTHER, + AnalysisMode, + CodeCloneMCPRunStore, + DetailLevel, + FindingFamilyFilter, + FindingNoveltyFilter, + FindingSort, + HotlistKind, + Mapping, + MCPAnalysisRequest, + MCPFindingNotFoundError, + MCPRunNotFoundError, + MCPRunRecord, + MCPServiceContractError, + OrderedDict, + Path, + Sequence, + _as_float, + _as_int, + _git_diff_lines_payload, + paginate, + resolve_finding_id, +) + + +class _StateLock(Protocol): + def __enter__(self) -> object: ... + def __exit__( + self, + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, + ) -> bool | None: ... + + +class _MCPSessionFindingMixin: + _runs: CodeCloneMCPRunStore + _state_lock: _StateLock + _review_state: dict[str, OrderedDict[str, str | None]] + _last_gate_results: dict[str, dict[str, object]] + _spread_max_cache: dict[str, int] + + def _validate_analysis_request(self, request: MCPAnalysisRequest) -> None: + _helpers._validate_choice( + "analysis_mode", + request.analysis_mode, + _VALID_ANALYSIS_MODES, + ) + _helpers._validate_choice( + "cache_policy", + request.cache_policy, + _VALID_CACHE_POLICIES, + ) + if request.cache_policy == "refresh": + raise MCPServiceContractError( + "cache_policy='refresh' is not supported by the read-only " + "CodeClone MCP server. Use 'reuse' or 'off'." + ) + if request.analysis_mode == "clones_only" and request.coverage_xml is not None: + raise MCPServiceContractError( + "coverage_xml requires analysis_mode='full' because coverage join " + "depends on metrics-enabled analysis." + ) + + def _resolve_request_changed_paths( + self, + *, + root_path: Path, + changed_paths: Sequence[str], + git_diff_ref: str | None, + ) -> tuple[str, ...]: + if changed_paths and git_diff_ref is not None: + raise MCPServiceContractError( + "Provide changed_paths or git_diff_ref, not both." + ) + if git_diff_ref is not None: + return self._git_diff_paths(root_path=root_path, git_diff_ref=git_diff_ref) + if not changed_paths: + return () + return self._normalize_changed_paths(root_path=root_path, paths=changed_paths) + + def _resolve_query_changed_paths( + self, + *, + record: MCPRunRecord, + changed_paths: Sequence[str], + git_diff_ref: str | None, + prefer_record_paths: bool = False, + ) -> tuple[str, ...]: + if changed_paths or git_diff_ref is not None: + return self._resolve_request_changed_paths( + root_path=record.root, + changed_paths=changed_paths, + git_diff_ref=git_diff_ref, + ) + if prefer_record_paths: + return record.changed_paths + return () + + def _normalize_changed_paths( + self, + *, + root_path: Path, + paths: Sequence[str], + ) -> tuple[str, ...]: + normalized: set[str] = set() + for raw_path in paths: + candidate = Path(str(raw_path)).expanduser() + if candidate.is_absolute(): + try: + relative = candidate.resolve().relative_to(root_path) + except (OSError, ValueError) as exc: + raise MCPServiceContractError( + f"Changed path '{raw_path}' is outside root '{root_path}'." + ) from exc + normalized.add(relative.as_posix()) + continue + cleaned = _helpers._normalize_relative_path(candidate.as_posix()) + if cleaned: + normalized.add(cleaned) + return tuple(sorted(normalized)) + + def _git_diff_paths( + self, + *, + root_path: Path, + git_diff_ref: str, + ) -> tuple[str, ...]: + lines = _git_diff_lines_payload( + root_path=root_path, + git_diff_ref=git_diff_ref, + ) + return self._normalize_changed_paths(root_path=root_path, paths=lines) + + def _path_filter_tuple(self, path: str | None) -> tuple[str, ...]: + if not path: + return () + cleaned = _helpers._normalize_relative_path(Path(path).as_posix()) + return (cleaned,) if cleaned else () + + def _previous_run_for_root(self, record: MCPRunRecord) -> MCPRunRecord | None: + previous: MCPRunRecord | None = None + for item in self._runs.records(): + if item.run_id == record.run_id: + return previous + if item.root == record.root: + previous = item + return None + + def _latest_compatible_record( + self, + *, + analysis_mode: AnalysisMode, + root_path: Path | None = None, + ) -> MCPRunRecord | None: + for item in reversed(self._runs.records()): + if root_path is not None and item.root != root_path: + continue + if _helpers._record_supports_analysis_mode( + item, + analysis_mode=analysis_mode, + ): + return item + return None + + def _resolve_granular_record( + self, + *, + run_id: str | None, + root: str | None, + analysis_mode: AnalysisMode, + ) -> MCPRunRecord: + if run_id is not None: + record = self._runs.get(run_id) + if _helpers._record_supports_analysis_mode( + record, + analysis_mode=analysis_mode, + ): + return record + raise MCPServiceContractError( + "Selected MCP run is not compatible with this check. " + f"Call analyze_repository(root='{record.root}', " + "analysis_mode='full') first." + ) + root_path = self._resolve_optional_root(root) + latest_record = self._latest_compatible_record( + analysis_mode=analysis_mode, + root_path=root_path, + ) + if latest_record is not None: + return latest_record + if root_path is not None: + raise MCPRunNotFoundError( + f"No compatible MCP analysis run is available for root: {root_path}. " + f"Call analyze_repository(root='{root_path}') or " + f"analyze_changed_paths(root='{root_path}', changed_paths=[...]) first." + ) + raise MCPRunNotFoundError( + "No compatible MCP analysis run is available. " + "Call analyze_repository(root='/path/to/repo') or " + "analyze_changed_paths(root='/path/to/repo', changed_paths=[...]) first." + ) + + def _resolve_optional_root(self, root: str | None) -> Path | None: + cleaned_root = "" if root is None else str(root).strip() + if not cleaned_root: + return None + return _helpers._resolve_root(cleaned_root) + + def _finding_id_maps( + self, + record: MCPRunRecord, + ) -> tuple[dict[str, str], dict[str, str]]: + canonical_ids = sorted( + str(finding.get("id", "")) + for finding in self._base_findings(record) + if str(finding.get("id", "")) + ) + base_ids = { + canonical_id: _helpers._base_short_finding_id(canonical_id) + for canonical_id in canonical_ids + } + grouped: dict[str, list[str]] = {} + for canonical_id, short_name in base_ids.items(): + grouped.setdefault(short_name, []).append(canonical_id) + canonical_to_short: dict[str, str] = {} + short_to_canonical: dict[str, str] = {} + for short_name, group in grouped.items(): + if len(group) == 1: + canonical_id = group[0] + canonical_to_short[canonical_id] = short_name + short_to_canonical[short_name] = canonical_id + continue + disambiguated_ids = _helpers._disambiguated_short_finding_ids(group) + for canonical_id, disambiguated in disambiguated_ids.items(): + canonical_to_short[canonical_id] = disambiguated + short_to_canonical[disambiguated] = canonical_id + return canonical_to_short, short_to_canonical + + def _short_finding_id( + self, + record: MCPRunRecord, + canonical_id: str, + ) -> str: + canonical_to_short, _short_to_canonical = self._finding_id_maps(record) + return canonical_to_short.get(canonical_id, canonical_id) + + def _resolve_canonical_finding_id( + self, + record: MCPRunRecord, + finding_id: str, + ) -> str: + canonical_to_short, short_to_canonical = self._finding_id_maps(record) + canonical = resolve_finding_id( + canonical_to_short=canonical_to_short, + short_to_canonical=short_to_canonical, + finding_id=finding_id, + ) + if canonical is not None: + return canonical + raise MCPFindingNotFoundError( + f"Finding id '{finding_id}' was not found in run " + f"'{_helpers._short_run_id(record.run_id)}'." + ) + + def _base_findings(self, record: MCPRunRecord) -> list[dict[str, object]]: + report_document = record.report_document + findings = _helpers._as_mapping(report_document.get("findings")) + groups = _helpers._as_mapping(findings.get("groups")) + clone_groups = _helpers._as_mapping(groups.get(FAMILY_CLONES)) + return [ + *_helpers._dict_list(clone_groups.get("functions")), + *_helpers._dict_list(clone_groups.get("blocks")), + *_helpers._dict_list(clone_groups.get("segments")), + *_helpers._dict_list( + _helpers._as_mapping(groups.get(FAMILY_STRUCTURAL)).get("groups") + ), + *_helpers._dict_list( + _helpers._as_mapping(groups.get(FAMILY_DEAD_CODE)).get("groups") + ), + *_helpers._dict_list( + _helpers._as_mapping(groups.get(FAMILY_DESIGN)).get("groups") + ), + ] + + def _query_findings( + self, + *, + record: MCPRunRecord, + family: FindingFamilyFilter = "all", + category: str | None = None, + severity: str | None = None, + source_kind: str | None = None, + novelty: FindingNoveltyFilter = "all", + sort_by: FindingSort = "default", + detail_level: DetailLevel = "normal", + changed_paths: Sequence[str] = (), + exclude_reviewed: bool = False, + ) -> list[dict[str, object]]: + findings = self._base_findings(record) + max_spread_value = max( + (self._spread_value(finding) for finding in findings), + default=0, + ) + with self._state_lock: + self._spread_max_cache[record.run_id] = max_spread_value + filtered = [ + finding + for finding in findings + if self._matches_finding_filters( + finding=finding, + family=family, + category=category, + severity=severity, + source_kind=source_kind, + novelty=novelty, + ) + and ( + not changed_paths + or self._finding_touches_paths( + finding=finding, + changed_paths=changed_paths, + ) + ) + and (not exclude_reviewed or not self._finding_is_reviewed(record, finding)) + ] + remediation_map = { + str(finding.get("id", "")): self._remediation_for_finding(record, finding) + for finding in filtered + } + priority_map = { + str(finding.get("id", "")): self._priority_score( + record, + finding, + remediation=remediation_map[str(finding.get("id", ""))], + max_spread_value=max_spread_value, + ) + for finding in filtered + } + ordered = self._sort_findings( + record=record, + findings=filtered, + sort_by=sort_by, + priority_map=priority_map, + ) + return [ + self._decorate_finding( + record, + finding, + detail_level=detail_level, + remediation=remediation_map[str(finding.get("id", ""))], + priority_payload=priority_map[str(finding.get("id", ""))], + max_spread_value=max_spread_value, + ) + for finding in ordered + ] + + def _sort_findings( + self, + *, + record: MCPRunRecord, + findings: Sequence[Mapping[str, object]], + sort_by: FindingSort, + priority_map: Mapping[str, Mapping[str, object]] | None = None, + ) -> list[dict[str, object]]: + finding_rows = [dict(finding) for finding in findings] + if sort_by == "default": + return finding_rows + if sort_by == "severity": + finding_rows.sort( + key=lambda finding: ( + -_helpers._severity_rank(str(finding.get("severity", ""))), + str(finding.get("id", "")), + ) + ) + elif sort_by == "spread": + finding_rows.sort( + key=lambda finding: ( + -self._spread_value(finding), + -_as_float(finding.get("priority", 0.0), 0.0), + str(finding.get("id", "")), + ) + ) + else: + finding_rows.sort( + key=lambda finding: ( + -_as_float( + _helpers._as_mapping( + (priority_map or {}).get(str(finding.get("id", ""))) + ).get("score", 0.0), + 0.0, + ) + if priority_map is not None + else -_as_float( + self._priority_score(record, finding)["score"], + 0.0, + ), + -_helpers._severity_rank(str(finding.get("severity", ""))), + str(finding.get("id", "")), + ) + ) + return finding_rows + + def _decorate_finding( + self, + record: MCPRunRecord, + finding: Mapping[str, object], + *, + detail_level: DetailLevel, + remediation: Mapping[str, object] | None = None, + priority_payload: Mapping[str, object] | None = None, + max_spread_value: int | None = None, + ) -> dict[str, object]: + resolved_remediation = ( + remediation + if remediation is not None + else self._remediation_for_finding(record, finding) + ) + resolved_priority_payload = ( + dict(priority_payload) + if priority_payload is not None + else self._priority_score( + record, + finding, + remediation=resolved_remediation, + max_spread_value=max_spread_value, + ) + ) + payload = dict(finding) + payload["priority_score"] = resolved_priority_payload["score"] + payload["priority_factors"] = resolved_priority_payload["factors"] + payload["locations"] = self._locations_for_finding( + record, + finding, + include_uri=detail_level == "full", + ) + payload["html_anchor"] = f"finding-{finding.get('id', '')}" + if resolved_remediation is not None: + payload["remediation"] = resolved_remediation + return self._project_finding_detail( + record, + payload, + detail_level=detail_level, + ) + + def _project_finding_detail( + self, + record: MCPRunRecord, + finding: Mapping[str, object], + *, + detail_level: DetailLevel, + ) -> dict[str, object]: + if detail_level == "full": + full_payload = dict(finding) + full_payload["id"] = self._short_finding_id( + record, + str(finding.get("id", "")), + ) + return full_payload + payload: dict[str, object] = { + "id": self._short_finding_id(record, str(finding.get("id", ""))), + "kind": _helpers._finding_kind_label(finding), + "severity": str(finding.get("severity", "")), + "novelty": str(finding.get("novelty", "")), + "scope": _helpers._finding_source_kind(finding), + "count": _as_int(finding.get("count", 0), 0), + "spread": dict(_helpers._as_mapping(finding.get("spread"))), + "priority": round(_as_float(finding.get("priority_score", 0.0), 0.0), 2), + } + clone_type = str(finding.get("clone_type", "")).strip() + if clone_type: + payload["type"] = clone_type + locations = [ + _helpers._as_mapping(item) + for item in _helpers._as_sequence(finding.get("locations")) + ] + if detail_level == "summary": + remediation = _helpers._as_mapping(finding.get("remediation")) + if remediation: + payload["effort"] = str(remediation.get("effort", "")) + payload["locations"] = [ + summary_location + for summary_location in ( + _helpers._summary_location_string(location) + for location in locations + ) + if summary_location + ] + return payload + remediation = _helpers._as_mapping(finding.get("remediation")) + if remediation: + payload["remediation"] = _helpers._project_remediation( + remediation, + detail_level="normal", + ) + payload["locations"] = [ + projected + for projected in ( + _helpers._normal_location_payload(location) for location in locations + ) + if projected + ] + return payload + + def _finding_summary_card( + self, + record: MCPRunRecord, + finding: Mapping[str, object], + ) -> dict[str, object]: + return self._finding_summary_card_payload( + record, + self._decorate_finding(record, finding, detail_level="full"), + ) + + def _finding_summary_card_payload( + self, + record: MCPRunRecord, + finding: Mapping[str, object], + ) -> dict[str, object]: + return self._project_finding_detail(record, finding, detail_level="summary") + + def _comparison_finding_card( + self, + record: MCPRunRecord, + finding: Mapping[str, object], + ) -> dict[str, object]: + summary_card = self._finding_summary_card(record, finding) + return { + "id": summary_card.get("id"), + "kind": summary_card.get("kind"), + "severity": summary_card.get("severity"), + } + + def _matches_finding_filters( + self, + *, + finding: Mapping[str, object], + family: FindingFamilyFilter, + category: str | None = None, + severity: str | None, + source_kind: str | None, + novelty: FindingNoveltyFilter, + ) -> bool: + finding_family = str(finding.get("family", "")).strip() + if family != "all" and finding_family != family: + return False + if ( + category is not None + and str(finding.get("category", "")).strip() != category + ): + return False + if ( + severity is not None + and str(finding.get("severity", "")).strip() != severity + ): + return False + dominant_kind = str( + _helpers._as_mapping(finding.get("source_scope")).get("dominant_kind", "") + ).strip() + if source_kind is not None and dominant_kind != source_kind: + return False + return novelty == "all" or str(finding.get("novelty", "")).strip() == novelty + + def _finding_touches_paths( + self, + *, + finding: Mapping[str, object], + changed_paths: Sequence[str], + ) -> bool: + normalized_paths = tuple(changed_paths) + for item in _helpers._as_sequence(finding.get("items")): + relative_path = str( + _helpers._as_mapping(item).get("relative_path", "") + ).strip() + if relative_path and _helpers._path_matches( + relative_path, + normalized_paths, + ): + return True + return False + + def _finding_is_reviewed( + self, + record: MCPRunRecord, + finding: Mapping[str, object], + ) -> bool: + with self._state_lock: + review_map = self._review_state.get(record.run_id, OrderedDict()) + return str(finding.get("id", "")) in review_map + + def _include_hotspot_finding( + self, + *, + record: MCPRunRecord, + finding: Mapping[str, object], + changed_paths: Sequence[str], + exclude_reviewed: bool, + ) -> bool: + if changed_paths and not self._finding_touches_paths( + finding=finding, + changed_paths=changed_paths, + ): + return False + return not exclude_reviewed or not self._finding_is_reviewed(record, finding) + + def _priority_score( + self, + record: MCPRunRecord, + finding: Mapping[str, object], + *, + remediation: Mapping[str, object] | None = None, + max_spread_value: int | None = None, + ) -> dict[str, object]: + spread_weight = self._spread_weight( + record, + finding, + max_spread_value=max_spread_value, + ) + factors = { + "severity_weight": _SEVERITY_WEIGHT.get( + str(finding.get("severity", "")), + 0.2, + ), + "effort_weight": _EFFORT_WEIGHT.get( + ( + str(remediation.get("effort", EFFORT_MODERATE)) + if remediation is not None + else EFFORT_MODERATE + ), + 0.6, + ), + "novelty_weight": _NOVELTY_WEIGHT.get( + str(finding.get("novelty", "")), + 0.7, + ), + "runtime_weight": _RUNTIME_WEIGHT.get( + str( + _helpers._as_mapping(finding.get("source_scope")).get( + "dominant_kind", + "other", + ) + ), + 0.5, + ), + "spread_weight": spread_weight, + "confidence_weight": _CONFIDENCE_WEIGHT.get( + str(finding.get("confidence", CONFIDENCE_MEDIUM)), + 0.7, + ), + } + product = 1.0 + for value in factors.values(): + product *= max(_as_float(value, 0.01), 0.01) + score = product ** (1.0 / max(len(factors), 1)) + return { + "score": round(score, 4), + "factors": { + key: round(_as_float(value, 0.0), 4) for key, value in factors.items() + }, + } + + def _spread_weight( + self, + record: MCPRunRecord, + finding: Mapping[str, object], + *, + max_spread_value: int | None = None, + ) -> float: + spread_value = self._spread_value(finding) + if max_spread_value is None: + with self._state_lock: + max_spread_value = self._spread_max_cache.get(record.run_id) + if max_spread_value is None: + max_spread_value = max( + (self._spread_value(item) for item in self._base_findings(record)), + default=0, + ) + with self._state_lock: + self._spread_max_cache[record.run_id] = max_spread_value + max_value = max_spread_value + if max_value <= 0: + return 0.3 + return max(0.2, min(1.0, spread_value / max_value)) + + def _spread_value(self, finding: Mapping[str, object]) -> int: + spread = _helpers._as_mapping(finding.get("spread")) + files = _as_int(spread.get("files", 0), 0) + functions = _as_int(spread.get("functions", 0), 0) + count = _as_int(finding.get("count", 0), 0) + return max(files, functions, count, 1) + + def _locations_for_finding( + self, + record: MCPRunRecord, + finding: Mapping[str, object], + *, + include_uri: bool = True, + ) -> list[dict[str, object]]: + locations: list[dict[str, object]] = [] + for item in _helpers._as_sequence(finding.get("items")): + item_map = _helpers._as_mapping(item) + relative_path = str(item_map.get("relative_path", "")).strip() + if not relative_path: + continue + line = _as_int(item_map.get("start_line", 0) or 0, 0) + end_line = _as_int(item_map.get("end_line", 0) or 0, 0) + symbol = str(item_map.get("qualname", item_map.get("module", ""))).strip() + location: dict[str, object] = { + "file": relative_path, + "line": line, + "end_line": end_line, + "symbol": symbol, + } + if include_uri: + absolute_path = (record.root / relative_path).resolve() + uri = absolute_path.as_uri() + if line > 0: + uri = f"{uri}#L{line}" + location["uri"] = uri + locations.append(location) + deduped: list[dict[str, object]] = [] + seen: set[tuple[str, int, str]] = set() + for location in locations: + key = ( + str(location.get("file", "")), + _as_int(location.get("line", 0), 0), + str(location.get("symbol", "")), + ) + if key not in seen: + seen.add(key) + deduped.append(location) + return deduped + + def _remediation_for_finding( + self, + record: MCPRunRecord, + finding: Mapping[str, object], + ) -> dict[str, object] | None: + suggestion = self._suggestion_for_finding(record, str(finding.get("id", ""))) + if suggestion is None: + return None + source_kind = str(getattr(suggestion, "source_kind", "other")) + spread_files = _as_int(getattr(suggestion, "spread_files", 0), 0) + spread_functions = _as_int(getattr(suggestion, "spread_functions", 0), 0) + title = str(getattr(suggestion, "title", "")).strip() + severity = str(finding.get("severity", "")).strip() + novelty = str(finding.get("novelty", "known")).strip() + count = _as_int( + getattr(suggestion, "fact_count", 0) or finding.get("count", 0) or 0, + 0, + ) + safe_refactor_shape = _helpers._safe_refactor_shape(suggestion) + effort = str(getattr(suggestion, "effort", EFFORT_MODERATE)) + confidence = str(getattr(suggestion, "confidence", CONFIDENCE_MEDIUM)) + risk_level = _helpers._risk_level_for_effort(effort) + return { + "effort": effort, + "priority": _as_float(getattr(suggestion, "priority", 0.0), 0.0), + "confidence": confidence, + "safe_refactor_shape": safe_refactor_shape, + "steps": list(getattr(suggestion, "steps", ())), + "risk_level": risk_level, + "why_now": _helpers._why_now_text( + title=title, + severity=severity, + novelty=novelty, + count=count, + source_kind=source_kind, + spread_files=spread_files, + spread_functions=spread_functions, + effort=effort, + ), + "blast_radius": { + "files": spread_files, + "functions": spread_functions, + "is_production": source_kind == "production", + }, + } + + def _suggestion_for_finding( + self, + record: MCPRunRecord, + finding_id: str, + ) -> object | None: + for suggestion in record.suggestions: + if _helpers._suggestion_finding_id(suggestion) == finding_id: + return suggestion + return None + + def _hotspot_rows( + self, + *, + record: MCPRunRecord, + kind: HotlistKind, + detail_level: DetailLevel, + changed_paths: Sequence[str], + exclude_reviewed: bool, + ) -> list[dict[str, object]]: + findings = self._base_findings(record) + finding_index = {str(finding.get("id", "")): finding for finding in findings} + max_spread_value = max( + (self._spread_value(finding) for finding in findings), + default=0, + ) + with self._state_lock: + self._spread_max_cache[record.run_id] = max_spread_value + remediation_map = { + str(finding.get("id", "")): self._remediation_for_finding(record, finding) + for finding in findings + } + priority_map = { + str(finding.get("id", "")): self._priority_score( + record, + finding, + remediation=remediation_map[str(finding.get("id", ""))], + max_spread_value=max_spread_value, + ) + for finding in findings + } + derived = _helpers._as_mapping(record.report_document.get("derived")) + hotlists = _helpers._as_mapping(derived.get("hotlists")) + if kind == "highest_priority": + ordered_ids = [ + str(finding.get("id", "")) + for finding in self._sort_findings( + record=record, + findings=findings, + sort_by="priority", + priority_map=priority_map, + ) + ] + else: + hotlist_key = _HOTLIST_REPORT_KEYS.get(kind) + if hotlist_key is None: + return [] + ordered_ids = [ + str(item) + for item in _helpers._as_sequence(hotlists.get(hotlist_key)) + if str(item) + ] + rows: list[dict[str, object]] = [] + for finding_id in ordered_ids: + finding = finding_index.get(finding_id) + if finding is None or not self._include_hotspot_finding( + record=record, + finding=finding, + changed_paths=changed_paths, + exclude_reviewed=exclude_reviewed, + ): + continue + finding_id_key = str(finding.get("id", "")) + rows.append( + self._decorate_finding( + record, + finding, + detail_level=detail_level, + remediation=remediation_map[finding_id_key], + priority_payload=priority_map[finding_id_key], + max_spread_value=max_spread_value, + ) + ) + return rows + + def _granular_payload( + self, + *, + record: MCPRunRecord, + check: str, + items: Sequence[Mapping[str, object]], + detail_level: DetailLevel, + max_results: int, + path: str | None, + threshold_context: Mapping[str, object] | None = None, + ) -> dict[str, object]: + bounded_items = [dict(item) for item in items[: max(1, max_results)]] + full_health = dict(_helpers._as_mapping(record.summary.get("health"))) + dimensions = _helpers._as_mapping(full_health.get("dimensions")) + relevant_dimension = _CHECK_TO_DIMENSION.get(check) + slim_dimensions = ( + {relevant_dimension: dimensions.get(relevant_dimension)} + if relevant_dimension and relevant_dimension in dimensions + else dict(dimensions) + ) + payload: dict[str, object] = { + "run_id": _helpers._short_run_id(record.run_id), + "check": check, + "detail_level": detail_level, + "path": path, + "returned": len(bounded_items), + "total": len(items), + "health": { + "score": full_health.get("score"), + "grade": full_health.get("grade"), + "dimensions": slim_dimensions, + }, + "items": bounded_items, + } + if threshold_context: + payload["threshold_context"] = dict(threshold_context) + return payload + + def _design_threshold_context( + self, + *, + record: MCPRunRecord, + check: str, + path: str | None, + items: Sequence[Mapping[str, object]], + requested_min: int | None = None, + ) -> dict[str, object] | None: + if items: + return None + spec = _DESIGN_CHECK_CONTEXT.get(check) + if spec is None: + return None + category = str(spec["category"]) + metric = str(spec["metric"]) + operator = str(spec["operator"]) + normalized_path = _helpers._normalize_relative_path(path or "") + metrics = _helpers._as_mapping(record.report_document.get("metrics")) + families = _helpers._as_mapping(metrics.get("families")) + family = _helpers._as_mapping(families.get(category)) + metric_items = [ + _helpers._as_mapping(item) + for item in _helpers._as_sequence(family.get("items")) + if not normalized_path + or _helpers._metric_item_matches_path( + _helpers._as_mapping(item), + normalized_path, + ) + ] + if not metric_items: + return None + values = [_as_int(item.get(metric), 0) for item in metric_items] + finding_threshold = self._design_finding_threshold( + record=record, + check=check, + ) + threshold = finding_threshold + threshold_kind = "finding_threshold" + if requested_min is not None and requested_min > finding_threshold: + threshold = requested_min + threshold_kind = "requested_min" + highest_below = _helpers._highest_below_threshold( + values=values, + operator=operator, + threshold=threshold, + ) + payload: dict[str, object] = { + "metric": metric, + "threshold": threshold, + "threshold_kind": threshold_kind, + "measured_units": len(metric_items), + } + if threshold_kind != "finding_threshold": + payload["finding_threshold"] = finding_threshold + if highest_below is not None: + payload["highest_below_threshold"] = highest_below + return payload + + def _design_finding_threshold( + self, + *, + record: MCPRunRecord, + check: str, + ) -> int: + spec = _DESIGN_CHECK_CONTEXT[check] + category = str(spec["category"]) + default_threshold = _as_int(spec["default_threshold"]) + findings = _helpers._as_mapping(record.report_document.get("findings")) + thresholds = _helpers._as_mapping( + _helpers._as_mapping(findings.get("thresholds")).get("design_findings") + ) + threshold_payload = _helpers._as_mapping(thresholds.get(category)) + if threshold_payload: + return _as_int(threshold_payload.get("value"), default_threshold) + request_value = { + "complexity": record.request.complexity_threshold, + "coupling": record.request.coupling_threshold, + "cohesion": record.request.cohesion_threshold, + }.get(check) + return _as_int(request_value, default_threshold) + + def _triage_suggestion_rows(self, record: MCPRunRecord) -> list[dict[str, object]]: + derived = _helpers._as_mapping(record.report_document.get("derived")) + canonical_rows = _helpers._dict_list(derived.get("suggestions")) + suggestion_source_kinds = { + _helpers._suggestion_finding_id( + suggestion + ): _helpers._normalized_source_kind( + getattr(suggestion, "source_kind", SOURCE_KIND_OTHER) + ) + for suggestion in record.suggestions + } + rows: list[dict[str, object]] = [] + for row in canonical_rows: + canonical_finding_id = str(row.get("finding_id", "")) + action = _helpers._as_mapping(row.get("action")) + try: + finding_id = self._short_finding_id( + record, + self._resolve_canonical_finding_id(record, canonical_finding_id), + ) + except MCPFindingNotFoundError: + finding_id = _helpers._base_short_finding_id(canonical_finding_id) + rows.append( + { + "id": f"suggestion:{finding_id}", + "finding_id": finding_id, + "title": str(row.get("title", "")), + "summary": str(row.get("summary", "")), + "effort": str(action.get("effort", "")), + "steps": list(_helpers._as_sequence(action.get("steps"))), + "source_kind": suggestion_source_kinds.get( + canonical_finding_id, + SOURCE_KIND_OTHER, + ), + } + ) + return rows + + def list_findings( + self, + *, + run_id: str | None = None, + family: FindingFamilyFilter = "all", + category: str | None = None, + severity: str | None = None, + source_kind: str | None = None, + novelty: FindingNoveltyFilter = "all", + sort_by: FindingSort = "default", + detail_level: DetailLevel = "summary", + changed_paths: Sequence[str] = (), + git_diff_ref: str | None = None, + exclude_reviewed: bool = False, + offset: int = 0, + limit: int = 50, + max_results: int | None = None, + ) -> dict[str, object]: + validated_family = _helpers._validate_choice( + "family", + family, + _VALID_FINDING_FAMILIES, + ) + validated_novelty = _helpers._validate_choice( + "novelty", + novelty, + _VALID_FINDING_NOVELTY, + ) + validated_sort = _helpers._validate_choice( + "sort_by", + sort_by, + _VALID_FINDING_SORT, + ) + validated_detail = _helpers._validate_choice( + "detail_level", + detail_level, + _VALID_DETAIL_LEVELS, + ) + validated_severity = _helpers._validate_optional_choice( + "severity", + severity, + _VALID_SEVERITIES, + ) + record = self._runs.get(run_id) + paths_filter = self._resolve_query_changed_paths( + record=record, + changed_paths=changed_paths, + git_diff_ref=git_diff_ref, + ) + normalized_limit = max( + 1, + min(max_results if max_results is not None else limit, 200), + ) + filtered = self._query_findings( + record=record, + family=validated_family, + category=category, + severity=validated_severity, + source_kind=source_kind, + novelty=validated_novelty, + sort_by=validated_sort, + detail_level=validated_detail, + changed_paths=paths_filter, + exclude_reviewed=exclude_reviewed, + ) + page = paginate( + filtered, + offset=offset, + limit=normalized_limit, + max_limit=200, + ) + return { + "run_id": _helpers._short_run_id(record.run_id), + "detail_level": validated_detail, + "sort_by": validated_sort, + "changed_paths": list(paths_filter), + "offset": page.offset, + "limit": page.limit, + "returned": len(page.items), + "total": page.total, + "next_offset": page.next_offset, + "items": page.items, + } + + def get_finding( + self, + *, + finding_id: str, + run_id: str | None = None, + detail_level: DetailLevel = "normal", + ) -> dict[str, object]: + record = self._runs.get(run_id) + validated_detail = _helpers._validate_choice( + "detail_level", + detail_level, + _VALID_DETAIL_LEVELS, + ) + canonical_id = self._resolve_canonical_finding_id(record, finding_id) + for finding in self._base_findings(record): + if str(finding.get("id")) == canonical_id: + return self._decorate_finding( + record, + finding, + detail_level=validated_detail, + ) + raise MCPFindingNotFoundError( + f"Finding id '{finding_id}' was not found in run " + f"'{_helpers._short_run_id(record.run_id)}'." + ) + + def _service_get_finding( + self, + *, + finding_id: str, + run_id: str | None = None, + detail_level: DetailLevel = "normal", + ) -> dict[str, object]: + return self.get_finding( + finding_id=finding_id, + run_id=run_id, + detail_level=detail_level, + ) + + def get_remediation( + self, + *, + finding_id: str, + run_id: str | None = None, + detail_level: DetailLevel = "normal", + ) -> dict[str, object]: + validated_detail = _helpers._validate_choice( + "detail_level", + detail_level, + _VALID_DETAIL_LEVELS, + ) + record = self._runs.get(run_id) + canonical_id = self._resolve_canonical_finding_id(record, finding_id) + finding = self._service_get_finding( + finding_id=canonical_id, + run_id=record.run_id, + detail_level="full", + ) + remediation = _helpers._as_mapping(finding.get("remediation")) + if not remediation: + raise MCPFindingNotFoundError( + f"Finding id '{finding_id}' does not expose remediation guidance." + ) + return { + "run_id": _helpers._short_run_id(record.run_id), + "finding_id": self._short_finding_id(record, canonical_id), + "detail_level": validated_detail, + "remediation": _helpers._project_remediation( + remediation, + detail_level=validated_detail, + ), + } + + def list_hotspots( + self, + *, + kind: HotlistKind, + run_id: str | None = None, + detail_level: DetailLevel = "summary", + changed_paths: Sequence[str] = (), + git_diff_ref: str | None = None, + exclude_reviewed: bool = False, + limit: int = 10, + max_results: int | None = None, + ) -> dict[str, object]: + validated_kind = _helpers._validate_choice("kind", kind, _VALID_HOTLIST_KINDS) + validated_detail = _helpers._validate_choice( + "detail_level", + detail_level, + _VALID_DETAIL_LEVELS, + ) + record = self._runs.get(run_id) + paths_filter = self._resolve_query_changed_paths( + record=record, + changed_paths=changed_paths, + git_diff_ref=git_diff_ref, + ) + rows = self._hotspot_rows( + record=record, + kind=validated_kind, + detail_level=validated_detail, + changed_paths=paths_filter, + exclude_reviewed=exclude_reviewed, + ) + normalized_limit = max( + 1, + min(max_results if max_results is not None else limit, 50), + ) + return { + "run_id": _helpers._short_run_id(record.run_id), + "kind": validated_kind, + "detail_level": validated_detail, + "changed_paths": list(paths_filter), + "returned": min(len(rows), normalized_limit), + "total": len(rows), + "items": [ + dict(_helpers._as_mapping(item)) for item in rows[:normalized_limit] + ], + } + + def mark_finding_reviewed( + self, + *, + finding_id: str, + run_id: str | None = None, + note: str | None = None, + ) -> dict[str, object]: + record = self._runs.get(run_id) + canonical_id = self._resolve_canonical_finding_id(record, finding_id) + self._service_get_finding( + finding_id=canonical_id, + run_id=record.run_id, + detail_level="normal", + ) + with self._state_lock: + review_map = self._review_state.setdefault(record.run_id, OrderedDict()) + review_map[canonical_id] = ( + note.strip() if isinstance(note, str) and note.strip() else None + ) + review_map.move_to_end(canonical_id) + return { + "run_id": _helpers._short_run_id(record.run_id), + "finding_id": self._short_finding_id(record, canonical_id), + "reviewed": True, + "note": review_map[canonical_id], + "reviewed_count": len(review_map), + } + + def list_reviewed_findings( + self, + *, + run_id: str | None = None, + ) -> dict[str, object]: + record = self._runs.get(run_id) + with self._state_lock: + review_items = tuple( + self._review_state.get(record.run_id, OrderedDict()).items() + ) + items = [] + for finding_id, note in review_items: + try: + finding = self._service_get_finding( + finding_id=finding_id, + run_id=record.run_id, + detail_level="full", + ) + except MCPFindingNotFoundError: + continue + items.append( + { + "finding_id": self._short_finding_id(record, finding_id), + "note": note, + "finding": self._project_finding_detail( + record, + finding, + detail_level="summary", + ), + } + ) + return { + "run_id": _helpers._short_run_id(record.run_id), + "reviewed_count": len(items), + "items": items, + } + + def check_complexity( + self, + *, + run_id: str | None = None, + root: str | None = None, + path: str | None = None, + min_complexity: int | None = None, + max_results: int = 10, + detail_level: DetailLevel = "summary", + ) -> dict[str, object]: + validated_detail = _helpers._validate_choice( + "detail_level", + detail_level, + _VALID_DETAIL_LEVELS, + ) + record = self._resolve_granular_record( + run_id=run_id, + root=root, + analysis_mode="full", + ) + findings = self._query_findings( + record=record, + family="design", + category=CATEGORY_COMPLEXITY, + detail_level=validated_detail, + changed_paths=self._path_filter_tuple(path), + sort_by="priority", + ) + if min_complexity is not None: + findings = [ + finding + for finding in findings + if _as_int( + _helpers._as_mapping(finding.get("facts")).get( + "cyclomatic_complexity", + 0, + ) + ) + >= min_complexity + ] + return self._granular_payload( + record=record, + check="complexity", + items=findings, + detail_level=validated_detail, + max_results=max_results, + path=path, + threshold_context=self._design_threshold_context( + record=record, + check="complexity", + path=path, + items=findings, + requested_min=min_complexity, + ), + ) + + def check_clones( + self, + *, + run_id: str | None = None, + root: str | None = None, + path: str | None = None, + clone_type: str | None = None, + source_kind: str | None = None, + max_results: int = 10, + detail_level: DetailLevel = "summary", + ) -> dict[str, object]: + validated_detail = _helpers._validate_choice( + "detail_level", + detail_level, + _VALID_DETAIL_LEVELS, + ) + record = self._resolve_granular_record( + run_id=run_id, + root=root, + analysis_mode="clones_only", + ) + findings = self._query_findings( + record=record, + family="clone", + source_kind=source_kind, + detail_level=validated_detail, + changed_paths=self._path_filter_tuple(path), + sort_by="priority", + ) + if clone_type is not None: + findings = [ + finding + for finding in findings + if str(finding.get("clone_type", "")).strip() == clone_type + ] + return self._granular_payload( + record=record, + check="clones", + items=findings, + detail_level=validated_detail, + max_results=max_results, + path=path, + ) + + def check_coupling( + self, + *, + run_id: str | None = None, + root: str | None = None, + path: str | None = None, + max_results: int = 10, + detail_level: DetailLevel = "summary", + ) -> dict[str, object]: + return self._check_design_metric( + run_id=run_id, + root=root, + path=path, + max_results=max_results, + detail_level=detail_level, + category=CATEGORY_COUPLING, + check="coupling", + ) + + def check_cohesion( + self, + *, + run_id: str | None = None, + root: str | None = None, + path: str | None = None, + max_results: int = 10, + detail_level: DetailLevel = "summary", + ) -> dict[str, object]: + return self._check_design_metric( + run_id=run_id, + root=root, + path=path, + max_results=max_results, + detail_level=detail_level, + category=CATEGORY_COHESION, + check="cohesion", + ) + + def _check_design_metric( + self, + *, + run_id: str | None, + root: str | None, + path: str | None, + max_results: int, + detail_level: DetailLevel, + category: str, + check: str, + ) -> dict[str, object]: + validated_detail = _helpers._validate_choice( + "detail_level", + detail_level, + _VALID_DETAIL_LEVELS, + ) + record = self._resolve_granular_record( + run_id=run_id, + root=root, + analysis_mode="full", + ) + findings = self._query_findings( + record=record, + family="design", + category=category, + detail_level=validated_detail, + changed_paths=self._path_filter_tuple(path), + sort_by="priority", + ) + return self._granular_payload( + record=record, + check=check, + items=findings, + detail_level=validated_detail, + max_results=max_results, + path=path, + threshold_context=self._design_threshold_context( + record=record, + check=check, + path=path, + items=findings, + ), + ) + + def check_dead_code( + self, + *, + run_id: str | None = None, + root: str | None = None, + path: str | None = None, + min_severity: str | None = None, + max_results: int = 10, + detail_level: DetailLevel = "summary", + ) -> dict[str, object]: + validated_detail = _helpers._validate_choice( + "detail_level", + detail_level, + _VALID_DETAIL_LEVELS, + ) + validated_min_severity = _helpers._validate_optional_choice( + "min_severity", + min_severity, + _VALID_SEVERITIES, + ) + record = self._resolve_granular_record( + run_id=run_id, + root=root, + analysis_mode="full", + ) + findings = self._query_findings( + record=record, + family="dead_code", + detail_level=validated_detail, + changed_paths=self._path_filter_tuple(path), + sort_by="priority", + ) + if validated_min_severity is not None: + findings = [ + finding + for finding in findings + if _helpers._severity_rank(str(finding.get("severity", ""))) + >= _helpers._severity_rank(validated_min_severity) + ] + return self._granular_payload( + record=record, + check="dead_code", + items=findings, + detail_level=validated_detail, + max_results=max_results, + path=path, + ) diff --git a/codeclone/surfaces/mcp/_session_helpers.py b/codeclone/surfaces/mcp/_session_helpers.py new file mode 100644 index 0000000..a9c902d --- /dev/null +++ b/codeclone/surfaces/mcp/_session_helpers.py @@ -0,0 +1,919 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from ...cache.store import Cache +from ...contracts import REPORT_SCHEMA_VERSION +from ...domain.findings import ( + CATEGORY_CLONE, + CATEGORY_COHESION, + CATEGORY_COMPLEXITY, + CATEGORY_COUPLING, + CATEGORY_DEAD_CODE, + CATEGORY_DEPENDENCY, + CATEGORY_STRUCTURAL, + FAMILY_CLONE, + FAMILY_DEAD_CODE, +) +from ...domain.quality import ( + EFFORT_EASY, + EFFORT_HARD, + EFFORT_MODERATE, + SEVERITY_CRITICAL, + SEVERITY_INFO, + SEVERITY_WARNING, +) +from ...domain.source_scope import ( + SOURCE_KIND_ORDER, + SOURCE_KIND_OTHER, +) +from ...models import MetricsDiff +from ._session_runtime import resolve_cache_path +from ._session_shared import ( + _COMPACT_ITEM_EMPTY_VALUES, + _COMPACT_ITEM_PATH_KEYS, + _SHORT_RUN_ID_LENGTH, + _SOURCE_KIND_BREAKDOWN_ORDER, + DEFAULT_BLOCK_MIN_LOC, + DEFAULT_BLOCK_MIN_STMT, + DEFAULT_MIN_LOC, + DEFAULT_MIN_STMT, + DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, + DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, + DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, + DEFAULT_SEGMENT_MIN_LOC, + DEFAULT_SEGMENT_MIN_STMT, + AnalysisMode, + CachePolicy, + ChoiceT, + DetailLevel, + FreshnessKind, + Iterable, + Mapping, + MCPAnalysisRequest, + MCPRunRecord, + MCPServiceContractError, + MCPServiceError, + MetricsDetailFamily, + Namespace, + Path, + Sequence, + _as_int, + _base_short_finding_id_payload, + _disambiguated_clone_short_ids_payload, + _disambiguated_short_finding_id_payload, + _leaf_symbol_name_payload, + _load_report_document_payload, + _suggestion_finding_id_payload, + _summarize_metrics_diff, +) +from .payloads import short_id + + +def _summary_health_payload(summary: Mapping[str, object]) -> dict[str, object]: + if str(summary.get("analysis_mode", "")) == "clones_only": + return {"available": False, "reason": "metrics_skipped"} + health = dict(_as_mapping(summary.get("health"))) + if health: + return health + return {"available": False, "reason": "unavailable"} + + +def _summary_health_score(summary: Mapping[str, object]) -> int | None: + health = _summary_health_payload(summary) + if health.get("available") is False: + return None + return _as_int(health.get("score", 0), 0) + + +def _summary_health_delta(summary: Mapping[str, object]) -> int | None: + if _summary_health_payload(summary).get("available") is False: + return None + metrics_diff = _as_mapping(summary.get("metrics_diff")) + return _as_int(metrics_diff.get("health_delta", 0), 0) + + +def _severity_rank(severity: str) -> int: + return { + SEVERITY_CRITICAL: 3, + SEVERITY_WARNING: 2, + SEVERITY_INFO: 1, + }.get(severity, 0) + + +def _validate_choice( + name: str, + value: ChoiceT, + allowed: Sequence[str] | frozenset[str], +) -> ChoiceT: + if value not in allowed: + allowed_list = ", ".join(sorted(allowed)) + raise MCPServiceContractError( + f"Invalid value for {name}: {value!r}. Expected one of: {allowed_list}." + ) + return value + + +def _validate_optional_choice( + name: str, + value: ChoiceT | None, + allowed: Sequence[str] | frozenset[str], +) -> ChoiceT | None: + if value is None: + return None + return _validate_choice(name, value, allowed) + + +def _metrics_detail_family(value: str | None) -> MetricsDetailFamily | None: + match value: + case "complexity": + return "complexity" + case "coupling": + return "coupling" + case "cohesion": + return "cohesion" + case "coverage_adoption": + return "coverage_adoption" + case "coverage_join": + return "coverage_join" + case "dependencies": + return "dependencies" + case "dead_code": + return "dead_code" + case "api_surface": + return "api_surface" + case "security_surfaces": + return "security_surfaces" + case "god_modules" | "overloaded_modules": + return "overloaded_modules" + case "health": + return "health" + case _: + return None + + +def _dict_rows(value: object) -> list[dict[str, object]]: + if not isinstance(value, Sequence) or isinstance(value, (str, bytes, bytearray)): + return [] + return [dict(item) for item in value if isinstance(item, Mapping)] + + +def _string_rows(value: object) -> list[str]: + if not isinstance(value, Sequence) or isinstance(value, (str, bytes, bytearray)): + return [] + return [str(item) for item in value if isinstance(item, str)] + + +def _dict_list(value: object) -> list[dict[str, object]]: + return [dict(_as_mapping(item)) for item in _as_sequence(value)] + + +def _as_mapping(value: object) -> Mapping[str, object]: + return value if isinstance(value, Mapping) else {} + + +def _as_sequence(value: object) -> Sequence[object]: + if isinstance(value, Sequence) and not isinstance(value, (str, bytes, bytearray)): + return value + return () + + +def _short_run_id(run_id: str) -> str: + return short_id(run_id, length=_SHORT_RUN_ID_LENGTH) + + +def _normalize_relative_path(path: str) -> str: + cleaned = path.strip() + if cleaned == ".": + return "" + if cleaned.startswith("./"): + cleaned = cleaned[2:] + cleaned = cleaned.rstrip("/") + if ".." in Path(cleaned).parts: + raise MCPServiceContractError(f"path traversal not allowed: {path}") + return cleaned + + +def _path_matches(relative_path: str, changed_paths: Sequence[str]) -> bool: + return any( + relative_path == candidate or relative_path.startswith(candidate + "/") + for candidate in changed_paths + ) + + +def _record_supports_analysis_mode( + record: MCPRunRecord, + *, + analysis_mode: AnalysisMode, +) -> bool: + record_mode = record.request.analysis_mode + if analysis_mode == "clones_only": + return record_mode in {"clones_only", "full"} + return record_mode == "full" + + +def _resolve_root(root: str | None) -> Path: + if not isinstance(root, str) or not root.strip(): + raise MCPServiceContractError( + "CodeClone MCP analyze_repository requires an absolute repository root." + ) + root_path = Path(root).expanduser() + if not root_path.is_absolute(): + raise MCPServiceContractError( + "CodeClone MCP analyze_repository requires an absolute repository root." + ) + try: + resolved = root_path.resolve() + except OSError as exc: + raise MCPServiceContractError( + f"Unable to resolve repository root '{root}': {exc}" + ) from exc + if not resolved.exists(): + raise MCPServiceContractError(f"Repository root '{resolved}' does not exist.") + if not resolved.is_dir(): + raise MCPServiceContractError( + f"Repository root '{resolved}' is not a directory." + ) + return resolved + + +def _resolve_optional_path(value: str, root_path: Path) -> Path: + candidate = Path(value).expanduser() + resolved = candidate if candidate.is_absolute() else root_path / candidate + try: + return resolved.resolve() + except OSError as exc: + raise MCPServiceContractError( + f"Invalid path '{value}' relative to '{root_path}': {exc}" + ) from exc + + +def _base_short_finding_id(canonical_id: str) -> str: + return _base_short_finding_id_payload(canonical_id) + + +def _disambiguated_short_finding_id(canonical_id: str) -> str: + return _disambiguated_short_finding_id_payload(canonical_id) + + +def _disambiguated_short_finding_ids( + canonical_ids: Sequence[str], +) -> dict[str, str]: + clone_ids = [ + canonical_id + for canonical_id in canonical_ids + if canonical_id.startswith("clone:") + ] + if len(clone_ids) == len(canonical_ids): + clone_short_ids = _disambiguated_clone_short_ids_payload(clone_ids) + if len(set(clone_short_ids.values())) == len(clone_short_ids): + return clone_short_ids + return { + canonical_id: _disambiguated_short_finding_id(canonical_id) + for canonical_id in canonical_ids + } + + +def _leaf_symbol_name(value: object) -> str: + return _leaf_symbol_name_payload(value) + + +def _finding_kind_label(finding: Mapping[str, object]) -> str: + family = str(finding.get("family", "")).strip() + kind = str(finding.get("kind", finding.get("category", ""))).strip() + if family == FAMILY_CLONE: + clone_kind = str( + finding.get("clone_kind", finding.get("category", kind)) + ).strip() + return f"{clone_kind}_clone" if clone_kind else "clone" + if family == FAMILY_DEAD_CODE: + return "dead_code" + return kind or family + + +def _summary_location_string(location: Mapping[str, object]) -> str: + path = str(location.get("file", "")).strip() + line = _as_int(location.get("line", 0), 0) + if not path: + return "" + return f"{path}:{line}" if line > 0 else path + + +def _normal_location_payload(location: Mapping[str, object]) -> dict[str, object]: + path = str(location.get("file", "")).strip() + if not path: + return {} + payload: dict[str, object] = { + "path": path, + "line": _as_int(location.get("line", 0), 0), + "end_line": _as_int(location.get("end_line", 0), 0), + } + symbol = _leaf_symbol_name(location.get("symbol")) + if symbol: + payload["symbol"] = symbol + return payload + + +def _suggestion_finding_id(suggestion: object) -> str: + return _suggestion_finding_id_payload(suggestion) + + +def _project_remediation( + remediation: Mapping[str, object], + *, + detail_level: DetailLevel, +) -> dict[str, object]: + if detail_level == "full": + return dict(remediation) + projected = { + "effort": remediation.get("effort"), + "risk": remediation.get("risk_level"), + "shape": remediation.get("safe_refactor_shape"), + "why_now": remediation.get("why_now"), + } + if detail_level == "summary": + return projected + projected["steps"] = list(_as_sequence(remediation.get("steps"))) + return projected + + +def _safe_refactor_shape(suggestion: object) -> str: + category = str(getattr(suggestion, "category", "")).strip() + clone_type = str(getattr(suggestion, "clone_type", "")).strip() + title = str(getattr(suggestion, "title", "")).strip() + if category == CATEGORY_CLONE and clone_type == "Type-1": + return "Keep one canonical implementation and route callers through it." + if category == CATEGORY_CLONE and clone_type == "Type-2": + return "Extract shared implementation with explicit parameters." + if category == CATEGORY_CLONE and "Block" in title: + return "Extract the repeated statement sequence into a helper." + if category == CATEGORY_STRUCTURAL: + return "Extract the repeated branch family into a named helper." + if category == CATEGORY_COMPLEXITY: + return "Split the function into smaller named steps." + if category == CATEGORY_COUPLING: + return "Isolate responsibilities and invert unnecessary dependencies." + if category == CATEGORY_COHESION: + return "Split the class by responsibility boundary." + if category == CATEGORY_DEAD_CODE: + return "Delete the unused symbol or document intentional reachability." + if category == CATEGORY_DEPENDENCY: + return "Break the cycle by moving shared abstractions to a lower layer." + return "Extract the repeated logic into a shared, named abstraction." + + +def _risk_level_for_effort(effort: str) -> str: + return { + EFFORT_EASY: "low", + EFFORT_MODERATE: "medium", + EFFORT_HARD: "high", + }.get(effort, "medium") + + +def _why_now_text( + *, + title: str, + severity: str, + novelty: str, + count: int, + source_kind: str, + spread_files: int, + spread_functions: int, + effort: str, +) -> str: + novelty_text = "new regression" if novelty == "new" else "known debt" + context = ( + "production code" + if source_kind == "production" + else source_kind or "mixed scope" + ) + spread_text = f"{spread_files} files / {spread_functions} functions" + count_text = f"{count} instances" if count > 0 else "localized issue" + return ( + f"{severity.upper()} {title} in {context} — {count_text}, " + f"{spread_text}, {effort} fix, {novelty_text}." + ) + + +def _highest_below_threshold( + *, + values: Sequence[int], + operator: str, + threshold: int, +) -> int | None: + if operator == ">": + below = [value for value in values if value <= threshold] + elif operator == ">=": + below = [value for value in values if value < threshold] + else: + return None + return max(below) if below else None + + +def _normalized_source_kind(value: object) -> str: + normalized = str(value).strip().lower() + if normalized in SOURCE_KIND_ORDER: + return normalized + return SOURCE_KIND_OTHER + + +def _finding_source_kind(finding: Mapping[str, object]) -> str: + source_scope = _as_mapping(finding.get("source_scope")) + return _normalized_source_kind(source_scope.get("dominant_kind")) + + +def _source_kind_breakdown(source_kinds: Iterable[object]) -> dict[str, int]: + breakdown = dict.fromkeys(_SOURCE_KIND_BREAKDOWN_ORDER, 0) + for value in source_kinds: + breakdown[_normalized_source_kind(value)] += 1 + return breakdown + + +def _metric_item_matches_path(item: Mapping[str, object], normalized_path: str) -> bool: + path_value = ( + str(item.get("relative_path", "")).strip() + or str(item.get("path", "")).strip() + or str(item.get("filepath", "")).strip() + or str(item.get("file", "")).strip() + ) + if not path_value: + return False + return _path_matches(path_value, (normalized_path,)) + + +def _comparison_settings( + *, + args: Namespace, + request: MCPAnalysisRequest, +) -> tuple[object, ...]: + return ( + request.analysis_mode, + _as_int(args.min_loc, DEFAULT_MIN_LOC), + _as_int(args.min_stmt, DEFAULT_MIN_STMT), + _as_int(args.block_min_loc, DEFAULT_BLOCK_MIN_LOC), + _as_int(args.block_min_stmt, DEFAULT_BLOCK_MIN_STMT), + _as_int(args.segment_min_loc, DEFAULT_SEGMENT_MIN_LOC), + _as_int(args.segment_min_stmt, DEFAULT_SEGMENT_MIN_STMT), + _as_int( + args.design_complexity_threshold, + DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, + ), + _as_int( + args.design_coupling_threshold, + DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, + ), + _as_int( + args.design_cohesion_threshold, + DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, + ), + ) + + +def _comparison_scope( + *, + before: MCPRunRecord, + after: MCPRunRecord, +) -> dict[str, object]: + same_root = before.root == after.root + same_analysis_settings = before.comparison_settings == after.comparison_settings + if same_root and same_analysis_settings: + reason = "comparable" + elif not same_root and not same_analysis_settings: + reason = "different_root_and_analysis_settings" + elif not same_root: + reason = "different_root" + else: + reason = "different_analysis_settings" + return { + "comparable": same_root and same_analysis_settings, + "same_root": same_root, + "same_analysis_settings": same_analysis_settings, + "reason": reason, + } + + +def _changed_verdict( + *, + changed_projection: Mapping[str, object], + health_delta: int | None, +) -> str: + if _as_int(changed_projection.get("new", 0), 0) > 0 or ( + health_delta is not None and health_delta < 0 + ): + return "regressed" + if ( + _as_int(changed_projection.get("total", 0), 0) == 0 + and health_delta is not None + and health_delta > 0 + ): + return "improved" + return "stable" + + +def _comparison_verdict( + *, + regressions: int, + improvements: int, + health_delta: int | None, +) -> str: + has_negative_signal = regressions > 0 or ( + health_delta is not None and health_delta < 0 + ) + has_positive_signal = improvements > 0 or ( + health_delta is not None and health_delta > 0 + ) + if has_negative_signal and has_positive_signal: + return "mixed" + if has_negative_signal: + return "regressed" + if has_positive_signal: + return "improved" + return "stable" + + +def _comparison_summary_text( + *, + comparable: bool, + comparability_reason: str, + regressions: int, + improvements: int, + health_delta: int | None, +) -> str: + if not comparable: + reason_text = { + "different_root": "different roots", + "different_analysis_settings": "different analysis settings", + "different_root_and_analysis_settings": ( + "different roots and analysis settings" + ), + }.get(comparability_reason, "incomparable runs") + return f"Finding and run health deltas omitted ({reason_text})" + if health_delta is None: + return ( + f"{improvements} findings resolved, {regressions} new regressions; " + "run health delta omitted (metrics unavailable)" + ) + return ( + f"{improvements} findings resolved, {regressions} new regressions, " + f"run health delta {health_delta:+d}" + ) + + +def _resolve_cache_path(*, root_path: Path, args: Namespace) -> Path: + return resolve_cache_path(root_path=root_path, args=args) + + +def _build_cache( + *, + root_path: Path, + args: Namespace, + cache_path: Path, + policy: CachePolicy, +) -> Cache: + cache = Cache( + cache_path, + root=root_path, + max_size_bytes=_as_int(args.max_cache_size_mb, 0) * 1024 * 1024, + min_loc=_as_int(args.min_loc, DEFAULT_MIN_LOC), + min_stmt=_as_int(args.min_stmt, DEFAULT_MIN_STMT), + block_min_loc=_as_int(args.block_min_loc, DEFAULT_BLOCK_MIN_LOC), + block_min_stmt=_as_int(args.block_min_stmt, DEFAULT_BLOCK_MIN_STMT), + segment_min_loc=_as_int(args.segment_min_loc, DEFAULT_SEGMENT_MIN_LOC), + segment_min_stmt=_as_int(args.segment_min_stmt, DEFAULT_SEGMENT_MIN_STMT), + collect_api_surface=bool(getattr(args, "api_surface", False)), + ) + if policy != "off": + cache.load() + return cache + + +def _metrics_computed(analysis_mode: AnalysisMode) -> tuple[str, ...]: + return ( + () + if analysis_mode == "clones_only" + else ( + "complexity", + "coupling", + "cohesion", + "health", + "dependencies", + "dead_code", + ) + ) + + +def _load_report_document(report_json: str) -> dict[str, object]: + return _load_report_document_payload(report_json) + + +def _report_digest(report_document: Mapping[str, object]) -> str: + integrity = _as_mapping(report_document.get("integrity")) + digest = _as_mapping(integrity.get("digest")) + value = digest.get("value") + if not isinstance(value, str) or not value: + raise MCPServiceError("Canonical report digest is missing.") + return value + + +def _summary_analysis_profile_payload(summary: Mapping[str, object]) -> dict[str, int]: + analysis_profile = _as_mapping(summary.get("analysis_profile")) + if not analysis_profile: + return {} + keys = ( + "min_loc", + "min_stmt", + "block_min_loc", + "block_min_stmt", + "segment_min_loc", + "segment_min_stmt", + ) + payload = {key: _as_int(analysis_profile.get(key), -1) for key in keys} + return {key: value for key, value in payload.items() if value >= 0} + + +def _summary_trusted_state_payload( + summary: Mapping[str, object], + *, + key: str, +) -> dict[str, object]: + baseline = _as_mapping(summary.get(key)) + trusted = bool(baseline.get("trusted_for_diff", False)) + payload: dict[str, object] = { + "loaded": bool(baseline.get("loaded", False)), + "status": str(baseline.get("status", "")), + "trusted": trusted, + } + if key == "baseline": + payload["compared_without_valid_baseline"] = not trusted + baseline_python_tag = baseline.get("python_tag") + runtime_python_tag = summary.get("python_tag") + if isinstance(baseline_python_tag, str) and baseline_python_tag.strip(): + payload["baseline_python_tag"] = baseline_python_tag + if isinstance(runtime_python_tag, str) and runtime_python_tag.strip(): + payload["runtime_python_tag"] = runtime_python_tag + return payload + + +def _summary_cache_payload(summary: Mapping[str, object]) -> dict[str, object]: + cache = dict(_as_mapping(summary.get("cache"))) + if not cache: + return {} + return { + "used": bool(cache.get("used", False)), + "freshness": _effective_freshness(summary), + } + + +def _effective_freshness(summary: Mapping[str, object]) -> FreshnessKind: + inventory = _as_mapping(summary.get("inventory")) + files = _as_mapping(inventory.get("files")) + analyzed = max(0, _as_int(files.get("analyzed", 0), 0)) + cached = max(0, _as_int(files.get("cached", 0), 0)) + cache = _as_mapping(summary.get("cache")) + cache_used = bool(cache.get("used")) + if cache_used and cached > 0 and analyzed == 0: + return "reused" + if cache_used and cached > 0 and analyzed > 0: + return "mixed" + return "fresh" + + +def _summary_inventory_payload(inventory: Mapping[str, object]) -> dict[str, object]: + if not inventory: + return {} + files = _as_mapping(inventory.get("files")) + code = _as_mapping(inventory.get("code")) + total_files = _as_int( + files.get( + "total_found", + files.get( + "analyzed", + len( + _as_sequence( + _as_mapping(inventory.get("file_registry")).get("items") + ) + ), + ), + ), + 0, + ) + functions = _as_int(code.get("functions", 0), 0) + _as_int( + code.get("methods", 0), + 0, + ) + return { + "files": total_files, + "lines": _as_int(code.get("parsed_lines", 0), 0), + "functions": functions, + "classes": _as_int(code.get("classes", 0), 0), + } + + +def _summary_diff_payload(summary: Mapping[str, object]) -> dict[str, object]: + baseline_diff = _as_mapping(summary.get("baseline_diff")) + metrics_diff = _as_mapping(summary.get("metrics_diff")) + return { + "new_clones": _as_int(baseline_diff.get("new_clone_groups_total", 0), 0), + "health_delta": ( + _as_int(metrics_diff.get("health_delta", 0), 0) + if ( + metrics_diff + and _summary_health_payload(summary).get("available") is not False + ) + else None + ), + "typing_param_permille_delta": _as_int( + metrics_diff.get("typing_param_permille_delta", 0), + 0, + ), + "typing_return_permille_delta": _as_int( + metrics_diff.get("typing_return_permille_delta", 0), + 0, + ), + "docstring_permille_delta": _as_int( + metrics_diff.get("docstring_permille_delta", 0), + 0, + ), + "api_breaking_changes": _as_int(metrics_diff.get("api_breaking_changes", 0), 0), + "new_api_symbols": _as_int(metrics_diff.get("new_api_symbols", 0), 0), + } + + +def _summary_coverage_join_payload(record: MCPRunRecord) -> dict[str, object]: + metrics = _as_mapping(record.report_document.get("metrics")) + families = _as_mapping(metrics.get("families")) + coverage_join = _as_mapping(families.get("coverage_join")) + summary = _as_mapping(coverage_join.get("summary")) + if not summary: + return {} + payload: dict[str, object] = { + "status": str(summary.get("status", "")).strip(), + "overall_permille": _as_int(summary.get("overall_permille", 0), 0), + "coverage_hotspots": _as_int(summary.get("coverage_hotspots", 0), 0), + "scope_gap_hotspots": _as_int(summary.get("scope_gap_hotspots", 0), 0), + "hotspot_threshold_percent": _as_int( + summary.get("hotspot_threshold_percent", 0), + 0, + ), + } + source_value = summary.get("source") + source = source_value.strip() if isinstance(source_value, str) else "" + if source: + payload["source"] = source + invalid_reason_value = summary.get("invalid_reason") + invalid_reason = ( + invalid_reason_value.strip() if isinstance(invalid_reason_value, str) else "" + ) + if invalid_reason: + payload["invalid_reason"] = invalid_reason + return payload + + +def _summary_security_surfaces_payload(record: MCPRunRecord) -> dict[str, object]: + metrics = _as_mapping(record.report_document.get("metrics")) + families = _as_mapping(metrics.get("families")) + security_surfaces = _as_mapping(families.get("security_surfaces")) + summary = _as_mapping(security_surfaces.get("summary")) + if not summary: + return {} + return { + "items": _as_int(summary.get("items", 0), 0), + "categories": _as_int(summary.get("category_count", 0), 0), + "production": _as_int(summary.get("production", 0), 0), + "tests": _as_int(summary.get("tests", 0), 0), + "report_only": bool(summary.get("report_only", True)), + } + + +def _compact_metrics_item(item: Mapping[str, object]) -> dict[str, object]: + compact: dict[str, object] = {} + path_value = ( + str(item.get("relative_path", "")).strip() + or str(item.get("path", "")).strip() + or str(item.get("filepath", "")).strip() + or str(item.get("file", "")).strip() + ) + if path_value: + compact["path"] = path_value + for key, value in item.items(): + if ( + key not in _COMPACT_ITEM_PATH_KEYS + and value not in _COMPACT_ITEM_EMPTY_VALUES + ): + compact[str(key)] = value + return compact + + +def _metrics_diff_payload(metrics_diff: MetricsDiff | None) -> dict[str, object] | None: + payload = _summarize_metrics_diff(metrics_diff) + return dict(payload) if payload is not None else None + + +def _schema_resource_payload() -> dict[str, object]: + return { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "CodeCloneCanonicalReport", + "type": "object", + "required": [ + "report_schema_version", + "meta", + "inventory", + "findings", + "derived", + "integrity", + ], + "properties": { + "report_schema_version": { + "type": "string", + "const": REPORT_SCHEMA_VERSION, + }, + "meta": {"type": "object"}, + "inventory": {"type": "object"}, + "findings": {"type": "object"}, + "metrics": {"type": "object"}, + "derived": {"type": "object"}, + "integrity": {"type": "object"}, + }, + } + + +def _finding_display_location(finding: Mapping[str, object]) -> str: + locations = _as_sequence(finding.get("locations")) + if not locations: + return "(unknown)" + first = locations[0] + if isinstance(first, str): + return first + location = _as_mapping(first) + path = str(location.get("path", location.get("file", ""))).strip() + if not path: + return "(unknown)" + line = _as_int(location.get("line", 0), 0) + return f"{path}:{line}" if line > 0 else path + + +def _render_pr_summary_markdown(payload: Mapping[str, object]) -> str: + health = _as_mapping(payload.get("health")) + score = health.get("score", "n/a") + grade = health.get("grade", "n/a") + delta = _as_int(payload.get("health_delta", 0), 0) + changed_items = [ + _as_mapping(item) + for item in _as_sequence(payload.get("new_findings_in_changed_files")) + ] + resolved = [_as_mapping(item) for item in _as_sequence(payload.get("resolved"))] + blocking_gates = [ + str(item) for item in _as_sequence(payload.get("blocking_gates")) if str(item) + ] + health_line = ( + "Health: " + f"{score}/100 ({grade}) | Delta: {delta:+d} | " + f"Verdict: {payload.get('verdict', 'stable')}" + if payload.get("health_delta") is not None + else ( + "Health: " + f"{score}/100 ({grade}) | Delta: n/a | " + f"Verdict: {payload.get('verdict', 'stable')}" + ) + ) + lines = [ + "## CodeClone Summary", + "", + health_line, + "", + f"### New findings in changed files ({len(changed_items)})", + ] + if not changed_items: + lines.append("- None") + else: + lines.extend( + [ + ( + f"- **{str(item.get('severity', 'info')).upper()}** " + f"{item.get('kind', 'finding')} in " + f"`{_finding_display_location(item)}`" + ) + for item in changed_items[:10] + ] + ) + lines.extend(["", f"### Resolved ({len(resolved)})"]) + if not resolved: + lines.append("- None") + else: + lines.extend( + [ + f"- {item.get('kind', 'finding')} in " + f"`{_finding_display_location(item)}`" + for item in resolved[:10] + ] + ) + lines.extend(["", "### Blocking gates"]) + if not blocking_gates: + lines.append("- none") + else: + lines.extend([f"- `{reason}`" for reason in blocking_gates]) + return "\n".join(lines) diff --git a/codeclone/surfaces/mcp/_session_runtime.py b/codeclone/surfaces/mcp/_session_runtime.py new file mode 100644 index 0000000..9b5179d --- /dev/null +++ b/codeclone/surfaces/mcp/_session_runtime.py @@ -0,0 +1,41 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from pathlib import Path + + +def validate_numeric_args(args: object) -> bool: + return bool( + not ( + _int_attr(args, "max_baseline_size_mb") < 0 + or _int_attr(args, "max_cache_size_mb") < 0 + or _int_attr(args, "fail_threshold", -1) < -1 + or _int_attr(args, "fail_complexity", -1) < -1 + or _int_attr(args, "fail_coupling", -1) < -1 + or _int_attr(args, "fail_cohesion", -1) < -1 + or _int_attr(args, "fail_health", -1) < -1 + or _int_attr(args, "min_typing_coverage", -1) < -1 + or _int_attr(args, "min_typing_coverage", -1) > 100 + or _int_attr(args, "min_docstring_coverage", -1) < -1 + or _int_attr(args, "min_docstring_coverage", -1) > 100 + or _int_attr(args, "coverage_min") < 0 + or _int_attr(args, "coverage_min") > 100 + ) + ) + + +def resolve_cache_path(*, root_path: Path, args: object) -> Path: + raw_value = getattr(args, "cache_path", None) + if isinstance(raw_value, str) and raw_value.strip(): + return Path(raw_value).expanduser() + return root_path / ".cache" / "codeclone" / "cache.json" + + +def _int_attr(args: object, name: str, default: int = 0) -> int: + value = getattr(args, name, default) + return value if isinstance(value, int) else default diff --git a/codeclone/surfaces/mcp/_session_shared.py b/codeclone/surfaces/mcp/_session_shared.py new file mode 100644 index 0000000..e4ac758 --- /dev/null +++ b/codeclone/surfaces/mcp/_session_shared.py @@ -0,0 +1,1202 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import hashlib +import subprocess +from argparse import Namespace +from collections import OrderedDict +from collections.abc import Iterable, Mapping, Sequence +from dataclasses import dataclass +from json import JSONDecodeError +from pathlib import Path +from threading import RLock +from typing import Final, Literal, TypeVar + +import orjson + +from ... import __version__ +from ...baseline import Baseline +from ...cache.store import Cache +from ...cache.versioning import CacheStatus +from ...config.pyproject_loader import ( + ConfigValidationError, + load_pyproject_config, +) +from ...config.spec import ( + DEFAULT_BASELINE_PATH, + DEFAULT_BLOCK_MIN_LOC, + DEFAULT_BLOCK_MIN_STMT, + DEFAULT_MAX_BASELINE_SIZE_MB, + DEFAULT_MAX_CACHE_SIZE_MB, + DEFAULT_MIN_LOC, + DEFAULT_MIN_STMT, + DEFAULT_SEGMENT_MIN_LOC, + DEFAULT_SEGMENT_MIN_STMT, +) +from ...contracts import ( + DEFAULT_COVERAGE_MIN, + DEFAULT_JSON_REPORT_PATH, + DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, + DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, + DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, + DOCS_URL, + REPORT_SCHEMA_VERSION, +) +from ...core._types import OutputPaths +from ...core.bootstrap import bootstrap +from ...core.discovery import discover +from ...core.parallelism import process +from ...core.pipeline import analyze +from ...core.reporting import report +from ...domain.findings import ( + CATEGORY_CLONE, + CATEGORY_COHESION, + CATEGORY_COMPLEXITY, + CATEGORY_COUPLING, + CATEGORY_DEAD_CODE, + CATEGORY_DEPENDENCY, + CATEGORY_STRUCTURAL, + CLONE_KIND_SEGMENT, + FAMILY_CLONE, + FAMILY_CLONES, + FAMILY_DEAD_CODE, + FAMILY_DESIGN, + FAMILY_STRUCTURAL, +) +from ...domain.quality import ( + CONFIDENCE_HIGH, + CONFIDENCE_LOW, + CONFIDENCE_MEDIUM, + EFFORT_EASY, + EFFORT_HARD, + EFFORT_MODERATE, + SEVERITY_CRITICAL, + SEVERITY_INFO, + SEVERITY_WARNING, +) +from ...domain.source_scope import ( + SOURCE_KIND_FIXTURES, + SOURCE_KIND_MIXED, + SOURCE_KIND_ORDER, + SOURCE_KIND_OTHER, + SOURCE_KIND_PRODUCTION, + SOURCE_KIND_TESTS, +) +from ...findings.ids import ( + clone_group_id, + dead_code_group_id, + design_group_id, + structural_group_id, +) +from ...models import CoverageJoinResult, MetricsDiff, ProjectMetrics, Suggestion +from ...report.gates.evaluator import GateResult as GatingResult +from ...report.gates.evaluator import MetricGateConfig +from ...report.gates.evaluator import evaluate_gates as _evaluate_report_gates +from ...report.gates.evaluator import summarize_metrics_diff as _summarize_metrics_diff +from ...utils.coerce import as_float as _as_float +from ...utils.coerce import as_int as _as_int +from ...utils.git_diff import validate_git_diff_ref +from .payloads import paginate, resolve_finding_id, short_id + +AnalysisMode = Literal["full", "clones_only"] +CachePolicy = Literal["reuse", "refresh", "off"] +FreshnessKind = Literal["fresh", "mixed", "reused"] +HotlistKind = Literal[ + "most_actionable", + "highest_spread", + "highest_priority", + "production_hotspots", + "test_fixture_hotspots", +] +FindingFamilyFilter = Literal["all", "clone", "structural", "dead_code", "design"] +FindingNoveltyFilter = Literal["all", "new", "known"] +FindingSort = Literal["default", "priority", "severity", "spread"] +DetailLevel = Literal["summary", "normal", "full"] +ComparisonFocus = Literal["all", "clones", "structural", "metrics"] +PRSummaryFormat = Literal["markdown", "json"] +HelpTopic = Literal[ + "workflow", + "analysis_profile", + "suppressions", + "baseline", + "coverage", + "latest_runs", + "review_state", + "changed_scope", +] +HelpDetail = Literal["compact", "normal"] +MetricsDetailFamily = Literal[ + "complexity", + "coupling", + "cohesion", + "coverage_adoption", + "coverage_join", + "dependencies", + "dead_code", + "api_surface", + "security_surfaces", + "god_modules", + "overloaded_modules", + "health", +] +ReportSection = Literal[ + "all", + "meta", + "inventory", + "findings", + "metrics", + "metrics_detail", + "derived", + "changed", + "integrity", +] +HealthScope = Literal["repository"] +SummaryFocus = Literal["repository", "production", "changed_paths"] + +_REPORT_DUMMY_PATH = Path(DEFAULT_JSON_REPORT_PATH) +_HEALTH_SCOPE_REPOSITORY: Final[HealthScope] = "repository" +_FOCUS_REPOSITORY: Final[SummaryFocus] = "repository" +_FOCUS_PRODUCTION: Final[SummaryFocus] = "production" +_FOCUS_CHANGED_PATHS: Final[SummaryFocus] = "changed_paths" +_MCP_CONFIG_KEYS = frozenset( + { + "min_loc", + "min_stmt", + "block_min_loc", + "block_min_stmt", + "segment_min_loc", + "segment_min_stmt", + "processes", + "cache_path", + "max_cache_size_mb", + "baseline", + "max_baseline_size_mb", + "metrics_baseline", + "api_surface", + "coverage_xml", + "coverage_min", + "golden_fixture_paths", + } +) +_RESOURCE_SECTION_MAP: Final[dict[str, ReportSection]] = { + "report.json": "all", + "summary": "meta", + "health": "metrics", + "changed": "changed", + "overview": "derived", +} +_SEVERITY_WEIGHT: Final[dict[str, float]] = { + SEVERITY_CRITICAL: 1.0, + SEVERITY_WARNING: 0.6, + SEVERITY_INFO: 0.2, +} +_EFFORT_WEIGHT: Final[dict[str, float]] = { + EFFORT_EASY: 1.0, + EFFORT_MODERATE: 0.6, + EFFORT_HARD: 0.3, +} +_NOVELTY_WEIGHT: Final[dict[str, float]] = {"new": 1.0, "known": 0.5} +_RUNTIME_WEIGHT: Final[dict[str, float]] = { + "production": 1.0, + "mixed": 0.8, + "tests": 0.4, + "fixtures": 0.2, + "other": 0.5, +} +_CONFIDENCE_WEIGHT: Final[dict[str, float]] = { + CONFIDENCE_HIGH: 1.0, + CONFIDENCE_MEDIUM: 0.7, + CONFIDENCE_LOW: 0.3, +} +# Canonical report groups use FAMILY_CLONES ("clones"), while individual finding +# payloads use FAMILY_CLONE ("clone"). +_VALID_ANALYSIS_MODES = frozenset({"full", "clones_only"}) +_VALID_CACHE_POLICIES = frozenset({"reuse", "refresh", "off"}) +_VALID_FINDING_FAMILIES = frozenset( + {"all", "clone", "structural", "dead_code", "design"} +) +_VALID_FINDING_NOVELTY = frozenset({"all", "new", "known"}) +_VALID_FINDING_SORT = frozenset({"default", "priority", "severity", "spread"}) +_VALID_DETAIL_LEVELS = frozenset({"summary", "normal", "full"}) +_VALID_COMPARISON_FOCUS = frozenset({"all", "clones", "structural", "metrics"}) +_VALID_PR_SUMMARY_FORMATS = frozenset({"markdown", "json"}) +_VALID_HELP_TOPICS = frozenset( + { + "workflow", + "analysis_profile", + "suppressions", + "baseline", + "coverage", + "latest_runs", + "review_state", + "changed_scope", + } +) +_VALID_HELP_DETAILS = frozenset({"compact", "normal"}) +DEFAULT_MCP_HISTORY_LIMIT = 4 +MAX_MCP_HISTORY_LIMIT = 10 +_VALID_REPORT_SECTIONS = frozenset( + { + "all", + "meta", + "inventory", + "findings", + "metrics", + "metrics_detail", + "derived", + "changed", + "integrity", + } +) +_VALID_HOTLIST_KINDS = frozenset( + { + "most_actionable", + "highest_spread", + "highest_priority", + "production_hotspots", + "test_fixture_hotspots", + } +) +_VALID_SEVERITIES = frozenset({SEVERITY_CRITICAL, SEVERITY_WARNING, SEVERITY_INFO}) +_SOURCE_KIND_BREAKDOWN_ORDER: Final[tuple[str, ...]] = ( + SOURCE_KIND_PRODUCTION, + SOURCE_KIND_TESTS, + SOURCE_KIND_FIXTURES, + SOURCE_KIND_MIXED, + SOURCE_KIND_OTHER, +) +_COMPACT_ITEM_PATH_KEYS: Final[frozenset[str]] = frozenset( + {"relative_path", "path", "filepath", "file"} +) +_COMPACT_ITEM_EMPTY_VALUES: Final[tuple[object, ...]] = ("", None, [], {}, ()) +_HOTLIST_REPORT_KEYS: Final[dict[str, str]] = { + "most_actionable": "most_actionable_ids", + "highest_spread": "highest_spread_ids", + "production_hotspots": "production_hotspot_ids", + "test_fixture_hotspots": "test_fixture_hotspot_ids", +} +_CHECK_TO_DIMENSION: Final[dict[str, str]] = { + "cohesion": "cohesion", + "coupling": "coupling", + "dead_code": "dead_code", + "complexity": "complexity", + "clones": "clones", +} +_DESIGN_CHECK_CONTEXT: Final[dict[str, dict[str, object]]] = { + "complexity": { + "category": CATEGORY_COMPLEXITY, + "metric": "cyclomatic_complexity", + "operator": ">", + "default_threshold": DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, + }, + "coupling": { + "category": CATEGORY_COUPLING, + "metric": "cbo", + "operator": ">", + "default_threshold": DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, + }, + "cohesion": { + "category": CATEGORY_COHESION, + "metric": "lcom4", + "operator": ">=", + "default_threshold": DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, + }, +} +_VALID_METRICS_DETAIL_FAMILIES = frozenset( + { + "complexity", + "coupling", + "cohesion", + "coverage_adoption", + "coverage_join", + "dependencies", + "dead_code", + "api_surface", + "security_surfaces", + "god_modules", + "overloaded_modules", + "health", + } +) +_METRICS_DETAIL_FAMILY_ALIASES: Final[dict[str, str]] = { + "god_modules": "overloaded_modules", +} +_SHORT_RUN_ID_LENGTH = 8 +_SHORT_HASH_ID_LENGTH = 6 +ChoiceT = TypeVar("ChoiceT", bound=str) + + +@dataclass(frozen=True) +class MCPHelpTopicSpec: + summary: str + key_points: tuple[str, ...] + recommended_tools: tuple[str, ...] + doc_links: tuple[tuple[str, str], ...] + warnings: tuple[str, ...] = () + anti_patterns: tuple[str, ...] = () + + +_MCP_BOOK_URL: Final = f"{DOCS_URL}book/" +_MCP_GUIDE_URL: Final = f"{DOCS_URL}mcp/" +_MCP_INTERFACE_DOC_LINK: Final[tuple[str, str]] = ( + "MCP interface contract", + f"{_MCP_BOOK_URL}20-mcp-interface/", +) +_BASELINE_DOC_LINK: Final[tuple[str, str]] = ( + "Baseline contract", + f"{_MCP_BOOK_URL}06-baseline/", +) +_CONFIG_DOC_LINK: Final[tuple[str, str]] = ( + "Config and defaults", + f"{_MCP_BOOK_URL}04-config-and-defaults/", +) +_REPORT_DOC_LINK: Final[tuple[str, str]] = ( + "Report contract", + f"{_MCP_BOOK_URL}08-report/", +) +_CLI_DOC_LINK: Final[tuple[str, str]] = ( + "CLI contract", + f"{_MCP_BOOK_URL}09-cli/", +) +_PIPELINE_DOC_LINK: Final[tuple[str, str]] = ( + "Core pipeline", + f"{_MCP_BOOK_URL}05-core-pipeline/", +) +_SUPPRESSIONS_DOC_LINK: Final[tuple[str, str]] = ( + "Inline suppressions contract", + f"{_MCP_BOOK_URL}19-inline-suppressions/", +) +_MCP_GUIDE_DOC_LINK: Final[tuple[str, str]] = ("MCP usage guide", _MCP_GUIDE_URL) +_HELP_TOPIC_SPECS: Final[dict[str, MCPHelpTopicSpec]] = { + "workflow": MCPHelpTopicSpec( + summary=( + "CodeClone MCP is triage-first and budget-aware. Start with a " + "summary or production triage, then narrow through hotspots or " + "focused checks before opening one finding in detail." + ), + key_points=( + "Recommended first pass: analyze_repository or analyze_changed_paths.", + ( + "Start with default or pyproject-resolved thresholds; lower them " + "only for an explicit higher-sensitivity follow-up pass." + ), + ( + "Use get_run_summary or get_production_triage before broad " + "finding listing." + ), + ( + "Prefer list_hotspots or focused check_* tools over " + "list_findings on noisy repositories." + ), + ("Use get_finding and get_remediation only after selecting an issue."), + ( + "get_report_section(section='all') is an exception path, not " + "a default first step." + ), + ), + recommended_tools=( + "analyze_repository", + "analyze_changed_paths", + "get_run_summary", + "get_production_triage", + "list_hotspots", + "check_clones", + "check_dead_code", + "get_finding", + "get_remediation", + ), + doc_links=(_MCP_INTERFACE_DOC_LINK, _MCP_GUIDE_DOC_LINK), + warnings=( + ( + "Broad list_findings calls burn context quickly on large or " + "noisy repositories." + ), + ( + "Prefer generate_pr_summary(format='markdown') unless machine " + "JSON is explicitly required." + ), + ), + anti_patterns=( + "Starting exploration with list_findings on a noisy repository.", + "Using get_report_section(section='all') as the default first step.", + ( + "Escalating detail on larger lists instead of opening one " + "finding with get_finding." + ), + ), + ), + "analysis_profile": MCPHelpTopicSpec( + summary=( + "CodeClone default analysis is intentionally conservative: stable " + "first-pass review, baseline-aware governance, and CI-friendly " + "signal over maximum local sensitivity." + ), + key_points=( + ( + "Default thresholds are intentionally conservative and " + "production-friendly." + ), + ( + "A clean default run does not rule out smaller local " + "duplication or repetition." + ), + ( + "Lowering thresholds increases sensitivity and can surface " + "smaller functions, tighter windows, and finer local signals." + ), + ( + "Lower-threshold runs are best for exploratory local review, " + "not as a silent replacement for the default governance profile." + ), + "Interpret results in the context of the active threshold profile.", + ), + recommended_tools=( + "analyze_repository", + "analyze_changed_paths", + "get_run_summary", + "compare_runs", + ), + doc_links=( + _CONFIG_DOC_LINK, + _PIPELINE_DOC_LINK, + _MCP_INTERFACE_DOC_LINK, + ), + warnings=( + ( + "Do not treat a default-threshold run as proof that no smaller " + "local clone or repetition exists." + ), + ( + "Lower-threshold runs usually increase noise and should be read " + "as higher-sensitivity exploratory passes." + ), + "Run comparisons are most meaningful when profiles are aligned.", + ), + anti_patterns=( + ( + "Assuming a clean default pass means no finer-grained " + "duplication exists anywhere in the repository." + ), + ( + "Lowering thresholds for exploration and then interpreting the " + "result as if it had the same meaning as the conservative " + "default pass." + ), + ( + "Mixing low-threshold exploratory output into baseline or CI " + "reasoning without acknowledging the profile change." + ), + ), + ), + "suppressions": MCPHelpTopicSpec( + summary=( + "CodeClone supports explicit inline suppressions for selected " + "findings. They are local policy, not analysis truth, and should " + "stay narrow and declaration-scoped." + ), + key_points=( + "Current syntax uses codeclone: ignore[rule-id,...].", + "Binding is declaration-scoped: def, async def, or class.", + ( + "Supported placement is the previous line or inline on the " + "declaration or header line." + ), + ( + "Suppressions are target-specific and do not imply file-wide " + "or cascading scope." + ), + ( + "Use suppressions for accepted dynamic or runtime false " + "positives, not to hide broad classes of debt." + ), + ), + recommended_tools=("get_finding", "get_remediation"), + doc_links=(_SUPPRESSIONS_DOC_LINK, _MCP_INTERFACE_DOC_LINK), + warnings=( + ( + "MCP explains suppression semantics but never creates or " + "updates suppressions." + ), + ), + anti_patterns=( + "Treating suppressions as file-wide or inherited state.", + ( + "Using suppressions to hide broad structural debt instead of " + "accepted false positives." + ), + ), + ), + "baseline": MCPHelpTopicSpec( + summary=( + "A baseline is CodeClone's accepted comparison snapshot for clones " + "and optional metrics. It separates known debt from new regressions " + "and is trust-checked before use." + ), + key_points=( + ( + "Canonical baseline schema is v2.0 with meta and clone keys; " + "metrics may be embedded for unified flows." + ), + ( + "Compatibility depends on generator identity, supported " + "schema version, fingerprint version, python tag, and payload " + "integrity." + ), + ( + "Known means already present in the trusted baseline; new " + "means not accepted by baseline." + ), + ( + "In CI and gating contexts, untrusted baseline states are " + "contract errors rather than soft warnings." + ), + "MCP is read-only and does not update or rewrite baselines.", + ), + recommended_tools=("get_run_summary", "evaluate_gates", "compare_runs"), + doc_links=(_BASELINE_DOC_LINK,), + warnings=( + "Baseline trust semantics directly affect new-vs-known classification.", + ), + anti_patterns=( + "Treating baseline as mutable MCP session state.", + "Assuming an untrusted baseline is only cosmetic in CI contexts.", + ), + ), + "coverage": MCPHelpTopicSpec( + summary=( + "Coverage join is an external current-run signal: CodeClone reads " + "an existing Cobertura XML report and joins line hits to risky " + "function spans." + ), + key_points=( + "Use Cobertura XML such as `coverage xml` output from coverage.py.", + "Coverage join does not become baseline truth and does not affect health.", + ( + "Coverage hotspot gating is current-run only and focuses on " + "medium/high-risk functions measured below the configured " + "threshold." + ), + ( + "Functions missing from the supplied coverage.xml are surfaced " + "as scope gaps, not labeled as untested." + ), + "Use metrics_detail(family='coverage_join') for bounded drill-down.", + ), + recommended_tools=( + "analyze_repository", + "analyze_changed_paths", + "get_run_summary", + "get_report_section", + "evaluate_gates", + ), + doc_links=( + _MCP_INTERFACE_DOC_LINK, + _CLI_DOC_LINK, + _REPORT_DOC_LINK, + ), + warnings=( + "Coverage join is only as accurate as the external XML path mapping.", + "It does not infer branch coverage and does not execute tests.", + "Use fail-on-untested-hotspots only with a valid joined coverage input.", + ), + anti_patterns=( + "Treating missing coverage XML as zero coverage without stating it.", + "Reading coverage join as a baseline-aware trend signal.", + "Assuming dynamic runtime dispatch is visible through a static line join.", + ), + ), + "latest_runs": MCPHelpTopicSpec( + summary=( + "latest/* resources point to the most recent analysis run in the " + "current MCP session. They are convenience handles, not persistent " + "truth anchors." + ), + key_points=( + "Run history is in-memory only and bounded by history-limit.", + "The latest pointer moves when a newer analyze_* call registers a run.", + "A fresh repository state requires a fresh analyze run.", + ( + "Short run ids are convenience handles derived from canonical " + "run identity." + ), + ( + "Do not assume latest/* is globally current outside the " + "active MCP session." + ), + ), + recommended_tools=( + "analyze_repository", + "analyze_changed_paths", + "get_run_summary", + "compare_runs", + ), + doc_links=(_MCP_INTERFACE_DOC_LINK, _MCP_GUIDE_DOC_LINK), + warnings=( + ( + "latest/* can point at a different repository after a later " + "analyze call in the same session." + ), + ), + anti_patterns=( + ( + "Assuming latest/* remains tied to one repository across the " + "whole client session." + ), + ( + "Using latest/* as a substitute for starting a fresh run when " + "freshness matters." + ), + ), + ), + "review_state": MCPHelpTopicSpec( + summary=( + "Reviewed state in MCP is session-local workflow state. It helps " + "long sessions track review progress without modifying canonical " + "findings, baseline, or persisted artifacts." + ), + key_points=( + "Review markers are in-memory only.", + "They do not change report truth, finding identity, or CI semantics.", + "They are useful for triage workflows across long sessions.", + ( + "They should not be interpreted as acceptance, suppression, " + "or baseline update." + ), + ), + recommended_tools=( + "list_hotspots", + "get_finding", + "mark_finding_reviewed", + "list_reviewed_findings", + ), + doc_links=(_MCP_INTERFACE_DOC_LINK, _MCP_GUIDE_DOC_LINK), + warnings=( + "Reviewed markers disappear when the MCP session is cleared or restarted.", + ), + anti_patterns=( + "Treating reviewed state as a persistent acceptance signal.", + "Assuming reviewed findings are removed from canonical report truth.", + ), + ), + "changed_scope": MCPHelpTopicSpec( + summary=( + "Changed-scope analysis narrows review to findings that touch a " + "selected change set. It is for PR and patch review, not a " + "replacement for full canonical analysis." + ), + key_points=( + ( + "Use analyze_changed_paths with explicit changed_paths or " + "git_diff_ref for review-focused runs." + ), + ( + "Start with the same conservative profile as the default " + "review, then lower thresholds only when you explicitly want " + "a higher-sensitivity changed-files pass." + ), + ( + "Changed-scope is best for asking what new issues touch " + "modified files and whether anything should block CI." + ), + "Prefer production triage and hotspot views before broad listing.", + "If repository-wide truth is needed, run full analysis first.", + ), + recommended_tools=( + "analyze_changed_paths", + "get_run_summary", + "get_production_triage", + "evaluate_gates", + "generate_pr_summary", + ), + doc_links=(_MCP_INTERFACE_DOC_LINK, _MCP_GUIDE_DOC_LINK), + warnings=( + ( + "Changed-scope narrows review focus; it does not replace the " + "full canonical report for repository-wide truth." + ), + ), + anti_patterns=( + "Using changed-scope as if it were the only source of repository truth.", + ( + "Starting changed-files review with broad listing instead of " + "compact triage." + ), + ), + ), +} + + +def _suggestion_finding_id_payload(suggestion: object) -> str: + if not hasattr(suggestion, "finding_family"): + return "" + family = str(getattr(suggestion, "finding_family", "")).strip() + if family == FAMILY_CLONES: + kind = str(getattr(suggestion, "finding_kind", "")).strip() + subject_key = str(getattr(suggestion, "subject_key", "")).strip() + return clone_group_id(kind or CLONE_KIND_SEGMENT, subject_key) + if family == FAMILY_STRUCTURAL: + return structural_group_id( + str(getattr(suggestion, "finding_kind", "")).strip() or CATEGORY_STRUCTURAL, + str(getattr(suggestion, "subject_key", "")).strip(), + ) + category = str(getattr(suggestion, "category", "")).strip() + subject_key = str(getattr(suggestion, "subject_key", "")).strip() + if category == CATEGORY_DEAD_CODE: + return dead_code_group_id(subject_key) + return design_group_id( + category, + subject_key or str(getattr(suggestion, "title", "")), + ) + + +@dataclass(frozen=True, slots=True) +class _CloneShortIdEntry: + canonical_id: str + alias: str + token: str + suffix: str + + def render(self, prefix_length: int) -> str: + if prefix_length <= 0: + prefix_length = len(self.token) + return f"{self.alias}:{self.token[:prefix_length]}{self.suffix}" + + +def _partitioned_short_id(alias: str, remainder: str) -> str: + first, _, rest = remainder.partition(":") + return f"{alias}:{first}:{rest}" if rest else f"{alias}:{first}" + + +def _clone_short_id_entry_payload(canonical_id: str) -> _CloneShortIdEntry: + _prefix, _, remainder = canonical_id.partition(":") + clone_kind, _, group_key = remainder.partition(":") + hashes = [part for part in group_key.split("|") if part] + if clone_kind == "function": + fingerprint = hashes[0] if hashes else group_key + bucket = "" + if "|" in group_key: + bucket = "|" + group_key.split("|")[-1] + return _CloneShortIdEntry( + canonical_id=canonical_id, + alias="fn", + token=fingerprint, + suffix=bucket, + ) + alias = {"block": "blk", "segment": "seg"}.get(clone_kind, "clone") + combined = "|".join(hashes) if hashes else group_key + token = hashlib.sha256(combined.encode()).hexdigest() + return _CloneShortIdEntry( + canonical_id=canonical_id, + alias=alias, + token=token, + suffix=f"|x{len(hashes) or 1}", + ) + + +def _disambiguated_clone_short_ids_payload( + canonical_ids: Sequence[str], +) -> dict[str, str]: + clone_entries = [ + _clone_short_id_entry_payload(canonical_id) for canonical_id in canonical_ids + ] + max_token_length = max((len(entry.token) for entry in clone_entries), default=0) + for prefix_length in range(_SHORT_HASH_ID_LENGTH + 2, max_token_length + 1, 2): + candidates = { + entry.canonical_id: entry.render(prefix_length) for entry in clone_entries + } + if len(set(candidates.values())) == len(candidates): + return candidates + return { + entry.canonical_id: entry.render(max_token_length) for entry in clone_entries + } + + +def _leaf_symbol_name_payload(value: object) -> str: + text = str(value).strip() + if not text: + return "" + if ":" in text: + text = text.rsplit(":", maxsplit=1)[-1] + if "." in text: + text = text.rsplit(".", maxsplit=1)[-1] + return text + + +def _base_short_finding_id_payload(canonical_id: str) -> str: + prefix, _, remainder = canonical_id.partition(":") + if prefix == "clone": + return _clone_short_id_entry_payload(canonical_id).render(_SHORT_HASH_ID_LENGTH) + if prefix == "structural": + finding_kind, _, finding_key = remainder.partition(":") + return f"struct:{finding_kind}:{finding_key[:_SHORT_HASH_ID_LENGTH]}" + if prefix == "dead_code": + return f"dead:{_leaf_symbol_name_payload(remainder)}" + if prefix == "design": + category, _, subject_key = remainder.partition(":") + return f"design:{category}:{_leaf_symbol_name_payload(subject_key)}" + return canonical_id + + +def _disambiguated_short_finding_id_payload(canonical_id: str) -> str: + prefix, _, remainder = canonical_id.partition(":") + if prefix == "clone": + return _clone_short_id_entry_payload(canonical_id).render(0) + if prefix == "structural": + return _partitioned_short_id("struct", remainder) + if prefix == "dead_code": + return f"dead:{remainder}" + if prefix == "design": + return _partitioned_short_id("design", remainder) + return canonical_id + + +def _json_text_payload( + payload: object, + *, + sort_keys: bool = True, +) -> str: + options = orjson.OPT_INDENT_2 + if sort_keys: + options |= orjson.OPT_SORT_KEYS + return orjson.dumps(payload, option=options).decode("utf-8") + + +def _git_diff_lines_payload( + *, + root_path: Path, + git_diff_ref: str, +) -> tuple[str, ...]: + try: + validated_ref = validate_git_diff_ref(git_diff_ref) + except ValueError as exc: + raise MCPGitDiffError(str(exc)) from exc + try: + completed = subprocess.run( + ["git", "diff", "--name-only", validated_ref, "--"], + cwd=root_path, + check=True, + capture_output=True, + text=True, + timeout=30, + ) + except (OSError, subprocess.CalledProcessError, subprocess.TimeoutExpired) as exc: + raise MCPGitDiffError( + f"Unable to resolve changed paths from git diff ref '{validated_ref}'." + ) from exc + return tuple( + sorted({line.strip() for line in completed.stdout.splitlines() if line.strip()}) + ) + + +def _load_report_document_payload(report_json: str) -> dict[str, object]: + try: + payload = orjson.loads(report_json) + except JSONDecodeError as exc: + raise MCPServiceError( + f"Generated canonical report is not valid JSON: {exc}" + ) from exc + if not isinstance(payload, dict): + raise MCPServiceError("Generated canonical report must be a JSON object.") + return dict(payload) + + +def _validated_history_limit(history_limit: int) -> int: + if not 1 <= history_limit <= MAX_MCP_HISTORY_LIMIT: + raise ValueError( + f"history_limit must be between 1 and {MAX_MCP_HISTORY_LIMIT}." + ) + return history_limit + + +class MCPServiceError(RuntimeError): + """Base class for CodeClone MCP service errors.""" + + +class MCPServiceContractError(MCPServiceError): + """Raised when an MCP request violates the CodeClone service contract.""" + + +class MCPRunNotFoundError(MCPServiceError): + """Raised when a requested MCP run is not available in the in-memory registry.""" + + +class MCPFindingNotFoundError(MCPServiceError): + """Raised when a requested finding id is not present in the selected run.""" + + +class MCPGitDiffError(MCPServiceError): + """Raised when changed paths cannot be resolved from a git ref.""" + + +class _BufferConsole: + def __init__(self) -> None: + self.messages: list[str] = [] + + def print(self, *objects: object, **_kwargs: object) -> None: + text = " ".join(str(obj) for obj in objects).strip() + if text: + self.messages.append(text) + + +@dataclass(frozen=True, slots=True) +class MCPAnalysisRequest: + root: str | None = None + analysis_mode: AnalysisMode = "full" + respect_pyproject: bool = True + changed_paths: tuple[str, ...] = () + git_diff_ref: str | None = None + processes: int | None = None + min_loc: int | None = None + min_stmt: int | None = None + block_min_loc: int | None = None + block_min_stmt: int | None = None + segment_min_loc: int | None = None + segment_min_stmt: int | None = None + api_surface: bool | None = None + coverage_xml: str | None = None + coverage_min: int | None = None + complexity_threshold: int | None = None + coupling_threshold: int | None = None + cohesion_threshold: int | None = None + baseline_path: str | None = None + metrics_baseline_path: str | None = None + max_baseline_size_mb: int | None = None + cache_policy: CachePolicy = "reuse" + cache_path: str | None = None + max_cache_size_mb: int | None = None + + +@dataclass(frozen=True, slots=True) +class MCPGateRequest: + run_id: str | None = None + fail_on_new: bool = False + fail_threshold: int = -1 + fail_complexity: int = -1 + fail_coupling: int = -1 + fail_cohesion: int = -1 + fail_cycles: bool = False + fail_dead_code: bool = False + fail_health: int = -1 + fail_on_new_metrics: bool = False + fail_on_typing_regression: bool = False + fail_on_docstring_regression: bool = False + fail_on_api_break: bool = False + fail_on_untested_hotspots: bool = False + min_typing_coverage: int = -1 + min_docstring_coverage: int = -1 + coverage_min: int = DEFAULT_COVERAGE_MIN + + +@dataclass(frozen=True, slots=True) +class MCPRunRecord: + run_id: str + root: Path + request: MCPAnalysisRequest + comparison_settings: tuple[object, ...] + report_document: dict[str, object] + summary: dict[str, object] + changed_paths: tuple[str, ...] + changed_projection: dict[str, object] | None + warnings: tuple[str, ...] + failures: tuple[str, ...] + func_clones_count: int + block_clones_count: int + project_metrics: ProjectMetrics | None + coverage_join: CoverageJoinResult | None + suggestions: tuple[Suggestion, ...] + new_func: frozenset[str] + new_block: frozenset[str] + metrics_diff: MetricsDiff | None + + +class CodeCloneMCPRunStore: + def __init__(self, *, history_limit: int = DEFAULT_MCP_HISTORY_LIMIT) -> None: + self._history_limit = _validated_history_limit(history_limit) + self._lock = RLock() + self._records: OrderedDict[str, MCPRunRecord] = OrderedDict() + self._latest_run_id: str | None = None + + def register(self, record: MCPRunRecord) -> MCPRunRecord: + with self._lock: + self._records.pop(record.run_id, None) + self._records[record.run_id] = record + self._records.move_to_end(record.run_id) + self._latest_run_id = record.run_id + while len(self._records) > self._history_limit: + self._records.popitem(last=False) + return record + + def get(self, run_id: str | None = None) -> MCPRunRecord: + with self._lock: + resolved_run_id = self._resolve_run_id(run_id) + if resolved_run_id is None: + raise MCPRunNotFoundError("No matching MCP analysis run is available.") + return self._records[resolved_run_id] + + def _resolve_run_id(self, run_id: str | None) -> str | None: + if run_id is None: + return self._latest_run_id + if run_id in self._records: + return run_id + matches = [ + candidate for candidate in self._records if candidate.startswith(run_id) + ] + if len(matches) == 1: + return matches[0] + if len(matches) > 1: + raise MCPServiceContractError( + f"Run id '{run_id}' is ambiguous in this MCP session." + ) + return None + + def records(self) -> tuple[MCPRunRecord, ...]: + with self._lock: + return tuple(self._records.values()) + + def clear(self) -> tuple[str, ...]: + with self._lock: + removed_run_ids = tuple(self._records.keys()) + self._records.clear() + self._latest_run_id = None + return removed_run_ids + + +__all__ = [ + "CATEGORY_CLONE", + "CATEGORY_COHESION", + "CATEGORY_COMPLEXITY", + "CATEGORY_COUPLING", + "CATEGORY_DEAD_CODE", + "CATEGORY_DEPENDENCY", + "CATEGORY_STRUCTURAL", + "CONFIDENCE_MEDIUM", + "DEFAULT_BASELINE_PATH", + "DEFAULT_BLOCK_MIN_LOC", + "DEFAULT_BLOCK_MIN_STMT", + "DEFAULT_COVERAGE_MIN", + "DEFAULT_MAX_BASELINE_SIZE_MB", + "DEFAULT_MAX_CACHE_SIZE_MB", + "DEFAULT_MCP_HISTORY_LIMIT", + "DEFAULT_MIN_LOC", + "DEFAULT_MIN_STMT", + "DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD", + "DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD", + "DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD", + "DEFAULT_SEGMENT_MIN_LOC", + "DEFAULT_SEGMENT_MIN_STMT", + "EFFORT_EASY", + "EFFORT_HARD", + "EFFORT_MODERATE", + "FAMILY_CLONE", + "FAMILY_CLONES", + "FAMILY_DEAD_CODE", + "FAMILY_DESIGN", + "FAMILY_STRUCTURAL", + "REPORT_SCHEMA_VERSION", + "SEVERITY_CRITICAL", + "SEVERITY_INFO", + "SEVERITY_WARNING", + "SOURCE_KIND_ORDER", + "SOURCE_KIND_OTHER", + "SOURCE_KIND_PRODUCTION", + "_CHECK_TO_DIMENSION", + "_COMPACT_ITEM_EMPTY_VALUES", + "_COMPACT_ITEM_PATH_KEYS", + "_CONFIDENCE_WEIGHT", + "_DESIGN_CHECK_CONTEXT", + "_EFFORT_WEIGHT", + "_FOCUS_CHANGED_PATHS", + "_FOCUS_PRODUCTION", + "_FOCUS_REPOSITORY", + "_HEALTH_SCOPE_REPOSITORY", + "_HELP_TOPIC_SPECS", + "_HOTLIST_REPORT_KEYS", + "_MCP_CONFIG_KEYS", + "_METRICS_DETAIL_FAMILY_ALIASES", + "_NOVELTY_WEIGHT", + "_REPORT_DUMMY_PATH", + "_RUNTIME_WEIGHT", + "_SEVERITY_WEIGHT", + "_SHORT_RUN_ID_LENGTH", + "_SOURCE_KIND_BREAKDOWN_ORDER", + "_VALID_ANALYSIS_MODES", + "_VALID_CACHE_POLICIES", + "_VALID_COMPARISON_FOCUS", + "_VALID_DETAIL_LEVELS", + "_VALID_FINDING_FAMILIES", + "_VALID_FINDING_NOVELTY", + "_VALID_FINDING_SORT", + "_VALID_HELP_DETAILS", + "_VALID_HELP_TOPICS", + "_VALID_HOTLIST_KINDS", + "_VALID_METRICS_DETAIL_FAMILIES", + "_VALID_PR_SUMMARY_FORMATS", + "_VALID_REPORT_SECTIONS", + "_VALID_SEVERITIES", + "AnalysisMode", + "Baseline", + "Cache", + "CachePolicy", + "CacheStatus", + "ChoiceT", + "CodeCloneMCPRunStore", + "ComparisonFocus", + "ConfigValidationError", + "DetailLevel", + "FindingFamilyFilter", + "FindingNoveltyFilter", + "FindingSort", + "FreshnessKind", + "GatingResult", + "HelpDetail", + "HelpTopic", + "HotlistKind", + "Iterable", + "MCPAnalysisRequest", + "MCPFindingNotFoundError", + "MCPGateRequest", + "MCPRunNotFoundError", + "MCPRunRecord", + "MCPServiceContractError", + "MCPServiceError", + "Mapping", + "MetricGateConfig", + "MetricsDetailFamily", + "MetricsDiff", + "Namespace", + "OrderedDict", + "OutputPaths", + "PRSummaryFormat", + "Path", + "RLock", + "ReportSection", + "Sequence", + "_BufferConsole", + "__version__", + "_as_float", + "_as_int", + "_base_short_finding_id_payload", + "_disambiguated_clone_short_ids_payload", + "_disambiguated_short_finding_id_payload", + "_evaluate_report_gates", + "_git_diff_lines_payload", + "_json_text_payload", + "_leaf_symbol_name_payload", + "_load_report_document_payload", + "_suggestion_finding_id_payload", + "_summarize_metrics_diff", + "analyze", + "bootstrap", + "discover", + "load_pyproject_config", + "paginate", + "process", + "report", + "resolve_finding_id", + "short_id", +] diff --git a/codeclone/surfaces/mcp/_session_state_mixin.py b/codeclone/surfaces/mcp/_session_state_mixin.py new file mode 100644 index 0000000..a8d58e5 --- /dev/null +++ b/codeclone/surfaces/mcp/_session_state_mixin.py @@ -0,0 +1,1211 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from ...baseline.metrics_baseline import probe_metrics_baseline_section +from . import _session_helpers as _helpers +from ._session_baseline import ( + CloneBaselineState, + MetricsBaselineState, +) +from ._session_finding_mixin import _MCPSessionFindingMixin, _StateLock +from ._session_runtime import validate_numeric_args +from ._session_shared import ( + _FOCUS_PRODUCTION, + _FOCUS_REPOSITORY, + _HEALTH_SCOPE_REPOSITORY, + _HELP_TOPIC_SPECS, + _MCP_CONFIG_KEYS, + _METRICS_DETAIL_FAMILY_ALIASES, + _VALID_COMPARISON_FOCUS, + _VALID_HELP_DETAILS, + _VALID_HELP_TOPICS, + _VALID_METRICS_DETAIL_FAMILIES, + _VALID_PR_SUMMARY_FORMATS, + _VALID_REPORT_SECTIONS, + DEFAULT_BASELINE_PATH, + DEFAULT_BLOCK_MIN_LOC, + DEFAULT_BLOCK_MIN_STMT, + DEFAULT_COVERAGE_MIN, + DEFAULT_MAX_BASELINE_SIZE_MB, + DEFAULT_MAX_CACHE_SIZE_MB, + DEFAULT_MIN_LOC, + DEFAULT_MIN_STMT, + DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, + DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, + DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, + DEFAULT_SEGMENT_MIN_LOC, + DEFAULT_SEGMENT_MIN_STMT, + FAMILY_CLONE, + REPORT_SCHEMA_VERSION, + SOURCE_KIND_PRODUCTION, + CacheStatus, + CodeCloneMCPRunStore, + ComparisonFocus, + ConfigValidationError, + GatingResult, + HelpDetail, + HelpTopic, + Mapping, + MCPAnalysisRequest, + MCPGateRequest, + MCPRunRecord, + MCPServiceContractError, + MetricGateConfig, + MetricsDetailFamily, + MetricsDiff, + Namespace, + OrderedDict, + Path, + PRSummaryFormat, + ReportSection, + Sequence, + __version__, + _as_int, + _evaluate_report_gates, + _json_text_payload, + load_pyproject_config, + paginate, +) + + +class _MCPSessionChangedProjectionMixin(_MCPSessionFindingMixin): + _runs: CodeCloneMCPRunStore + _state_lock: _StateLock + _review_state: dict[str, OrderedDict[str, str | None]] + _last_gate_results: dict[str, dict[str, object]] + _spread_max_cache: dict[str, int] + + def _build_changed_projection( + self, + record: MCPRunRecord, + ) -> dict[str, object] | None: + if not record.changed_paths: + return None + items = self._query_findings( + record=record, + detail_level="summary", + changed_paths=record.changed_paths, + ) + new_count = sum(1 for item in items if str(item.get("novelty", "")) == "new") + known_count = sum( + 1 for item in items if str(item.get("novelty", "")) == "known" + ) + new_by_source_kind = _helpers._source_kind_breakdown( + item.get("source_kind") + for item in items + if str(item.get("novelty", "")) == "new" + ) + health_delta = _helpers._summary_health_delta(record.summary) + return { + "run_id": _helpers._short_run_id(record.run_id), + "changed_paths": list(record.changed_paths), + "total": len(items), + "new": new_count, + "known": known_count, + "new_by_source_kind": new_by_source_kind, + "items": items, + "health": dict(_helpers._summary_health_payload(record.summary)), + "health_delta": health_delta, + "verdict": _helpers._changed_verdict( + changed_projection={"new": new_count, "total": len(items)}, + health_delta=health_delta, + ), + } + + def _augment_summary_with_changed( + self, + *, + summary: Mapping[str, object], + changed_paths: Sequence[str], + changed_projection: Mapping[str, object] | None, + ) -> dict[str, object]: + payload = dict(summary) + if changed_paths: + payload["changed_paths"] = list(changed_paths) + if changed_projection is not None: + payload["changed_findings"] = { + "total": _as_int(changed_projection.get("total", 0), 0), + "new": _as_int(changed_projection.get("new", 0), 0), + "known": _as_int(changed_projection.get("known", 0), 0), + "items": [ + dict(_helpers._as_mapping(item)) + for item in _helpers._as_sequence(changed_projection.get("items"))[ + :10 + ] + ], + } + payload["health_delta"] = ( + _as_int(changed_projection.get("health_delta", 0), 0) + if changed_projection.get("health_delta") is not None + else None + ) + payload["verdict"] = str(changed_projection.get("verdict", "stable")) + return payload + + +class _MCPSessionAnalysisArgsMixin(_MCPSessionChangedProjectionMixin): + _runs: CodeCloneMCPRunStore + _state_lock: _StateLock + _review_state: dict[str, OrderedDict[str, str | None]] + _last_gate_results: dict[str, dict[str, object]] + _spread_max_cache: dict[str, int] + + def _comparison_index( + self, + record: MCPRunRecord, + *, + focus: str, + ) -> dict[str, dict[str, object]]: + findings = self._base_findings(record) + if focus == "clones": + findings = [f for f in findings if str(f.get("family", "")) == "clone"] + elif focus == "structural": + findings = [f for f in findings if str(f.get("family", "")) == "structural"] + elif focus == "metrics": + findings = [ + f + for f in findings + if str(f.get("family", "")) in {"design", "dead_code"} + ] + return {str(finding.get("id", "")): dict(finding) for finding in findings} + + def _build_args(self, *, root_path: Path, request: MCPAnalysisRequest) -> Namespace: + args = Namespace( + root=str(root_path), + min_loc=DEFAULT_MIN_LOC, + min_stmt=DEFAULT_MIN_STMT, + block_min_loc=DEFAULT_BLOCK_MIN_LOC, + block_min_stmt=DEFAULT_BLOCK_MIN_STMT, + segment_min_loc=DEFAULT_SEGMENT_MIN_LOC, + segment_min_stmt=DEFAULT_SEGMENT_MIN_STMT, + processes=None, + cache_path=None, + max_cache_size_mb=DEFAULT_MAX_CACHE_SIZE_MB, + baseline=DEFAULT_BASELINE_PATH, + max_baseline_size_mb=DEFAULT_MAX_BASELINE_SIZE_MB, + update_baseline=False, + fail_on_new=False, + fail_threshold=-1, + ci=False, + fail_complexity=-1, + fail_coupling=-1, + fail_cohesion=-1, + fail_cycles=False, + fail_dead_code=False, + fail_health=-1, + fail_on_new_metrics=False, + fail_on_typing_regression=False, + fail_on_docstring_regression=False, + fail_on_api_break=False, + min_typing_coverage=-1, + min_docstring_coverage=-1, + api_surface=False, + coverage_xml=None, + fail_on_untested_hotspots=False, + coverage_min=DEFAULT_COVERAGE_MIN, + design_complexity_threshold=DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, + design_coupling_threshold=DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, + design_cohesion_threshold=DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, + update_metrics_baseline=False, + metrics_baseline=DEFAULT_BASELINE_PATH, + skip_metrics=False, + skip_dead_code=False, + skip_dependencies=False, + golden_fixture_paths=(), + html_out=None, + json_out=None, + md_out=None, + sarif_out=None, + text_out=None, + no_progress=True, + no_color=True, + quiet=True, + verbose=False, + debug=False, + open_html_report=False, + timestamped_report_paths=False, + ) + if request.respect_pyproject: + try: + config_values = load_pyproject_config(root_path) + except ConfigValidationError as exc: + raise MCPServiceContractError(str(exc)) from exc + for key in sorted(_MCP_CONFIG_KEYS.intersection(config_values)): + setattr(args, key, config_values[key]) + + self._apply_request_overrides(args=args, root_path=root_path, request=request) + + if request.analysis_mode == "clones_only": + args.skip_metrics = True + args.skip_dead_code = True + args.skip_dependencies = True + else: + args.skip_metrics = False + args.skip_dead_code = False + args.skip_dependencies = False + + if not validate_numeric_args(args): + raise MCPServiceContractError( + "Numeric analysis settings must be non-negative and thresholds " + "must be >= -1. Coverage thresholds must be between 0 and 100." + ) + + return args + + def _apply_request_overrides( + self, + *, + args: Namespace, + root_path: Path, + request: MCPAnalysisRequest, + ) -> None: + override_map: dict[str, object | None] = { + "processes": request.processes, + "min_loc": request.min_loc, + "min_stmt": request.min_stmt, + "block_min_loc": request.block_min_loc, + "block_min_stmt": request.block_min_stmt, + "segment_min_loc": request.segment_min_loc, + "segment_min_stmt": request.segment_min_stmt, + "api_surface": request.api_surface, + "coverage_min": request.coverage_min, + "max_baseline_size_mb": request.max_baseline_size_mb, + "max_cache_size_mb": request.max_cache_size_mb, + "design_complexity_threshold": request.complexity_threshold, + "design_coupling_threshold": request.coupling_threshold, + "design_cohesion_threshold": request.cohesion_threshold, + } + for key, value in override_map.items(): + if value is not None: + setattr(args, key, value) + + if request.baseline_path is not None: + args.baseline = str( + _helpers._resolve_optional_path(request.baseline_path, root_path) + ) + if request.metrics_baseline_path is not None: + args.metrics_baseline = str( + _helpers._resolve_optional_path( + request.metrics_baseline_path, + root_path, + ) + ) + if request.cache_path is not None: + args.cache_path = str( + _helpers._resolve_optional_path(request.cache_path, root_path) + ) + if request.coverage_xml is not None: + args.coverage_xml = str( + _helpers._resolve_optional_path(request.coverage_xml, root_path) + ) + + def _resolve_baseline_inputs( + self, + *, + root_path: Path, + args: Namespace, + ) -> tuple[Path, bool, Path, bool, dict[str, object] | None]: + baseline_path = _helpers._resolve_optional_path(str(args.baseline), root_path) + baseline_exists = baseline_path.exists() + + metrics_baseline_arg_path = _helpers._resolve_optional_path( + str(args.metrics_baseline), + root_path, + ) + shared_baseline_payload: dict[str, object] | None = None + if metrics_baseline_arg_path == baseline_path: + probe = probe_metrics_baseline_section(metrics_baseline_arg_path) + metrics_baseline_exists = probe.has_metrics_section + shared_baseline_payload = probe.payload + else: + metrics_baseline_exists = metrics_baseline_arg_path.exists() + + return ( + baseline_path, + baseline_exists, + metrics_baseline_arg_path, + metrics_baseline_exists, + shared_baseline_payload, + ) + + +class _MCPSessionRunSummaryBuilderMixin(_MCPSessionAnalysisArgsMixin): + _runs: CodeCloneMCPRunStore + _state_lock: _StateLock + _review_state: dict[str, OrderedDict[str, str | None]] + _last_gate_results: dict[str, dict[str, object]] + _spread_max_cache: dict[str, int] + + def _changed_analysis_payload( + self, + record: MCPRunRecord, + ) -> dict[str, object]: + changed_projection = _helpers._as_mapping(record.changed_projection) + health = _helpers._summary_health_payload(record.summary) + health_payload = ( + { + "score": health.get("score"), + "grade": health.get("grade"), + } + if health.get("available") is not False + else dict(health) + ) + return { + "run_id": _helpers._short_run_id(record.run_id), + "focus": "changed_paths", + "health_scope": "repository", + "baseline": dict( + _helpers._summary_trusted_state_payload( + record.summary, + key="baseline", + ) + ), + "changed_files": len(record.changed_paths), + "health": health_payload, + "analysis_profile": _helpers._summary_analysis_profile_payload( + record.summary + ), + "health_delta": ( + _as_int(changed_projection.get("health_delta", 0), 0) + if changed_projection.get("health_delta") is not None + else None + ), + "verdict": str(changed_projection.get("verdict", "stable")), + "new_findings": _as_int(changed_projection.get("new", 0), 0), + "new_by_source_kind": dict( + _helpers._as_mapping(changed_projection.get("new_by_source_kind")) + ), + "resolved_findings": 0, + "changed_findings": [], + "coverage_join": _helpers._summary_coverage_join_payload(record), + } + + def _build_run_summary_payload( + self, + *, + run_id: str, + root_path: Path, + request: MCPAnalysisRequest, + report_document: Mapping[str, object], + baseline_state: CloneBaselineState, + metrics_baseline_state: MetricsBaselineState, + cache_status: CacheStatus, + new_func: Sequence[str] | set[str], + new_block: Sequence[str] | set[str], + metrics_diff: MetricsDiff | None, + warnings: Sequence[str], + failures: Sequence[str], + ) -> dict[str, object]: + meta = _helpers._as_mapping(report_document.get("meta")) + meta_baseline = _helpers._as_mapping(meta.get("baseline")) + meta_metrics_baseline = _helpers._as_mapping(meta.get("metrics_baseline")) + meta_cache = _helpers._as_mapping(meta.get("cache")) + inventory = _helpers._as_mapping(report_document.get("inventory")) + findings = _helpers._as_mapping(report_document.get("findings")) + metrics = _helpers._as_mapping(report_document.get("metrics")) + metrics_summary = _helpers._as_mapping(metrics.get("summary")) + summary = _helpers._as_mapping(findings.get("summary")) + analysis_profile = _helpers._summary_analysis_profile_payload(meta) + payload = { + "run_id": run_id, + "root": str(root_path), + "analysis_mode": request.analysis_mode, + "codeclone_version": meta.get("codeclone_version", __version__), + "python_tag": str(meta.get("python_tag", "")), + "report_schema_version": report_document.get( + "report_schema_version", + REPORT_SCHEMA_VERSION, + ), + "baseline": { + "path": meta_baseline.get( + "path", + str(root_path / DEFAULT_BASELINE_PATH), + ), + "loaded": bool(meta_baseline.get("loaded", baseline_state.loaded)), + "status": str(meta_baseline.get("status", baseline_state.status.value)), + "trusted_for_diff": baseline_state.trusted_for_diff, + "python_tag": meta_baseline.get("python_tag"), + }, + "metrics_baseline": { + "path": meta_metrics_baseline.get( + "path", + str(root_path / DEFAULT_BASELINE_PATH), + ), + "loaded": bool( + meta_metrics_baseline.get( + "loaded", + metrics_baseline_state.loaded, + ) + ), + "status": str( + meta_metrics_baseline.get( + "status", + metrics_baseline_state.status.value, + ) + ), + "trusted_for_diff": metrics_baseline_state.trusted_for_diff, + }, + "cache": { + "path": meta_cache.get("path"), + "status": str(meta_cache.get("status", cache_status.value)), + "used": bool(meta_cache.get("used", False)), + "schema_version": meta_cache.get("schema_version"), + }, + "inventory": dict(inventory), + "findings_summary": dict(summary), + "health": dict(_helpers._as_mapping(metrics_summary.get("health"))), + "baseline_diff": { + "new_function_clone_groups": len(new_func), + "new_block_clone_groups": len(new_block), + "new_clone_groups_total": len(new_func) + len(new_block), + }, + "metrics_diff": _helpers._metrics_diff_payload(metrics_diff), + "warnings": list(warnings), + "failures": list(failures), + } + if analysis_profile: + payload["analysis_profile"] = analysis_profile + payload["cache"] = _helpers._summary_cache_payload(payload) + payload["health"] = _helpers._summary_health_payload(payload) + return payload + + +class _MCPSessionSummaryMixin(_MCPSessionRunSummaryBuilderMixin): + _runs: CodeCloneMCPRunStore + _state_lock: _StateLock + _review_state: dict[str, OrderedDict[str, str | None]] + _last_gate_results: dict[str, dict[str, object]] + _spread_max_cache: dict[str, int] + + def _summary_payload( + self, + summary: Mapping[str, object], + *, + record: MCPRunRecord | None = None, + ) -> dict[str, object]: + inventory = _helpers._as_mapping(summary.get("inventory")) + if ( + not summary.get("run_id") + and not record + and "inventory" in summary + and not summary.get("baseline") + ): + return { + "focus": _FOCUS_REPOSITORY, + "health_scope": _HEALTH_SCOPE_REPOSITORY, + "inventory": _helpers._summary_inventory_payload(inventory), + "health": _helpers._summary_health_payload(summary), + } + resolved_run_id = ( + record.run_id if record is not None else str(summary.get("run_id", "")) + ) + payload: dict[str, object] = { + "run_id": ( + _helpers._short_run_id(resolved_run_id) if resolved_run_id else "" + ), + "focus": _FOCUS_REPOSITORY, + "health_scope": _HEALTH_SCOPE_REPOSITORY, + "version": str(summary.get("codeclone_version", __version__)), + "schema": str(summary.get("report_schema_version", "")), + "mode": str(summary.get("analysis_mode", "")), + "baseline": self._summary_baseline_payload(summary), + "metrics_baseline": self._summary_metrics_baseline_payload(summary), + "cache": _helpers._summary_cache_payload(summary), + "inventory": _helpers._summary_inventory_payload(inventory), + "health": _helpers._summary_health_payload(summary), + "findings": self._summary_findings_payload(summary, record=record), + "diff": _helpers._summary_diff_payload(summary), + "warnings": list(_helpers._as_sequence(summary.get("warnings"))), + "failures": list(_helpers._as_sequence(summary.get("failures"))), + } + analysis_profile = _helpers._summary_analysis_profile_payload(summary) + if analysis_profile: + payload["analysis_profile"] = analysis_profile + if record is not None: + coverage_join = _helpers._summary_coverage_join_payload(record) + if coverage_join: + payload["coverage_join"] = coverage_join + security_surfaces = _helpers._summary_security_surfaces_payload(record) + if security_surfaces: + payload["security_surfaces"] = security_surfaces + return payload + + def _summary_baseline_payload( + self, + summary: Mapping[str, object], + ) -> dict[str, object]: + return _helpers._summary_trusted_state_payload(summary, key="baseline") + + def _summary_metrics_baseline_payload( + self, + summary: Mapping[str, object], + ) -> dict[str, object]: + return _helpers._summary_trusted_state_payload(summary, key="metrics_baseline") + + def _summary_findings_payload( + self, + summary: Mapping[str, object], + *, + record: MCPRunRecord | None, + ) -> dict[str, object]: + findings_summary = _helpers._as_mapping(summary.get("findings_summary")) + if record is None: + return { + "total": _as_int(findings_summary.get("total", 0), 0), + "new": 0, + "known": 0, + "by_family": {}, + "production": 0, + "new_by_source_kind": _helpers._source_kind_breakdown(()), + } + findings = self._base_findings(record) + by_family: dict[str, int] = { + "clones": 0, + "structural": 0, + "dead_code": 0, + "design": 0, + } + new_count = 0 + known_count = 0 + production_count = 0 + new_by_source_kind = _helpers._source_kind_breakdown( + _helpers._finding_source_kind(finding) + for finding in findings + if str(finding.get("novelty", "")).strip() == "new" + ) + for finding in findings: + family = str(finding.get("family", "")).strip() + family_key = "clones" if family == FAMILY_CLONE else family + if family_key in by_family: + by_family[family_key] += 1 + if str(finding.get("novelty", "")).strip() == "new": + new_count += 1 + else: + known_count += 1 + if _helpers._finding_source_kind(finding) == SOURCE_KIND_PRODUCTION: + production_count += 1 + return { + "total": len(findings), + "new": new_count, + "known": known_count, + "by_family": {key: value for key, value in by_family.items() if value > 0}, + "production": production_count, + "new_by_source_kind": new_by_source_kind, + } + + def _metrics_detail_payload( + self, + *, + metrics: Mapping[str, object], + family: MetricsDetailFamily | None, + path: str | None, + offset: int, + limit: int, + ) -> dict[str, object]: + summary = dict(_helpers._as_mapping(metrics.get("summary"))) + families = _helpers._as_mapping(metrics.get("families")) + normalized_path = _helpers._normalize_relative_path(path or "") + if family is None and not normalized_path: + return { + "summary": summary, + "_hint": "Use family and/or path parameters to access per-item detail.", + } + family_names = (family,) if family is not None else tuple(sorted(families)) + items: list[dict[str, object]] = [] + for family_name in family_names: + family_payload = _helpers._as_mapping(families.get(family_name)) + for item in _helpers._as_sequence(family_payload.get("items")): + item_map = _helpers._as_mapping(item) + if normalized_path and not _helpers._metric_item_matches_path( + item_map, + normalized_path, + ): + continue + compact_item = _helpers._compact_metrics_item(item_map) + if family is None: + compact_item = {"family": family_name, **compact_item} + items.append(compact_item) + if family is None: + items.sort( + key=lambda item: ( + str(item.get("family", "")), + str(item.get("path", "")), + str(item.get("qualname", "")), + _as_int(item.get("start_line", 0), 0), + ) + ) + page = paginate(items, offset=offset, limit=limit, max_limit=200) + return { + "family": family, + "path": normalized_path or None, + "offset": page.offset, + "limit": page.limit, + "returned": len(page.items), + "total": page.total, + "has_more": page.next_offset is not None, + "items": page.items, + } + + def _derived_section_payload(self, record: MCPRunRecord) -> dict[str, object]: + derived = _helpers._as_mapping(record.report_document.get("derived")) + if not derived: + raise MCPServiceContractError( + "Report section 'derived' is not available in this run." + ) + suggestions = self._triage_suggestion_rows(record) + canonical_to_short, _ = self._finding_id_maps(record) + hotlists = _helpers._as_mapping(derived.get("hotlists")) + projected_hotlists: dict[str, list[str]] = {} + for hotlist_key, hotlist_ids in hotlists.items(): + projected_hotlists[hotlist_key] = [ + canonical_to_short.get( + str(finding_id), + _helpers._base_short_finding_id(str(finding_id)), + ) + for finding_id in _helpers._as_sequence(hotlist_ids) + if str(finding_id) + ] + return { + "suggestions": suggestions, + "hotlists": projected_hotlists, + } + + +class _MCPSessionReportMixin(_MCPSessionSummaryMixin): + _runs: CodeCloneMCPRunStore + _state_lock: _StateLock + _review_state: dict[str, OrderedDict[str, str | None]] + _last_gate_results: dict[str, dict[str, object]] + _spread_max_cache: dict[str, int] + + def get_run_summary(self, run_id: str | None = None) -> dict[str, object]: + record = self._runs.get(run_id) + return self._summary_payload(record.summary, record=record) + + def compare_runs( + self, + *, + run_id_before: str, + run_id_after: str | None = None, + focus: ComparisonFocus = "all", + ) -> dict[str, object]: + validated_focus = _helpers._validate_choice( + "focus", + focus, + _VALID_COMPARISON_FOCUS, + ) + before = self._runs.get(run_id_before) + after = self._runs.get(run_id_after) + before_findings = self._comparison_index(before, focus=validated_focus) + after_findings = self._comparison_index(after, focus=validated_focus) + before_ids = set(before_findings) + after_ids = set(after_findings) + regressions = sorted(after_ids - before_ids) + improvements = sorted(before_ids - after_ids) + common = before_ids & after_ids + health_before = _helpers._summary_health_score(before.summary) + health_after = _helpers._summary_health_score(after.summary) + comparability = _helpers._comparison_scope(before=before, after=after) + comparable = bool(comparability["comparable"]) + health_delta = ( + health_after - health_before + if comparable and health_before is not None and health_after is not None + else None + ) + verdict = ( + _helpers._comparison_verdict( + regressions=len(regressions), + improvements=len(improvements), + health_delta=health_delta, + ) + if comparable + else "incomparable" + ) + regressions_payload = ( + [ + self._comparison_finding_card( + after, + after_findings[finding_id], + ) + for finding_id in regressions + ] + if comparable + else [] + ) + improvements_payload = ( + [ + self._comparison_finding_card( + before, + before_findings[finding_id], + ) + for finding_id in improvements + ] + if comparable + else [] + ) + payload: dict[str, object] = { + "before": { + "run_id": _helpers._short_run_id(before.run_id), + "health": health_before, + }, + "after": { + "run_id": _helpers._short_run_id(after.run_id), + "health": health_after, + }, + "comparable": comparable, + "health_delta": health_delta, + "verdict": verdict, + "regressions": regressions_payload, + "improvements": improvements_payload, + "unchanged": len(common) if comparable else None, + "summary": _helpers._comparison_summary_text( + comparable=comparable, + comparability_reason=str(comparability["reason"]), + regressions=len(regressions), + improvements=len(improvements), + health_delta=health_delta, + ), + } + if not comparable: + payload["reason"] = comparability["reason"] + return payload + + +class _MCPSessionStateMixin(_MCPSessionReportMixin): + _runs: CodeCloneMCPRunStore + _state_lock: _StateLock + _review_state: dict[str, OrderedDict[str, str | None]] + _last_gate_results: dict[str, dict[str, object]] + _spread_max_cache: dict[str, int] + + def evaluate_gates(self, request: MCPGateRequest) -> dict[str, object]: + record = self._runs.get(request.run_id) + gate_result = self._evaluate_gate_snapshot(record=record, request=request) + result = { + "run_id": _helpers._short_run_id(record.run_id), + "would_fail": gate_result.exit_code != 0, + "exit_code": gate_result.exit_code, + "reasons": list(gate_result.reasons), + "config": { + "fail_on_new": request.fail_on_new, + "fail_threshold": request.fail_threshold, + "fail_complexity": request.fail_complexity, + "fail_coupling": request.fail_coupling, + "fail_cohesion": request.fail_cohesion, + "fail_cycles": request.fail_cycles, + "fail_dead_code": request.fail_dead_code, + "fail_health": request.fail_health, + "fail_on_new_metrics": request.fail_on_new_metrics, + "fail_on_typing_regression": request.fail_on_typing_regression, + "fail_on_docstring_regression": request.fail_on_docstring_regression, + "fail_on_api_break": request.fail_on_api_break, + "fail_on_untested_hotspots": request.fail_on_untested_hotspots, + "min_typing_coverage": request.min_typing_coverage, + "min_docstring_coverage": request.min_docstring_coverage, + "coverage_min": request.coverage_min, + }, + } + with self._state_lock: + self._last_gate_results[record.run_id] = dict(result) + return result + + def _evaluate_gate_snapshot( + self, + *, + record: MCPRunRecord, + request: MCPGateRequest, + ) -> GatingResult: + if request.fail_on_untested_hotspots: + if record.coverage_join is None: + raise MCPServiceContractError( + "Coverage gating requires a run created with coverage_xml." + ) + if record.coverage_join.status != "ok": + detail = record.coverage_join.invalid_reason or "invalid coverage input" + raise MCPServiceContractError( + "Coverage gating requires a valid Cobertura XML input. " + f"Reason: {detail}" + ) + return _evaluate_report_gates( + report_document=record.report_document, + config=MetricGateConfig( + fail_complexity=request.fail_complexity, + fail_coupling=request.fail_coupling, + fail_cohesion=request.fail_cohesion, + fail_cycles=request.fail_cycles, + fail_dead_code=request.fail_dead_code, + fail_health=request.fail_health, + fail_on_new_metrics=request.fail_on_new_metrics, + fail_on_typing_regression=request.fail_on_typing_regression, + fail_on_docstring_regression=request.fail_on_docstring_regression, + fail_on_api_break=request.fail_on_api_break, + fail_on_untested_hotspots=request.fail_on_untested_hotspots, + min_typing_coverage=request.min_typing_coverage, + min_docstring_coverage=request.min_docstring_coverage, + coverage_min=request.coverage_min, + fail_on_new=request.fail_on_new, + fail_threshold=request.fail_threshold, + ), + baseline_status=str( + _helpers._as_mapping( + _helpers._as_mapping(record.report_document.get("meta")).get( + "baseline" + ) + ).get("status", "") + ), + metrics_diff=record.metrics_diff, + clone_new_count=len(record.new_func) + len(record.new_block), + clone_total=record.func_clones_count + record.block_clones_count, + ) + + def get_report_section( + self, + *, + run_id: str | None = None, + section: ReportSection = "all", + family: MetricsDetailFamily | None = None, + path: str | None = None, + offset: int = 0, + limit: int = 50, + ) -> dict[str, object]: + validated_section = _helpers._validate_choice( + "section", + section, + _VALID_REPORT_SECTIONS, + ) + record = self._runs.get(run_id) + report_document = record.report_document + if validated_section == "all": + return dict(report_document) + if validated_section == "changed": + if record.changed_projection is None: + raise MCPServiceContractError( + "Report section 'changed' is not available in this run." + ) + return dict(record.changed_projection) + if validated_section == "metrics": + metrics = _helpers._as_mapping(report_document.get("metrics")) + return {"summary": dict(_helpers._as_mapping(metrics.get("summary")))} + if validated_section == "metrics_detail": + metrics = _helpers._as_mapping(report_document.get("metrics")) + if not metrics: + raise MCPServiceContractError( + "Report section 'metrics_detail' is not available in this run." + ) + validated_family_input = _helpers._validate_optional_choice( + "family", + family, + _VALID_METRICS_DETAIL_FAMILIES, + ) + normalized_family = ( + _METRICS_DETAIL_FAMILY_ALIASES.get( + str(validated_family_input), + str(validated_family_input), + ) + if validated_family_input is not None + else None + ) + validated_family = _helpers._metrics_detail_family(normalized_family) + return self._metrics_detail_payload( + metrics=metrics, + family=validated_family, + path=path, + offset=offset, + limit=limit, + ) + if validated_section == "derived": + return self._derived_section_payload(record) + payload = report_document.get(validated_section) + if not isinstance(payload, Mapping): + raise MCPServiceContractError( + f"Report section '{validated_section}' is not available in this run." + ) + return dict(payload) + + def get_production_triage( + self, + *, + run_id: str | None = None, + max_hotspots: int = 3, + max_suggestions: int = 3, + ) -> dict[str, object]: + record = self._runs.get(run_id) + summary = self._summary_payload(record.summary, record=record) + findings = self._base_findings(record) + findings_breakdown = _helpers._source_kind_breakdown( + _helpers._finding_source_kind(finding) for finding in findings + ) + suggestion_rows = self._triage_suggestion_rows(record) + suggestion_breakdown = _helpers._source_kind_breakdown( + row.get("source_kind") for row in suggestion_rows + ) + hotspot_limit = max(1, min(max_hotspots, 10)) + suggestion_limit = max(1, min(max_suggestions, 10)) + production_hotspots = self._hotspot_rows( + record=record, + kind="production_hotspots", + detail_level="summary", + changed_paths=(), + exclude_reviewed=False, + ) + production_suggestions = [ + dict(row) + for row in suggestion_rows + if str(row.get("source_kind", "")) == SOURCE_KIND_PRODUCTION + ] + payload: dict[str, object] = { + "run_id": _helpers._short_run_id(record.run_id), + "focus": _FOCUS_PRODUCTION, + "health_scope": _HEALTH_SCOPE_REPOSITORY, + "baseline": dict(_helpers._as_mapping(summary.get("baseline"))), + "health": dict(_helpers._summary_health_payload(summary)), + "cache": dict(_helpers._as_mapping(summary.get("cache"))), + "findings": { + "total": len(findings), + "by_source_kind": findings_breakdown, + "new_by_source_kind": dict( + _helpers._as_mapping( + _helpers._as_mapping(summary.get("findings")).get( + "new_by_source_kind" + ) + ) + ), + "outside_focus": len(findings) + - findings_breakdown[SOURCE_KIND_PRODUCTION], + }, + "top_hotspots": { + "kind": "production_hotspots", + "available": len(production_hotspots), + "returned": min(len(production_hotspots), hotspot_limit), + "items": [ + dict(_helpers._as_mapping(item)) + for item in production_hotspots[:hotspot_limit] + ], + }, + "suggestions": { + "total": len(suggestion_rows), + "by_source_kind": suggestion_breakdown, + "outside_focus": len(suggestion_rows) + - suggestion_breakdown[SOURCE_KIND_PRODUCTION], + }, + "top_suggestions": { + "available": len(production_suggestions), + "returned": min(len(production_suggestions), suggestion_limit), + "items": production_suggestions[:suggestion_limit], + }, + } + analysis_profile = _helpers._summary_analysis_profile_payload(summary) + if analysis_profile: + payload["analysis_profile"] = analysis_profile + coverage_join = _helpers._summary_coverage_join_payload(record) + if coverage_join: + payload["coverage_join"] = coverage_join + security_surfaces = _helpers._summary_security_surfaces_payload(record) + if security_surfaces: + payload["security_surfaces"] = security_surfaces + return payload + + def get_help( + self, + *, + topic: HelpTopic, + detail: HelpDetail = "compact", + ) -> dict[str, object]: + validated_topic = _helpers._validate_choice("topic", topic, _VALID_HELP_TOPICS) + validated_detail = _helpers._validate_choice( + "detail", + detail, + _VALID_HELP_DETAILS, + ) + spec = _HELP_TOPIC_SPECS[validated_topic] + payload: dict[str, object] = { + "topic": validated_topic, + "detail": validated_detail, + "summary": spec.summary, + "key_points": list(spec.key_points), + "recommended_tools": list(spec.recommended_tools), + "doc_links": [ + {"title": title, "url": url} for title, url in spec.doc_links + ], + } + if validated_detail == "normal": + if spec.warnings: + payload["warnings"] = list(spec.warnings) + if spec.anti_patterns: + payload["anti_patterns"] = list(spec.anti_patterns) + return payload + + def generate_pr_summary( + self, + *, + run_id: str | None = None, + changed_paths: tuple[str, ...] = (), + git_diff_ref: str | None = None, + format: PRSummaryFormat = "markdown", + ) -> dict[str, object]: + output_format = _helpers._validate_choice( + "format", + format, + _VALID_PR_SUMMARY_FORMATS, + ) + record = self._runs.get(run_id) + paths_filter = self._resolve_query_changed_paths( + record=record, + changed_paths=changed_paths, + git_diff_ref=git_diff_ref, + prefer_record_paths=True, + ) + changed_items = self._query_findings( + record=record, + detail_level="summary", + changed_paths=paths_filter, + ) + previous = self._previous_run_for_root(record) + resolved: list[dict[str, object]] = [] + if previous is not None: + compare_payload = self.compare_runs( + run_id_before=previous.run_id, + run_id_after=record.run_id, + focus="all", + ) + resolved = _helpers._dict_rows(compare_payload.get("improvements")) + with self._state_lock: + gate_result = dict( + self._last_gate_results.get( + record.run_id, + {"would_fail": False, "reasons": []}, + ) + ) + verdict = _helpers._changed_verdict( + changed_projection={ + "total": len(changed_items), + "new": sum( + 1 for item in changed_items if str(item.get("novelty", "")) == "new" + ), + }, + health_delta=_helpers._summary_health_delta(record.summary), + ) + payload: dict[str, object] = { + "run_id": _helpers._short_run_id(record.run_id), + "changed_files": len(paths_filter), + "health": _helpers._summary_health_payload(record.summary), + "health_delta": _helpers._summary_health_delta(record.summary), + "verdict": verdict, + "new_findings_in_changed_files": changed_items, + "resolved": resolved, + "blocking_gates": _helpers._string_rows(gate_result.get("reasons")), + } + if output_format == "json": + return payload + return { + "run_id": _helpers._short_run_id(record.run_id), + "format": output_format, + "content": _helpers._render_pr_summary_markdown(payload), + } + + def clear_session_runs(self) -> dict[str, object]: + removed_run_ids = self._runs.clear() + with self._state_lock: + cleared_review_entries = sum( + len(entries) for entries in self._review_state.values() + ) + cleared_gate_results = len(self._last_gate_results) + cleared_spread_cache_entries = len(self._spread_max_cache) + self._review_state.clear() + self._last_gate_results.clear() + self._spread_max_cache.clear() + return { + "cleared_runs": len(removed_run_ids), + "cleared_run_ids": [ + _helpers._short_run_id(run_id) for run_id in removed_run_ids + ], + "cleared_review_entries": cleared_review_entries, + "cleared_gate_results": cleared_gate_results, + "cleared_spread_cache_entries": cleared_spread_cache_entries, + } + + def read_resource(self, uri: str) -> str: + if uri == "codeclone://schema": + return _json_text_payload(_helpers._schema_resource_payload()) + if uri == "codeclone://latest/triage": + latest = self._runs.get() + return _json_text_payload(self.get_production_triage(run_id=latest.run_id)) + latest_prefix = "codeclone://latest/" + run_prefix = "codeclone://runs/" + if uri.startswith(latest_prefix): + latest = self._runs.get() + suffix = uri[len(latest_prefix) :] + return self._render_resource(latest, suffix) + if not uri.startswith(run_prefix): + raise MCPServiceContractError(f"Unsupported CodeClone resource URI: {uri}") + remainder = uri[len(run_prefix) :] + run_id, sep, suffix = remainder.partition("/") + if not sep: + raise MCPServiceContractError(f"Unsupported CodeClone resource URI: {uri}") + record = self._runs.get(run_id) + return self._render_resource(record, suffix) + + def _render_resource(self, record: MCPRunRecord, suffix: str) -> str: + if suffix == "summary": + return _json_text_payload( + self._summary_payload(record.summary, record=record) + ) + if suffix == "triage": + raise MCPServiceContractError( + "Production triage is exposed only as codeclone://latest/triage." + ) + if suffix == "health": + return _json_text_payload(_helpers._summary_health_payload(record.summary)) + if suffix == "gates": + with self._state_lock: + gate_result = self._last_gate_results.get(record.run_id) + if gate_result is None: + raise MCPServiceContractError( + "No gate evaluation result is available in this MCP session." + ) + return _json_text_payload(gate_result) + if suffix == "changed": + if record.changed_projection is None: + raise MCPServiceContractError( + "Changed-findings projection is not available in this run." + ) + return _json_text_payload(record.changed_projection) + if suffix == "schema": + return _json_text_payload(_helpers._schema_resource_payload()) + if suffix == "report.json": + return _json_text_payload(record.report_document, sort_keys=False) + if suffix == "overview": + return _json_text_payload( + self.list_hotspots(kind="highest_spread", run_id=record.run_id) + ) + finding_prefix = "findings/" + if suffix.startswith(finding_prefix): + finding_id = suffix[len(finding_prefix) :] + return _json_text_payload( + self._service_get_finding( + run_id=record.run_id, + finding_id=finding_id, + ) + ) + raise MCPServiceContractError( + f"Unsupported CodeClone resource suffix '{suffix}'." + ) + + def _prune_session_state(self) -> None: + active_run_ids = {record.run_id for record in self._runs.records()} + with self._state_lock: + for state_map in ( + self._review_state, + self._last_gate_results, + self._spread_max_cache, + ): + stale_run_ids = [ + run_id for run_id in state_map if run_id not in active_run_ids + ] + for run_id in stale_run_ids: + state_map.pop(run_id, None) diff --git a/codeclone/surfaces/mcp/payloads.py b/codeclone/surfaces/mcp/payloads.py new file mode 100644 index 0000000..a984206 --- /dev/null +++ b/codeclone/surfaces/mcp/payloads.py @@ -0,0 +1,56 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 + +from __future__ import annotations + +from collections.abc import Mapping, Sequence +from dataclasses import dataclass +from typing import Generic, TypeVar + +T = TypeVar("T") + + +@dataclass(frozen=True, slots=True) +class PageWindow(Generic[T]): + items: list[T] + offset: int + limit: int + total: int + next_offset: int | None + + +def paginate( + items: Sequence[T], + *, + offset: int, + limit: int, + max_limit: int, +) -> PageWindow[T]: + normalized_offset = max(0, offset) + normalized_limit = max(1, min(limit, max_limit)) + page = list(items[normalized_offset : normalized_offset + normalized_limit]) + next_offset = normalized_offset + len(page) + return PageWindow( + items=page, + offset=normalized_offset, + limit=normalized_limit, + total=len(items), + next_offset=(next_offset if next_offset < len(items) else None), + ) + + +def resolve_finding_id( + *, + canonical_to_short: Mapping[str, str], + short_to_canonical: Mapping[str, str], + finding_id: str, +) -> str | None: + if finding_id in canonical_to_short: + return finding_id + return short_to_canonical.get(finding_id) + + +def short_id(value: str, *, length: int = 8) -> str: + return value[:length] diff --git a/codeclone/mcp_server.py b/codeclone/surfaces/mcp/server.py similarity index 90% rename from codeclone/mcp_server.py rename to codeclone/surfaces/mcp/server.py index ee7a6fc..3599bac 100644 --- a/codeclone/mcp_server.py +++ b/codeclone/surfaces/mcp/server.py @@ -10,16 +10,19 @@ import ipaddress import sys from collections.abc import Callable -from typing import TYPE_CHECKING, Any, Literal, TypeVar, cast +from typing import TYPE_CHECKING, Literal, TypeVar -from . import __version__ -from .contracts import DOCS_URL -from .mcp_service import ( +from ... import __version__ +from ...contracts import DEFAULT_COVERAGE_MIN, DOCS_URL +from .service import CodeCloneMCPService +from .session import ( DEFAULT_MCP_HISTORY_LIMIT, MAX_MCP_HISTORY_LIMIT, - CodeCloneMCPService, + AnalysisMode, + CachePolicy, MCPAnalysisRequest, MCPGateRequest, + MCPServiceContractError, _validated_history_limit, ) @@ -48,6 +51,12 @@ "CodeClone MCP support requires the optional 'mcp' extra. " "Install it with: pip install 'codeclone[mcp]'" ) +DEFAULT_MCP_HOST = "127.0.0.1" +DEFAULT_MCP_PORT = 8000 +DEFAULT_MCP_JSON_RESPONSE = True +DEFAULT_MCP_STATELESS_HTTP = True +DEFAULT_MCP_DEBUG = False +DEFAULT_MCP_LOG_LEVEL: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = "INFO" class MCPDependencyError(RuntimeError): @@ -64,12 +73,13 @@ def _load_mcp_runtime() -> tuple[ ToolAnnotations, ]: try: - from mcp.server.fastmcp import FastMCP as runtime_fastmcp + from mcp.server.fastmcp import FastMCP as imported_fastmcp from mcp.types import ToolAnnotations as runtime_tool_annotations except ImportError as exc: raise MCPDependencyError(_MCP_INSTALL_HINT) from exc + runtime_fastmcp: type[FastMCP] = imported_fastmcp return ( - cast("type[FastMCP]", runtime_fastmcp), + runtime_fastmcp, runtime_tool_annotations( readOnlyHint=True, destructiveHint=False, @@ -91,16 +101,44 @@ def _load_mcp_runtime() -> tuple[ ) +def _validated_analysis_mode(value: str) -> AnalysisMode: + if value == "full": + return "full" + if value == "clones_only": + return "clones_only" + raise MCPServiceContractError( + f"Invalid value for analysis_mode: {value!r}. " + "Expected one of: clones_only, full." + ) + + +def _validated_cache_policy(value: str) -> CachePolicy: + if value == "reuse": + return "reuse" + if value == "refresh": + return "refresh" + if value == "off": + return "off" + raise MCPServiceContractError( + f"Invalid value for cache_policy: {value!r}. " + "Expected one of: off, refresh, reuse." + ) + + def build_mcp_server( *, history_limit: int = DEFAULT_MCP_HISTORY_LIMIT, - host: str = "127.0.0.1", - port: int = 8000, - json_response: bool = False, - stateless_http: bool = False, - debug: bool = False, - log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = "INFO", + host: str = DEFAULT_MCP_HOST, + port: int = DEFAULT_MCP_PORT, + json_response: bool = DEFAULT_MCP_JSON_RESPONSE, + stateless_http: bool = DEFAULT_MCP_STATELESS_HTTP, + debug: bool = DEFAULT_MCP_DEBUG, + log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = ( + DEFAULT_MCP_LOG_LEVEL + ), ) -> FastMCP: + """Build and register the local read-only CodeClone FastMCP server.""" + runtime_fastmcp, read_only_tool, analysis_tool, session_tool = _load_mcp_runtime() service = CodeCloneMCPService(history_limit=_validated_history_limit(history_limit)) mcp = runtime_fastmcp( @@ -118,20 +156,26 @@ def build_mcp_server( # FastMCP otherwise reports the `mcp` package version in initialize/serverInfo. mcp._mcp_server.version = __version__ - def tool(*args: Any, **kwargs: Any) -> Callable[[MCPCallable], MCPCallable]: - return cast( - "Callable[[MCPCallable], MCPCallable]", - mcp.tool(*args, **kwargs), - ) + def tool(*args: object, **kwargs: object) -> Callable[[MCPCallable], MCPCallable]: + decorator = mcp.tool(*args, **kwargs) # type: ignore[arg-type] + + def register(func: MCPCallable) -> MCPCallable: + decorator(func) + return func + + return register def resource( - *args: Any, - **kwargs: Any, + *args: object, + **kwargs: object, ) -> Callable[[MCPCallable], MCPCallable]: - return cast( - "Callable[[MCPCallable], MCPCallable]", - mcp.resource(*args, **kwargs), - ) + decorator = mcp.resource(*args, **kwargs) # type: ignore[arg-type] + + def register(func: MCPCallable) -> MCPCallable: + decorator(func) + return func + + return register @tool( title="Analyze Repository", @@ -175,7 +219,7 @@ def analyze_repository( return service.analyze_repository( MCPAnalysisRequest( root=root, - analysis_mode=analysis_mode, # type: ignore[arg-type] + analysis_mode=_validated_analysis_mode(analysis_mode), respect_pyproject=respect_pyproject, changed_paths=tuple(changed_paths or ()), git_diff_ref=git_diff_ref, @@ -195,7 +239,7 @@ def analyze_repository( baseline_path=baseline_path, metrics_baseline_path=metrics_baseline_path, max_baseline_size_mb=max_baseline_size_mb, - cache_policy=cache_policy, # type: ignore[arg-type] + cache_policy=_validated_cache_policy(cache_policy), cache_path=cache_path, max_cache_size_mb=max_cache_size_mb, ) @@ -247,7 +291,7 @@ def analyze_changed_paths( root=root, changed_paths=tuple(changed_paths or ()), git_diff_ref=git_diff_ref, - analysis_mode=analysis_mode, # type: ignore[arg-type] + analysis_mode=_validated_analysis_mode(analysis_mode), respect_pyproject=respect_pyproject, processes=processes, min_loc=min_loc, @@ -265,7 +309,7 @@ def analyze_changed_paths( baseline_path=baseline_path, metrics_baseline_path=metrics_baseline_path, max_baseline_size_mb=max_baseline_size_mb, - cache_policy=cache_policy, # type: ignore[arg-type] + cache_policy=_validated_cache_policy(cache_policy), cache_path=cache_path, max_cache_size_mb=max_cache_size_mb, ) @@ -323,8 +367,8 @@ def help( detail: str = "compact", ) -> dict[str, object]: return service.get_help( - topic=topic, # type: ignore[arg-type] - detail=detail, # type: ignore[arg-type] + topic=topic, + detail=detail, ) @tool( @@ -353,7 +397,7 @@ def evaluate_gates( fail_on_untested_hotspots: bool = False, min_typing_coverage: int = -1, min_docstring_coverage: int = -1, - coverage_min: int = 50, + coverage_min: int = DEFAULT_COVERAGE_MIN, ) -> dict[str, object]: return service.evaluate_gates( MCPGateRequest( @@ -399,8 +443,8 @@ def get_report_section( ) -> dict[str, object]: return service.get_report_section( run_id=run_id, - section=section, # type: ignore[arg-type] - family=family, # type: ignore[arg-type] + section=section, + family=family, path=path, offset=offset, limit=limit, @@ -435,13 +479,13 @@ def list_findings( ) -> dict[str, object]: return service.list_findings( run_id=run_id, - family=family, # type: ignore[arg-type] + family=family, category=category, severity=severity, source_kind=source_kind, - novelty=novelty, # type: ignore[arg-type] - sort_by=sort_by, # type: ignore[arg-type] - detail_level=detail_level, # type: ignore[arg-type] + novelty=novelty, + sort_by=sort_by, + detail_level=detail_level, changed_paths=tuple(changed_paths or ()), git_diff_ref=git_diff_ref, exclude_reviewed=exclude_reviewed, @@ -469,7 +513,7 @@ def get_finding( return service.get_finding( finding_id=finding_id, run_id=run_id, - detail_level=detail_level, # type: ignore[arg-type] + detail_level=detail_level, ) @tool( @@ -490,7 +534,7 @@ def get_remediation( return service.get_remediation( finding_id=finding_id, run_id=run_id, - detail_level=detail_level, # type: ignore[arg-type] + detail_level=detail_level, ) @tool( @@ -514,9 +558,9 @@ def list_hotspots( max_results: int | None = None, ) -> dict[str, object]: return service.list_hotspots( - kind=kind, # type: ignore[arg-type] + kind=kind, run_id=run_id, - detail_level=detail_level, # type: ignore[arg-type] + detail_level=detail_level, changed_paths=tuple(changed_paths or ()), git_diff_ref=git_diff_ref, exclude_reviewed=exclude_reviewed, @@ -542,7 +586,7 @@ def compare_runs( return service.compare_runs( run_id_before=run_id_before, run_id_after=run_id_after, - focus=focus, # type: ignore[arg-type] + focus=focus, ) @tool( @@ -571,7 +615,7 @@ def check_complexity( path=path, min_complexity=min_complexity, max_results=max_results, - detail_level=detail_level, # type: ignore[arg-type] + detail_level=detail_level, ) @tool( @@ -602,7 +646,7 @@ def check_clones( clone_type=clone_type, source_kind=source_kind, max_results=max_results, - detail_level=detail_level, # type: ignore[arg-type] + detail_level=detail_level, ) @tool( @@ -629,7 +673,7 @@ def check_coupling( root=root, path=path, max_results=max_results, - detail_level=detail_level, # type: ignore[arg-type] + detail_level=detail_level, ) @tool( @@ -656,7 +700,7 @@ def check_cohesion( root=root, path=path, max_results=max_results, - detail_level=detail_level, # type: ignore[arg-type] + detail_level=detail_level, ) @tool( @@ -685,7 +729,7 @@ def check_dead_code( path=path, min_severity=min_severity, max_results=max_results, - detail_level=detail_level, # type: ignore[arg-type] + detail_level=detail_level, ) @tool( @@ -708,7 +752,7 @@ def generate_pr_summary( run_id=run_id, changed_paths=tuple(changed_paths or ()), git_diff_ref=git_diff_ref, - format=format, # type: ignore[arg-type] + format=format, ) @tool( @@ -875,7 +919,7 @@ def build_parser() -> argparse.ArgumentParser: ) parser.add_argument( "--host", - default="127.0.0.1", + default=DEFAULT_MCP_HOST, help="Host to bind when using streamable-http.", ) parser.add_argument( @@ -890,7 +934,7 @@ def build_parser() -> argparse.ArgumentParser: parser.add_argument( "--port", type=int, - default=8000, + default=DEFAULT_MCP_PORT, help="Port to bind when using streamable-http.", ) parser.add_argument( @@ -905,25 +949,25 @@ def build_parser() -> argparse.ArgumentParser: parser.add_argument( "--json-response", action=argparse.BooleanOptionalAction, - default=True, + default=DEFAULT_MCP_JSON_RESPONSE, help="Use JSON responses for streamable-http transport.", ) parser.add_argument( "--stateless-http", action=argparse.BooleanOptionalAction, - default=True, + default=DEFAULT_MCP_STATELESS_HTTP, help="Use stateless Streamable HTTP mode when transport is streamable-http.", ) parser.add_argument( "--debug", action=argparse.BooleanOptionalAction, - default=False, + default=DEFAULT_MCP_DEBUG, help="Enable FastMCP debug mode.", ) parser.add_argument( "--log-level", choices=("DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"), - default="INFO", + default=DEFAULT_MCP_LOG_LEVEL, help="FastMCP server log level.", ) return parser diff --git a/codeclone/surfaces/mcp/service.py b/codeclone/surfaces/mcp/service.py new file mode 100644 index 0000000..adca1d4 --- /dev/null +++ b/codeclone/surfaces/mcp/service.py @@ -0,0 +1,263 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 + +from __future__ import annotations + +import inspect +from typing import Protocol + +from .session import ( + DEFAULT_MCP_HISTORY_LIMIT, + MCPAnalysisRequest, + MCPGateRequest, + MCPSession, +) +from .tools._base import run_kw + + +class _RunDictService(Protocol): + def _run_dict(self, method_name: str, **params: object) -> dict[str, object]: ... + + +class _QueryServiceMixin: + def compare_runs(self: _RunDictService, **params: object) -> dict[str, object]: + return self._run_dict("compare_runs", **params) + + def get_report_section( + self: _RunDictService, + **params: object, + ) -> dict[str, object]: + return self._run_dict("get_report_section", **params) + + def list_findings(self: _RunDictService, **params: object) -> dict[str, object]: + return self._run_dict("list_findings", **params) + + def get_finding(self: _RunDictService, **params: object) -> dict[str, object]: + return self._run_dict("get_finding", **params) + + def get_remediation(self: _RunDictService, **params: object) -> dict[str, object]: + return self._run_dict("get_remediation", **params) + + def list_hotspots(self: _RunDictService, **params: object) -> dict[str, object]: + return self._run_dict("list_hotspots", **params) + + def get_production_triage( + self: _RunDictService, + **params: object, + ) -> dict[str, object]: + return self._run_dict("get_production_triage", **params) + + def get_help(self: _RunDictService, **params: object) -> dict[str, object]: + return self._run_dict("get_help", **params) + + def generate_pr_summary( + self: _RunDictService, + **params: object, + ) -> dict[str, object]: + return self._run_dict("generate_pr_summary", **params) + + def mark_finding_reviewed( + self: _RunDictService, + **params: object, + ) -> dict[str, object]: + return self._run_dict("mark_finding_reviewed", **params) + + def list_reviewed_findings( + self: _RunDictService, + **params: object, + ) -> dict[str, object]: + return self._run_dict("list_reviewed_findings", **params) + + def check_complexity( + self: _RunDictService, + **params: object, + ) -> dict[str, object]: + return self._run_dict("check_complexity", **params) + + def check_clones(self: _RunDictService, **params: object) -> dict[str, object]: + return self._run_dict("check_clones", **params) + + def check_coupling(self: _RunDictService, **params: object) -> dict[str, object]: + return self._run_dict("check_coupling", **params) + + def check_cohesion(self: _RunDictService, **params: object) -> dict[str, object]: + return self._run_dict("check_cohesion", **params) + + def check_dead_code(self: _RunDictService, **params: object) -> dict[str, object]: + return self._run_dict("check_dead_code", **params) + + +class CodeCloneMCPService(_QueryServiceMixin, MCPSession): + def __init__(self, *, history_limit: int = DEFAULT_MCP_HISTORY_LIMIT) -> None: + super().__init__(history_limit=history_limit) + self._session_cls = MCPSession + # Keep a stable seam for tests and monkeypatch-based callers while the + # service itself now owns the real MCP session state. + self.session = self + + def _run_dict(self, method_name: str, **params: object) -> dict[str, object]: + bound = getattr(self._session_cls, method_name).__get__(self, type(self)) + result = run_kw(bound, params) + if not isinstance(result, dict): + raise TypeError(f"MCP session method '{method_name}' must return a dict.") + return result + + def analyze_repository(self, request: MCPAnalysisRequest) -> dict[str, object]: + return self._session_cls.analyze_repository(self, request) + + def analyze_changed_paths(self, request: MCPAnalysisRequest) -> dict[str, object]: + return self._session_cls.analyze_changed_paths(self, request) + + def get_run_summary(self, run_id: str | None = None) -> dict[str, object]: + return self._session_cls.get_run_summary(self, run_id) + + def evaluate_gates(self, request: MCPGateRequest) -> dict[str, object]: + return self._session_cls.evaluate_gates(self, request) + + def clear_session_runs(self) -> dict[str, object]: + return self._session_cls.clear_session_runs(self) + + def read_resource(self, uri: str) -> str: + return self._session_cls.read_resource(self, uri) + + +_EMPTY = inspect.Signature.empty + + +def _kwonly( + name: str, + annotation: str, + default: object = _EMPTY, +) -> inspect.Parameter: + return inspect.Parameter( + name, + inspect.Parameter.KEYWORD_ONLY, + default=default, + annotation=annotation, + ) + + +def _apply_public_method_signatures() -> None: + signature_specs: dict[str, tuple[inspect.Parameter, ...]] = { + "check_clones": ( + _kwonly("run_id", "str | None", None), + _kwonly("root", "str | None", None), + _kwonly("path", "str | None", None), + _kwonly("clone_type", "str | None", None), + _kwonly("source_kind", "str | None", None), + _kwonly("max_results", "int", 10), + _kwonly("detail_level", "DetailLevel", "summary"), + ), + "check_cohesion": ( + _kwonly("run_id", "str | None", None), + _kwonly("root", "str | None", None), + _kwonly("path", "str | None", None), + _kwonly("max_results", "int", 10), + _kwonly("detail_level", "DetailLevel", "summary"), + ), + "check_complexity": ( + _kwonly("run_id", "str | None", None), + _kwonly("root", "str | None", None), + _kwonly("path", "str | None", None), + _kwonly("min_complexity", "int | None", None), + _kwonly("max_results", "int", 10), + _kwonly("detail_level", "DetailLevel", "summary"), + ), + "check_coupling": ( + _kwonly("run_id", "str | None", None), + _kwonly("root", "str | None", None), + _kwonly("path", "str | None", None), + _kwonly("max_results", "int", 10), + _kwonly("detail_level", "DetailLevel", "summary"), + ), + "check_dead_code": ( + _kwonly("run_id", "str | None", None), + _kwonly("root", "str | None", None), + _kwonly("path", "str | None", None), + _kwonly("min_severity", "str | None", None), + _kwonly("max_results", "int", 10), + _kwonly("detail_level", "DetailLevel", "summary"), + ), + "compare_runs": ( + _kwonly("run_id_before", "str"), + _kwonly("run_id_after", "str | None", None), + _kwonly("focus", "ComparisonFocus", "all"), + ), + "generate_pr_summary": ( + _kwonly("run_id", "str | None", None), + _kwonly("changed_paths", "Sequence[str]", ()), + _kwonly("git_diff_ref", "str | None", None), + _kwonly("format", "PRSummaryFormat", "markdown"), + ), + "get_finding": ( + _kwonly("finding_id", "str"), + _kwonly("run_id", "str | None", None), + _kwonly("detail_level", "DetailLevel", "normal"), + ), + "get_help": ( + _kwonly("topic", "HelpTopic"), + _kwonly("detail", "HelpDetail", "compact"), + ), + "get_production_triage": ( + _kwonly("run_id", "str | None", None), + _kwonly("max_hotspots", "int", 3), + _kwonly("max_suggestions", "int", 3), + ), + "get_remediation": ( + _kwonly("finding_id", "str"), + _kwonly("run_id", "str | None", None), + _kwonly("detail_level", "DetailLevel", "normal"), + ), + "get_report_section": ( + _kwonly("run_id", "str | None", None), + _kwonly("section", "ReportSection", "all"), + _kwonly("family", "MetricsDetailFamily | None", None), + _kwonly("path", "str | None", None), + _kwonly("offset", "int", 0), + _kwonly("limit", "int", 50), + ), + "list_findings": ( + _kwonly("run_id", "str | None", None), + _kwonly("family", "FindingFamilyFilter", "all"), + _kwonly("category", "str | None", None), + _kwonly("severity", "str | None", None), + _kwonly("source_kind", "str | None", None), + _kwonly("novelty", "FindingNoveltyFilter", "all"), + _kwonly("sort_by", "FindingSort", "default"), + _kwonly("detail_level", "DetailLevel", "summary"), + _kwonly("changed_paths", "Sequence[str]", ()), + _kwonly("git_diff_ref", "str | None", None), + _kwonly("exclude_reviewed", "bool", False), + _kwonly("offset", "int", 0), + _kwonly("limit", "int", 50), + _kwonly("max_results", "int | None", None), + ), + "list_hotspots": ( + _kwonly("kind", "HotlistKind"), + _kwonly("run_id", "str | None", None), + _kwonly("detail_level", "DetailLevel", "summary"), + _kwonly("changed_paths", "Sequence[str]", ()), + _kwonly("git_diff_ref", "str | None", None), + _kwonly("exclude_reviewed", "bool", False), + _kwonly("limit", "int", 10), + _kwonly("max_results", "int | None", None), + ), + "list_reviewed_findings": (_kwonly("run_id", "str | None", None),), + "mark_finding_reviewed": ( + _kwonly("finding_id", "str"), + _kwonly("run_id", "str | None", None), + _kwonly("note", "str | None", None), + ), + } + self_param = inspect.Parameter("self", inspect.Parameter.POSITIONAL_OR_KEYWORD) + for name, params in signature_specs.items(): + method = getattr(CodeCloneMCPService, name) + method.__signature__ = inspect.Signature( + parameters=(self_param, *params), + return_annotation="dict[str, object]", + ) + + +_apply_public_method_signatures() diff --git a/codeclone/surfaces/mcp/session.py b/codeclone/surfaces/mcp/session.py new file mode 100644 index 0000000..3de4ce2 --- /dev/null +++ b/codeclone/surfaces/mcp/session.py @@ -0,0 +1,342 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from ...cache.store import resolve_cache_status +from ...report.meta import build_report_meta as _build_report_meta +from ...report.meta import current_report_timestamp_utc as _current_report_timestamp_utc +from . import _session_helpers as _helpers +from ._session_baseline import ( + resolve_clone_baseline_state, + resolve_metrics_baseline_state, +) +from ._session_shared import ( + _REPORT_DUMMY_PATH, + DEFAULT_BLOCK_MIN_LOC, + DEFAULT_BLOCK_MIN_STMT, + DEFAULT_MCP_HISTORY_LIMIT, + DEFAULT_MIN_LOC, + DEFAULT_MIN_STMT, + DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, + DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, + DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, + DEFAULT_SEGMENT_MIN_LOC, + DEFAULT_SEGMENT_MIN_STMT, + MAX_MCP_HISTORY_LIMIT, + AnalysisMode, + Baseline, + CachePolicy, + CacheStatus, + CodeCloneMCPRunStore, + DetailLevel, + MCPAnalysisRequest, + MCPFindingNotFoundError, + MCPGateRequest, + MCPGitDiffError, + MCPRunNotFoundError, + MCPRunRecord, + MCPServiceContractError, + MCPServiceError, + OrderedDict, + OutputPaths, + RLock, + __version__, + _as_int, + _BufferConsole, + _validated_history_limit, + analyze, + bootstrap, + discover, + process, + report, +) +from ._session_state_mixin import _MCPSessionStateMixin + +__all__ = [ + "DEFAULT_MCP_HISTORY_LIMIT", + "MAX_MCP_HISTORY_LIMIT", + "AnalysisMode", + "CachePolicy", + "DetailLevel", + "MCPAnalysisRequest", + "MCPFindingNotFoundError", + "MCPGateRequest", + "MCPGitDiffError", + "MCPRunNotFoundError", + "MCPRunRecord", + "MCPServiceContractError", + "MCPServiceError", + "MCPSession", + "_validated_history_limit", +] + + +class MCPSession(_MCPSessionStateMixin): + def __init__(self, *, history_limit: int = DEFAULT_MCP_HISTORY_LIMIT) -> None: + self._runs = CodeCloneMCPRunStore(history_limit=history_limit) + self._state_lock = RLock() + self._review_state: dict[str, OrderedDict[str, str | None]] = {} + self._last_gate_results: dict[str, dict[str, object]] = {} + self._spread_max_cache: dict[str, int] = {} + + def analyze_repository(self, request: MCPAnalysisRequest) -> dict[str, object]: + self._validate_analysis_request(request) + root_path = _helpers._resolve_root(request.root) + analysis_started_at_utc = _current_report_timestamp_utc() + changed_paths = self._resolve_request_changed_paths( + root_path=root_path, + changed_paths=request.changed_paths, + git_diff_ref=request.git_diff_ref, + ) + args = self._build_args(root_path=root_path, request=request) + ( + baseline_path, + baseline_exists, + metrics_baseline_path, + metrics_baseline_exists, + shared_baseline_payload, + ) = self._resolve_baseline_inputs(root_path=root_path, args=args) + cache_path = _helpers._resolve_cache_path(root_path=root_path, args=args) + cache = _helpers._build_cache( + root_path=root_path, + args=args, + cache_path=cache_path, + policy=request.cache_policy, + ) + console = _BufferConsole() + + boot = bootstrap( + args=args, + root=root_path, + output_paths=OutputPaths(json=_REPORT_DUMMY_PATH), + cache_path=cache_path, + ) + discovery_result = discover(boot=boot, cache=cache) + processing_result = process(boot=boot, discovery=discovery_result, cache=cache) + analysis_result = analyze( + boot=boot, + discovery=discovery_result, + processing=processing_result, + ) + + clone_baseline_state = resolve_clone_baseline_state( + baseline_path=baseline_path, + baseline_exists=baseline_exists, + max_baseline_size_mb=_as_int(args.max_baseline_size_mb, 0), + shared_baseline_payload=( + shared_baseline_payload + if metrics_baseline_path == baseline_path + else None + ), + ) + metrics_baseline_state = resolve_metrics_baseline_state( + metrics_baseline_path=metrics_baseline_path, + metrics_baseline_exists=metrics_baseline_exists, + max_baseline_size_mb=_as_int(args.max_baseline_size_mb, 0), + skip_metrics=bool(args.skip_metrics), + shared_baseline_payload=( + shared_baseline_payload + if metrics_baseline_path == baseline_path + else None + ), + ) + + cache_status, cache_schema_version = resolve_cache_status(cache) + report_meta = _build_report_meta( + codeclone_version=__version__, + scan_root=root_path, + baseline_path=baseline_path, + baseline=clone_baseline_state.baseline, + baseline_loaded=clone_baseline_state.loaded, + baseline_status=clone_baseline_state.status.value, + cache_path=cache_path, + cache_used=cache_status == CacheStatus.OK, + cache_status=cache_status.value, + cache_schema_version=cache_schema_version, + files_skipped_source_io=len(processing_result.source_read_failures), + metrics_baseline_path=metrics_baseline_path, + metrics_baseline=metrics_baseline_state.baseline, + metrics_baseline_loaded=metrics_baseline_state.loaded, + metrics_baseline_status=metrics_baseline_state.status.value, + health_score=( + analysis_result.project_metrics.health.total + if analysis_result.project_metrics is not None + else None + ), + health_grade=( + analysis_result.project_metrics.health.grade + if analysis_result.project_metrics is not None + else None + ), + analysis_mode=request.analysis_mode, + metrics_computed=_helpers._metrics_computed(request.analysis_mode), + min_loc=_as_int(args.min_loc, DEFAULT_MIN_LOC), + min_stmt=_as_int(args.min_stmt, DEFAULT_MIN_STMT), + block_min_loc=_as_int(args.block_min_loc, DEFAULT_BLOCK_MIN_LOC), + block_min_stmt=_as_int(args.block_min_stmt, DEFAULT_BLOCK_MIN_STMT), + segment_min_loc=_as_int(args.segment_min_loc, DEFAULT_SEGMENT_MIN_LOC), + segment_min_stmt=_as_int(args.segment_min_stmt, DEFAULT_SEGMENT_MIN_STMT), + design_complexity_threshold=_as_int( + getattr( + args, + "design_complexity_threshold", + DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, + ), + DEFAULT_REPORT_DESIGN_COMPLEXITY_THRESHOLD, + ), + design_coupling_threshold=_as_int( + getattr( + args, + "design_coupling_threshold", + DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, + ), + DEFAULT_REPORT_DESIGN_COUPLING_THRESHOLD, + ), + design_cohesion_threshold=_as_int( + getattr( + args, + "design_cohesion_threshold", + DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, + ), + DEFAULT_REPORT_DESIGN_COHESION_THRESHOLD, + ), + analysis_started_at_utc=analysis_started_at_utc, + report_generated_at_utc=_current_report_timestamp_utc(), + ) + + baseline_for_diff = ( + clone_baseline_state.baseline + if clone_baseline_state.trusted_for_diff + else Baseline(baseline_path) + ) + new_func, new_block = baseline_for_diff.diff( + analysis_result.func_groups, + analysis_result.block_groups, + ) + metrics_diff = None + if ( + analysis_result.project_metrics is not None + and metrics_baseline_state.trusted_for_diff + ): + metrics_diff = metrics_baseline_state.baseline.diff( + analysis_result.project_metrics + ) + + report_artifacts = report( + boot=boot, + discovery=discovery_result, + processing=processing_result, + analysis=analysis_result, + report_meta=report_meta, + new_func=new_func, + new_block=new_block, + metrics_diff=metrics_diff, + ) + report_json = report_artifacts.json + if report_json is None: + raise MCPServiceError("CodeClone MCP expected a canonical JSON report.") + report_document = _helpers._load_report_document(report_json) + run_id = _helpers._report_digest(report_document) + + warning_items = set(console.messages) + baseline_warning = getattr(clone_baseline_state, "warning_message", None) + if isinstance(baseline_warning, str) and baseline_warning: + warning_items.add(baseline_warning) + metrics_warning = getattr(metrics_baseline_state, "warning_message", None) + if isinstance(metrics_warning, str) and metrics_warning: + warning_items.add(metrics_warning) + if cache.load_warning: + warning_items.add(cache.load_warning) + warning_items.update(discovery_result.skipped_warnings) + warnings = tuple(sorted(warning_items)) + failures = tuple( + sorted( + { + *processing_result.failed_files, + *processing_result.source_read_failures, + } + ) + ) + + base_summary = self._build_run_summary_payload( + run_id=run_id, + root_path=root_path, + request=request, + report_document=report_document, + baseline_state=clone_baseline_state, + metrics_baseline_state=metrics_baseline_state, + cache_status=cache_status, + new_func=new_func, + new_block=new_block, + metrics_diff=metrics_diff, + warnings=warnings, + failures=failures, + ) + provisional_record = MCPRunRecord( + run_id=run_id, + root=root_path, + request=request, + comparison_settings=_helpers._comparison_settings( + args=args, + request=request, + ), + report_document=report_document, + summary=base_summary, + changed_paths=changed_paths, + changed_projection=None, + warnings=warnings, + failures=failures, + func_clones_count=analysis_result.func_clones_count, + block_clones_count=analysis_result.block_clones_count, + project_metrics=analysis_result.project_metrics, + coverage_join=analysis_result.coverage_join, + suggestions=analysis_result.suggestions, + new_func=frozenset(new_func), + new_block=frozenset(new_block), + metrics_diff=metrics_diff, + ) + changed_projection = self._build_changed_projection(provisional_record) + summary = self._augment_summary_with_changed( + summary=base_summary, + changed_paths=changed_paths, + changed_projection=changed_projection, + ) + record = MCPRunRecord( + run_id=run_id, + root=root_path, + request=request, + comparison_settings=_helpers._comparison_settings( + args=args, + request=request, + ), + report_document=report_document, + summary=summary, + changed_paths=changed_paths, + changed_projection=changed_projection, + warnings=warnings, + failures=failures, + func_clones_count=analysis_result.func_clones_count, + block_clones_count=analysis_result.block_clones_count, + project_metrics=analysis_result.project_metrics, + coverage_join=analysis_result.coverage_join, + suggestions=analysis_result.suggestions, + new_func=frozenset(new_func), + new_block=frozenset(new_block), + metrics_diff=metrics_diff, + ) + self._runs.register(record) + self._prune_session_state() + return self._summary_payload(record.summary, record=record) + + def analyze_changed_paths(self, request: MCPAnalysisRequest) -> dict[str, object]: + if not request.changed_paths and request.git_diff_ref is None: + raise MCPServiceContractError( + "analyze_changed_paths requires changed_paths or git_diff_ref." + ) + analysis_summary = self.analyze_repository(request) + record = self._runs.get(str(analysis_summary.get("run_id", "")) or None) + return self._changed_analysis_payload(record) diff --git a/codeclone/surfaces/mcp/tools/__init__.py b/codeclone/surfaces/mcp/tools/__init__.py new file mode 100644 index 0000000..fa68bb9 --- /dev/null +++ b/codeclone/surfaces/mcp/tools/__init__.py @@ -0,0 +1,33 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 + +from __future__ import annotations + +from ._base import MCPTool +from .analyze import TOOLS as ANALYZE_TOOLS +from .checks import TOOLS as CHECK_TOOLS +from .compare import TOOLS as COMPARE_TOOLS +from .findings import TOOLS as FINDING_TOOLS +from .gates import TOOLS as GATE_TOOLS +from .help import TOOLS as HELP_TOOLS +from .hotspots import TOOLS as HOTSPOT_TOOLS +from .pr import TOOLS as PR_TOOLS +from .report_section import TOOLS as REPORT_SECTION_TOOLS +from .runs import TOOLS as RUN_TOOLS + +MCP_TOOLS: tuple[MCPTool, ...] = ( + *ANALYZE_TOOLS, + *RUN_TOOLS, + *FINDING_TOOLS, + *CHECK_TOOLS, + *HOTSPOT_TOOLS, + *REPORT_SECTION_TOOLS, + *COMPARE_TOOLS, + *GATE_TOOLS, + *PR_TOOLS, + *HELP_TOOLS, +) + +MCP_TOOLS_BY_NAME: dict[str, MCPTool] = {tool.name: tool for tool in MCP_TOOLS} diff --git a/codeclone/surfaces/mcp/tools/_base.py b/codeclone/surfaces/mcp/tools/_base.py new file mode 100644 index 0000000..313a67b --- /dev/null +++ b/codeclone/surfaces/mcp/tools/_base.py @@ -0,0 +1,45 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections.abc import Callable, Mapping +from dataclasses import dataclass +from typing import Protocol + + +@dataclass(frozen=True, slots=True) +class MCPToolSchema: + title: str + description: str = "" + + +class MCPToolSession(Protocol): + def __getattr__(self, name: str) -> Callable[..., object]: ... + + +class MCPTool(Protocol): + @property + def name(self) -> str: ... + + @property + def schema(self) -> MCPToolSchema: ... + + def run(self, session: MCPToolSession, params: Mapping[str, object]) -> object: ... + + +@dataclass(frozen=True, slots=True) +class SimpleMCPTool: + name: str + schema: MCPToolSchema + runner: Callable[[MCPToolSession, Mapping[str, object]], object] + + def run(self, session: MCPToolSession, params: Mapping[str, object]) -> object: + return self.runner(session, params) + + +def run_kw(bound: Callable[..., object], params: Mapping[str, object]) -> object: + return bound(**dict(params)) diff --git a/codeclone/surfaces/mcp/tools/analyze.py b/codeclone/surfaces/mcp/tools/analyze.py new file mode 100644 index 0000000..2c7548c --- /dev/null +++ b/codeclone/surfaces/mcp/tools/analyze.py @@ -0,0 +1,36 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 + +from __future__ import annotations + +from collections.abc import Mapping + +from ..session import MCPAnalysisRequest, MCPServiceContractError +from ._base import MCPToolSchema, SimpleMCPTool + + +def _analysis_request(params: Mapping[str, object]) -> MCPAnalysisRequest: + request = params.get("request") + if not isinstance(request, MCPAnalysisRequest): + raise MCPServiceContractError("Tool requires a valid MCPAnalysisRequest.") + return request + + +TOOLS = ( + SimpleMCPTool( + name="analyze_repository", + schema=MCPToolSchema(title="Analyze Repository"), + runner=lambda session, params: session.analyze_repository( + _analysis_request(params) + ), + ), + SimpleMCPTool( + name="analyze_changed_paths", + schema=MCPToolSchema(title="Analyze Changed Paths"), + runner=lambda session, params: session.analyze_changed_paths( + _analysis_request(params) + ), + ), +) diff --git a/codeclone/surfaces/mcp/tools/checks.py b/codeclone/surfaces/mcp/tools/checks.py new file mode 100644 index 0000000..cfbc772 --- /dev/null +++ b/codeclone/surfaces/mcp/tools/checks.py @@ -0,0 +1,36 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 + +from __future__ import annotations + +from ._base import MCPToolSchema, SimpleMCPTool, run_kw + +TOOLS = ( + SimpleMCPTool( + name="check_complexity", + schema=MCPToolSchema(title="Check Complexity"), + runner=lambda session, params: run_kw(session.check_complexity, params), + ), + SimpleMCPTool( + name="check_clones", + schema=MCPToolSchema(title="Check Clones"), + runner=lambda session, params: run_kw(session.check_clones, params), + ), + SimpleMCPTool( + name="check_coupling", + schema=MCPToolSchema(title="Check Coupling"), + runner=lambda session, params: run_kw(session.check_coupling, params), + ), + SimpleMCPTool( + name="check_cohesion", + schema=MCPToolSchema(title="Check Cohesion"), + runner=lambda session, params: run_kw(session.check_cohesion, params), + ), + SimpleMCPTool( + name="check_dead_code", + schema=MCPToolSchema(title="Check Dead Code"), + runner=lambda session, params: run_kw(session.check_dead_code, params), + ), +) diff --git a/codeclone/surfaces/mcp/tools/compare.py b/codeclone/surfaces/mcp/tools/compare.py new file mode 100644 index 0000000..7967800 --- /dev/null +++ b/codeclone/surfaces/mcp/tools/compare.py @@ -0,0 +1,16 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 + +from __future__ import annotations + +from ._base import MCPToolSchema, SimpleMCPTool, run_kw + +TOOLS = ( + SimpleMCPTool( + name="compare_runs", + schema=MCPToolSchema(title="Compare Runs"), + runner=lambda session, params: run_kw(session.compare_runs, params), + ), +) diff --git a/codeclone/surfaces/mcp/tools/findings.py b/codeclone/surfaces/mcp/tools/findings.py new file mode 100644 index 0000000..0897afd --- /dev/null +++ b/codeclone/surfaces/mcp/tools/findings.py @@ -0,0 +1,36 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 + +from __future__ import annotations + +from ._base import MCPToolSchema, SimpleMCPTool, run_kw + +TOOLS = ( + SimpleMCPTool( + name="list_findings", + schema=MCPToolSchema(title="List Findings"), + runner=lambda session, params: run_kw(session.list_findings, params), + ), + SimpleMCPTool( + name="get_finding", + schema=MCPToolSchema(title="Get Finding"), + runner=lambda session, params: run_kw(session.get_finding, params), + ), + SimpleMCPTool( + name="get_remediation", + schema=MCPToolSchema(title="Get Remediation"), + runner=lambda session, params: run_kw(session.get_remediation, params), + ), + SimpleMCPTool( + name="mark_finding_reviewed", + schema=MCPToolSchema(title="Mark Finding Reviewed"), + runner=lambda session, params: run_kw(session.mark_finding_reviewed, params), + ), + SimpleMCPTool( + name="list_reviewed_findings", + schema=MCPToolSchema(title="List Reviewed Findings"), + runner=lambda session, params: run_kw(session.list_reviewed_findings, params), + ), +) diff --git a/codeclone/surfaces/mcp/tools/gates.py b/codeclone/surfaces/mcp/tools/gates.py new file mode 100644 index 0000000..16b44f4 --- /dev/null +++ b/codeclone/surfaces/mcp/tools/gates.py @@ -0,0 +1,27 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 + +from __future__ import annotations + +from collections.abc import Mapping + +from ..session import MCPGateRequest, MCPServiceContractError +from ._base import MCPToolSchema, SimpleMCPTool + + +def _gate_request(params: Mapping[str, object]) -> MCPGateRequest: + request = params.get("request") + if not isinstance(request, MCPGateRequest): + raise MCPServiceContractError("Tool requires a valid MCPGateRequest.") + return request + + +TOOLS = ( + SimpleMCPTool( + name="evaluate_gates", + schema=MCPToolSchema(title="Evaluate Gates"), + runner=lambda session, params: session.evaluate_gates(_gate_request(params)), + ), +) diff --git a/codeclone/surfaces/mcp/tools/help.py b/codeclone/surfaces/mcp/tools/help.py new file mode 100644 index 0000000..2b64784 --- /dev/null +++ b/codeclone/surfaces/mcp/tools/help.py @@ -0,0 +1,16 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 + +from __future__ import annotations + +from ._base import MCPToolSchema, SimpleMCPTool, run_kw + +TOOLS = ( + SimpleMCPTool( + name="help", + schema=MCPToolSchema(title="Help"), + runner=lambda session, params: run_kw(session.get_help, params), + ), +) diff --git a/codeclone/surfaces/mcp/tools/hotspots.py b/codeclone/surfaces/mcp/tools/hotspots.py new file mode 100644 index 0000000..2b95dfa --- /dev/null +++ b/codeclone/surfaces/mcp/tools/hotspots.py @@ -0,0 +1,21 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 + +from __future__ import annotations + +from ._base import MCPToolSchema, SimpleMCPTool, run_kw + +TOOLS = ( + SimpleMCPTool( + name="list_hotspots", + schema=MCPToolSchema(title="List Hotspots"), + runner=lambda session, params: run_kw(session.list_hotspots, params), + ), + SimpleMCPTool( + name="get_production_triage", + schema=MCPToolSchema(title="Get Production Triage"), + runner=lambda session, params: run_kw(session.get_production_triage, params), + ), +) diff --git a/codeclone/surfaces/mcp/tools/pr.py b/codeclone/surfaces/mcp/tools/pr.py new file mode 100644 index 0000000..8561072 --- /dev/null +++ b/codeclone/surfaces/mcp/tools/pr.py @@ -0,0 +1,16 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 + +from __future__ import annotations + +from ._base import MCPToolSchema, SimpleMCPTool, run_kw + +TOOLS = ( + SimpleMCPTool( + name="generate_pr_summary", + schema=MCPToolSchema(title="Generate PR Summary"), + runner=lambda session, params: run_kw(session.generate_pr_summary, params), + ), +) diff --git a/codeclone/surfaces/mcp/tools/report_section.py b/codeclone/surfaces/mcp/tools/report_section.py new file mode 100644 index 0000000..e53f50b --- /dev/null +++ b/codeclone/surfaces/mcp/tools/report_section.py @@ -0,0 +1,16 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 + +from __future__ import annotations + +from ._base import MCPToolSchema, SimpleMCPTool, run_kw + +TOOLS = ( + SimpleMCPTool( + name="get_report_section", + schema=MCPToolSchema(title="Get Report Section"), + runner=lambda session, params: run_kw(session.get_report_section, params), + ), +) diff --git a/codeclone/surfaces/mcp/tools/runs.py b/codeclone/surfaces/mcp/tools/runs.py new file mode 100644 index 0000000..58189b8 --- /dev/null +++ b/codeclone/surfaces/mcp/tools/runs.py @@ -0,0 +1,27 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 + +from __future__ import annotations + +from ._base import MCPToolSchema, SimpleMCPTool + + +def _run_id(params: dict[str, object]) -> str | None: + value = params.get("run_id") + return value if isinstance(value, str) else None + + +TOOLS = ( + SimpleMCPTool( + name="get_run_summary", + schema=MCPToolSchema(title="Get Run Summary"), + runner=lambda session, params: session.get_run_summary(_run_id(dict(params))), + ), + SimpleMCPTool( + name="clear_session_runs", + schema=MCPToolSchema(title="Clear Session Runs"), + runner=lambda session, _params: session.clear_session_runs(), + ), +) diff --git a/codeclone/ui_messages.py b/codeclone/ui_messages/__init__.py similarity index 81% rename from codeclone/ui_messages.py rename to codeclone/ui_messages/__init__.py index 998fb52..edec8eb 100644 --- a/codeclone/ui_messages.py +++ b/codeclone/ui_messages/__init__.py @@ -7,14 +7,30 @@ from __future__ import annotations import platform +import re import shlex import sys +import textwrap import traceback from pathlib import Path -from . import __version__ -from .contracts import ISSUES_URL -from .domain.quality import ( +from .. import __version__ +from ..contracts import ( + DEFAULT_BASELINE_PATH, + DEFAULT_COVERAGE_MIN, + DEFAULT_HTML_REPORT_PATH, + DEFAULT_JSON_REPORT_PATH, + DEFAULT_MARKDOWN_REPORT_PATH, + DEFAULT_MAX_BASELINE_SIZE_MB, + DEFAULT_MAX_CACHE_SIZE_MB, + DEFAULT_MIN_LOC, + DEFAULT_MIN_STMT, + DEFAULT_PROCESSES, + DEFAULT_SARIF_REPORT_PATH, + DEFAULT_TEXT_REPORT_PATH, + ISSUES_URL, +) +from ..domain.quality import ( HEALTH_GRADE_A, HEALTH_GRADE_B, HEALTH_GRADE_C, @@ -31,9 +47,15 @@ HELP_VERSION = "Print the CodeClone version and exit." HELP_ROOT = "Project root directory to scan.\nDefaults to the current directory." -HELP_MIN_LOC = "Minimum Lines of Code (LOC) required for clone analysis.\nDefault: 10." -HELP_MIN_STMT = "Minimum AST statement count required for clone analysis.\nDefault: 6." -HELP_PROCESSES = "Number of parallel worker processes.\nDefault: 4." +HELP_MIN_LOC = ( + "Minimum Lines of Code (LOC) required for clone analysis.\n" + f"Default: {DEFAULT_MIN_LOC}." +) +HELP_MIN_STMT = ( + "Minimum AST statement count required for clone analysis.\n" + f"Default: {DEFAULT_MIN_STMT}." +) +HELP_PROCESSES = f"Number of parallel worker processes.\nDefault: {DEFAULT_PROCESSES}." HELP_CHANGED_ONLY = ( "Limit clone gating and changed-scope summaries to findings that touch\n" "files from a git diff selection." @@ -53,11 +75,15 @@ HELP_CACHE_DIR_LEGACY = ( "Legacy alias for --cache-path.\nPrefer --cache-path in new configurations." ) -HELP_MAX_BASELINE_SIZE_MB = "Maximum allowed baseline size in MB.\nDefault: 5." -HELP_MAX_CACHE_SIZE_MB = "Maximum cache file size in MB.\nDefault: 50." +HELP_MAX_BASELINE_SIZE_MB = ( + f"Maximum allowed baseline size in MB.\nDefault: {DEFAULT_MAX_BASELINE_SIZE_MB}." +) +HELP_MAX_CACHE_SIZE_MB = ( + f"Maximum cache file size in MB.\nDefault: {DEFAULT_MAX_CACHE_SIZE_MB}." +) HELP_BASELINE = ( "Path to the clone baseline.\n" - f"If FILE is omitted, uses {Path('codeclone.baseline.json')}." + f"If FILE is omitted, uses {Path(DEFAULT_BASELINE_PATH)}." ) HELP_UPDATE_BASELINE = ( "Overwrite the clone baseline with current results.\nDisabled by default." @@ -127,7 +153,8 @@ ) HELP_COVERAGE_MIN = ( "Coverage threshold for untested hotspot detection.\n" - "Threshold is a whole percent from 0 to 100.\nDefault: 50." + "Threshold is a whole percent from 0 to 100.\n" + f"Default: {DEFAULT_COVERAGE_MIN}." ) HELP_CI = ( "Enable CI preset.\n" @@ -140,30 +167,30 @@ ) HELP_METRICS_BASELINE = ( "Path to the metrics baseline.\n" - f"If FILE is omitted, uses {Path('codeclone.baseline.json')}." + f"If FILE is omitted, uses {Path(DEFAULT_BASELINE_PATH)}." ) HELP_SKIP_METRICS = "Skip full metrics analysis and run in clone-only mode." HELP_SKIP_DEAD_CODE = "Skip dead code detection." HELP_SKIP_DEPENDENCIES = "Skip dependency graph analysis." HELP_HTML = ( "Generate an HTML report.\n" - "If FILE is omitted, writes to .cache/codeclone/report.html." + f"If FILE is omitted, writes to {DEFAULT_HTML_REPORT_PATH}." ) HELP_JSON = ( "Generate the canonical JSON report.\n" - "If FILE is omitted, writes to .cache/codeclone/report.json." + f"If FILE is omitted, writes to {DEFAULT_JSON_REPORT_PATH}." ) HELP_MD = ( "Generate a Markdown report.\n" - "If FILE is omitted, writes to .cache/codeclone/report.md." + f"If FILE is omitted, writes to {DEFAULT_MARKDOWN_REPORT_PATH}." ) HELP_SARIF = ( "Generate a SARIF 2.1.0 report.\n" - "If FILE is omitted, writes to .cache/codeclone/report.sarif." + f"If FILE is omitted, writes to {DEFAULT_SARIF_REPORT_PATH}." ) HELP_TEXT = ( "Generate a plain-text report.\n" - "If FILE is omitted, writes to .cache/codeclone/report.txt." + f"If FILE is omitted, writes to {DEFAULT_TEXT_REPORT_PATH}." ) HELP_OPEN_HTML_REPORT = ( "Open the generated HTML report in the default browser.\nRequires --html." @@ -216,6 +243,13 @@ " lcom4={lcom_avg}/{lcom_max} cycles={cycles} dead_code={dead}" " health={health}({grade}) overloaded_modules={overloaded_modules}" ) +SUMMARY_COMPACT_DEPENDENCIES = ( + "Dependencies avg={avg_depth} p95={p95_depth} max={max_depth}" +) +SUMMARY_COMPACT_SECURITY_SURFACES = ( + "Security items={items} categories={categories}" + " production={production} tests={tests}" +) SUMMARY_COMPACT_CHANGED_SCOPE = ( "Changed paths={paths} findings={findings} new={new} known={known}" ) @@ -296,9 +330,13 @@ "[dim]Comparison will proceed against an empty baseline.[/dim]\n" f"[dim]{ACTION_UPDATE_BASELINE}[/dim]" ) -ERR_BASELINE_GATING_REQUIRES_TRUSTED = ( +ERR_BASELINE_CI_REQUIRES_TRUSTED = ( f"[error]CI requires a trusted baseline.[/error]\n{ACTION_UPDATE_BASELINE}" ) +ERR_BASELINE_GATING_REQUIRES_TRUSTED = ( + "[error]Baseline-aware gates require a trusted baseline.[/error]\n" + f"{ACTION_UPDATE_BASELINE}" +) SUCCESS_BASELINE_UPDATED = "✔ Baseline updated: {path}" FAIL_NEW_TITLE = "[error]FAILED: New code clones detected.[/error]" @@ -316,6 +354,14 @@ "\n[warning]New clones detected but --fail-on-new not set.[/warning]\n" "Run with --update-baseline to accept them as technical debt." ) +TIP_VSCODE_EXTENSION = ( + "\n[dim]Tip:[/dim] VS Code detected. " + "CodeClone has a native extension for triage-first review and hotspot " + "navigation.\n" + "[dim]{url}[/dim]" +) + +_RICH_MARKUP_TAG_RE = re.compile(r"\[/?[a-zA-Z][a-zA-Z0-9_ .#:-]*]") def version_output(version: str) -> str: @@ -389,6 +435,10 @@ def fmt_cache_save_failed(error: object) -> str: return WARN_CACHE_SAVE_FAILED.format(error=error) +def fmt_vscode_extension_tip(*, url: str) -> str: + return TIP_VSCODE_EXTENSION.format(url=url) + + def fmt_legacy_cache_warning(*, legacy_path: Path, new_path: Path) -> str: return WARN_LEGACY_CACHE.format(legacy_path=legacy_path, new_path=new_path) @@ -397,6 +447,60 @@ def fmt_invalid_baseline(error: object) -> str: return ERR_INVALID_BASELINE.format(error=error) +def fmt_baseline_gating_requires_trusted(*, ci: bool) -> str: + return ( + ERR_BASELINE_CI_REQUIRES_TRUSTED if ci else ERR_BASELINE_GATING_REQUIRES_TRUSTED + ) + + +def fmt_cli_runtime_warning(message: object) -> str: + source = _RICH_MARKUP_TAG_RE.sub("", str(message)).strip() + paragraphs = [ + line.strip() for raw_line in source.splitlines() if (line := raw_line.strip()) + ] + rendered: list[str] = [] + for index, paragraph in enumerate(paragraphs): + label = "Warning" + body = paragraph.rstrip() + lowered = body.lower() + if lowered.startswith("cache "): + label = "Cache" + body = body[6:] + elif lowered.startswith("baseline "): + label = "Baseline" + body = body[9:] + elif lowered.startswith("legacy cache "): + label = "Cache" + + segments = [segment.strip() for segment in body.split("; ") if segment.strip()] + head = segments[0].rstrip(".)") if segments else body.rstrip(".)") + details: list[str] = [] + if " (" in head: + head, extra = head.split(" (", 1) + details.append(extra.rstrip(".)")) + if not details and ": " in head: + head, extra = head.split(": ", 1) + details.append(extra.rstrip(".)")) + details.extend(segment.rstrip(".)") for segment in segments[1:]) + + rendered.append(f" [warning]{label}[/warning] {head}") + for detail in details: + rendered.extend( + [ + f" [dim]{wrapped}[/dim]" + for wrapped in textwrap.wrap( + detail, + width=max(40, CLI_LAYOUT_MAX_WIDTH - 8), + break_long_words=False, + break_on_hyphens=False, + ) + ] + ) + if index != len(paragraphs) - 1: + rendered.append("") + return "\n".join(rendered) + + def fmt_path(template: str, path: Path) -> str: return template.format(path=path) @@ -459,6 +563,34 @@ def fmt_summary_compact_metrics( ) +def fmt_summary_compact_dependencies( + *, + avg_depth: float, + p95_depth: int, + max_depth: int, +) -> str: + return SUMMARY_COMPACT_DEPENDENCIES.format( + avg_depth=f"{avg_depth:.1f}", + p95_depth=p95_depth, + max_depth=max_depth, + ) + + +def fmt_summary_compact_security_surfaces( + *, + items: int, + categories: int, + production: int, + tests: int, +) -> str: + return SUMMARY_COMPACT_SECURITY_SURFACES.format( + items=items, + categories=categories, + production=production, + tests=tests, + ) + + def fmt_summary_compact_adoption( *, param_permille: int, @@ -524,7 +656,7 @@ def fmt_summary_compact_coverage_join( HEALTH_GRADE_F: "bold red", } -_L = 12 # label column width (after 2-space indent) +_L = 13 # label column width (after 2-space indent) def _v(n: int, style: str = "") -> str: @@ -630,6 +762,31 @@ def fmt_metrics_cycles(count: int) -> str: return f" {'Cycles':<{_L}}[bold red]{count} detected[/bold red]" +def fmt_metrics_dependencies( + *, avg_depth: float, p95_depth: int, max_depth: int +) -> str: + return ( + f" {'Dependencies':<{_L}}" + f"avg {avg_depth:.1f} · p95 {p95_depth} · max {max_depth}" + ) + + +def fmt_metrics_security_surfaces( + *, + items: int, + categories: int, + production: int, + tests: int, +) -> str: + return ( + f" {'Security':<{_L}}" + f"{_v(items, 'bold cyan')} surfaces" + f" · {_v(categories, 'bold cyan')} categories" + f" · production {_v(production)}" + f" · tests {_v(tests)}" + ) + + def fmt_metrics_dead_code(count: int, *, suppressed: int = 0) -> str: suppressed_suffix = ( f" [dim]({suppressed} suppressed)[/dim]" if suppressed > 0 else "" diff --git a/codeclone/utils/__init__.py b/codeclone/utils/__init__.py new file mode 100644 index 0000000..b7eef7e --- /dev/null +++ b/codeclone/utils/__init__.py @@ -0,0 +1,11 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +"""Shared internal utility helpers.""" + +from __future__ import annotations + +__all__ = ["coerce", "git_diff", "json_io", "schema_validation"] diff --git a/codeclone/_coerce.py b/codeclone/utils/coerce.py similarity index 100% rename from codeclone/_coerce.py rename to codeclone/utils/coerce.py diff --git a/codeclone/_git_diff.py b/codeclone/utils/git_diff.py similarity index 100% rename from codeclone/_git_diff.py rename to codeclone/utils/git_diff.py diff --git a/codeclone/_json_io.py b/codeclone/utils/json_io.py similarity index 96% rename from codeclone/_json_io.py rename to codeclone/utils/json_io.py index c7be355..69dd93c 100644 --- a/codeclone/_json_io.py +++ b/codeclone/utils/json_io.py @@ -10,7 +10,6 @@ import tempfile from json import JSONDecodeError from pathlib import Path -from typing import Any import orjson @@ -40,7 +39,7 @@ def read_json_document(path: Path) -> object: return orjson.loads(path.read_bytes()) -def read_json_object(path: Path) -> dict[str, Any]: +def read_json_object(path: Path) -> dict[str, object]: payload = read_json_document(path) if not isinstance(payload, dict): raise TypeError("JSON payload must be an object") diff --git a/codeclone/_schema_validation.py b/codeclone/utils/schema_validation.py similarity index 95% rename from codeclone/_schema_validation.py rename to codeclone/utils/schema_validation.py index e90404f..8233eeb 100644 --- a/codeclone/_schema_validation.py +++ b/codeclone/utils/schema_validation.py @@ -8,7 +8,7 @@ from typing import TYPE_CHECKING -from .errors import BaselineValidationError +from ..contracts.errors import BaselineValidationError if TYPE_CHECKING: from collections.abc import Mapping, Set diff --git a/docs/README.md b/docs/README.md index 2d04be9..47fc996 100644 --- a/docs/README.md +++ b/docs/README.md @@ -4,11 +4,15 @@ This site is built with MkDocs and published to [orenlab.github.io/codeclone](https://orenlab.github.io/codeclone/). !!! note "Version Notice" -This site currently documents the in-development `v2.0.x` line from `main`. -For the latest stable CodeClone documentation (`v1.4.4`), see the -[`v1.4.4` README](https://github.com/orenlab/codeclone/blob/v1.4.4/README.md) -and the -[`v1.4.4` docs tree](https://github.com/orenlab/codeclone/tree/v1.4.4/docs). + This site currently documents the in-development `v2.0.x` line from `main`. + For the latest stable CodeClone documentation (`v1.4.4`), see the + [`v1.4.4` README](https://github.com/orenlab/codeclone/blob/v1.4.4/README.md) + and the + [`v1.4.4` docs tree](https://github.com/orenlab/codeclone/tree/v1.4.4/docs). + +!!! note "Repository licensing" + CodeClone source code is licensed under MPL-2.0. Documentation content + under `docs/` and the published docs site is licensed under MIT. It has two documentation layers: @@ -38,8 +42,8 @@ repository build: - [Config and defaults](book/04-config-and-defaults.md) - [Core pipeline and invariants](book/05-core-pipeline.md) - [Baseline contract (schema v2.1)](book/06-baseline.md) -- [Cache contract (schema v2.5)](book/07-cache.md) -- [Report contract (schema v2.8)](book/08-report.md) +- [Cache contract (schema v2.6)](book/07-cache.md) +- [Report contract (schema v2.10)](book/08-report.md) ## Interfaces @@ -91,13 +95,22 @@ help topics when the connected server exposes them. ## Local Preview -Build the docs site with MkDocs, then generate the sample report into the built -site: +=== "Build the site" + + ```bash title="Validate the docs site" + uv run --with mkdocs --with mkdocs-material mkdocs build --strict + ``` + +=== "Build the site and sample report" + + ```bash title="Generate the live sample report into the built site" + uv run --with mkdocs --with mkdocs-material mkdocs build --strict + uv run python scripts/build_docs_example_report.py --output-dir site/examples/report/live + ``` -```bash -uv run --with mkdocs --with mkdocs-material mkdocs build --strict -uv run python scripts/build_docs_example_report.py --output-dir site/examples/report/live -``` +!!! note "Generated output" + `site/` is generated output. It is used for local preview and GitHub Pages + publishing, but it is not committed to git. GitHub Pages publishing is handled by [`docs.yml`](https://github.com/orenlab/codeclone/blob/main/.github/workflows/docs.yml) diff --git a/docs/architecture.md b/docs/architecture.md index 43fab28..d4ae3e8 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -144,7 +144,7 @@ gating decisions. Detected findings can be rendered as: - interactive HTML (`--html`), -- canonical JSON (`--json`, schema `2.8`), +- canonical JSON (`--json`, schema `2.10`), - deterministic text projection (`--text`), - deterministic Markdown projection (`--md`), - deterministic SARIF projection (`--sarif`). @@ -193,10 +193,10 @@ Current shape: Operational note: -- `codeclone/mcp_server.py` is only a thin launcher/registration layer. +- `codeclone/surfaces/mcp/server.py` is only a thin launcher/registration layer. - The optional MCP runtime is imported lazily so the base `codeclone` install and normal CI paths do not require MCP packages. -- `codeclone/mcp_service.py` is the in-process adapter over the existing +- `codeclone/surfaces/mcp/service.py` is the in-process adapter over the existing pipeline/report contracts. The MCP layer is intentionally thin. It does not add a separate analysis engine; @@ -274,7 +274,7 @@ baseline/options contract violations. ## Python Tag Consistency for Baseline Checks Due to inherent AST differences across interpreter builds, baseline compatibility -is pinned to `python_tag` (for example `cp313`). +is pinned to `python_tag` (for example `cp314`). This preserves deterministic and reproducible clone detection results while allowing patch updates within the same interpreter tag. diff --git a/docs/book/00-intro.md b/docs/book/00-intro.md index 58f04e2..5d53853 100644 --- a/docs/book/00-intro.md +++ b/docs/book/00-intro.md @@ -7,9 +7,9 @@ describes only behavior that is present in code and/or locked by tests. ## Public surface -- CLI entrypoint: `codeclone/cli.py:main` +- CLI entrypoint: `codeclone/main.py:main` - Package version: `codeclone/__init__.py:__version__` -- Global contract constants: `codeclone/contracts.py` +- Global contract constants: `codeclone/contracts/__init__.py` ## Contracts @@ -22,10 +22,10 @@ version, same baseline/cache/report schemas): Refs: -- `codeclone/report/json_contract.py:build_report_document` -- `codeclone/baseline.py:Baseline.verify_compatibility` -- `codeclone/cache.py:Cache.load` -- `codeclone/contracts.py:ExitCode` +- `codeclone/report/document/builder.py:build_report_document` +- `codeclone/baseline/clone_baseline.py:Baseline.verify_compatibility` +- `codeclone/cache/store.py:Cache.load` +- `codeclone/contracts/__init__.py:ExitCode` ## Invariants (MUST) @@ -35,9 +35,9 @@ Refs: Refs: -- `codeclone/cli.py:_main_impl` -- `codeclone/baseline.py:BASELINE_UNTRUSTED_STATUSES` -- `codeclone/cache.py:Cache._ignore_cache` +- `codeclone/surfaces/cli/workflow.py:_main_impl` +- `codeclone/baseline/trust.py:BASELINE_UNTRUSTED_STATUSES` +- `codeclone/cache/store.py:Cache._ignore_cache` ## Failure modes @@ -50,8 +50,8 @@ Refs: Refs: -- `codeclone/cli.py:_main_impl` -- `codeclone/cli.py:main` +- `codeclone/surfaces/cli/workflow.py:_main_impl` +- `codeclone/main.py:main` ## Determinism / canonicalization @@ -62,9 +62,9 @@ Refs: Refs: - `codeclone/scanner.py:iter_py_files` -- `codeclone/report/json_contract.py:build_report_document` -- `codeclone/baseline.py:_compute_payload_sha256` -- `codeclone/cache.py:_canonical_json` +- `codeclone/report/document/builder.py:build_report_document` +- `codeclone/baseline/trust.py:_compute_payload_sha256` +- `codeclone/cache/integrity.py:canonical_json` ## Locked by tests diff --git a/docs/book/01-architecture-map.md b/docs/book/01-architecture-map.md index 51b7980..61d66a7 100644 --- a/docs/book/01-architecture-map.md +++ b/docs/book/01-architecture-map.md @@ -2,135 +2,125 @@ ## Purpose -Document current module boundaries and ownership in CodeClone v2.x. +Document the current module boundaries and ownership in CodeClone `2.0.x`. ## Public surface Main ownership layers: -- Core detection pipeline: `scanner` -> `extractor` -> `cfg/normalize/blocks` -> `grouping`. -- Quality metrics pipeline: complexity/coupling/cohesion/dependencies/dead-code/health. -- Contracts and persistence: baseline, metrics baseline, cache, exit semantics. -- Report model and projections: canonical JSON + deterministic TXT/Markdown/SARIF + explainability facts. -- MCP agent surface: read-only server layer over the same pipeline/report contracts. -- VS Code extension surface: native IDE client over the MCP layer and the same canonical report semantics, with - limited Restricted Mode, source-first review flow, and factual overview surfaces such as `Coverage Join` when MCP - exposes them. -- Claude Desktop bundle surface: installable local `.mcpb` wrapper that launches the same `codeclone-mcp` server for - Claude Desktop without introducing a second MCP or analysis layer. -- Codex plugin surface: repo-local Codex plugin under `plugins/` and `.agents/plugins/marketplace.json` that adds - native plugin discovery, a local MCP definition, and a CodeClone review skill over the same server. -- Render layer: HTML rendering and template assets. +- CLI entry and UX orchestration +- Config parsing and pyproject resolution +- Core runtime pipeline +- Analysis and clone grouping +- Metrics and findings +- Baseline/cache persistence contracts +- Canonical report document and deterministic projections +- HTML render-only surface +- Read-only MCP surface +- IDE/client surfaces over MCP ## Data model -| Layer | Modules | Responsibility | -|-----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------| -| Contracts | `codeclone/contracts.py`, `codeclone/errors.py` | Shared schema versions, URLs, exit-code enum, typed exceptions | -| Domain models | `codeclone/models.py`, `codeclone/domain/*.py` | Typed dataclasses/enums plus centralized finding/scope/severity taxonomies | -| Discovery + parsing | `codeclone/scanner.py`, `codeclone/extractor.py` | Enumerate files, parse AST, extract function/block/segment units | -| Structural analysis | `codeclone/cfg.py`, `codeclone/normalize.py`, `codeclone/fingerprint.py`, `codeclone/blocks.py` | CFG, normalization, statement hashes, block/segment windows | -| Grouping | `codeclone/grouping.py` | Build function/block/segment groups | -| Metrics | `codeclone/metrics/*` | Compute complexity/coupling/cohesion/dependency/dead-code/health signals | -| Report core | `codeclone/report/*`, `codeclone/_cli_meta.py` | Canonical report building, deterministic projections, explainability facts, and shared metadata | -| Persistence | `codeclone/baseline.py`, `codeclone/metrics_baseline.py`, `codeclone/cache.py` | Baseline/cache trust/compat/integrity and atomic persistence | -| Runtime orchestration | `codeclone/pipeline.py`, `codeclone/cli.py`, `codeclone/_cli_args.py`, `codeclone/_cli_paths.py`, `codeclone/_cli_summary.py`, `codeclone/_cli_config.py`, `codeclone/ui_messages.py` | CLI UX, stage orchestration, status handling, outputs, error markers | -| MCP agent interface | `codeclone/mcp_service.py`, `codeclone/mcp_server.py` | Read-only MCP tools/resources over canonical analysis and report layers | -| VS Code extension | `extensions/vscode-codeclone/*` | Native VS Code control surface over MCP, with limited Restricted Mode, triage-first review, and source-first drill-down | -| Claude Desktop bundle | `extensions/claude-desktop-codeclone/*` | Installable local MCPB wrapper over `codeclone-mcp`, keeping Claude Desktop on the canonical read-only MCP surface | -| Codex plugin | `plugins/codeclone/*`, `.agents/plugins/marketplace.json` | Native Codex plugin surface over `codeclone-mcp`, with repo-local discovery metadata and CodeClone skill guidance | -| Rendering | `codeclone/html_report.py`, `codeclone/_html_report/*`, `codeclone/_html_badges.py`, `codeclone/_html_js.py`, `codeclone/_html_escape.py`, `codeclone/_html_snippets.py`, `codeclone/templates.py` | HTML-only view layer over report data | +| Layer | Modules | Responsibility | +|-------------------------|-------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------| +| Entry | `codeclone/main.py` | Public CLI entrypoint only | +| CLI surface | `codeclone/surfaces/cli/*`, `codeclone/ui_messages/*` | Parse args, resolve runtime mode, print summaries, write outputs, route exits | +| Config | `codeclone/config/*` | Option specs, parser construction, pyproject loading, CLI > pyproject > defaults merge | +| Core runtime | `codeclone/core/*` | Bootstrap, discovery, worker processing, project metrics, report/gate integration | +| Analysis | `codeclone/analysis/*`, `codeclone/blocks/*`, `codeclone/paths/*`, `codeclone/qualnames/*` | Parse source, normalize AST/CFG facts, extract units, prepare deterministic analysis inputs | +| Findings | `codeclone/findings/clones/*`, `codeclone/findings/structural/*` | Clone grouping and structural finding derivation | +| Metrics | `codeclone/metrics/*` | Complexity, coupling, cohesion, dependencies, dead code, health, adoption, coverage join, API surface | +| Contracts/domain | `codeclone/contracts/*`, `codeclone/models.py`, `codeclone/domain/*` | Version constants, enums, typed exceptions, shared models, domain taxonomies | +| Persistence | `codeclone/baseline/*`, `codeclone/cache/*` | Trusted comparison state and optimization-only cache contracts | +| Canonical report | `codeclone/report/document/*`, `codeclone/report/gates/*`, `codeclone/report/*.py` | Canonical report payload, derived projections, explainability, suggestions, gate reasons | +| Deterministic renderers | `codeclone/report/renderers/*` | Text/Markdown/SARIF/JSON projections over the canonical report | +| HTML render layer | `codeclone/report/html/*` | Render-only HTML view over canonical report/meta facts | +| MCP surface | `codeclone/surfaces/mcp/*` | Read-only MCP tools/resources over the same pipeline/report contracts | +| Client surfaces | `extensions/vscode-codeclone/*`, `extensions/claude-desktop-codeclone/*`, `plugins/codeclone/*` | Native clients/install surfaces over `codeclone-mcp` | Refs: -- `codeclone/pipeline.py` -- `codeclone/cli.py:_main_impl` +- `codeclone/main.py:main` +- `codeclone/surfaces/cli/workflow.py:_main_impl` +- `codeclone/core/pipeline.py:analyze` +- `codeclone/report/document/builder.py:build_report_document` +- `codeclone/report/html/assemble.py:build_html_report` +- `codeclone/surfaces/mcp/server.py:build_mcp_server` ## Contracts -- Core analysis modules do not depend on render/UI modules. -- HTML renderer receives already-computed report data/facts and does not - recompute detection semantics. -- MCP layer reuses current pipeline/report semantics and must not introduce a - separate analysis truth path. -- The VS Code extension follows the same rule through MCP: it is a client - integration surface over canonical report semantics, not a separate analyzer. -- The Claude Desktop bundle follows the same rule: it is a local installation - and launcher surface over `codeclone-mcp`, not a second server. -- The Codex plugin follows the same rule: it is a local discovery and skills - surface over `codeclone-mcp`, not a second analyzer or report model. -- MCP may ship task-specific slim projections (for example, summary-only metrics - or inventory counts) as long as canonical report data remains the source of - truth and richer detail stays reachable through dedicated tools/sections. -- The same rule applies to bounded semantic routing tools such as - `help(topic=...)`: they explain contract meaning and route agents to the - safest next step, but they do not introduce a second documentation or truth - model. -- The same rule applies to summary cache convenience fields such as - `freshness` and to production-first triage projections built from - canonical hotlists/suggestions. -- The same rule also applies to compact interpretation hints such as - `health_scope`, `focus`, and `new_by_source_kind`: they clarify projection - meaning without introducing a second report truth. -- MCP finding lists may also expose short run/finding ids and slimmer relative - location projections, while keeping `get_finding(detail_level="full")` as the - richer per-finding inspection path. -- Baseline, metrics baseline, and cache are validated before being trusted. +- Core produces facts; renderers present facts. +- `codeclone/report/document/*` is the canonical report source of truth. +- HTML, Markdown, SARIF, text, and MCP are projections over the same canonical report semantics. +- Baseline and cache are persistence contracts, not analysis truth. +- Cache is optimization-only and fail-open. +- MCP is read-only and must not create a second analysis truth path. +- VS Code, Claude Desktop, and Codex plugin surfaces are clients over MCP, not second analyzers. Refs: -- `codeclone/report/json_contract.py:build_report_document` -- `codeclone/html_report.py:build_html_report` -- `codeclone/baseline.py:Baseline.load` -- `codeclone/metrics_baseline.py:MetricsBaseline.load` -- `codeclone/cache.py:Cache.load` +- `codeclone/report/document/builder.py:build_report_document` +- `codeclone/report/renderers/text.py:render_text_report_document` +- `codeclone/report/renderers/markdown.py:render_markdown_report_document` +- `codeclone/report/renderers/sarif.py:render_sarif_report_document` +- `codeclone/report/html/assemble.py:build_html_report` +- `codeclone/baseline/clone_baseline.py:Baseline.load` +- `codeclone/baseline/metrics_baseline.py:MetricsBaseline.load` +- `codeclone/cache/store.py:Cache.load` ## Invariants (MUST) - Report serialization is deterministic and schema-versioned. -- UI is render-only and must not change gating semantics. -- Status enums remain domain-owned in baseline/metrics-baseline/cache modules. +- UI is render-only and must not invent gating semantics. +- Status enums remain domain-owned in baseline/metrics-baseline/cache/contracts modules. +- `codeclone/main.py` stays thin; orchestration lives in `codeclone/surfaces/cli/*`. Refs: -- `codeclone/report/json_contract.py:build_report_document` -- `codeclone/report/explain.py:build_block_group_facts` -- `codeclone/baseline.py:BaselineStatus` -- `codeclone/metrics_baseline.py:MetricsBaselineStatus` -- `codeclone/cache.py:CacheStatus` +- `codeclone/report/document/integrity.py:_build_integrity_payload` +- `codeclone/report/document/inventory.py:_build_inventory_payload` +- `codeclone/baseline/trust.py:BaselineStatus` +- `codeclone/baseline/_metrics_baseline_contract.py:MetricsBaselineStatus` +- `codeclone/cache/versioning.py:CacheStatus` +- `codeclone/contracts/__init__.py:ExitCode` ## Failure modes -| Condition | Layer | -|--------------------------------------------|---------------------------------------------------| -| Invalid CLI args / invalid output path | Runtime orchestration (`_cli_args`, `_cli_paths`) | -| Baseline schema/integrity mismatch | Baseline contract layer | -| Metrics baseline schema/integrity mismatch | Metrics baseline contract layer | -| Cache corruption/version mismatch | Cache contract layer (fail-open) | -| HTML snippet read failure | Render layer fallback snippet | +| Condition | Layer | +|--------------------------------------------------|----------------------------------------------------------------| +| Invalid CLI args / invalid output path | CLI surface (`codeclone/config/*`, `codeclone/surfaces/cli/*`) | +| Baseline schema/integrity mismatch | Baseline contract layer | +| Metrics baseline schema/integrity mismatch | Metrics-baseline contract layer | +| Cache corruption/version mismatch | Cache contract layer (fail-open) | +| HTML snippet read failure | HTML render layer fallback snippet | +| MCP invalid request / invalid root / unknown run | MCP surface | ## Determinism / canonicalization -- File iteration and group key ordering are explicit sorts. -- Report serializer uses fixed record layouts and sorted keys. +- File iteration and grouping order are explicit sorts. +- Canonical report integrity excludes non-canonical `derived` payload. +- Baseline and cache hashes/signatures use canonical JSON. Refs: - `codeclone/scanner.py:iter_py_files` -- `codeclone/report/json_contract.py:build_report_document` +- `codeclone/report/document/integrity.py:_build_integrity_payload` +- `codeclone/baseline/trust.py:_compute_payload_sha256` +- `codeclone/cache/integrity.py:canonical_json` ## Locked by tests +- `tests/test_architecture.py::test_architecture_layer_violations` - `tests/test_report.py::test_report_json_compact_v21_contract` +- `tests/test_report_contract_coverage.py::test_report_document_rich_invariants_and_renderers` - `tests/test_html_report.py::test_html_report_uses_core_block_group_facts` - `tests/test_cache.py::test_cache_v13_uses_relpaths_when_root_set` -- `tests/test_cli_unit.py::test_argument_parser_contract_error_marker_for_invalid_args` -- `tests/test_architecture.py::test_architecture_layer_violations` +- `tests/test_mcp_service.py::test_mcp_service_analyze_repository_registers_latest_run` ## Non-guarantees -- Internal module split may evolve in v2.x if public contracts are preserved. -- Import tree acyclicity is policy and test-enforced where explicitly asserted. +- Internal file splits may evolve in `2.0.x` if public contracts are preserved. +- Package markers and internal helper placement are not contract by themselves. ## Chapter map @@ -143,11 +133,7 @@ Refs: | Cache trust and fail-open behavior | [07-cache.md](07-cache.md) | | Report schema and provenance | [08-report.md](08-report.md), [10-html-render.md](10-html-render.md) | | MCP agent surface | [20-mcp-interface.md](20-mcp-interface.md) | -| VS Code IDE surface | [21-vscode-extension.md](21-vscode-extension.md) | -| Claude Desktop install surface | [22-claude-desktop-bundle.md](22-claude-desktop-bundle.md) | -| Codex plugin surface | [23-codex-plugin.md](23-codex-plugin.md) | | Health score model | [15-health-score.md](15-health-score.md) | | Metrics gates and metrics baseline | [15-metrics-and-quality-gates.md](15-metrics-and-quality-gates.md) | | Dead-code liveness policy | [16-dead-code-contract.md](16-dead-code-contract.md) | -| Suggestions and clone typing | [17-suggestions-and-clone-typing.md](17-suggestions-and-clone-typing.md) | | Determinism and versioning policy | [12-determinism.md](12-determinism.md), [14-compatibility-and-versioning.md](14-compatibility-and-versioning.md) | diff --git a/docs/book/02-terminology.md b/docs/book/02-terminology.md index 316f483..fe3f22b 100644 --- a/docs/book/02-terminology.md +++ b/docs/book/02-terminology.md @@ -6,88 +6,65 @@ Define terms exactly as used by code and tests. ## Public surface -- Baseline identifiers and statuses: `codeclone/baseline.py` -- Cache statuses and compact layout: `codeclone/cache.py` -- Report schema and group layouts: `codeclone/report/json_contract.py` +- Baseline identifiers and statuses: `codeclone/baseline/*` +- Cache statuses and compact layout: `codeclone/cache/*` +- Report schema and group layouts: `codeclone/report/document/*` ## Data model -- **fingerprint**: function-level CFG fingerprint (`sha1`) + LOC bucket key. -- **block_hash**: ordered sequence of normalized statement hashes in a fixed window. -- **segment_hash**: hash of ordered segment window. -- **segment_sig**: hash of sorted segment window (candidate grouping signature). -- **stable structure facts**: per-function deterministic structure profile fields - (`entry_guard_*`, `terminal_kind`, `try_finally_profile`, - `side_effect_order_profile`) reused by report families. -- **cohort structural findings**: report-only structural families derived from - existing function-clone groups (`clone_guard_exit_divergence`, - `clone_cohort_drift`). -- **python_tag**: runtime compatibility tag like `cp313`. +- **fingerprint**: function-level CFG fingerprint (`sha1`) plus LOC bucket +- **block_hash**: ordered sequence of normalized statement hashes in a fixed window +- **segment_hash**: hash of an ordered segment window +- **segment_sig**: hash of a sorted segment window used for candidate grouping +- **python_tag**: runtime compatibility tag like `cp314` - **schema_version**: - - baseline schema (`meta.schema_version`) for baseline compatibility. - - cache schema (`v`) for cache compatibility. - - report schema (`report_schema_version`) for report format compatibility. -- **payload_sha256**: canonical baseline semantic hash. -- **trusted baseline**: baseline loaded + status `ok`. -- **source_kind**: file classification — `production`, `tests`, `fixtures`, `other` — - determined by scanner path rules. Drives source-scope breakdown and - hotspot attribution. -- **health score**: weighted blend of seven dimension scores (0–100). - Dimensions: clones 25%, complexity 20%, cohesion 15%, coupling 10%, - dead code 10%, dependencies 10%, coverage 10%. - Report-only layers such as `Overloaded Modules` do not currently affect the score. - Grade bands: A ≥90, B ≥75, C ≥60, D ≥40, F <40. -- **design finding**: metric-driven finding (complexity/coupling/cohesion) - emitted by the canonical report builder when a class or function exceeds - the report-level design threshold. Thresholds are stored in - `meta.analysis_thresholds.design_findings`. -- **suggestion**: advisory recommendation card derived from clones, structural - findings, or metric violations. Advisory only — never gates CI. -- **production_hotspot**: finding group whose items are concentrated in - production source scope (`source_kind=production`). -- **freshness**: MCP cache indicator (`fresh` / `mixed` / `reused`) - reflecting how much of the analysis was recomputed vs cache-served. -- **directory_hotspot**: derived aggregation in `derived.overview` showing - which directories concentrate the most findings by category. + - baseline schema in `meta.schema_version` + - cache schema in top-level `v` + - report schema in `report_schema_version` +- **payload_sha256**: canonical baseline semantic hash +- **trusted baseline**: baseline loaded with status `ok` +- **source_kind**: file classification `production | tests | fixtures | other` +- **design finding**: metric-driven finding emitted by the canonical report builder using + `meta.analysis_thresholds.design_findings` +- **suggestion**: advisory recommendation card derived from findings/metrics; never gates CI +- **directory_hotspot**: derived aggregation showing where findings cluster by category Refs: -- `codeclone/grouping.py:build_groups` -- `codeclone/blocks.py:extract_blocks` -- `codeclone/blocks.py:extract_segments` -- `codeclone/baseline.py:current_python_tag` -- `codeclone/baseline.py:Baseline.verify_compatibility` +- `codeclone/findings/clones/grouping.py:build_groups` +- `codeclone/blocks/__init__.py` +- `codeclone/baseline/trust.py:current_python_tag` +- `codeclone/baseline/clone_baseline.py:Baseline.verify_compatibility` - `codeclone/scanner.py:classify_source_kind` - `codeclone/metrics/health.py:compute_health` -- `codeclone/report/json_contract.py:_design_findings_thresholds_payload` +- `codeclone/report/document/_common.py:_design_findings_thresholds_payload` - `codeclone/report/suggestions.py:generate_suggestions` - `codeclone/report/overview.py:build_directory_hotspots` ## Contracts -- New/known classification is key-based, not item-heuristic-based. +- New/known classification is key-based, not heuristic-based. - Baseline trust is status-driven. - Cache trust is status-driven and independent from baseline trust. -- Design finding universe is determined solely by the canonical report builder; - MCP and HTML read, never resynthesize. +- Design finding universe is determined by the canonical report builder; MCP and HTML read it, never resynthesize it. - Suggestions are advisory and never affect exit code. Refs: -- `codeclone/report/json_contract.py:build_report_document` -- `codeclone/cli.py:_main_impl` +- `codeclone/report/document/builder.py:build_report_document` +- `codeclone/surfaces/cli/workflow.py:_main_impl` ## Invariants (MUST) -- Function group key format: `fingerprint|loc_bucket`. -- Block group key format: `block_hash`. -- Segment group key format: `segment_hash|qualname` (internal/report-only grouping path). +- Function group key format: `fingerprint|loc_bucket` +- Block group key format: `block_hash` +- Segment group key format: `segment_hash|qualname` Refs: -- `codeclone/grouping.py:build_groups` -- `codeclone/grouping.py:build_block_groups` -- `codeclone/grouping.py:build_segment_groups` +- `codeclone/findings/clones/grouping.py:build_groups` +- `codeclone/findings/clones/grouping.py:build_block_groups` +- `codeclone/findings/clones/grouping.py:build_segment_groups` ## Failure modes @@ -99,8 +76,8 @@ Refs: Refs: -- `codeclone/baseline.py:Baseline.verify_compatibility` -- `codeclone/cache.py:Cache.load` +- `codeclone/baseline/clone_baseline.py:Baseline.verify_compatibility` +- `codeclone/cache/store.py:Cache.load` ## Determinism / canonicalization @@ -109,8 +86,8 @@ Refs: Refs: -- `codeclone/baseline.py:_require_sorted_unique_ids` -- `codeclone/cache.py:_encode_wire_file_entry` +- `codeclone/baseline/trust.py:_require_sorted_unique_ids` +- `codeclone/cache/_wire_encode.py:_encode_wire_file_entry` ## Locked by tests diff --git a/docs/book/03-contracts-exit-codes.md b/docs/book/03-contracts-exit-codes.md index 89a1747..3a7345c 100644 --- a/docs/book/03-contracts-exit-codes.md +++ b/docs/book/03-contracts-exit-codes.md @@ -6,59 +6,61 @@ Define stable process exit semantics and category boundaries. ## Public surface -- Exit enum: `codeclone/contracts.py:ExitCode` -- CLI categorization and exits: `codeclone/cli.py:_main_impl`, `codeclone/cli.py:main` -- Error markers: `codeclone/ui_messages.py` +- Exit enum: `codeclone/contracts/__init__.py:ExitCode` +- CLI entry: `codeclone/main.py:main` +- CLI orchestration: `codeclone/surfaces/cli/workflow.py:_main_impl` +- Error markers/formatters: `codeclone/ui_messages/__init__.py` ## Data model -| Exit code | Category | Meaning | -|-----------|----------------|------------------------------------------------------------------------------------------------------------------------| -| 0 | success | Run completed without gating failures | -| 2 | contract error | Input/contract violation (baseline trust, output path/ext, invalid CLI flag combinations, unreadable source in gating) | -| 3 | gating failure | Analysis succeeded but policy failed (`--fail-on-new`, `--fail-threshold`, metrics gates) | -| 5 | internal error | Unexpected exception escaped `_main_impl` | +| Exit code | Category | Meaning | +|-----------|----------------|-----------------------------------------------------| +| `0` | success | Run completed without gating failures | +| `2` | contract error | Input or contract violation | +| `3` | gating failure | Analysis succeeded but policy failed | +| `5` | internal error | Unexpected exception escaped top-level CLI handling | Refs: -- `codeclone/contracts.py:ExitCode` -- `codeclone/_cli_args.py:_ArgumentParser.error` +- `codeclone/contracts/__init__.py:ExitCode` +- `codeclone/config/argparse_builder.py:_ArgumentParser.error` ## Contracts -- Contract errors must use `CONTRACT ERROR:` marker. -- Gating failures must use `GATING FAILURE:` marker. -- Internal errors are formatted by `fmt_internal_error`; traceback hidden unless debug enabled. +- Contract errors use the `CONTRACT ERROR:` marker. +- Gating failures use the `GATING FAILURE:` marker. +- Internal errors use `INTERNAL ERROR:` and hide traceback unless debug is enabled. +- `main()` lets `SystemExit` from contract/gating paths pass through unchanged. Refs: -- `codeclone/ui_messages.py:fmt_contract_error` -- `codeclone/ui_messages.py:fmt_gating_failure` -- `codeclone/ui_messages.py:fmt_internal_error` +- `codeclone/ui_messages/__init__.py:MARKER_CONTRACT_ERROR` +- `codeclone/ui_messages/__init__.py:MARKER_INTERNAL_ERROR` +- `codeclone/ui_messages/__init__.py:fmt_contract_error` +- `codeclone/ui_messages/__init__.py:fmt_gating_failure` +- `codeclone/ui_messages/__init__.py:fmt_internal_error` ## Invariants (MUST) -- `SystemExit` from contract/gating paths must pass through `main()` unchanged. -- Only non-`SystemExit` exceptions in `main()` become exit 5. -- In gating mode, unreadable source files force exit 2 even if clone gating would also fail. +- Only non-`SystemExit` exceptions in `main()` become exit `5`. +- In gating mode, unreadable source files win over clone/metric gate failure and force exit `2`. Refs: -- `codeclone/cli.py:main` -- `codeclone/cli.py:_main_impl` +- `codeclone/main.py:main` +- `codeclone/surfaces/cli/workflow.py:_main_impl` ## Failure modes -| Condition | Marker | Exit | -|----------------------------------------------|----------------|------| -| Invalid output extension | CONTRACT ERROR | 2 | -| `--open-html-report` without `--html` | CONTRACT ERROR | 2 | -| `--timestamped-report-paths` without reports | CONTRACT ERROR | 2 | -| Untrusted baseline in CI/gating | CONTRACT ERROR | 2 | -| Unreadable source in CI/gating | CONTRACT ERROR | 2 | -| New clones with `--fail-on-new` | GATING FAILURE | 3 | -| Threshold exceeded | GATING FAILURE | 3 | -| Unexpected exception in main pipeline | INTERNAL ERROR | 5 | +| Condition | Marker | Exit | +|--------------------------------------------|------------------|------| +| Invalid output extension/path | `CONTRACT ERROR` | `2` | +| Invalid CLI flag combination | `CONTRACT ERROR` | `2` | +| Untrusted baseline in CI/gating | `CONTRACT ERROR` | `2` | +| Unreadable source in CI/gating | `CONTRACT ERROR` | `2` | +| New clones with `--fail-on-new` | `GATING FAILURE` | `3` | +| Threshold or metrics gate breach | `GATING FAILURE` | `3` | +| Unexpected exception in top-level CLI path | `INTERNAL ERROR` | `5` | ## Determinism / canonicalization @@ -67,8 +69,8 @@ Refs: Refs: -- `codeclone/contracts.py:cli_help_epilog` -- `codeclone/ui_messages.py:MARKER_CONTRACT_ERROR` +- `codeclone/contracts/__init__.py:cli_help_epilog` +- `codeclone/ui_messages/__init__.py:MARKER_CONTRACT_ERROR` ## Locked by tests @@ -80,9 +82,4 @@ Refs: ## Non-guarantees -- Exact message body text may evolve; category marker and exit code are contract. - -## See also - -- [09-cli.md](09-cli.md) -- [15-metrics-and-quality-gates.md](15-metrics-and-quality-gates.md) +- Exact message body wording may evolve; marker category and exit code are contract. diff --git a/docs/book/04-config-and-defaults.md b/docs/book/04-config-and-defaults.md index c916b09..9365533 100644 --- a/docs/book/04-config-and-defaults.md +++ b/docs/book/04-config-and-defaults.md @@ -6,11 +6,13 @@ Describe effective runtime configuration and defaults that affect behavior. ## Public surface -- CLI parser and defaults: `codeclone/_cli_args.py:build_parser` -- Pyproject config loader: `codeclone/_cli_config.py` -- Effective cache default path logic: `codeclone/cli.py:_resolve_cache_path` -- Metrics-mode selection logic: `codeclone/cli.py:_configure_metrics_mode` -- Debug mode sources: `codeclone/cli.py:_is_debug_enabled` +- Option specs/defaults: `codeclone/config/spec.py` +- CLI parser and defaults: `codeclone/config/argparse_builder.py:build_parser` +- Pyproject config loader: `codeclone/config/pyproject_loader.py:load_pyproject_config` +- Config resolver: `codeclone/config/resolver.py:resolve_config` +- Effective cache default path logic: `codeclone/surfaces/cli/runtime.py:_resolve_cache_path` +- Metrics-mode selection logic: `codeclone/surfaces/cli/runtime.py:_configure_metrics_mode` +- Debug mode sources: `codeclone/surfaces/cli/console.py:_is_debug_enabled` ## Data model @@ -51,7 +53,7 @@ Fragment-level admission thresholds (pyproject.toml only, advanced tuning): Example project-level config: -```toml +```toml title="Minimal [tool.codeclone] configuration" [tool.codeclone] min_loc = 10 min_stmt = 6 @@ -132,9 +134,22 @@ Report outputs and local UX: | `verbose` | `bool` | `false` | Enable more verbose CLI output | `-` | | `debug` | `bool` | `false` | Enable debug diagnostics | Also enabled by `CODECLONE_DEBUG=1` | -This is the exact accepted key set from `codeclone/_cli_config.py`; unknown +This is the exact accepted `[tool.codeclone]` key set from +`codeclone/config/spec.py` and `codeclone/config/pyproject_loader.py`; unknown keys are contract errors. +!!! note "Pyproject keys vs CLI flags" + The tables above list `[tool.codeclone]` keys, not CLI flag spellings. + CLI flags may map to the same internal destination under a different name. + Example: `coverage_xml` in `pyproject.toml` corresponds to CLI + `--coverage FILE`. The same pattern applies to report outputs such as + `html_out` ↔ `--html` and `json_out` ↔ `--json`. + +!!! warning "Metrics-mode conflicts are enforced" + Metrics update/gating flags are runtime contracts, not hints. Combinations + such as `skip_metrics=true` together with metrics gating or metrics + baseline update flags are contract errors. + Notes: - `skip_metrics=false*`: parser default is `false`, but runtime may auto-enable @@ -158,12 +173,22 @@ scan root when provided as relative paths. Current-run coverage join config: +- `coverage_xml` is the `[tool.codeclone]` key; the equivalent CLI flag is + `--coverage FILE`. - `coverage_xml` may be set in `pyproject.toml`; relative paths resolve from the scan root like other configured paths. - `coverage_min` and `fail_on_untested_hotspots` follow the same precedence rules as CLI flags. - Coverage join remains current-run only and does not persist to baseline. +Dependency depth config note: + +- `dependency_max_depth` is an observed metric in reports/baselines, not a + CLI or `pyproject.toml` option. +- Dependency depth now uses an internal adaptive profile based on + `avg_depth`, `p95_depth`, and `max_depth` for the internal module graph. +- There is no user-facing knob to tune that model in `2.0.0b6`. + Metrics baseline path selection contract: - Relative `baseline` / `metrics_baseline` paths coming from defaults or @@ -178,9 +203,12 @@ Metrics baseline path selection contract: Refs: -- `codeclone/_cli_args.py:build_parser` -- `codeclone/cli.py:_main_impl` -- `codeclone/cli.py:_configure_metrics_mode` +- `codeclone/config/spec.py` +- `codeclone/config/argparse_builder.py:build_parser` +- `codeclone/config/pyproject_loader.py:load_pyproject_config` +- `codeclone/config/resolver.py:resolve_config` +- `codeclone/surfaces/cli/workflow.py:_main_impl` +- `codeclone/surfaces/cli/runtime.py:_configure_metrics_mode` ## Contracts @@ -192,7 +220,7 @@ Refs: Refs: -- `codeclone/cli.py:_main_impl` +- `codeclone/surfaces/cli/workflow.py:_main_impl` ## Invariants (MUST) @@ -209,9 +237,10 @@ Refs: Refs: -- `codeclone/extractor.py:extract_units_and_stats_from_source` -- `codeclone/_cli_args.py:build_parser` -- `codeclone/cli.py:_configure_metrics_mode` +- `codeclone/analysis/units.py:extract_units_and_stats_from_source` +- `codeclone/config/spec.py` +- `codeclone/config/argparse_builder.py:build_parser` +- `codeclone/surfaces/cli/runtime.py:_configure_metrics_mode` ## Failure modes @@ -223,8 +252,9 @@ Refs: Refs: -- `codeclone/_cli_paths.py:_validate_output_path` -- `codeclone/cli.py:_main_impl` +- `codeclone/surfaces/cli/reports_output.py:_validate_output_path` +- `codeclone/surfaces/cli/startup.py:resolve_existing_root_path` +- `codeclone/surfaces/cli/workflow.py:_main_impl` ## Determinism / canonicalization @@ -233,8 +263,8 @@ Refs: Refs: -- `codeclone/contracts.py:cli_help_epilog` -- `codeclone/ui_messages.py:SUMMARY_LABEL_FILES_FOUND` +- `codeclone/contracts/__init__.py:cli_help_epilog` +- `codeclone/ui_messages/__init__.py:SUMMARY_LABEL_FILES_FOUND` ## Locked by tests diff --git a/docs/book/05-core-pipeline.md b/docs/book/05-core-pipeline.md index 57c3a3c..903b505 100644 --- a/docs/book/05-core-pipeline.md +++ b/docs/book/05-core-pipeline.md @@ -2,139 +2,106 @@ ## Purpose -Describe the detection pipeline from file discovery to grouped clones. +Describe the runtime pipeline from file discovery to grouped clones, metrics, +report assembly, and gating. ## Public surface -Pipeline entrypoints: - -- Discovery stage: `codeclone/pipeline.py:discover` -- Per-file processing: `codeclone/pipeline.py:process_file` -- Extraction: `codeclone/extractor.py:extract_units_and_stats_from_source` -- Grouping: `codeclone/grouping.py` +- Discovery: `codeclone/core/discovery.py:discover` +- Per-file processing: `codeclone/core/worker.py:process_file` +- Extraction: `codeclone/analysis/units.py:extract_units_and_stats_from_source` +- Clone grouping: `codeclone/findings/clones/grouping.py` +- Project metrics and suggestions: `codeclone/core/pipeline.py` +- Report/gating integration: `codeclone/core/reporting.py:report`, + `codeclone/core/reporting.py:gate` ## Data model Stages: -1. Discover Python files (`iter_py_files`, sorted traversal) -2. Load from cache if `stat` signature matches -3. Process changed files: +1. Bootstrap runtime paths and config. +2. Discover Python files with deterministic traversal. +3. Load usable cache entries by stat signature and compatible analysis profile. +4. Process changed/missed files: - read source - - AST parse with limits - - extract units/blocks/segments -4. Build groups: + - parse AST with limits + - extract function, block, and segment units + - collect referenced names/qualnames and dead-code candidates +5. Build groups: - function groups by `fingerprint|loc_bucket` - block groups by `block_hash` - segment groups by `segment_sig` then `segment_hash|qualname` -5. Report-layer post-processing: - - merge block windows to maximal regions - - merge/suppress segment report groups - - optionally split out clone groups fully contained in configured - `golden_fixture_paths` -6. Structural report findings: - - duplicated branch families from per-function AST structure facts - - clone cohort drift families built from existing function groups (no rescan) -7. Metrics computation (full mode only): - - per-function cyclomatic complexity - - per-class coupling (CBO) and cohesion (LCOM4) - - dead-code analysis: declaration-only, qualname-based liveness - - dependency graph and cycle detection -8. Health scoring: - - seven dimension scores: clones, complexity, coupling, cohesion, - dead code, dependencies, coverage - - weighted blend → composite score (0–100) and grade (A–F) -9. Suggestion generation: - - advisory cards from clone groups, structural findings, metric violations - - deterministic priority sort, never gates CI -10. Current-run coverage join (optional): - - when `--coverage` is present, join external Cobertura XML to discovered - function spans - - invalid XML becomes `coverage_join.status="invalid"` for that run rather - than mutating baseline state -11. Design finding extraction: - - threshold-aware findings for complexity, coupling, cohesion - - coverage `coverage_hotspot` / `coverage_scope_gap` findings from valid - coverage-join rows only - - thresholds recorded in `meta.analysis_thresholds.design_findings` -12. Derived overview and hotlists: - - overview families, top risks, source breakdown, health snapshot - - directory hotspots by category (`derived.overview.directory_hotspots`) - - hotlists: most actionable, highest spread, production/test-fixture hotspots -13. Gate evaluation: - - clone-baseline diff (NEW vs KNOWN) - - metric threshold gates (`--fail-complexity`, `--fail-coupling`, etc.) - - metric regression gates (`--fail-on-new-metrics`) - - coverage hotspot gate (`--fail-on-untested-hotspots`) - - gate reasons emitted in deterministic order +6. Compute project metrics in full mode: + - complexity, coupling, cohesion + - dead code + - dependency graph and cycles + - health score + - adoption, API surface, optional coverage join +7. Build canonical report document and deterministic projections. +8. Evaluate clone diff and metric gates. Refs: -- `codeclone/pipeline.py` -- `codeclone/extractor.py:extract_units_and_stats_from_source` -- `codeclone/report/blocks.py:prepare_block_report_groups` -- `codeclone/report/segments.py:prepare_segment_report_groups` -- `codeclone/metrics/health.py:compute_health` -- `codeclone/metrics/coverage_join.py:build_coverage_join` -- `codeclone/report/json_contract.py:_build_design_groups` -- `codeclone/report/suggestions.py:generate_suggestions` -- `codeclone/report/overview.py:build_directory_hotspots` -- `codeclone/pipeline.py:metric_gate_reasons` +- `codeclone/core/bootstrap.py:bootstrap` +- `codeclone/core/discovery.py:discover` +- `codeclone/core/worker.py:process_file` +- `codeclone/analysis/units.py:extract_units_and_stats_from_source` +- `codeclone/report/document/builder.py:build_report_document` +- `codeclone/report/gates/evaluator.py:metric_gate_reasons` +- `codeclone/core/reporting.py:gate` ## Contracts -- Detection core (`extractor`, `normalize`, `cfg`, `blocks`) computes clone candidates. +- Detection core computes facts; report layer materializes canonical findings from those facts. - Report-layer transformations do not change function/block grouping keys used for baseline diff. - Segment groups are report-only and do not participate in baseline diff/gating. - Structural findings are report-only and do not participate in baseline diff/gating. -- `golden_fixture_paths` is a project-level clone exclusion policy, not a - fingerprint/baseline rule: - - it applies only to clone groups fully contained in matching - `tests/` / `tests/fixtures/` paths - - excluded groups do not affect health, clone gates, or suggestions - - excluded groups remain observable as suppressed canonical report facts -- Dead-code liveness references from test paths are excluded at extraction/cache-load boundaries for both - local-name references and canonical qualname references. +- `golden_fixture_paths` is a clone-policy exclusion layer: + excluded groups remain visible as suppressed canonical report facts, but do + not affect health, gates, or suggestions. +- Test-path liveness references are filtered both on fresh extraction and on + cache decode. Refs: -- `codeclone/cli.py:_main_impl` (diff uses only function/block groups) -- `codeclone/baseline.py:Baseline.diff` -- `codeclone/extractor.py:extract_units_and_stats_from_source` -- `codeclone/pipeline.py:_load_cached_metrics` +- `codeclone/findings/clones/grouping.py:build_groups` +- `codeclone/report/document/_findings_groups.py:_build_clone_groups` +- `codeclone/findings/structural/detectors.py:normalize_structural_findings` +- `codeclone/core/discovery_cache.py:load_cached_metrics_extended` +- `codeclone/baseline/clone_baseline.py:Baseline.diff` ## Invariants (MUST) -- `Files found = Files analyzed + Cache hits + Files skipped` warning if broken. -- In gating mode, unreadable source IO (`source_read_error`) is a contract failure. -- Parser time/resource protections are applied in POSIX mode via `_parse_limits`. +- `files_found = files_analyzed + cache_hits + files_skipped`, or CLI warns explicitly. +- In gating mode, unreadable source IO is a contract failure. +- Parser time/resource protections are applied before AST extraction. Refs: -- `codeclone/_cli_summary.py:_print_summary` -- `codeclone/cli.py:_main_impl` -- `codeclone/extractor.py:_parse_limits` +- `codeclone/surfaces/cli/summary.py:_print_summary` +- `codeclone/surfaces/cli/workflow.py:_main_impl` +- `codeclone/analysis/parser.py:_parse_limits` ## Failure modes -| Condition | Behavior | -|----------------------------------|-----------------------------------------------------------------------------| -| File stat/read/encoding error | File skipped; tracked as failed file; source-read subset tracked separately | -| Source read error in gating mode | Contract error exit 2 | -| Parser timeout | `ParseError` returned through processing failure path | -| Unexpected per-file exception | Captured as `ProcessingResult(error_kind="unexpected_error")` | +| Condition | Behavior | +|----------------------------------|--------------------------------------------------| +| File stat/read/encoding error | File skipped; tracked as failed file | +| Source read error in gating mode | Contract error, exit `2` | +| Parser timeout | `ParseError` through processing failure path | +| Unexpected per-file exception | Captured as `unexpected_error` processing result | ## Determinism / canonicalization - File list is sorted. -- Group sorting in reports is deterministic by key and stable item sort. +- Group sorting is deterministic by stable tuple keys. +- Canonical report integrity is computed only from canonical sections. Refs: - `codeclone/scanner.py:iter_py_files` -- `codeclone/report/json_contract.py:_build_clone_groups` -- `codeclone/report/json_contract.py:_build_structural_groups` -- `codeclone/report/json_contract.py:_build_integrity_payload` +- `codeclone/findings/clones/grouping.py:build_groups` +- `codeclone/report/document/integrity.py:_build_integrity_payload` ## Locked by tests @@ -143,16 +110,8 @@ Refs: - `tests/test_cli_inprocess.py::test_cli_unreadable_source_fails_in_ci_with_contract_error` - `tests/test_extractor.py::test_parse_limits_triggers_timeout` - `tests/test_extractor.py::test_dead_code_marks_symbol_dead_when_referenced_only_by_tests` -- `tests/test_extractor.py::test_extract_collects_referenced_qualnames_for_import_aliases` - `tests/test_pipeline_metrics.py::test_load_cached_metrics_ignores_referenced_names_from_test_files` ## Non-guarantees -- Parallel scheduling order is not guaranteed; only final grouped output determinism is guaranteed. - -## See also - -- [08-report.md](08-report.md) -- [15-metrics-and-quality-gates.md](15-metrics-and-quality-gates.md) -- [16-dead-code-contract.md](16-dead-code-contract.md) -- [17-suggestions-and-clone-typing.md](17-suggestions-and-clone-typing.md) +- Parallel worker scheduling order is not guaranteed; only final output determinism is guaranteed. diff --git a/docs/book/06-baseline.md b/docs/book/06-baseline.md index b21c51e..d582a77 100644 --- a/docs/book/06-baseline.md +++ b/docs/book/06-baseline.md @@ -2,67 +2,58 @@ ## Purpose -Specify baseline schema v2.1, trust/compatibility checks, integrity hashing, and -runtime behavior. +Specify clone-baseline schema `2.1`, trust/compatibility checks, integrity +hashing, and runtime behavior. ## Public surface -- Baseline object lifecycle: `codeclone/baseline.py:Baseline` -- Baseline statuses: `codeclone/baseline.py:BaselineStatus` -- Baseline status coercion: `codeclone/baseline.py:coerce_baseline_status` -- CLI integration: `codeclone/cli.py:_main_impl` +- Baseline object lifecycle: `codeclone/baseline/clone_baseline.py:Baseline` +- Baseline statuses: `codeclone/baseline/trust.py:BaselineStatus` +- Baseline status coercion: `codeclone/baseline/trust.py:coerce_baseline_status` +- CLI integration: `codeclone/surfaces/cli/baseline_state.py` ## Data model Canonical baseline shape: -- Required top-level keys: `meta`, `clones` -- Optional top-level keys: `metrics`, `api_surface` (unified baseline flow) +- required top-level keys: `meta`, `clones` +- optional top-level keys: `metrics`, `api_surface` (unified baseline flow) - `meta` required keys: `generator`, `schema_version`, `fingerprint_version`, `python_tag`, `created_at`, `payload_sha256` - `clones` required keys: `functions`, `blocks` -- `functions` and `blocks` are sorted/unique `list[str]` +- `functions` and `blocks` are sorted, unique `list[str]` Refs: -- `codeclone/baseline.py:_TOP_LEVEL_REQUIRED_KEYS` -- `codeclone/baseline.py:_TOP_LEVEL_OPTIONAL_KEYS` -- `codeclone/baseline.py:_META_REQUIRED_KEYS` -- `codeclone/baseline.py:_CLONES_REQUIRED_KEYS` -- `codeclone/baseline.py:_require_sorted_unique_ids` +- `codeclone/baseline/clone_baseline.py:_TOP_LEVEL_REQUIRED_KEYS` +- `codeclone/baseline/clone_baseline.py:_TOP_LEVEL_OPTIONAL_KEYS` +- `codeclone/baseline/clone_baseline.py:_META_REQUIRED_KEYS` +- `codeclone/baseline/clone_baseline.py:_CLONES_REQUIRED_KEYS` +- `codeclone/baseline/trust.py:_require_sorted_unique_ids` ## Contracts -Compatibility gates (`verify_compatibility`): +Compatibility gates: -- `generator == "codeclone"` -- `schema_version` major/minor must be supported by runtime +- `generator.name == "codeclone"` +- supported `schema_version` - `fingerprint_version == BASELINE_FINGERPRINT_VERSION` - `python_tag == current_python_tag()` - integrity verified via `payload_sha256` Current runtime policy: -- New clone baseline saves write schema `2.1`. -- Runtime still accepts `2.0` and `2.1` within baseline major `2`. - -Embedded metrics contract: - -- Top-level `metrics` is allowed only for baseline schema `>= 2.0`. -- Clone baseline save preserves existing embedded `metrics` payload, - optional `api_surface` payload, and the corresponding - `meta.metrics_payload_sha256` / `meta.api_surface_payload_sha256` values. -- Embedded `api_surface` snapshots use a compact wire format: each symbol stores - `local_name` relative to its containing `module`, and each module row stores - `filepath` relative to the baseline directory when possible. Runtime - reconstructs canonical full qualnames and runtime filepaths in memory before - diffing. -- The default runtime flow is unified: clone baseline and metrics baseline - usually share the same `codeclone.baseline.json` file unless the metrics path - is explicitly overridden. -- In unified rewrite mode, disabled optional metric surfaces are omitted from - the rewritten embedded payload instead of being preserved as stale baggage. +- new clone baseline saves write schema `2.1` +- runtime accepts `1.0`, `2.0`, and `2.1` + +Unified-baseline contract: + +- top-level `metrics` is allowed only for baseline schema `>= 2.0` +- the default runtime flow is unified: clone and metrics comparison state both + live in `codeclone.baseline.json` unless `--metrics-baseline` is redirected +- unified rewrites preserve current embedded metric sections that remain enabled + and drop disabled optional sections instead of keeping stale baggage Integrity payload includes only: @@ -71,30 +62,23 @@ Integrity payload includes only: - `meta.fingerprint_version` - `meta.python_tag` -Integrity payload excludes: - -- `meta.schema_version` -- `meta.generator.*` -- `meta.created_at` - Refs: -- `codeclone/baseline.py:Baseline.verify_compatibility` -- `codeclone/baseline.py:_compute_payload_sha256` -- `codeclone/baseline.py:_preserve_embedded_metrics` +- `codeclone/baseline/clone_baseline.py:Baseline.verify_compatibility` +- `codeclone/baseline/trust.py:_compute_payload_sha256` +- `codeclone/baseline/metrics_baseline.py:MetricsBaseline.save` ## Invariants (MUST) -- Legacy top-level baselines (`functions`/`blocks` at root) are untrusted and - require regeneration. -- Baseline writes are atomic (`*.tmp` + `os.replace`, same filesystem). +- Legacy top-level baselines (`functions`/`blocks` at root) are untrusted and require regeneration. +- Baseline writes are atomic (`*.tmp` + `os.replace`). - Baseline diff is set-based and deterministic. Refs: -- `codeclone/baseline.py:_is_legacy_baseline_payload` -- `codeclone/baseline.py:_atomic_write_json` -- `codeclone/baseline.py:Baseline.diff` +- `codeclone/baseline/clone_baseline.py:_is_legacy_baseline_payload` +- `codeclone/baseline/clone_baseline.py:_atomic_write_json` +- `codeclone/baseline/clone_baseline.py:Baseline.diff` ## Failure modes @@ -113,26 +97,25 @@ Refs: CLI behavior: -- Normal mode: untrusted baseline is ignored, diff runs against empty baseline. -- Gating mode (`--ci` / `--fail-on-new`): untrusted baseline is contract error - (exit 2). +- normal mode: untrusted baseline is ignored and diff runs against empty baseline +- gating mode (`--ci` / `--fail-on-new`): untrusted baseline is a contract error Refs: -- `codeclone/baseline.py:BaselineStatus` -- `codeclone/cli.py:_main_impl` +- `codeclone/baseline/trust.py:BaselineStatus` +- `codeclone/surfaces/cli/baseline_state.py:resolve_clone_baseline_state` ## Determinism / canonicalization - Clone IDs are serialized sorted. -- Hash serialization uses canonical JSON (`sort_keys=True`, compact separators). -- `payload_sha256` uses `hmac.compare_digest` during verification. +- Hash serialization uses canonical JSON. +- Integrity verification uses constant-time comparison. Refs: -- `codeclone/baseline.py:_baseline_payload` -- `codeclone/baseline.py:_compute_payload_sha256` -- `codeclone/baseline.py:Baseline.verify_integrity` +- `codeclone/baseline/clone_baseline.py:_baseline_payload` +- `codeclone/baseline/trust.py:_compute_payload_sha256` +- `codeclone/baseline/clone_baseline.py:Baseline.verify_integrity` ## Locked by tests @@ -144,6 +127,5 @@ Refs: ## Non-guarantees -- Baseline generator version (`meta.generator.version`) is informational and not - a compatibility gate. +- `meta.generator.version` is informational and not a compatibility gate. - Baseline file indentation/style is not part of compatibility contract. diff --git a/docs/book/07-cache.md b/docs/book/07-cache.md index 1e3e268..9cc8dca 100644 --- a/docs/book/07-cache.md +++ b/docs/book/07-cache.md @@ -2,90 +2,74 @@ ## Purpose -Define cache schema v2.5, integrity verification, and fail-open behavior. +Define cache schema `2.6`, integrity verification, stale-entry pruning, and +fail-open behavior. ## Public surface -- Cache object lifecycle: `codeclone/cache.py:Cache` -- Cache statuses: `codeclone/cache.py:CacheStatus` -- Stat signature source: `codeclone/cache.py:file_stat_signature` -- CLI cache integration: `codeclone/cli.py:_main_impl` +- Cache object lifecycle: `codeclone/cache/store.py:Cache` +- Cache statuses: `codeclone/cache/versioning.py:CacheStatus` +- Stat signature source: `codeclone/cache/store.py:file_stat_signature` +- Wire encode/decode: `codeclone/cache/_wire_encode.py`, + `codeclone/cache/_wire_decode.py` +- CLI/runtime integration: `codeclone/surfaces/cli/runtime.py`, + `codeclone/core/discovery.py` ## Data model -On-disk schema (`v == "2.5"`): +On-disk schema (`v == "2.6"`): -- Top-level: `v`, `payload`, `sig` +- top-level: `v`, `payload`, `sig` - `payload` keys: `py`, `fp`, `ap`, `files`, optional `sr` - `ap` (`analysis_profile`) keys: - - `min_loc`, `min_stmt` - - `block_min_loc`, `block_min_stmt` - - `segment_min_loc`, `segment_min_stmt` - - `collect_api_surface` -- `files` map stores compact per-file entries: - - `st`: `[mtime_ns, size]` - - `ss`: `[lines, functions, methods, classes]` (source stats snapshot) - - `u` (function units): compact row layout with structural facts: - `[qualname,start,end,loc,stmt_count,fingerprint,loc_bucket,cc,nesting,risk,raw_hash,entry_guard_count,entry_guard_terminal_profile,entry_guard_has_side_effect_before,terminal_kind,try_finally_profile,side_effect_order_profile]` - - optional analysis sections (`b`/`s` and metrics-related sections) - - `rn`: referenced local names (non-test files only) - - `rq`: referenced canonical qualnames (non-test files only) -- file keys are wire relpaths when `root` is configured -- optional `sr` (`segment report projection`) stores precomputed segment-report - merge/suppression output: - - `d`: digest of raw segment groups - - `s`: suppressed segment groups count - - `g`: grouped merged segment items (wire rows) -- per-file `dc` (`dead_candidates`) rows do not repeat filepath; path is implied by - the containing file entry + `min_loc`, `min_stmt`, `block_min_loc`, `block_min_stmt`, + `segment_min_loc`, `segment_min_stmt`, `collect_api_surface` +- `files` stores compact per-file entries with stat signature, extracted units, + optional metrics sections (including report-only `security_surfaces`), + referenced names/qualnames, and cached source stats +- `sr` stores optional segment-report projection payload Refs: -- `codeclone/cache.py:Cache.load` -- `codeclone/cache.py:_encode_wire_file_entry` -- `codeclone/cache.py:_decode_wire_file_entry` +- `codeclone/cache/store.py:Cache.load` +- `codeclone/cache/_wire_encode.py:_encode_wire_file_entry` +- `codeclone/cache/_wire_decode.py:_decode_wire_file_entry` ## Contracts - Cache is optimization-only; invalid cache never blocks analysis. -- Any cache trust failure triggers warning + empty cache fallback. -- Cached file entry without valid `ss` (`source_stats`) is treated as cache-miss for - processing counters and reprocessed. -- Cache compatibility gates: - - version `v == CACHE_VERSION` +- Any cache trust failure triggers warning + empty-cache fallback. +- Compatibility gates: + - `v == CACHE_VERSION` - `payload.py == current_python_tag()` - `payload.fp == BASELINE_FINGERPRINT_VERSION` - `payload.ap` matches the current analysis profile - (`min_loc`, `min_stmt`, `block_min_loc`, `block_min_stmt`, - `segment_min_loc`, `segment_min_stmt`, `collect_api_surface`) - - `sig` equals deterministic hash of canonical payload -- Cache schema must also be bumped when cached analysis semantics change in a - way that could leave syntactically valid but semantically stale per-file - entries accepted by runtime compatibility checks. + - `sig` matches deterministic hash of canonical payload +- Stale deleted-file entries are pruned on save/update; cache must reflect the + current worktree, not historical deleted modules. +- Cached entries without valid source stats are treated as cache-miss for + processing counters and reprocessed. Refs: -- `codeclone/cache.py:Cache.load` -- `codeclone/cache.py:Cache._ignore_cache` -- `codeclone/cache.py:Cache._sign_data` +- `codeclone/cache/store.py:Cache.load` +- `codeclone/cache/store.py:Cache._ignore_cache` +- `codeclone/cache/integrity.py:sign_cache_payload` +- `codeclone/core/discovery.py:discover` ## Invariants (MUST) -- Cache save writes canonical JSON and atomically replaces target file. -- Empty sections (`u`, `b`, `s`) are omitted from written wire entries. -- `rn`/`rq` are serialized as sorted unique arrays and omitted when empty. -- Cached public-API symbol payloads preserve declared parameter order; cache - canonicalization must not reorder callable signatures. -- `ss` is written when source stats are available and is required for full cache-hit - accounting in discovery stage. -- Legacy secret file `.cache_secret` is never used for trust; warning only. +- Cache save writes canonical JSON and atomically replaces the target file. +- Empty sections are omitted from wire entries. +- Referenced names/qualnames are serialized as sorted unique arrays and omitted when empty. +- Cached public-API symbol payloads preserve declared parameter order. +- Legacy `.cache_secret` is warning-only and never used for trust. Refs: -- `codeclone/cache.py:Cache.save` -- `codeclone/cache.py:_encode_wire_file_entry` -- `codeclone/pipeline.py:discover` -- `codeclone/cache.py:LEGACY_CACHE_SECRET_FILENAME` +- `codeclone/cache/store.py:Cache.save` +- `codeclone/cache/_wire_encode.py:_encode_wire_file_entry` +- `codeclone/cache/versioning.py:LEGACY_CACHE_SECRET_FILENAME` ## Failure modes @@ -106,25 +90,21 @@ CLI behavior: cache failures do not change exit code; analysis continues without Refs: -- `codeclone/cache.py:CacheStatus` -- `codeclone/cli.py:_main_impl` +- `codeclone/cache/versioning.py:CacheStatus` +- `codeclone/surfaces/cli/runtime.py:resolve_cache_status` ## Determinism / canonicalization - Cache signatures are computed over canonical JSON payload. -- Wire file paths and row arrays are sorted before write. -- `rn`/`rq` are deterministically normalized to sorted unique arrays. -- Current schema decodes only the canonical row shapes that current runtime writes; - for `u` rows, decoder accepts legacy 11-column layout and canonical 17-column - layout (missing structural columns default to neutral values). -- `sr` is additive and optional; invalid/missing projection never invalidates the - cache and simply falls back to runtime recomputation. +- Wire file paths and compact row arrays are sorted before write. +- Optional segment-report projection is additive; invalid/missing projection + falls back to runtime recomputation. Refs: -- `codeclone/cache.py:_canonical_json` -- `codeclone/cache.py:_wire_filepath_from_runtime` -- `codeclone/cache.py:_encode_wire_file_entry` +- `codeclone/cache/integrity.py:canonical_json` +- `codeclone/cache/projection.py:wire_filepath_from_runtime` +- `codeclone/cache/_wire_encode.py:_encode_wire_file_entry` ## Locked by tests @@ -135,9 +115,9 @@ Refs: - `tests/test_cache.py::test_cache_too_large_warns` - `tests/test_cli_inprocess.py::test_cli_reports_cache_too_large_respects_max_size_flag` - `tests/test_cli_inprocess.py::test_cli_cache_analysis_profile_compatibility` -- `tests/test_pipeline_metrics.py::test_load_cached_metrics_ignores_referenced_names_from_test_files` +- `tests/test_core_branch_coverage.py::test_discover_prunes_deleted_cache_entries` ## Non-guarantees - Cache file content stability across schema bumps is not guaranteed. -- Cache payload is tamper-evident only; it is not secret-authenticated. +- Cache is tamper-evident only; it is not an authenticated secret store. diff --git a/docs/book/08-report.md b/docs/book/08-report.md index 2dbc6a0..82b1cf5 100644 --- a/docs/book/08-report.md +++ b/docs/book/08-report.md @@ -2,21 +2,27 @@ ## Purpose -Define report contracts in `2.0.0b5`: canonical JSON (`report_schema_version=2.8`) -plus deterministic TXT/Markdown/SARIF projections. +Define the canonical report contract in `2.0.0b6`: report schema `2.10` plus +deterministic text/Markdown/SARIF/HTML projections. ## Public surface -- Canonical report builder: `codeclone/report/json_contract.py:build_report_document` -- JSON/TXT renderers: `codeclone/report/serialize.py` -- Markdown renderer: `codeclone/report/markdown.py` -- SARIF renderer: `codeclone/report/sarif.py` -- HTML renderer: `codeclone/html_report.py:build_html_report` -- Shared metadata source: `codeclone/_cli_meta.py:_build_report_meta` +- Canonical report builder: `codeclone/report/document/builder.py:build_report_document` +- Canonical inventory/integrity helpers: + `codeclone/report/document/inventory.py`, + `codeclone/report/document/integrity.py` +- Text renderer: `codeclone/report/renderers/text.py:render_text_report_document` +- Markdown renderer: + `codeclone/report/renderers/markdown.py:render_markdown_report_document` +- SARIF renderer: + `codeclone/report/renderers/sarif.py:render_sarif_report_document` +- HTML renderer: `codeclone/report/html/assemble.py:build_html_report` +- Shared CLI report meta: + `codeclone/surfaces/cli/report_meta.py:_build_report_meta` ## Data model -JSON report top-level (v2.8): +Canonical top-level sections: - `report_schema_version` - `meta` @@ -26,190 +32,89 @@ JSON report top-level (v2.8): - `derived` - `integrity` -Canonical provenance additions: - -- `meta.analysis_profile` records the effective runtime clone, block, and - segment thresholds for that run (`min_loc`, `min_stmt`, `block_*`, - `segment_*`). -- `meta.analysis_thresholds.design_findings` records the effective report-level - thresholds used to materialize canonical design findings for that run - (`complexity > N`, `coupling > N`, `cohesion >= N`). - -Canonical report-only metrics additions: - -- `metrics.families.overloaded_modules` records project-relative module hotspot - profiles and candidate classification for `Overloaded Modules` -- `metrics.families.coverage_adoption` records parameter coverage, return - coverage, public docstring coverage, and `Any` usage counts, plus compact - baseline deltas when a trusted metrics baseline is available -- `metrics.families.api_surface` records the current public symbol inventory - and compact baseline diff facts (`added`, `breaking`) when - `--api-surface` is enabled -- `metrics.families.coverage_join` records an optional current-run join between - external Cobertura line coverage and CodeClone function spans. Its summary - carries `status`, `source`, unit/line counts, `overall_permille`, - `missing_from_report_units`, `coverage_hotspots`, `scope_gap_hotspots`, - `hotspot_threshold_percent`, and optional `invalid_reason`; the same compact - summary is mirrored in `metrics.summary.coverage_join`; its items carry - per-function joined coverage facts, including `coverage_status`, - `coverage_hotspot`, and `scope_gap_hotspot`. -- coverage join facts are canonical report truth for that run, but they are - **not** baseline truth and do not update `codeclone.baseline.json` -- adoption/API/coverage-join metrics do **not** participate in clone baseline - NEW/KNOWN semantics; coverage join also does not participate in health scoring - and gates only when explicitly requested -- `Overloaded Modules` is a report-only experimental layer rather than a second - complexity metric: - - complexity reports local control-flow hotspots in functions and methods - - `Overloaded Modules` reports module-level responsibility overload and dependency - pressure - - the layer may later become scoring only after validation and explicit - health-model documentation updates - -Coverage/API role split: - -- `coverage_adoption` is a canonical metrics family, not a style linter. It - reports observable adoption facts only. -- `coverage_join` is a canonical current-run signal over an external Cobertura - XML file. It reports joined line facts and may materialize - `design` findings with `category="coverage"` and kinds - `coverage_hotspot` (measured below threshold) or `coverage_scope_gap` - (outside the supplied coverage scope); it does not infer branch coverage or - execute tests. -- `api_surface` is a canonical metrics/gating family, not a second finding - engine. It reports public API inventory plus baseline-diff facts when the - run opted into API collection. - -Canonical vs non-canonical split: - -- Canonical: `report_schema_version`, `meta`, `inventory`, `findings`, `metrics` -- Non-canonical projection layer: `derived` -- Integrity metadata: `integrity` (`canonicalization` + `digest`) - -Derived projection layer: - -- `derived.suggestions[*]` — action-surplus projection cards keyed back to - canonical findings via `finding_id` -- `derived.overview` — summary-only overview facts: - - `families` - - `top_risks` - - `source_scope_breakdown` - - `health_snapshot` - - `directory_hotspots` -- `derived.hotlists` — deterministic lists of canonical finding IDs: - - `most_actionable_ids` - - `highest_spread_ids` - - `production_hotspot_ids` - - `test_fixture_hotspot_ids` - -Finding families: +Canonical section roles: + +- `meta`, `inventory`, `findings`, `metrics` are canonical truth +- `derived` is a deterministic projection layer +- `integrity` carries canonicalization metadata and digest + +Current canonical report-only metric families include: + +- `health` +- `dead_code` +- `dependencies` +- `coverage_adoption` +- `api_surface` +- `coverage_join` +- `overloaded_modules` +- `security_surfaces` + +Dependency depth facts in the canonical report now include: + +- `avg_depth` +- `p95_depth` +- `max_depth` + +These describe the internal module dependency graph. They are report facts, not +user-facing config knobs. + +Current finding families include: - `findings.groups.clones.{functions,blocks,segments}` -- optional `findings.groups.clones.suppressed.{functions,blocks,segments}` for - clone groups excluded by project policy such as `golden_fixture_paths` +- optional `findings.groups.clones.suppressed.*` - `findings.groups.structural.groups` - `findings.groups.dead_code.groups` - `findings.groups.design.groups` -- `findings.summary.suppressed.dead_code` (suppressed counter, non-active findings) -- optional `findings.summary.suppressed.clones` plus clone-summary suppressed - counters when clone groups were excluded from active findings - -Important role split: - -- Findings explain what was detected. -- Suggestions exist only when they add action structure on top of a finding - (next step, prioritization, effort/risk framing, grouped remediation, or - review relevance). -- Low-signal local structural info hints may remain findings-only and not - appear as separate suggestion cards. -Structural finding kinds currently emitted by core/report pipeline: +Refs: -- `duplicated_branches` -- `clone_guard_exit_divergence` -- `clone_cohort_drift` +- `codeclone/report/document/builder.py:build_report_document` +- `codeclone/report/document/_common.py:_design_findings_thresholds_payload` +- `codeclone/report/document/_findings_groups.py:_build_clone_groups` +- `codeclone/report/document/_findings_groups.py:_build_structural_groups` -Per-group common axes (family-specific fields may extend): +## Contracts -- identity: `id`, `family`, `category`, `kind` -- assessment: `severity`, `confidence`, `priority` -- scope: `source_scope` (`dominant_kind`, `breakdown`, `impact_scope`) -- spread: `spread.files`, `spread.functions` -- evidence: `items`, `facts` (+ optional `display_facts`) +- JSON is the source of truth for report semantics. +- Markdown, text, SARIF, HTML, and MCP projections must read canonical report facts rather than recompute them. +- `derived` does not replace canonical findings/metrics. +- Design findings are built once in the canonical report using + `meta.analysis_thresholds.design_findings`; consumers must not synthesize them post-hoc. +- Coverage Join is canonical current-run truth for that run, but not baseline truth. +- `security_surfaces` is a report-only exact inventory of security-relevant + capabilities and trust-boundary code. It does not claim vulnerabilities or + exploitability. +- Clone groups excluded by project policy are carried only under suppressed clone buckets and do not affect active + findings, health, clone gating, or suggestions. -## Contracts +Refs: -- JSON is source of truth for report semantics. -- Markdown and SARIF are deterministic projections from the same report document. -- MCP summary/finding/hotlist/report-section queries are deterministic views over - the same canonical report document. -- SARIF is an IDE/code-scanning-oriented projection: - - repo-relative result paths are anchored via `%SRCROOT%` - - referenced files are listed under `run.artifacts` - - clone results carry `baselineState` when clone novelty is known -- Derived layer (`suggestions`, `overview`, `hotlists`) does not replace canonical - findings/metrics. -- Design findings are built once in the canonical report using the effective - threshold policy recorded in `meta.analysis_thresholds.design_findings`; MCP - and HTML must not re-synthesize them post-hoc from raw metric rows. -- Coverage design findings are built from canonical `coverage_join` rows only - when a valid join is present. Invalid coverage input is represented as - `metrics.families.coverage_join.summary.status="invalid"` with no hotspot - item rows. -- HTML overview cards are materialized from canonical findings plus - `derived.overview` + `derived.hotlists`; pre-expanded overview card payloads are - not part of the report contract. -- `derived.overview.directory_hotspots` is a deterministic report-layer - aggregation over canonical findings; HTML must render it as-is or omit it on - compatibility paths without a canonical report document. -- `derived.overview.health_snapshot` is a projection over canonical - `metrics.families.health.summary`; it summarizes the current score but does - not define a second health model. -- `derived.overview.directory_hotspots[*].path` is an overview-oriented - directory key: runtime findings keep their parent directory, while test-only - and fixture-only findings collapse to the corresponding source-scope roots - (`.../tests` or `.../tests/fixtures`) to avoid duplicating the same hotspot - across leaf fixture paths. -- Overview hotspot/source-breakdown sections must resolve from canonical report - data or deterministic derived IDs; HTML must not silently substitute stale - placeholders such as `n/a` or empty-state cards when canonical data exists. -- `analysis_started_at_utc` and `report_generated_at_utc` are carried in - `meta.runtime`; renderers/projections may use them for provenance but must not - reinterpret them as semantic analysis data. -- Canonical `meta.scan_root` is normalized to `"."`; absolute runtime paths are - exposed under `meta.runtime.*_absolute`. -- `clone_type` and `novelty` are group-level properties inside clone groups. -- Cohort-drift structural families are report-only and must not affect baseline diff - or CI gating decisions. -- Dead-code suppressed candidates are carried only under metrics - (`metrics.families.dead_code.suppressed_items`) and never promoted to - active `findings.groups.dead_code`. -- Clone groups excluded by `golden_fixture_paths` are carried only under - `findings.groups.clones.suppressed.*`; they do not contribute to active - findings totals, health scoring, clone gating, or suggestion generation. -- A lower score after upgrade may reflect a broader health model, not only - worse code. Report renderers may surface the score, but health-model - expansion is documented separately in [15-health-score.md](15-health-score.md) - and compatibility notes. +- `codeclone/report/document/builder.py:build_report_document` +- `codeclone/report/derived.py:_health_snapshot` +- `codeclone/report/overview.py:materialize_report_overview` +- `codeclone/report/suggestions.py:generate_suggestions` ## Invariants (MUST) -- Stable ordering for groups/items/suggestions/hotlists. -- Stable ordering for SARIF rules, artifacts, and results. +- Stable ordering for groups, items, suggestions, and hotlists. - `derived.suggestions[*].finding_id` references existing canonical finding IDs. - `derived.hotlists.*_ids` reference existing canonical finding IDs. -- SARIF `artifacts[*]` and `locations[*].artifactLocation.index` stay aligned. -- `integrity.digest` is computed from canonical sections only (derived excluded). -- `source_scope.impact_scope` is explicit and deterministic (`runtime`, - `non_runtime`, `mixed`). +- SARIF artifacts, rules, and locations stay index-aligned. +- `integrity.digest` is computed from canonical sections only; `derived` is excluded. + +Refs: + +- `codeclone/report/document/integrity.py:_build_integrity_payload` +- `codeclone/report/document/inventory.py:_build_inventory_payload` +- `codeclone/report/renderers/sarif.py:render_sarif_report_document` ## Failure modes -| Condition | Behavior | -|---------------------------------|------------------------------------------------| -| Missing optional UI/meta fields | Renderer falls back to empty/`(none)` display | -| Untrusted baseline | Clone novelty resolves to `new` for all groups | -| Missing snippet source in HTML | Safe fallback snippet block | +| Condition | Behavior | +|---------------------------------|--------------------------------------------------------| +| Missing optional UI/meta fields | Renderer falls back to empty or `(none)`-style display | +| Untrusted baseline | Clone novelty resolves as current-run only | +| Missing source snippet in HTML | Safe fallback snippet block | ## Determinism / canonicalization @@ -219,9 +124,9 @@ Per-group common axes (family-specific fields may extend): Refs: -- `codeclone/report/json_contract.py:_build_integrity_payload` -- `codeclone/report/json_contract.py:_build_inventory_payload` -- `codeclone/structural_findings.py:normalize_structural_findings` +- `codeclone/report/document/integrity.py:_build_integrity_payload` +- `codeclone/report/document/inventory.py:_build_inventory_payload` +- `codeclone/findings/structural/detectors.py:normalize_structural_findings` ## Locked by tests @@ -231,22 +136,8 @@ Refs: - `tests/test_report_contract_coverage.py::test_report_document_rich_invariants_and_renderers` - `tests/test_report_contract_coverage.py::test_markdown_and_sarif_reuse_prebuilt_report_document` - `tests/test_report_branch_invariants.py::test_overview_and_sarif_branch_invariants` -- `tests/test_report.py::test_json_includes_clone_guard_exit_divergence_structural_group` -- `tests/test_report.py::test_json_includes_clone_cohort_drift_structural_group` -- `tests/test_report.py::test_report_json_dead_code_suppressed_items_are_reported_separately` ## Non-guarantees -- Human-readable wording in `derived` or HTML may evolve without schema bump. -- CSS/layout changes are not part of JSON contract. - -## See also - -- [07-cache.md](07-cache.md) -- [09-cli.md](09-cli.md) -- [10-html-render.md](10-html-render.md) -- [15-health-score.md](15-health-score.md) -- [20-mcp-interface.md](20-mcp-interface.md) -- [17-suggestions-and-clone-typing.md](17-suggestions-and-clone-typing.md) -- [../sarif.md](../sarif.md) -- [../examples/report.md](../examples/report.md) +- Human-facing wording in `derived` or HTML may evolve without a schema bump. +- CSS/layout changes are not part of the canonical report contract. diff --git a/docs/book/09-cli.md b/docs/book/09-cli.md index 644701b..9c8c517 100644 --- a/docs/book/09-cli.md +++ b/docs/book/09-cli.md @@ -2,189 +2,121 @@ ## Purpose -Define observable CLI behavior: argument handling, summaries, error UI, and output writing. +Define observable CLI behavior: argument handling, summaries, output writing, +and exit routing. + +!!! note "Observable surface only" + This chapter covers scripting-visible behavior and user-facing CLI output + categories. Rich styling details may evolve as long as markers, exit + semantics, and deterministic output contracts stay stable. ## Public surface -- CLI runner: `codeclone/cli.py:main`, `codeclone/cli.py:_main_impl` -- Parser: `codeclone/_cli_args.py:build_parser` -- Summary renderer: `codeclone/_cli_summary.py:_print_summary` -- Path validation: `codeclone/_cli_paths.py:_validate_output_path` -- Message catalog: `codeclone/ui_messages.py` +- Public entrypoint: `codeclone/main.py:main` +- CLI orchestration: `codeclone/surfaces/cli/workflow.py:_main_impl` +- Parser: `codeclone/config/argparse_builder.py:build_parser` +- Summary renderer: `codeclone/surfaces/cli/summary.py:_print_summary` +- Output path validation and writes: + `codeclone/surfaces/cli/reports_output.py` +- Message catalog: `codeclone/ui_messages/__init__.py` ## Data model CLI modes: -- Normal mode -- Gating mode (`--ci`, `--fail-on-new`, `--fail-threshold>=0`) -- Update mode (`--update-baseline`) +- normal mode +- gating mode (`--ci`, `--fail-on-new`, explicit metric gates) +- baseline update mode (`--update-baseline`, `--update-metrics-baseline`) -Summary metrics: +Summary metrics include: - files found/analyzed/cache hits/skipped -- structural counters: analyzed lines/functions/methods/classes -- function/block/segment groups -- excluded golden-fixture clone groups (when configured) -- suppressed segment groups -- dead-code active/suppressed status in metrics line -- adoption coverage in the normal `Metrics` block: - parameter typing, return typing, public docstrings, and `Any` count -- public API surface in the normal `Metrics` block when `api_surface` was - collected: symbol/module counts plus added/breaking deltas when a trusted - metrics baseline is available -- coverage join in the normal `Metrics` block when `--coverage FILE` was - provided: joined Cobertura overall line coverage, untested hotspot count, and - threshold/source context +- structural counters for lines/functions/methods/classes +- function/block/segment clone groups +- suppressed clone groups from `golden_fixture_paths` +- dead-code active/suppressed status +- dependency depth profile (`avg_depth`, `p95_depth`, `max_depth`) when metrics are computed +- adoption/API/coverage-join facts when computed - new vs baseline -Metrics-related CLI gates: - -- threshold gates: - `--fail-complexity`, `--fail-coupling`, `--fail-cohesion`, `--fail-health` -- coverage threshold gates: - `--min-typing-coverage`, `--min-docstring-coverage` -- baseline-aware delta gates: - `--fail-on-new-metrics`, - `--fail-on-typing-regression`, - `--fail-on-docstring-regression`, - `--fail-on-api-break` -- external coverage join gate: - `--coverage FILE`, `--coverage-min PERCENT`, - `--fail-on-untested-hotspots` -- update mode: - `--update-metrics-baseline` -- opt-in metrics family: - `--api-surface` -- In unified baseline mode, `--update-baseline` rewrites embedded metric - surfaces from the current enabled config; disabled optional surfaces are - dropped. - Refs: -- `codeclone/_cli_summary.py:_print_summary` -- `codeclone/ui_messages.py:fmt_summary_files` +- `codeclone/surfaces/cli/summary.py:_print_summary` +- `codeclone/surfaces/cli/runtime.py:_metrics_flags_requested` +- `codeclone/surfaces/cli/runtime.py:_metrics_computed` +- `codeclone/surfaces/cli/report_meta.py:_build_report_meta` ## Contracts - Help output includes canonical exit-code section and project links. -- Reporting flag UX uses explicit pairs (`--no-progress`/`--progress`, - `--no-color`/`--color`) and avoids generated double-negation aliases. -- `--open-html-report` is a local UX action layered on top of `--html`; it does not implicitly enable HTML output. -- `--timestamped-report-paths` only rewrites default report paths requested via bare report flags; explicit FILE values - stay unchanged. -- Changed-scope clone review uses: +- Bare report flags write to deterministic default paths under `.cache/codeclone/`. +- `--open-html-report` is layered on top of `--html`; it does not imply HTML output. +- `--timestamped-report-paths` rewrites only default report paths requested via bare flags. +- In interactive VS Code terminals, the CLI may print a one-time extension hint + after summary output. The hint is suppressed in `--quiet`, CI, and non-TTY + contexts, and is tracked per CodeClone version next to the resolved project + cache path. +- Changed-scope review uses: - `--changed-only` - - `--diff-against GIT_REF` - - `--paths-from-git-diff GIT_REF` - Typical usage: - - `codeclone . --changed-only --diff-against main` - - `codeclone . --paths-from-git-diff HEAD~1` -- Contract errors are prefixed by `CONTRACT ERROR:`. -- Gating failures are prefixed by `GATING FAILURE:`. -- Internal errors use `fmt_internal_error` with optional debug details. -- Runtime footer uses explicit wording: `Pipeline done in s`. - This metric is CLI pipeline time and does not include external launcher/startup overhead (for example `uv run`). -- Dead-code metric line is stateful and deterministic: - - `N found (M suppressed)` when active dead-code items exist - - `✔ clean` when both active and suppressed are zero - - `✔ clean (M suppressed)` when active is zero but suppressed > 0 -- The normal rich `Metrics` block includes: - - `Adoption` when adoption coverage facts were computed - - `Public API` when `api_surface` facts were computed - - `Coverage` when Cobertura coverage was joined with `--coverage` -- Quiet compact metrics output stays on the existing fixed one-line summary and - does not expand adoption/API/coverage-join detail. -- When `golden_fixture_paths` excludes clone groups from active review, CLI - keeps that count inside the `Clones` summary line (`fixtures=N`) instead of - adding a separate summary row. -- Typing/docstring adoption metrics are computed in full mode. -- `--api-surface` is opt-in in normal runs, but runtime auto-enables it when - `--fail-on-api-break` or `--update-metrics-baseline` needs a public API - snapshot. -- `--fail-on-typing-regression` / `--fail-on-docstring-regression` require a - metrics baseline that already contains adoption coverage data. -- `--fail-on-api-break` requires a metrics baseline that already contains - `api_surface` data. -- `--coverage` is a current-run external Cobertura input. It does not update or - compare against `codeclone.baseline.json`. -- Relative clone-baseline and metrics-baseline paths from defaults or - `pyproject.toml` resolve from the analysis root. Explicit CLI paths are used - as provided. -- Invalid Cobertura XML is warning-only in normal runs: CLI prints - `Coverage join ignored`, keeps exit `0`, and shows `Coverage` as unavailable - in the normal `Metrics` block. It becomes a contract error only when - `--fail-on-untested-hotspots` requires a valid join. -- `--fail-on-untested-hotspots` requires `--coverage` and a valid Cobertura XML - input. It exits `3` when medium/high-risk functions measured by Coverage Join - fall below `--coverage-min` (default `50`). Functions outside the supplied - `coverage.xml` scope are surfaced separately and do not trigger this gate. - The flag name is retained for CLI compatibility. + - `--diff-against` + - `--paths-from-git-diff` +- Contract errors use `CONTRACT ERROR:`. +- Gating failures use `GATING FAILURE:`. +- Internal errors use `fmt_internal_error` and include traceback only in debug mode. Refs: -- `codeclone/contracts.py:cli_help_epilog` -- `codeclone/ui_messages.py:fmt_contract_error` -- `codeclone/ui_messages.py:fmt_internal_error` +- `codeclone/contracts/__init__.py:cli_help_epilog` +- `codeclone/ui_messages/__init__.py:fmt_contract_error` +- `codeclone/ui_messages/__init__.py:fmt_internal_error` +- `codeclone/surfaces/cli/changed_scope.py:_validate_changed_scope_args` ## Invariants (MUST) -- Report writes (`--html/--json/--md/--sarif/--text`) are path-validated and write failures are contract errors. -- Bare reporting flags write to default deterministic paths under - `.cache/codeclone/`. -- `--open-html-report` requires `--html`; invalid combination is a contract error. -- `--timestamped-report-paths` requires at least one requested report output; invalid combination is a contract error. -- `--changed-only` requires either `--diff-against` or `--paths-from-git-diff`. -- `--diff-against` requires `--changed-only`. -- `--diff-against` and `--paths-from-git-diff` are mutually exclusive. -- Git diff refs are validated as safe single revision expressions before - subprocess execution. -- Browser-open failure after a successful HTML write is warning-only and does not change the process exit code. -- Baseline update write failure is contract error. -- In gating mode, unreadable source files are contract errors with higher priority than clone gating failure. -- Changed-scope flags do not create a second canonical report: they project clone - summary/threshold decisions over the changed-files subset after the normal full - analysis completes. +- Report writes are path-validated and write failures are contract errors. +- `--open-html-report` requires `--html`. +- `--timestamped-report-paths` requires at least one requested report output. +- `--changed-only` requires a diff source. +- Browser-open failure after successful HTML write is warning-only. +- In gating mode, unreadable source files are contract errors with higher priority than clone/metric gate failures. Refs: -- `codeclone/cli.py:_write_report_output` -- `codeclone/cli.py:_main_impl` +- `codeclone/surfaces/cli/reports_output.py:_validate_output_path` +- `codeclone/surfaces/cli/reports_output.py:_validate_report_ui_flags` +- `codeclone/surfaces/cli/workflow.py:_main_impl` ## Failure modes -| Condition | User-facing category | Exit | -|---------------------------------------------------------------------------|----------------------|------| -| Invalid CLI flag | contract | 2 | -| Invalid output extension/path | contract | 2 | -| `--open-html-report` without `--html` | contract | 2 | -| `--timestamped-report-paths` without reports | contract | 2 | -| `--changed-only` without diff source | contract | 2 | -| `--diff-against` without `--changed-only` | contract | 2 | -| `--diff-against` + `--paths-from-git-diff` | contract | 2 | -| Baseline untrusted in CI/gating | contract | 2 | -| Coverage/API regression gate without required metrics-baseline capability | contract | 2 | -| `--fail-on-untested-hotspots` without `--coverage` | contract | 2 | -| Invalid Cobertura XML without hotspot gating | warning only | 0 | -| Invalid Cobertura XML for coverage hotspot gating | contract | 2 | -| Unreadable source in CI/gating | contract | 2 | -| New clones with `--fail-on-new` | gating | 3 | -| Threshold exceeded | gating | 3 | -| Coverage hotspots with `--fail-on-untested-hotspots` | gating | 3 | -| Unexpected exception | internal | 5 | +!!! warning "Failure precedence" + Contract failures take precedence over gating failures. In CI and scripted + flows, invalid config or unreadable sources must surface as exit `2` before + any clone or metrics gate can fail with exit `3`. + +| Condition | User-facing category | Exit | +|-------------------------------------------------------------------|----------------------|------| +| Invalid CLI flag | contract | `2` | +| Invalid output extension/path | contract | `2` | +| Invalid changed-scope flag combination | contract | `2` | +| Baseline untrusted in CI/gating | contract | `2` | +| Coverage/API regression gate without required baseline capability | contract | `2` | +| Unreadable source in CI/gating | contract | `2` | +| New clones with `--fail-on-new` | gating | `3` | +| Threshold or metrics gate exceeded | gating | `3` | +| Unexpected exception | internal | `5` | ## Determinism / canonicalization - Summary metric ordering is fixed. -- Compact summary mode (`--quiet`) is fixed-format text. +- Compact summary mode is fixed-format text. - Help epilog is generated from static constants. -- `git diff --name-only` input is normalized to sorted repo-relative paths before - changed-scope projection is applied. +- Git diff path inputs are normalized to sorted repo-relative paths. Refs: -- `codeclone/_cli_summary.py:_print_summary` -- `codeclone/contracts.py:EXIT_CODE_DESCRIPTIONS` +- `codeclone/surfaces/cli/summary.py:_print_summary` +- `codeclone/contracts/__init__.py:cli_help_epilog` +- `codeclone/surfaces/cli/changed_scope.py:_normalize_changed_paths` ## Locked by tests @@ -196,12 +128,5 @@ Refs: ## Non-guarantees -- Rich styling details are not part of machine-facing CLI contract. +- Rich styling details are not machine-facing contract. - Warning phrasing may evolve if category markers and exit semantics stay stable. - -## See also - -- [04-config-and-defaults.md](04-config-and-defaults.md) -- [20-mcp-interface.md](20-mcp-interface.md) -- [15-metrics-and-quality-gates.md](15-metrics-and-quality-gates.md) -- [16-dead-code-contract.md](16-dead-code-contract.md) diff --git a/docs/book/10-html-render.md b/docs/book/10-html-render.md index b87d377..6b64879 100644 --- a/docs/book/10-html-render.md +++ b/docs/book/10-html-render.md @@ -2,128 +2,85 @@ ## Purpose -Document HTML rendering as a pure view layer over report data/facts. +Document HTML rendering as a pure view layer over canonical report data. ## Public surface -- Main renderer: `codeclone/html_report.py:build_html_report` -- HTML assembly package: `codeclone/_html_report/*` -- Overview materialization bridge: `codeclone/report/overview.py:materialize_report_overview` -- Escaping helpers: `codeclone/_html_escape.py` -- Snippet/highlight helpers: `codeclone/_html_snippets.py` -- Static template: `codeclone/templates.py:REPORT_TEMPLATE` +- Main renderer: `codeclone/report/html/assemble.py:build_html_report` +- Package entrypoint: `codeclone/report/html/__init__.py:build_html_report` +- Context shaping: `codeclone/report/html/_context.py` +- Escaping helpers: `codeclone/report/html/primitives/escape.py` +- Snippet/highlight helpers: `codeclone/report/html/widgets/snippets.py` +- Sections/widgets/assets: `codeclone/report/html/sections/*`, + `codeclone/report/html/widgets/*`, `codeclone/report/html/assets/*` ## Data model -Inputs to renderer: +Inputs to the renderer: -- canonical report document (`report_document`) when available (preferred path) -- compatibility inputs for direct rendering path: - - grouped clone data (`func_groups`, `block_groups`, `segment_groups`) - - block explainability facts (`block_group_facts`) - - novelty key sets (`new_function_group_keys`, `new_block_group_keys`) - - shared report metadata (`report_meta`) +- canonical `report_document` (preferred path) +- shared `report_meta` +- optional runtime snippet sources for code excerpts Output: -- single self-contained HTML string - -Refs: - -- `codeclone/html_report.py:build_html_report` +- one self-contained HTML string ## Contracts -- HTML must not recompute detection semantics; it renders facts from core/report layers. -- Explainability hints shown in UI are sourced from `build_block_group_facts` data. -- Provenance panel mirrors report metadata contract. -- HTML may expose local UX affordances such as the health-grade badge dialog - or provenance modal, but those actions are projections over already computed - report/meta facts. -- Overview UI is a report projection: - - KPI cards with baseline-aware tone (`✓ baselined` / `+N` regression) - - Health gauge with baseline delta arc (improvement/degradation) - - Executive Summary: issue breakdown (sorted bars) + source breakdown - - Hotspots by Directory: render-only view over `derived.overview.directory_hotspots` - - Health Profile: full-width radar chart of dimension scores - - Get Badge modal: grade-only / score+grade variants with shields.io embed -- Quality UI is also a report projection: - - deterministic subtabs for complexity, coupling, cohesion, overloaded - modules, and `Coverage Join` when canonical join facts exist - - `Coverage Join` uses the same stat-card and table patterns as other - quality surfaces; it separates measured coverage hotspots from coverage - scope gaps, and invalid joins render a factual unavailable state instead - of a success-style empty message -- Dead-code UI is a single top-level `Dead Code` tab with deterministic split - sub-tabs: `Active` and `Suppressed`. -- Clones UI may append a `Suppressed` sub-tab when canonical report data - includes `findings.groups.clones.suppressed.*`; those rows are factual - projections of policy-excluded clone groups such as `golden_fixture_paths` - and do not become active clone findings. -- IDE deep links: - - An IDE picker in the topbar lets users choose their IDE. The selection is - persisted in `localStorage` (key `codeclone-ide`). - - Supported IDEs: PyCharm, IntelliJ IDEA, VS Code, Cursor, Fleet, Zed. - - File paths across Clones, Quality, Suggestions, Dead Code, and Findings - tabs are rendered as `` elements with `data-file` - (absolute path) and `data-line` attributes. - - JetBrains IDEs use `jetbrains://` protocol (requires Toolbox); others use - native URL schemes (`vscode://`, `cursor://`, `fleet://`, `zed://`). - - The scan root is embedded as `data-scan-root` on `` so that - JetBrains links can derive the project name and relative path. - - When no IDE is selected, links are inert (no `href`, default cursor). +- HTML must not recompute detection semantics; it renders facts from report/core layers. +- Provenance panels mirror canonical report/meta facts. +- Overview, Quality, Suggestions, Dead Code, and Clones tabs are projections over canonical report sections. +- Quality may include report-only subtabs such as `Coverage Join` and + `Security Surfaces`; these remain factual projections over canonical metrics + families rather than HTML-only analysis. +- IDE deep links are HTML-only UX over canonical path/line facts. +- Missing snippets or optional meta fields render safe factual fallbacks rather than invented data. Refs: -- `codeclone/report/explain.py:build_block_group_facts` +- `codeclone/report/html/assemble.py:build_html_report` +- `codeclone/report/html/sections/_clones.py:_render_group_explanation` +- `codeclone/report/html/sections/_meta.py:render_meta_panel` +- `codeclone/report/html/assets/js.py:_IDE_LINKS` - `codeclone/report/overview.py:materialize_report_overview` -- `codeclone/_html_report/_sections/_clones.py:_render_group_explanation` -- `codeclone/_html_report/_sections/_meta.py:render_meta_panel` -- `codeclone/_html_js.py:_IDE_LINKS` -- `codeclone/_html_report/_assemble.py` (IDE picker topbar widget) ## Invariants (MUST) -- All user/content fields are escaped for text/attributes before insertion. +- User/content fields are escaped before insertion into HTML. - Missing file snippets render explicit fallback blocks. -- Novelty controls reflect baseline trust split note and per-group novelty flags. -- Suppressed dead-code rows are rendered only from report dead-code suppression - payloads and do not become active dead-code findings in UI tables. -- Structural finding cards may render a compact inline suggested action when a - low-signal local hint intentionally has no separate suggestion card. -- IDE link `data-file` and `data-line` attributes are escaped via - `_escape_html` before insertion into HTML. +- Novelty badges reflect baseline trust and per-group novelty flags. +- Suppressed dead-code rows render only from report suppression payloads. +- Path-link `data-file` and `data-line` attributes are escaped before insertion. Refs: -- `codeclone/_html_escape.py:_escape_html` -- `codeclone/_html_snippets.py:_render_code_block` -- `codeclone/_html_report/_sections/_clones.py:render_clones_panel` -- `codeclone/_html_report/_tables.py` (path cell IDE links) -- `codeclone/report/findings.py` (structural findings IDE links) +- `codeclone/report/html/primitives/escape.py:_escape_html` +- `codeclone/report/html/widgets/snippets.py:_render_code_block` +- `codeclone/report/html/widgets/tables.py` ## Failure modes -| Condition | Behavior | -|-------------------------------------|---------------------------------------------| -| Source file unreadable for snippet | Render fallback snippet with message | -| Missing/invalid optional meta field | Render empty or `(none)`-equivalent display | -| Pygments unavailable | Escape-only fallback code rendering | +| Condition | Behavior | +|-------------------------------------|----------------------------------------| +| Source file unreadable for snippet | Render fallback snippet with message | +| Missing/invalid optional meta field | Render empty or `(none)`-style display | +| Pygments unavailable | Escape-only fallback code rendering | Refs: -- `codeclone/_html_snippets.py:_FileCache.get_lines_range` -- `codeclone/_html_snippets.py:_try_pygments` +- `codeclone/report/html/widgets/snippets.py:_FileCache` +- `codeclone/report/html/widgets/snippets.py:_try_pygments` ## Determinism / canonicalization -- Section/group ordering follows sorted report inputs. +- Section and group ordering follow sorted canonical report inputs. - Metadata rows are built in fixed order. Refs: -- `codeclone/_html_report/_assemble.py:build_html_report` -- `codeclone/_html_report/_sections/_meta.py:render_meta_panel` +- `codeclone/report/html/assemble.py:build_html_report` +- `codeclone/report/html/sections/_meta.py:render_meta_panel` ## Locked by tests @@ -132,16 +89,9 @@ Refs: - `tests/test_html_report.py::test_html_report_escapes_script_breakout_payload` - `tests/test_html_report.py::test_html_report_missing_source_snippet_fallback` - `tests/test_html_report.py::test_html_and_json_group_order_consistent` +- `tests/test_html_report.py::test_html_report_quality_includes_security_surfaces_subtab` ## Non-guarantees -- CSS/visual system and interaction details may evolve without schema bump. -- HTML-only interaction affordances (theme toggle, IDE picker, provenance modal, - badge modal, radar chart) are not baseline/cache/report contracts. -- IDE deep link behavior depends on the user's local IDE installation and - protocol handler registration (e.g. JetBrains Toolbox for `jetbrains://`). -- Overview layout (KPI grid, executive summary, analytics) is a pure view - concern; only the underlying data identity and ordering are contract-sensitive. -- Direct `build_html_report(...)` compatibility paths without a canonical - `report_document` may omit `directory_hotspots`; HTML must not approximate - directory aggregates from suggestion cards. +- CSS, layout, and interaction details may evolve without a schema bump. +- IDE deep link behavior depends on local IDE installation and protocol handlers. diff --git a/docs/book/11-security-model.md b/docs/book/11-security-model.md index 548c817..ac9d3e7 100644 --- a/docs/book/11-security-model.md +++ b/docs/book/11-security-model.md @@ -7,10 +7,12 @@ Describe implemented protections and explicit security boundaries. ## Public surface - Scanner path validation: `codeclone/scanner.py:iter_py_files` -- File read limits and parser limits: `codeclone/cli.py:process_file`, `codeclone/extractor.py:_parse_limits` -- Baseline/cache validation: `codeclone/baseline.py`, `codeclone/cache.py` -- HTML escaping: `codeclone/_html_escape.py`, `codeclone/html_report.py` -- MCP read-only enforcement: `codeclone/mcp_service.py`, `codeclone/mcp_server.py` +- File read and parser limits: `codeclone/core/worker.py:process_file`, + `codeclone/analysis/parser.py:_parse_limits` +- Baseline/cache validation: `codeclone/baseline/*`, `codeclone/cache/*` +- HTML escaping: `codeclone/report/html/primitives/escape.py`, + `codeclone/report/html/assemble.py` +- MCP read-only enforcement: `codeclone/surfaces/mcp/*` ## Data model @@ -19,44 +21,39 @@ Security-relevant input classes: - filesystem paths (root/source/baseline/cache/report) - untrusted JSON files (baseline/cache) - untrusted source snippets and metadata rendered into HTML +- MCP request parameters (`root`, filters, diff refs, cache policy) ## Contracts - CodeClone parses source text; it does not execute repository Python code. - Sensitive root directories are blocked by scanner policy. -- Symlink traversal outside root is skipped. -- HTML report escapes text and attribute contexts before embedding. -- MCP server is read-only by design: no tool mutates source files, baselines, - cache, or report artifacts. -- `--allow-remote` guard must be passed explicitly for non-local transports; - default is local-only (`stdio`). -- `cache_policy=refresh` is rejected — MCP cannot trigger cache invalidation. -- Review markers (`mark_finding_reviewed`) are session-local in-memory state; - they are never persisted to disk or leaked into baselines/reports. -- `git_diff_ref` is validated as a safe single revision expression before any - `git diff` subprocess call. Leading option-like prefixes, whitespace/control - characters, and unsupported punctuation are rejected. -- Run history is bounded by `--history-limit` (default 10) to prevent - unbounded memory growth. +- Symlink traversal outside the root is skipped. +- HTML escapes text and attribute contexts before embedding. +- MCP is read-only by design: + no tool mutates source files, baselines, cache, or report artifacts. +- `--allow-remote` is required for non-local transports. +- `cache_policy=refresh` is rejected by MCP. +- Review markers are session-local in-memory state only. +- `git_diff_ref` is validated as a safe single revision expression before any `git diff` subprocess call. Refs: -- `codeclone/extractor.py:_parse_with_limits` +- `codeclone/analysis/parser.py:_parse_with_limits` - `codeclone/scanner.py:SENSITIVE_DIRS` - `codeclone/scanner.py:iter_py_files` -- `codeclone/_html_escape.py:_escape_html` +- `codeclone/report/html/primitives/escape.py:_escape_html` ## Invariants (MUST) - Baseline and cache integrity checks use constant-time comparison. - Size guards are enforced before parsing baseline/cache JSON. -- Cache failures degrade safely (warning + ignore), baseline trust failures follow trust model. +- Cache failures degrade safely; baseline trust failures follow the explicit trust model. Refs: -- `codeclone/baseline.py:Baseline.verify_integrity` -- `codeclone/cache.py:Cache.load` -- `codeclone/cli.py:_main_impl` +- `codeclone/baseline/clone_baseline.py:Baseline.verify_integrity` +- `codeclone/cache/store.py:Cache.load` +- `codeclone/surfaces/cli/workflow.py:_main_impl` ## Failure modes @@ -68,20 +65,20 @@ Refs: | Oversized cache | Cache ignored | | HTML-injected payload in metadata/source | Escaped output | | `--allow-remote` not passed for HTTP | Transport rejected | -| `cache_policy=refresh` requested | Policy rejected | +| `cache_policy=refresh` requested in MCP | Policy rejected | | `git_diff_ref` fails validation | Parameter rejected | ## Determinism / canonicalization - Canonical JSON hashing for baseline/cache prevents formatting-only drift. -- Security failures map to explicit statuses (baseline/cache enums). +- Security failures map to explicit statuses rather than silent mutation. Refs: -- `codeclone/baseline.py:_compute_payload_sha256` -- `codeclone/cache.py:_canonical_json` -- `codeclone/baseline.py:BaselineStatus` -- `codeclone/cache.py:CacheStatus` +- `codeclone/baseline/trust.py:_compute_payload_sha256` +- `codeclone/cache/integrity.py:canonical_json` +- `codeclone/baseline/trust.py:BaselineStatus` +- `codeclone/cache/versioning.py:CacheStatus` ## Locked by tests @@ -90,8 +87,8 @@ Refs: - `tests/test_security.py::test_html_report_escapes_user_content` - `tests/test_html_report.py::test_html_report_escapes_script_breakout_payload` - `tests/test_cache.py::test_cache_too_large_warns` -- `tests/test_mcp_service.py::test_cache_policy_refresh_rejected` -- `tests/test_mcp_server.py::test_allow_remote_guard` +- `tests/test_mcp_service.py::test_mcp_service_rejects_refresh_cache_policy_in_read_only_mode` +- `tests/test_mcp_server.py::test_mcp_server_main_rejects_non_loopback_host_without_opt_in` ## Non-guarantees diff --git a/docs/book/12-determinism.md b/docs/book/12-determinism.md index e0579f1..6209a41 100644 --- a/docs/book/12-determinism.md +++ b/docs/book/12-determinism.md @@ -6,9 +6,11 @@ Document deterministic behavior and canonicalization controls. ## Public surface -- Sorting and traversal: `codeclone/scanner.py`, `codeclone/report/serialize.py`, `codeclone/cache.py` -- Canonical hashing: `codeclone/baseline.py`, `codeclone/cache.py` -- Golden detector snapshot policy: `tests/test_detector_golden.py` +- Sorted file traversal: `codeclone/scanner.py` +- Canonical report construction: `codeclone/report/document/*` +- Deterministic text projection: `codeclone/report/renderers/text.py` +- Baseline hashing: `codeclone/baseline/trust.py` +- Cache signing: `codeclone/cache/integrity.py` ## Data model @@ -18,57 +20,56 @@ Deterministic outputs depend on: - fixed baseline/cache/report schemas - sorted file traversal - sorted group keys and item records -- canonical JSON serialization for hashes +- canonical JSON serialization for hashes/signatures ## Contracts -- JSON report uses deterministic ordering for files/groups/items. -- TXT report uses deterministic metadata key order and group/item ordering. +- Canonical JSON report uses deterministic ordering for files, groups, items, and summaries. +- Text/Markdown/SARIF projections are deterministic views over the canonical report. - Baseline hash is canonical and independent from non-payload metadata fields. - Cache signature is canonical and independent from JSON whitespace. Refs: -- `codeclone/report/json_contract.py:build_report_document` -- `codeclone/report/serialize.py:render_text_report_document` -- `codeclone/baseline.py:_compute_payload_sha256` -- `codeclone/cache_io.py:sign_cache_payload` +- `codeclone/report/document/builder.py:build_report_document` +- `codeclone/report/renderers/text.py:render_text_report_document` +- `codeclone/baseline/trust.py:_compute_payload_sha256` +- `codeclone/cache/integrity.py:sign_cache_payload` ## Invariants (MUST) - `inventory.file_registry.items` is lexicographically sorted. - finding groups/items and derived hotlists are deterministically ordered. -- Baseline clone lists are sorted and unique. -- Golden detector test runs only on canonical Python tag from fixture metadata. +- baseline clone lists are sorted and unique. +- golden detector fixtures run only on the canonical Python tag from fixture metadata. Refs: -- `codeclone/report/json_contract.py:_build_inventory_payload` -- `codeclone/baseline.py:_require_sorted_unique_ids` +- `codeclone/report/document/inventory.py:_build_inventory_payload` +- `codeclone/baseline/trust.py:_require_sorted_unique_ids` - `tests/test_detector_golden.py::test_detector_output_matches_golden_fixture` ## Failure modes -| Condition | Determinism impact | -|-------------------------------------|--------------------------------------------------------| -| Different Python tag | Clone IDs may differ; baseline considered incompatible | -| Unsorted/non-canonical baseline IDs | Baseline rejected as invalid | -| Cache signature mismatch | Cache ignored and recomputed | -| Different cache provenance state | `meta.cache_*` differs by design | +| Condition | Determinism impact | +|-------------------------------------|-----------------------------------------------------| +| Different Python tag | Clone IDs may differ; baseline becomes incompatible | +| Unsorted/non-canonical baseline IDs | Baseline rejected as invalid | +| Cache signature mismatch | Cache ignored and recomputed | +| Different cache provenance state | `meta.cache_*` differs by design | ## Determinism / canonicalization Primary canonicalization points: -- `json.dumps(..., sort_keys=True, separators=(",", ":"), ensure_ascii=False)` for baseline/cache payload - hash/signature. -- tuple-based sort keys for report record arrays. +- canonical JSON with sorted keys and compact separators for baseline/cache hashing +- stable tuple-based sort keys for report arrays and hotlists Refs: -- `codeclone/baseline.py:_compute_payload_sha256` -- `codeclone/cache_io.py:canonical_json` -- `codeclone/report/json_contract.py:_build_integrity_payload` +- `codeclone/baseline/trust.py:_compute_payload_sha256` +- `codeclone/cache/integrity.py:canonical_json` +- `codeclone/report/document/integrity.py:_build_integrity_payload` ## Locked by tests @@ -81,5 +82,4 @@ Refs: ## Non-guarantees - Determinism is not guaranteed across different `python_tag` values. -- Byte-identical reports are not guaranteed across different cache provenance - states (`cache_status`, `cache_used`, `cache_schema_version`). +- Byte-identical reports are not guaranteed across different cache provenance states. diff --git a/docs/book/13-testing-as-spec.md b/docs/book/13-testing-as-spec.md index c2b03b2..4ec392c 100644 --- a/docs/book/13-testing-as-spec.md +++ b/docs/book/13-testing-as-spec.md @@ -33,16 +33,17 @@ Test classes by role: The following matrix is treated as executable contract: -| Contract | Tests | -|--------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| -| Baseline schema/integrity/compat gates | `tests/test_baseline.py` | -| Cache v2.5 fail-open + status mapping + API-surface-aware reuse + API signature order preservation | `tests/test_cache.py`, `tests/test_cli_inprocess.py::test_cli_reports_cache_too_large_respects_max_size_flag`, `tests/test_cli_inprocess.py::test_cli_public_api_breaking_count_stable_across_warm_cache`, `tests/test_cli_inprocess.py::test_cli_api_surface_ignores_non_api_warm_cache` | -| Exit code categories and markers | `tests/test_cli_unit.py`, `tests/test_cli_inprocess.py` | -| Report schema v2.8 canonical/derived/integrity + JSON/TXT/MD/SARIF projections | `tests/test_report.py`, `tests/test_report_contract_coverage.py`, `tests/test_report_branch_invariants.py` | -| HTML render-only explainability + escaping | `tests/test_html_report.py` | -| Current-run Cobertura coverage join parsing, gating, and projections | `tests/test_coverage_join.py`, `tests/test_pipeline_metrics.py`, `tests/test_cli_unit.py`, `tests/test_mcp_service.py`, `tests/test_html_report.py` | -| Golden fixture clone exclusion policy | `tests/test_golden_fixtures.py`, `tests/test_cli_inprocess.py::test_cli_pyproject_golden_fixture_paths_exclude_fixture_clone_groups`, `tests/test_report.py::test_report_json_clone_groups_can_include_suppressed_golden_fixture_bucket` | -| Scanner traversal safety | `tests/test_scanner_extra.py`, `tests/test_security.py` | +| Contract | Tests | +|----------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Baseline schema/integrity/compat gates | `tests/test_baseline.py` | +| Cache v2.6 fail-open + status mapping + API-surface-aware reuse + security-surface persistence + API signature order preservation | `tests/test_cache.py`, `tests/test_cli_inprocess.py::test_cli_reports_cache_too_large_respects_max_size_flag`, `tests/test_cli_inprocess.py::test_cli_public_api_breaking_count_stable_across_warm_cache`, `tests/test_cli_inprocess.py::test_cli_api_surface_ignores_non_api_warm_cache` | +| Exit code categories and markers | `tests/test_cli_unit.py`, `tests/test_cli_inprocess.py` | +| Report schema v2.10 canonical/derived/integrity + JSON/TXT/MD/SARIF projections | `tests/test_report.py`, `tests/test_report_contract_coverage.py`, `tests/test_report_branch_invariants.py` | +| HTML render-only explainability + escaping | `tests/test_html_report.py` | +| Current-run Cobertura coverage join parsing, gating, and projections | `tests/test_coverage_join.py`, `tests/test_pipeline_metrics.py`, `tests/test_cli_unit.py`, `tests/test_mcp_service.py`, `tests/test_html_report.py` | +| Report-only security surfaces inventory and projections | `tests/test_security_surfaces.py`, `tests/test_pipeline_metrics.py`, `tests/test_cache.py`, `tests/test_report_contract_coverage.py`, `tests/test_cli_unit.py`, `tests/test_html_report.py`, `tests/test_mcp_service.py`, `tests/test_mcp_server.py` | +| Golden fixture clone exclusion policy | `tests/test_golden_fixtures.py`, `tests/test_cli_inprocess.py::test_cli_pyproject_golden_fixture_paths_exclude_fixture_clone_groups`, `tests/test_report.py::test_report_json_clone_groups_can_include_suppressed_golden_fixture_bucket` | +| Scanner traversal safety | `tests/test_scanner_extra.py`, `tests/test_security.py` | ## Invariants (MUST) diff --git a/docs/book/14-compatibility-and-versioning.md b/docs/book/14-compatibility-and-versioning.md index a68bdd2..922382a 100644 --- a/docs/book/14-compatibility-and-versioning.md +++ b/docs/book/14-compatibility-and-versioning.md @@ -7,12 +7,16 @@ compatibility is enforced. ## Public surface -- Version constants: `codeclone/contracts.py` -- Baseline compatibility checks: `codeclone/baseline.py:Baseline.verify_compatibility` -- Metrics baseline compatibility checks: `codeclone/metrics_baseline.py:MetricsBaseline.verify_compatibility` -- Cache compatibility checks: `codeclone/cache.py:Cache.load` -- Report schema assignment: `codeclone/report/json_contract.py:build_report_document` -- MCP public surface: `codeclone/mcp_server.py`, `codeclone/mcp_service.py` +- Version constants: `codeclone/contracts/__init__.py` +- Clone baseline compatibility: + `codeclone/baseline/clone_baseline.py:Baseline.verify_compatibility` +- Metrics baseline compatibility: + `codeclone/baseline/metrics_baseline.py:MetricsBaseline.verify_compatibility` +- Cache compatibility: `codeclone/cache/store.py:Cache.load` +- Report schema assignment: + `codeclone/report/document/builder.py:build_report_document` +- MCP public surface: `codeclone/surfaces/mcp/server.py`, + `codeclone/surfaces/mcp/service.py` ## Data model @@ -20,144 +24,71 @@ Current contract versions: - `BASELINE_SCHEMA_VERSION = "2.1"` - `BASELINE_FINGERPRINT_VERSION = "1"` -- `CACHE_VERSION = "2.5"` -- `REPORT_SCHEMA_VERSION = "2.8"` -- `METRICS_BASELINE_SCHEMA_VERSION = "1.2"` (used only when metrics are stored - in a dedicated metrics-baseline file instead of the default unified baseline) +- `CACHE_VERSION = "2.6"` +- `REPORT_SCHEMA_VERSION = "2.10"` +- `METRICS_BASELINE_SCHEMA_VERSION = "1.2"` Refs: -- `codeclone/contracts.py` +- `codeclone/contracts/__init__.py` ## Contracts Version bump rules: -- Bump **baseline schema** only for baseline JSON layout/type changes. -- Bump **fingerprint version** when clone key semantics change. -- Bump **cache schema** for cache wire-format/validation changes and for - cached-analysis semantic changes that would otherwise leave stale cache - entries looking compatible to runtime validation. -- Bump **report schema** for canonical report document contract changes - (`report_schema_version`, consumed by JSON/TXT/Markdown/SARIF and HTML provenance/view). -- Bump **metrics-baseline schema** only for dedicated metrics-baseline payload - changes. -- This schema does **not** imply that metrics normally live in a separate file: - the default runtime path is still the unified baseline file, and the - standalone metrics-baseline schema applies only when users opt into a - different metrics-baseline path. -- MCP does not currently define a separate schema/version constant; tool names, - resource shapes, and documented request/response semantics are therefore - package-versioned public surface and must be documented/tested when changed. -- Slimming or splitting MCP-only projections (for example, summary payloads or - `metrics` vs `metrics_detail`) does not change `report_schema_version` as long - as the canonical report document and finding identities remain unchanged. -- The same rule applies to finding-level MCP projection changes such as - short MCP ids, slim summary locations, or omitting `priority_factors` - outside `detail_level="full"`. -- Additive MCP-only convenience fields/projections such as - `cache.freshness`, production-first triage, `health_scope`, `focus`, or - `new_by_source_kind` also do not change - `report_schema_version` when they are derived from unchanged canonical report - and summary data. -- The same rule applies to bounded MCP semantic guidance such as - `help(topic=...)`: package-versioned wording and routing may evolve, but they - do not change `report_schema_version` as long as canonical report semantics - and finding identities remain unchanged. -- Canonical report changes such as `meta.analysis_thresholds.design_findings` - or threshold-aware design finding materialization do change - `report_schema_version` because they alter canonical report semantics and - integrity payload. -- The same is true for additive canonical metrics families such as - `metrics.families.overloaded_modules`, `coverage_adoption`, `api_surface`, - or `coverage_join`: even when the layer is report-only or current-run only, - it still changes canonical report schema and integrity payload, so it - requires a report-schema bump. -- The same rule applies to new canonical suppressed-finding buckets such as - `findings.groups.clones.suppressed.*`: even though they are non-active - review facts, they still change canonical report shape and integrity payload. -- CodeClone does not currently define a separate health-model version constant. - Health-score semantics are package-versioned and must be documented in the - Health Score chapter and release notes when they change. - -Baseline compatibility rules: - -- Runtime accepts baseline schema majors `1` and `2` with supported minors. -- Runtime writes current schema (`2.1`) on new/updated baseline saves. -- Embedded top-level `metrics` is valid only for baseline schema `>= 2.0`. -- Unified clone baselines may also embed top-level `api_surface` when metrics - baseline data is stored in the same file. -- Embedded and standalone `api_surface` snapshots now use compact symbol wire - layout (`local_name` relative to `module`, `filepath` relative to the - baseline directory when possible) while runtime reconstructs full canonical - qualnames and runtime filepaths before comparison. This is a schema change - for baseline `2.1` / metrics-baseline `1.2`, not a silent serialization - detail. -- Capability-sensitive metrics gates (for example adoption regression or API - break gating) must check for the required embedded data, not only the clone - baseline schema version. - -Metrics-baseline compatibility rules: - -- Runtime writes standalone metrics-baseline schema `1.2`. -- Runtime accepts standalone metrics-baseline `1.1` and `1.2`. -- When metrics are embedded into the unified clone baseline, the embedded - metrics section follows the clone baseline schema compatibility window - instead (`2.0` and `2.1` in the current runtime). - -Baseline regeneration rules: - -- Required when `fingerprint_version` changes. -- Required when `python_tag` changes. -- Not required for package patch/minor updates if compatibility gates still pass. +- bump **baseline schema** only for clone-baseline JSON layout/type changes +- bump **fingerprint version** when clone identity semantics change +- bump **cache schema** for cache wire-format or compatibility-semantics changes +- bump **report schema** for canonical report document shape/meaning changes +- bump **metrics-baseline schema** only for standalone metrics-baseline payload changes -## Health model evolution +Operational compatibility rules: + +- runtime writes baseline schema `2.1` +- runtime accepts clone baseline `1.0`, `2.0`, and `2.1` +- runtime writes standalone metrics-baseline schema `1.2` +- runtime accepts standalone metrics-baseline `1.1` and `1.2` +- runtime writes cache schema `2.6` +- MCP does not define a separate schema constant; tool/resource semantics are package-versioned public surface -Health Score is stable within a given scoring model, but the scoring model may -evolve across releases. +Baseline regeneration is required when: -New signal families may first appear as report-only or experimental layers. -After validation and contract hardening, selected layers may later be promoted -into scoring. +- `fingerprint_version` changes +- `python_tag` changes -Future CodeClone releases may expand the Health Score formula with additional -validated signal families. As a result, a repository's score may decrease after -upgrade even if the code itself did not become worse. In such cases, the change -reflects an evolved scoring model rather than a retroactive decline in code -quality. +It is not required for package patch/minor updates when compatibility gates still pass. -Short operational reminder: +## Health model evolution -> A lower score after upgrade may reflect a broader health model, not only -> worse code. +CodeClone does not currently define a separate health-model version constant. +Health semantics are package-versioned behavior and must be documented in: -Contract consequence: +- this chapter +- [15-health-score.md](15-health-score.md) +- release notes -- health-model expansion does not necessarily require a baseline/cache/report - schema bump; -- but it **does** require explicit documentation and release-note coverage, - because it changes user-visible scoring semantics. +A lower score after upgrade may reflect a broader scoring model, not only worse code. ## Invariants (MUST) -- Contract changes must include code updates and changelog/docs updates. -- Schema mismatches must map to explicit statuses. -- Legacy baseline payloads (<=1.3 layout) remain untrusted and require regeneration. +- Contract changes require code + tests + changelog/docs updates. +- Schema mismatches map to explicit statuses. +- Legacy baselines stay untrusted and require regeneration. Refs: -- `codeclone/baseline.py:BaselineStatus` -- `codeclone/baseline.py:_is_legacy_baseline_payload` +- `codeclone/baseline/trust.py:BaselineStatus` +- `codeclone/baseline/clone_baseline.py:_is_legacy_baseline_payload` ## Failure modes -| Change type | User impact | -|------------------------------|-----------------------------------------------------------------------| -| Baseline schema bump | older unsupported baseline schemas become untrusted until regenerated | -| Fingerprint bump | clone IDs change; baseline regeneration required | -| Cache schema bump | old caches are ignored and rebuilt automatically | -| Report schema bump | downstream report consumers must update | -| Metrics-baseline schema bump | dedicated metrics-baseline files must be regenerated | +| Change type | User impact | +|------------------------------|----------------------------------------------------------------| +| Baseline schema bump | Older unsupported baselines become untrusted until regenerated | +| Fingerprint bump | Clone IDs change; baseline regeneration required | +| Cache schema bump | Old caches are ignored and rebuilt automatically | +| Report schema bump | Downstream report consumers must update | +| Metrics-baseline schema bump | Dedicated metrics-baseline files must be regenerated | ## Determinism / canonicalization @@ -166,9 +97,9 @@ Refs: Refs: -- `codeclone/contracts.py` -- `codeclone/baseline.py:Baseline.verify_compatibility` -- `codeclone/metrics_baseline.py:MetricsBaseline.verify_compatibility` +- `codeclone/contracts/__init__.py` +- `codeclone/baseline/clone_baseline.py:Baseline.verify_compatibility` +- `codeclone/baseline/metrics_baseline.py:MetricsBaseline.verify_compatibility` ## Locked by tests @@ -180,7 +111,5 @@ Refs: ## Non-guarantees -- Backward compatibility is not guaranteed across incompatible schema/fingerprint - bumps. -- Health Score is not frozen forever as a mathematical formula; what is frozen - is the obligation to document scoring-model changes and present them honestly. +- Backward compatibility is not guaranteed across incompatible schema/fingerprint bumps. +- Health Score is not mathematically frozen forever; the obligation to document scoring-model changes is. diff --git a/docs/book/15-health-score.md b/docs/book/15-health-score.md index be7b71e..207d790 100644 --- a/docs/book/15-health-score.md +++ b/docs/book/15-health-score.md @@ -2,131 +2,86 @@ ## Purpose -Define the current Health Score model, the report-only layers that do **not** -yet affect it, and the policy for future scoring-model expansion. - -Health Score is a user-facing contract. It is not just an internal aggregate. +Define the current Health Score model, what does not affect it yet, and the +policy for future scoring-model expansion. ## Public surface - Scoring model: `codeclone/metrics/health.py:compute_health` -- Weight assignment: `codeclone/contracts.py:HEALTH_WEIGHTS` -- Input wiring: `codeclone/pipeline.py:compute_project_metrics` +- Weight assignment: `codeclone/contracts/__init__.py:HEALTH_WEIGHTS` +- Input wiring: `codeclone/core/pipeline.py:compute_project_metrics` - Canonical report surface: - `codeclone/report/json_contract.py:build_report_document` -- Overview projection: - `codeclone/report/json_contract.py:_health_snapshot` + `codeclone/report/document/builder.py:build_report_document` +- Health snapshot projections: + `codeclone/report/derived.py:_health_snapshot`, + `codeclone/report/overview.py:_health_snapshot` - CLI / HTML / MCP consumers: - `codeclone/_cli_summary.py`, `codeclone/_html_report/_sections/_overview.py`, - `codeclone/mcp_service.py` + `codeclone/surfaces/cli/summary.py`, + `codeclone/report/html/sections/_overview.py`, + `codeclone/surfaces/mcp/session.py` ## Contracts - Health Score is computed only in `analysis_mode=full`. -- In `analysis_mode=clones_only`, health is intentionally unavailable rather - than fabricated from partial inputs. +- In `analysis_mode=clones_only`, health is intentionally unavailable. - The current scoring model includes exactly seven dimensions: `clones`, `complexity`, `coupling`, `cohesion`, `dead_code`, `dependencies`, `coverage`. -- Only dimensions produced by `compute_health(...)` contribute to the score. - Report-only or advisory layers must not affect the score until they are - explicitly promoted into the scoring model and documented. + explicitly promoted and documented. -## What currently affects Health Score +## Scoring model -Current weights from `codeclone/contracts.py:HEALTH_WEIGHTS`: +Current weights from `codeclone/contracts/__init__.py:HEALTH_WEIGHTS`: -| Dimension | Weight | Current inputs in code | Signal type | Visible report/UI surface | -|--------------|--------|--------------------------------------------------------------------------------------|----------------------------------|------------------------------------------------------------------------------------------| -| Clones | 25% | function clone groups + block clone groups, normalized by `files_analyzed_or_cached` | aggregate project-level | `metrics.families.health.summary.dimensions.clones`, HTML `Health Profile`, CLI, MCP | -| Complexity | 20% | `complexity_avg`, `complexity_max`, `high_risk_functions` | local findings -> aggregate | `metrics.families.health.summary.dimensions.complexity`, design findings, HTML, CLI, MCP | -| Cohesion | 15% | `cohesion_avg`, `low_cohesion_classes` | local findings -> aggregate | `metrics.families.health.summary.dimensions.cohesion`, design findings, HTML, CLI, MCP | -| Coupling | 10% | `coupling_avg`, `coupling_max`, `high_risk_classes` | local findings -> aggregate | `metrics.families.health.summary.dimensions.coupling`, design findings, HTML, CLI, MCP | -| Dead code | 10% | count of active dead-code items after suppression and non-actionable filtering | local findings -> aggregate | `metrics.families.dead_code`, health dimensions, HTML, CLI, MCP | -| Dependencies | 10% | `dependency_cycles`, `dependency_max_depth` | aggregate graph-level | `metrics.families.dependencies`, health dimensions, HTML, CLI, MCP | -| Coverage | 10% | `files_analyzed_or_cached / files_found` | aggregate inventory-completeness | `metrics.families.health.summary.dimensions.coverage`, HTML `Health Profile`, MCP | +| Dimension | Weight | Signal | +|--------------|--------|------------------------------------------------------------------| +| Clones | 25% | Function + block clone density | +| Complexity | 20% | Function-level complexity risk | +| Cohesion | 15% | Low-cohesion class pressure | +| Coupling | 10% | Class-level coupling pressure | +| Dead code | 10% | Active dead-code items after suppression/filtering | +| Dependencies | 10% | Cycles and deep dependency chains | +| Coverage | 10% | Analysis completeness (`files_analyzed_or_cached / files_found`) | Important clarifications: -- `coverage` here means **analysis completeness**, not test coverage. -- The clone dimension currently uses only **function** and **block** clone - groups. Segment groups are visible in reports, but they do not currently feed - Health Score. -- Dead-code penalties use active dead-code items returned by - `find_unused(...)`. Suppressed or non-actionable candidates do not penalize - the score. -- Dependency pressure currently penalizes cycles directly and only penalizes - dependency depth beyond the safe zone (`max_depth > 6`). - -## Explainability intent +- `coverage` here means analysis completeness, not test coverage. +- Segment clones are visible in reports but do not currently affect Health Score. +- Suppressed or non-actionable dead-code items do not penalize the score. +- Dependencies score uses the internal module dependency graph only. +- Cycles still penalize the dependencies dimension directly. +- Acyclic depth pressure is adaptive: -The current health model is deterministic and explainable by design: +``` + expected_tail = max(ceil(avg_depth * 2.0), p95_depth + 1) + tail_pressure = max(0, max_depth - expected_tail) + score = 100 - cycles * 25 - tail_pressure * 4 +``` -- every scoring dimension is derived from explicit inputs already present in the - pipeline and canonical report; -- the canonical report exposes the score and per-dimension breakdown under - `metrics.families.health.summary`; -- overview/report projections may summarize the result, but they must not invent - extra health heuristics outside the scoring model. +- This model is internal and not configurable through CLI or `pyproject.toml`. ## Current non-scoring layers -The following layers are visible today but do **not** currently affect Health -Score: - -### Overloaded Modules - -`Overloaded Modules` is currently a report-only experimental layer. - -- It surfaces module-level hotspots derived from implementation burden and - dependency pressure. -- It is visible in `metrics.families.overloaded_modules`, HTML, Markdown/TXT, and MCP - `metrics_detail(family="overloaded_modules")`. -- It does not currently affect Health Score, gates, baseline novelty, or SARIF. -- It is **not** a restatement of cyclomatic complexity: complexity highlights - local control-flow hotspots, while Overloaded Modules highlights module-level - responsibility overload and dependency pressure. - -### Other visible non-scoring layers +These layers are report-only: they provide signal but are not yet validated +for scoring-model inclusion. -- `findings.groups.clones.segments` — canonical report-only segment-clone layer; - visible for review, excluded from baseline diff/gating/health. -- `findings.groups.structural.groups` — report-only structural findings; - visible as evidence/advisory material, excluded from health. -- `derived.suggestions` and `derived.hotlists` — advisory and routing - projections; never scoring inputs. +- `metrics.families.overloaded_modules` +- `metrics.families.security_surfaces` +- `findings.groups.clones.segments` +- `findings.groups.structural.groups` +- `derived.suggestions` +- `derived.hotlists` +- `metrics.families.coverage_join` ## Health model evolution -Health Score is stable within a given scoring model, but the model may evolve -across releases. - -New signal families may first appear as report-only or experimental layers. -After validation and contract hardening, selected layers may later be -introduced into scoring. - -Future CodeClone releases may expand the Health Score formula with additional -validated signal families. As a result, a repository's score may decrease after -upgrade even if the code itself did not become worse. In such cases, the change -reflects an evolved scoring model rather than a retroactive decline in code -quality. - -Promotion rules for a new scoring input: - -- the signal must be deterministic and stable enough for canonical reporting; -- the signal must be explainable in terms of explicit inputs and visible output; -- the signal must be validated on real repositories, not only synthetic cases; -- the change must be documented in release notes and in Health Score docs; -- MCP/HTML/CLI surfaces must continue to present the score honestly after the - expansion. - -Current versioning note: +Future releases may expand the score with additional validated signal families. +If that happens: -- CodeClone does **not** currently define a separate health-model version - constant. -- Health semantics are package-versioned public behavior and must therefore be - documented in this chapter, in compatibility notes, and in release notes when - they change. +- the change must be documented in this chapter and release notes +- CLI/HTML/MCP must continue to present the score honestly +- a lower score after upgrade may reflect a broader model, not only worse code ## Locked by tests @@ -140,4 +95,3 @@ Current versioning note: - [08-report.md](08-report.md) - [14-compatibility-and-versioning.md](14-compatibility-and-versioning.md) - [15-metrics-and-quality-gates.md](15-metrics-and-quality-gates.md) -- [16-dead-code-contract.md](16-dead-code-contract.md) diff --git a/docs/book/15-metrics-and-quality-gates.md b/docs/book/15-metrics-and-quality-gates.md index 5817088..48d230f 100644 --- a/docs/book/15-metrics-and-quality-gates.md +++ b/docs/book/15-metrics-and-quality-gates.md @@ -6,11 +6,12 @@ Define metrics mode selection, metrics-baseline behavior, and gating semantics. ## Public surface -- Metrics mode wiring: `codeclone/cli.py:_configure_metrics_mode` -- Main orchestration and exit routing: `codeclone/cli.py:_main_impl` -- Gate evaluation: `codeclone/pipeline.py:metric_gate_reasons`, - `codeclone/pipeline.py:gate` -- Metrics baseline persistence/diff: `codeclone/metrics_baseline.py:MetricsBaseline` +- Metrics mode wiring: `codeclone/surfaces/cli/runtime.py:_configure_metrics_mode` +- Main orchestration and exit routing: `codeclone/surfaces/cli/workflow.py:_main_impl` +- Gate evaluation: `codeclone/report/gates/evaluator.py:metric_gate_reasons`, + `codeclone/core/reporting.py:gate` +- Metrics baseline persistence/diff: + `codeclone/baseline/metrics_baseline.py:MetricsBaseline` ## Data model @@ -20,9 +21,8 @@ Metrics gate inputs: `--fail-complexity`, `--fail-coupling`, `--fail-cohesion`, `--fail-health` - adoption threshold gates: `--min-typing-coverage`, `--min-docstring-coverage` -- external Cobertura coverage join: - `--coverage FILE`, `--coverage-min PERCENT`, - `--fail-on-untested-hotspots` +- current-run Cobertura coverage join: + `--coverage`, `--coverage-min`, `--fail-on-untested-hotspots` - boolean structural gates: `--fail-cycles`, `--fail-dead-code` - baseline-aware delta gates: @@ -39,88 +39,48 @@ Modes: - `analysis_mode=full`: metrics computed and suggestions enabled - `analysis_mode=clones_only`: metrics skipped -- Health-score semantics are defined in - [15-health-score.md](15-health-score.md). -- Metrics comparison state is unified by default: unless `--metrics-baseline` - is explicitly redirected, metrics baseline data comes from the same - `codeclone.baseline.json` path as clone baseline data. Refs: -- `codeclone/cli.py:_metrics_flags_requested` -- `codeclone/cli.py:_metrics_computed` -- `codeclone/_cli_meta.py:_build_report_meta` +- `codeclone/surfaces/cli/runtime.py:_metrics_flags_requested` +- `codeclone/surfaces/cli/runtime.py:_metrics_computed` +- `codeclone/surfaces/cli/report_meta.py:_build_report_meta` - `codeclone/metrics/health.py:compute_health` -- `codeclone/contracts.py:HEALTH_WEIGHTS` +- `codeclone/contracts/__init__.py:HEALTH_WEIGHTS` ## Contracts -- `--skip-metrics` is incompatible with metrics gating/update flags and is a - contract error. -- `golden_fixture_paths` is a separate project-level clone policy: - clone groups fully contained in matching `tests/` / `tests/fixtures/` paths - are excluded before health/gate/suggestion evaluation, but remain visible as - suppressed report facts. -- If metrics are not explicitly requested and no metrics baseline exists, - runtime auto-enables clone-only mode (`skip_metrics=true`). -- In clone-only mode: - `skip_dead_code=true`, `skip_dependencies=true`. -- `--fail-dead-code` forces dead-code analysis on (even if metrics are skipped). -- `--fail-cycles` forces dependency analysis on (even if metrics are skipped). -- Type/docstring adoption metrics are computed in full mode. -- `--coverage` joins an external Cobertura XML file to current-run function - spans with stdlib XML parsing only. This signal is not metrics-baseline truth, - is not written to `codeclone.baseline.json`, and does not affect fingerprint - or clone identity semantics. -- Invalid Cobertura XML downgrades to a current-run - `coverage_join.status="invalid"` signal in normal analysis. It does not fail - the run or update any baseline; only `--fail-on-untested-hotspots` upgrades - invalid input into a contract error. -- `--api-surface` is opt-in in normal runs, but runtime auto-enables it when - `--fail-on-api-break` or `--update-metrics-baseline` needs a public API - snapshot. -- In the normal CLI `Metrics` block, adoption coverage is shown whenever metrics - are computed, and the public API surface line appears when `api_surface` - facts were collected. A coverage line appears when `--coverage` produced a - joined coverage summary. -- `--update-baseline` in full mode implies metrics-baseline update in the same - run. -- If metrics baseline path equals clone baseline path and clone baseline file is - missing, `--update-metrics-baseline` escalates to `--update-baseline` so - embedded metrics can be written safely. -- `--fail-on-new-metrics` requires trusted metrics baseline unless baseline is - being updated in the same run. -- `--fail-on-typing-regression` / `--fail-on-docstring-regression` require a - metrics baseline that already contains adoption coverage data. -- `--fail-on-api-break` requires a metrics baseline that already contains - `api_surface` data. -- `--fail-on-untested-hotspots` requires `--coverage` and a valid Cobertura XML - input. It evaluates current-run `coverage_join` facts only for measured - medium/high-risk functions below the configured threshold; scope gaps are - surfaced separately and do not require or update a metrics baseline. The - flag name is retained for CLI compatibility. -- In CI mode, if metrics baseline was loaded and trusted, runtime enables - `fail_on_new_metrics=true`. +- `--skip-metrics` is incompatible with metrics gating/update flags. +- If metrics are not explicitly requested and no metrics baseline exists, runtime may auto-enable clone-only mode. +- In clone-only mode, dead-code and dependency analysis are skipped unless explicitly forced by gates. +- There is currently no user-facing gate or config knob for `dependency_max_depth`; + dependency depth contributes to Health Score through the internal adaptive + model over `avg_depth`, `p95_depth`, and `max_depth` only. +- `--coverage` is a current-run signal only; it does not update baseline state. +- Invalid Cobertura XML becomes `coverage_join.status="invalid"` in normal runs and becomes a contract error only when + hotspot gating requires a valid join. +- `--api-surface` is opt-in, but runtime auto-enables it when API break gating or metrics-baseline update needs it. +- `--fail-on-new-metrics` requires a trusted metrics baseline unless baseline is being updated in the same run. +- `--fail-on-typing-regression`, `--fail-on-docstring-regression`, and `--fail-on-api-break` require the corresponding + capability in the trusted metrics baseline. +- In CI mode, if a trusted metrics baseline is loaded, runtime enables `fail_on_new_metrics=true`. Refs: -- `codeclone/cli.py:_configure_metrics_mode` -- `codeclone/cli.py:_main_impl` -- `codeclone/metrics_baseline.py:MetricsBaseline.verify_compatibility` +- `codeclone/surfaces/cli/runtime.py:_configure_metrics_mode` +- `codeclone/surfaces/cli/workflow.py:_main_impl` +- `codeclone/baseline/metrics_baseline.py:MetricsBaseline.verify_compatibility` ## Invariants (MUST) -- Metrics diff is computed only when: - metrics were computed and metrics baseline is trusted. -- Metric gate reasons are emitted in deterministic order: - threshold checks -> cycles/dead/health -> NEW-vs-baseline diffs -> - adoption/API baseline diffs -> coverage-join hotspot gate. -- Metric gate reasons are namespaced as `metric:*` in gate output. +- Metrics diff is computed only when metrics were computed and metrics baseline is trusted. +- Gate reasons are emitted in deterministic order. +- Metric gate reasons are namespaced as `metric:*`. Refs: -- `codeclone/pipeline.py:metric_gate_reasons` -- `codeclone/pipeline.py:gate` +- `codeclone/report/gates/evaluator.py:metric_gate_reasons` +- `codeclone/core/reporting.py:gate` ## Failure modes @@ -132,8 +92,7 @@ Refs: | Invalid Cobertura XML without hotspot gate | Current-run invalid signal, exit `0` | | Coverage hotspot gate without valid `--coverage` input | Contract error, exit `2` | | `--update-metrics-baseline` when metrics were not computed | Contract error, exit `2` | -| Threshold breach or NEW-vs-baseline metric regressions | Gating failure, exit `3` | -| Coverage hotspots from current-run coverage join | Gating failure, exit `3` | +| Threshold breach or metrics regressions | Gating failure, exit `3` | ## Determinism / canonicalization @@ -143,9 +102,9 @@ Refs: Refs: -- `codeclone/metrics_baseline.py:snapshot_from_project_metrics` -- `codeclone/metrics_baseline.py:_compute_payload_sha256` -- `codeclone/metrics_baseline.py:MetricsBaseline.verify_integrity` +- `codeclone/baseline/_metrics_baseline_payload.py:snapshot_from_project_metrics` +- `codeclone/baseline/_metrics_baseline_payload.py:_compute_payload_sha256` +- `codeclone/baseline/metrics_baseline.py:MetricsBaseline.verify_integrity` ## Locked by tests @@ -160,16 +119,4 @@ Refs: ## Non-guarantees - Absolute threshold defaults are not frozen by this chapter. -- Metrics scoring internals, per-dimension weighting, and the exact clone - density curve may evolve if exit semantics and contract statuses stay stable. - See [15-health-score.md](15-health-score.md) for the current model and the - phased expansion policy. - -## See also - -- [15-health-score.md](15-health-score.md) -- [04-config-and-defaults.md](04-config-and-defaults.md) -- [05-core-pipeline.md](05-core-pipeline.md) -- [09-cli.md](09-cli.md) -- [16-dead-code-contract.md](16-dead-code-contract.md) -- [17-suggestions-and-clone-typing.md](17-suggestions-and-clone-typing.md) +- Metrics scoring internals may evolve if exit semantics and contract statuses stay stable and are documented honestly. diff --git a/docs/book/16-dead-code-contract.md b/docs/book/16-dead-code-contract.md index 118180b..6875bce 100644 --- a/docs/book/16-dead-code-contract.md +++ b/docs/book/16-dead-code-contract.md @@ -8,11 +8,11 @@ Define dead-code liveness rules, canonical symbol-usage boundaries, and gating s - Dead-code detection core: `codeclone/metrics/dead_code.py:find_unused` - Test-path classifier: `codeclone/paths.py:is_test_filepath` -- Inline suppression parser/binder: `codeclone/suppressions.py` +- Inline suppression parser/binder: `codeclone/analysis/suppressions.py` - Extraction of referenced names/candidates: - `codeclone/extractor.py:extract_units_and_stats_from_source` + `codeclone/analysis/units.py:extract_units_and_stats_from_source` - Cache load boundary for referenced names: - `codeclone/pipeline.py:_load_cached_metrics` + `codeclone/core/discovery_cache.py:load_cached_metrics_extended` ## Data model @@ -62,7 +62,7 @@ Refs: - `codeclone/metrics/dead_code.py:_is_non_actionable_candidate` - `codeclone/metrics/dead_code.py:find_unused` -- `codeclone/pipeline.py:metric_gate_reasons` +- `codeclone/report/gates/evaluator.py:metric_gate_reasons` ## Invariants (MUST) @@ -74,8 +74,8 @@ Refs: Refs: - `codeclone/metrics/dead_code.py:find_unused` -- `codeclone/extractor.py:extract_units_and_stats_from_source` -- `codeclone/pipeline.py:_load_cached_metrics` +- `codeclone/analysis/units.py:extract_units_and_stats_from_source` +- `codeclone/core/discovery_cache.py:load_cached_metrics_extended` ## Failure modes @@ -104,9 +104,9 @@ Refs: ## Locked by tests - `tests/test_extractor.py::test_dead_code_marks_symbol_dead_when_referenced_only_by_tests` -- `tests/test_extractor.py::test_dead_code_skips_module_pep562_hooks` -- `tests/test_extractor.py::test_dead_code_applies_inline_suppression_per_declaration` -- `tests/test_extractor.py::test_dead_code_suppression_binding_is_scoped_to_target_symbol` +- `tests/test_extractor.py::test_dead_code_respects_runtime_hooks_and_inline_suppressions[skip_pep562_hooks]` +- `tests/test_extractor.py::test_dead_code_respects_runtime_hooks_and_inline_suppressions[inline_suppression_per_declaration]` +- `tests/test_extractor.py::test_dead_code_respects_runtime_hooks_and_inline_suppressions[suppression_binding_scoped_to_target]` - `tests/test_extractor.py::test_extract_collects_referenced_qualnames_for_import_aliases` - `tests/test_extractor.py::test_collect_dead_candidates_skips_protocol_and_stub_like_symbols` - `tests/test_pipeline_metrics.py::test_load_cached_metrics_ignores_referenced_names_from_test_files` @@ -118,7 +118,7 @@ Refs: - `tests/test_report.py::test_report_json_dead_code_suppressed_items_are_reported_separately` - `tests/test_html_report.py::test_html_report_renders_dead_code_split_with_suppressed_layer` - `tests/test_suppressions.py::test_extract_suppression_directives_supports_inline_and_leading_forms` -- `tests/test_suppressions.py::test_bind_suppressions_applies_only_to_adjacent_declaration_line` +- `tests/test_suppressions.py::test_bind_suppressions_targets_expected_declaration_scope[adjacent_leading_only]` ## Non-guarantees diff --git a/docs/book/17-suggestions-and-clone-typing.md b/docs/book/17-suggestions-and-clone-typing.md index a7eebeb..b9dc7ad 100644 --- a/docs/book/17-suggestions-and-clone-typing.md +++ b/docs/book/17-suggestions-and-clone-typing.md @@ -2,37 +2,38 @@ ## Purpose -Define deterministic clone-type classification and suggestion generation -contracts used by canonical report projections (`JSON` / `TXT` / `Markdown` / -`HTML`). +Define deterministic clone-type classification and suggestion generation used by +canonical report projections. ## Public surface - Clone-type classifier: `codeclone/report/suggestions.py:classify_clone_type` - Suggestion engine: `codeclone/report/suggestions.py:generate_suggestions` -- Pipeline integration: `codeclone/pipeline.py:compute_suggestions` -- Report serialization: `codeclone/report/json_contract.py:build_report_document` -- HTML render integration: `codeclone/html_report.py:build_html_report` +- Pipeline integration: `codeclone/core/pipeline.py:compute_suggestions` +- Report serialization: `codeclone/report/document/builder.py:build_report_document` +- HTML render integration: `codeclone/report/html/assemble.py:build_html_report` ## Data model Suggestion shape: -- `severity`: `critical|warning|info` -- `category`: - `clone|structural|complexity|coupling|cohesion|dead_code|dependency` -- `source_kind`: source classification of the primary location - (`production` / `tests` / `fixtures` / `other`) -- `title`, `location`, `steps`, `effort`, `priority` +- `severity` +- `category` +- `source_kind` +- `title` +- `location` +- `steps` +- `effort` +- `priority` Clone typing: - function groups: - Type-1: identical `raw_hash` - Type-2: identical normalized `fingerprint` - - Type-3: mixed fingerprints (same group semantics) + - Type-3: mixed fingerprints inside same group semantics - Type-4: fallback -- block/segment groups: Type-4 +- block and segment groups: Type-4 Refs: @@ -41,36 +42,26 @@ Refs: ## Contracts -- Suggestions are generated only in full metrics mode - (`skip_metrics=false`). +- Suggestions are generated only in full metrics mode. - Suggestions are advisory only and never directly control exit code. -- Suggestions are not a one-to-one mirror of findings. They should exist only - when they add action structure beyond the canonical finding itself. -- Low-signal local structural `info` hints stay in `findings` and do not emit a - separate suggestion card. -- SARIF projection is finding-driven and does not consume suggestion cards. -- JSON report stores clone typing at group level: - - `findings.groups.clones.[*].clone_type` -- Suggestion location is deterministic: first item by stable path/line sort. +- Suggestions are not a one-to-one mirror of findings; they exist only when they add action structure. +- Low-signal local structural info hints stay in findings and do not emit separate suggestion cards. +- SARIF remains finding-driven and does not consume suggestion cards. +- JSON report stores clone typing at group level under clone groups. Refs: -- `codeclone/pipeline.py:analyze` -- `codeclone/pipeline.py:gate` -- `codeclone/report/json_contract.py:build_report_document` +- `codeclone/core/pipeline.py:analyze` +- `codeclone/core/pipeline.py:compute_suggestions` +- `codeclone/report/document/builder.py:build_report_document` - `codeclone/report/suggestions.py:generate_suggestions` ## Invariants (MUST) -- Suggestion priority formula is stable: - `severity_weight / effort_weight`. -- For structural findings, separate suggestion cards are emitted only for the - actionable subset; low-signal local `info` hints remain finding-only. -- Suggestion output is sorted by: - `(-priority, severity, category, source_kind, location, title, subject_key)`. -- Derived suggestion serialization in report JSON applies deterministic ordering by - `(-priority, severity_rank, title, finding_id)`. -- Clone type output for a given group is deterministic for identical inputs. +- Suggestion priority formula is stable. +- Structural suggestion cards are emitted only for the actionable subset. +- Suggestion output is deterministically sorted. +- Clone type output for identical inputs is deterministic. Refs: @@ -87,14 +78,13 @@ Refs: ## Determinism / canonicalization -- Classifier uses deterministic set normalization + sorted collections. -- Serializer emits suggestions in generator-provided deterministic order. +- Classifier uses deterministic set normalization and sorted collections. +- Serializer emits suggestions in deterministic order. Refs: - `codeclone/report/suggestions.py:classify_clone_type` -- `codeclone/report/suggestions.py:generate_suggestions` -- `codeclone/report/json_contract.py:build_report_document` +- `codeclone/report/document/builder.py:build_report_document` ## Locked by tests @@ -105,13 +95,5 @@ Refs: ## Non-guarantees -- Suggestion wording can evolve without schema bump. -- Suggestion heuristics may be refined if deterministic ordering and - non-gating behavior remain unchanged. - -## See also - -- [05-core-pipeline.md](05-core-pipeline.md) -- [08-report.md](08-report.md) -- [10-html-render.md](10-html-render.md) -- [15-metrics-and-quality-gates.md](15-metrics-and-quality-gates.md) +- Suggestion wording can evolve without a schema bump. +- Suggestion heuristics may be refined if deterministic ordering and non-gating behavior remain unchanged. diff --git a/docs/book/18-benchmarking.md b/docs/book/18-benchmarking.md index cb49d41..0fdfc08 100644 --- a/docs/book/18-benchmarking.md +++ b/docs/book/18-benchmarking.md @@ -62,7 +62,7 @@ Benchmark output (`benchmark_schema_version=1.0`) contains: Refs: -- `codeclone/report/json_contract.py:_build_integrity_payload` +- `codeclone/report/document/integrity.py:_build_integrity_payload` - `benchmarks/run_benchmark.py` ## Recommended run profile diff --git a/docs/book/19-inline-suppressions.md b/docs/book/19-inline-suppressions.md index b051172..cb296ad 100644 --- a/docs/book/19-inline-suppressions.md +++ b/docs/book/19-inline-suppressions.md @@ -7,11 +7,11 @@ source comments, without introducing broad/project-wide ignores. ## Public surface -- Suppression directive parser and binder: `codeclone/suppressions.py` +- Suppression directive parser and binder: `codeclone/analysis/suppressions.py` - Dead-code final filter: `codeclone/metrics/dead_code.py:find_unused` - Suppressed dead-code projection helper: `codeclone/metrics/dead_code.py:find_suppressed_unused` -- Dead-code candidate extraction: `codeclone/extractor.py:_collect_dead_candidates` +- Dead-code candidate extraction: `codeclone/analysis/_module_walk.py:_collect_dead_candidates` ## Data model @@ -22,9 +22,9 @@ source comments, without introducing broad/project-wide ignores. Refs: -- `codeclone/suppressions.py:SuppressionDirective` -- `codeclone/suppressions.py:DeclarationTarget` -- `codeclone/suppressions.py:SuppressionBinding` +- `codeclone/analysis/suppressions.py:SuppressionDirective` +- `codeclone/analysis/suppressions.py:DeclarationTarget` +- `codeclone/analysis/suppressions.py:SuppressionBinding` - `codeclone/models.py:DeadCandidate` ## Contracts @@ -76,20 +76,20 @@ Refs: Refs: -- `codeclone/suppressions.py:extract_suppression_directives` -- `codeclone/suppressions.py:bind_suppressions_to_declarations` -- `codeclone/cache.py:_canonicalize_cache_entry` +- `codeclone/analysis/suppressions.py:extract_suppression_directives` +- `codeclone/analysis/suppressions.py:bind_suppressions_to_declarations` +- `codeclone/cache/_canonicalize.py:_canonicalize_cache_entry` ## Locked by tests - `tests/test_suppressions.py::test_extract_suppression_directives_supports_inline_and_leading_forms` -- `tests/test_suppressions.py::test_extract_suppression_directives_ignores_unknown_and_malformed_safely` -- `tests/test_suppressions.py::test_bind_suppressions_applies_only_to_adjacent_declaration_line` -- `tests/test_suppressions.py::test_bind_suppressions_does_not_propagate_class_inline_to_method` -- `tests/test_suppressions.py::test_bind_suppressions_applies_to_method_target` +- `tests/test_suppressions.py::test_extract_suppression_directives_ignores_invalid_forms[unknown_and_malformed]` +- `tests/test_suppressions.py::test_bind_suppressions_targets_expected_declaration_scope[adjacent_leading_only]` +- `tests/test_suppressions.py::test_bind_suppressions_targets_expected_declaration_scope[class_inline_does_not_propagate]` +- `tests/test_suppressions.py::test_bind_suppressions_targets_expected_declaration_scope[method_target]` - `tests/test_suppressions.py::test_build_suppression_index_deduplicates_rules_stably` -- `tests/test_extractor.py::test_dead_code_applies_inline_suppression_per_declaration` -- `tests/test_extractor.py::test_dead_code_suppression_binding_is_scoped_to_target_symbol` +- `tests/test_extractor.py::test_dead_code_respects_runtime_hooks_and_inline_suppressions[inline_suppression_per_declaration]` +- `tests/test_extractor.py::test_dead_code_respects_runtime_hooks_and_inline_suppressions[suppression_binding_scoped_to_target]` - `tests/test_metrics_modules.py::test_find_unused_applies_inline_dead_code_suppression` - `tests/test_metrics_modules.py::test_find_suppressed_unused_returns_actionable_suppressed_candidates` - `tests/test_report.py::test_report_json_dead_code_suppressed_items_are_reported_separately` diff --git a/docs/book/20-mcp-interface.md b/docs/book/20-mcp-interface.md index 2401070..a1775a6 100644 --- a/docs/book/20-mcp-interface.md +++ b/docs/book/20-mcp-interface.md @@ -4,348 +4,157 @@ Define the current public MCP surface in the `2.0` beta line. -This interface is **optional** (installed via the `mcp` extra). It exposes -the deterministic analysis pipeline as a **read-only MCP server** for AI agents -and MCP-capable clients. It does not replace the CLI or the canonical report -contract. +The MCP layer is optional, read-only, and built on the same canonical +pipeline/report contracts as the CLI. It does not create a second analysis +engine or a second persistence model. + +!!! note "Read-only integration contract" +MCP surfaces the same canonical report and run state as the CLI and HTML +report. It must not mutate source, baseline, cache, or report artifacts. ## Public surface -- Package extra: `codeclone[mcp]` -- MCP launcher: `codeclone-mcp` -- MCP server: `codeclone/mcp_server.py` -- MCP service adapter: `codeclone/mcp_service.py` +- package extra: `codeclone[mcp]` +- launcher: `codeclone-mcp` +- server wiring: `codeclone/surfaces/mcp/server.py` +- in-process service/session: `codeclone/surfaces/mcp/service.py`, + `codeclone/surfaces/mcp/session.py` -## Data model +## Shape Current server characteristics: -- optional dependency; base `codeclone` install does not require `mcp` +- optional dependency; base `codeclone` install does not require MCP runtime - transports: - `stdio` - `streamable-http` - run storage: - in-memory only - - bounded history (`--history-limit`, default `4`, maximum `10`) - - latest-run pointer for `codeclone://latest/...` resources - - the `latest` pointer moves whenever a newer `analyze_*` call registers a run -- run identity: - - canonical run identity is derived from the canonical report integrity digest - - MCP payloads expose a short `run_id` handle (first 8 hex chars) - - MCP tools/resources accept both short and full run ids - - MCP finding ids are compact by default and may lengthen when needed to - stay unique within a run + - bounded by `--history-limit` + - latest-run pointer is process-local +- roots: + - analysis tools require an absolute repository root + - relative roots such as `.` are rejected - analysis modes: - `full` - `clones_only` -- process-count policy: - - `processes` is an optional override - - when omitted, MCP defers to the core CodeClone runtime -- initialize metadata: - - `serverInfo.version` reflects the CodeClone package version - - clients may use it for compatibility checks -- root contract: - - analysis tools require an absolute repository root - - relative roots such as `.` are rejected in MCP because server cwd may - differ from the client workspace - - granular `check_*` tools may omit `root` and use the latest compatible - stored run; if `root` is provided, it must also be absolute - cache policies: - `reuse` - `off` - `refresh` is rejected in MCP because the server is read-only. -- summary payload: - - `run_id`, `version`, `schema`, `mode`, compact `analysis_profile` - - `health_scope` explains what the health score covers - - `focus` explains the active summary/triage lens - - `baseline`, `metrics_baseline`, `cache` - - untrusted baseline comparisons stay compact but explicit through - `baseline.compared_without_valid_baseline`, - `baseline.baseline_python_tag`, and `baseline.runtime_python_tag` - - `cache.freshness` classifies summary cache reuse as `fresh`, `mixed`, - or `reused` - - flattened `inventory` (`files`, `lines`, `functions`, `classes`) - - flattened `findings` (`total`, `new`, `known`, `by_family`, `production`, - `new_by_source_kind`) - - flattened `diff` (`new_clones`, `health_delta`, - `typing_param_permille_delta`, `typing_return_permille_delta`, - `docstring_permille_delta`, `api_breaking_changes`, `new_api_symbols`) - - optional `coverage_join` when an analysis request included - `coverage_xml` (`status`, `overall_permille`, `coverage_hotspots`, - `scope_gap_hotspots`, `hotspot_threshold_percent`) - - `warnings`, `failures` - - `analyze_changed_paths` is intentionally more compact than `get_run_summary`: - it returns `changed_files`, compact `baseline`, `focus`, `health_scope`, - `health`, `health_delta`, `verdict`, `new_findings`, - `new_by_source_kind`, `resolved_findings`, and an empty - `changed_findings` placeholder, while - detailed changed payload stays in - `get_report_section(section="changed")` -- workflow guidance: - - the MCP surface is intentionally agent-guiding rather than list-first - - the cheapest useful path is designed to be the most obvious path: - `get_run_summary` / `get_production_triage` first, then `list_hotspots` - or `check_*`, then `get_finding` / `get_remediation` - - `help(topic=...)` is a bounded semantic routing tool for contract/workflow - uncertainty; it is not a second manual or docs proxy -- finding-list payloads: - - MCP finding ids are compact projection ids; canonical report ids are unchanged - - `detail_level="summary"` is the default for list/check/hotspot tools - - `detail_level="summary"` keeps compact relative `"path:line"` locations - - `detail_level="normal"` keeps structured `{path, line, end_line, symbol}` - locations plus remediation - - `detail_level="full"` keeps the compatibility-oriented payload, - including `priority_factors`, `items`, and per-location `uri` - - empty design `check_*` responses may include a compact - `threshold_context` (`metric`, `threshold`, `measured_units`, - `highest_below_threshold`) so agents can tell whether the run is truly - quiet or just below the active threshold - -The MCP layer does not introduce a separate analysis engine. It calls the -current CodeClone pipeline and reuses the canonical report document already -produced by the report contract. + - `refresh` is rejected by the read-only MCP service contract; use `reuse` + or `off` + +!!! warning "Absolute roots and remote exposure" +Analysis tools require an absolute repository root, and HTTP exposure +beyond loopback is intentionally explicit. Keep `stdio` as the default for +local IDE and agent clients. ## Tools -Current tool set (`21` tools): - -| Tool | Key parameters | Purpose | -|--------------------------|----------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------| -| `analyze_repository` | absolute `root`, `analysis_mode`, thresholds, `api_surface`, `coverage_xml`, cache/baseline paths | Full analysis → compact summary; then `get_run_summary` or `get_production_triage` | -| `analyze_changed_paths` | absolute `root`, `changed_paths` or `git_diff_ref`, `analysis_mode`, `api_surface`, `coverage_xml` | Diff-aware analysis → compact changed-files snapshot | -| `get_run_summary` | `run_id` | Cheapest run snapshot: health, findings, baseline, inventory, active thresholds | -| `get_production_triage` | `run_id`, `max_hotspots`, `max_suggestions` | Production-first view: health, hotspots, suggestions, active thresholds | -| `help` | `topic`, `detail` | Semantic guide for workflow, analysis profile, baseline, coverage, suppressions, review state, changed-scope | -| `compare_runs` | `run_id_before`, `run_id_after`, `focus` | Run-to-run delta: regressions, improvements, health change | -| `evaluate_gates` | `run_id`, gate thresholds, `fail_on_untested_hotspots`, `coverage_min` | Preview CI gating decisions | -| `get_report_section` | `run_id`, `section`, `family`, `path`, `offset`, `limit` | Read report sections; `metrics_detail` is paginated with family/path filters | -| `list_findings` | `family`, `severity`, `novelty`, `sort_by`, `detail_level`, `changed_paths`, pagination | Filtered, paginated findings; use after hotspots or `check_*` | -| `get_finding` | `finding_id`, `run_id`, `detail_level` | Single finding detail by id; defaults to `normal` | -| `get_remediation` | `finding_id`, `run_id`, `detail_level` | Remediation payload for one finding | -| `list_hotspots` | `kind`, `run_id`, `detail_level`, `changed_paths`, `limit` | Priority-ranked hotspot views; preferred before broad listing | -| `check_clones` | `run_id`, `root`, `path`, `clone_type`, `source_kind`, `detail_level` | Clone findings only; `health.dimensions` includes only `clones` | -| `check_complexity` | `run_id`, `root`, `path`, `min_complexity`, `detail_level` | Complexity hotspots only | -| `check_coupling` | `run_id`, `root`, `path`, `detail_level` | Coupling hotspots only | -| `check_cohesion` | `run_id`, `root`, `path`, `detail_level` | Cohesion hotspots only | -| `check_dead_code` | `run_id`, `root`, `path`, `min_severity`, `detail_level` | Dead-code findings only | -| `generate_pr_summary` | `run_id`, `changed_paths`, `git_diff_ref`, `format` | PR-friendly markdown or JSON summary | -| `mark_finding_reviewed` | `finding_id`, `run_id`, `note` | Session-local review marker (in-memory) | -| `list_reviewed_findings` | `run_id` | List reviewed findings for a run | -| `clear_session_runs` | none | Reset in-memory runs and session state | - -All tools are read-only except `mark_finding_reviewed` and `clear_session_runs` -(session-local, in-memory). `check_*` tools query stored runs — call -`analyze_repository` or `analyze_changed_paths` first. - -Recommended workflow: - -1. `get_run_summary` or `get_production_triage` -2. `help(topic=...)` if contract meaning is unclear -3. `list_hotspots` or `check_*` -4. `get_finding` → `get_remediation` -5. `generate_pr_summary(format="markdown")` - -`metrics_detail` families currently include canonical health/quality families -plus `overloaded_modules`, `coverage_adoption`, `coverage_join`, and -`api_surface`. - -For analysis sensitivity, the intended model is: - -1. start with repo defaults or `pyproject`-resolved thresholds -2. lower thresholds only for an explicit higher-sensitivity exploratory pass -3. compare runs only when profile differences are understood +Current tool set: `21` tools. + +The MCP surface is intentionally triage-first: analyze first, summarize/triage +second, then drill into one finding or one hotspot family. + +### Analysis and run-level tools + +| Tool | Key parameters | Purpose | +|-------------------------|------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------| +| `analyze_repository` | `root`, `analysis_mode`, thresholds, `api_surface`, `coverage_xml`, `baseline_path`, `metrics_baseline_path`, `cache_policy` | Full deterministic analysis of one repo root; registers the latest in-memory run. | +| `analyze_changed_paths` | `root`, `changed_paths` or `git_diff_ref`, `analysis_mode`, thresholds, `api_surface`, `coverage_xml`, `cache_policy` | Diff-aware analysis with changed-files projection over the same canonical run/report contract. | +| `get_run_summary` | `run_id` | Cheapest run-level snapshot. Start here after analysis when you need health, findings, baseline/cache status, and inventory in compact form. | +| `get_production_triage` | `run_id`, `max_hotspots`, `max_suggestions` | Production-first first-pass view over one stored run. | +| `help` | `topic`, `detail` | Bounded workflow/contract guidance for supported MCP topics. | +| `compare_runs` | `run_id_before`, `run_id_after`, `focus` | Run-to-run delta view over findings and health; returns `incomparable` when roots/settings differ. | +| `evaluate_gates` | `run_id`, gate flags, threshold overrides, `coverage_min` | Preview CI/gating decisions against a stored run without mutating process or repo state. | + +### Report and finding projection tools + +| Tool | Key parameters | Purpose | +|-----------------------|------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------| +| `get_report_section` | `run_id`, `section`, `family`, `path`, `offset`, `limit` | Read canonical report sections; `metrics_detail` is the bounded/paginated drill-down path. | +| `list_findings` | `run_id`, `family`, `category`, `severity`, `source_kind`, `novelty`, `sort_by`, `detail_level`, changed-scope filters, pagination | Deterministic filtered finding list over canonical stored findings. | +| `get_finding` | `finding_id`, `run_id`, `detail_level` | Return one canonical finding group by short or full id. | +| `get_remediation` | `finding_id`, `run_id`, `detail_level` | Return the remediation/explainability packet for one finding. | +| `list_hotspots` | `kind`, `run_id`, `detail_level`, changed-scope filters, pagination | Return one derived hotspot list such as `most_actionable` or `production_hotspots`. | +| `generate_pr_summary` | `run_id`, `changed_paths`, `git_diff_ref`, `format` | PR-oriented summary for changed scope; `markdown` is the default human/LLM-facing format. | + +### Focused check tools + +| Tool | Key parameters | Purpose | +|--------------------|-------------------------------------------------------------------------------------------------|-------------------------------------------------------| +| `check_clones` | `run_id` or absolute `root`, `path`, `clone_type`, `source_kind`, `max_results`, `detail_level` | Narrow clone-only query over a compatible stored run. | +| `check_complexity` | `run_id` or absolute `root`, `path`, `min_complexity`, `max_results`, `detail_level` | Narrow complexity-hotspot query. | +| `check_coupling` | `run_id` or absolute `root`, `path`, `max_results`, `detail_level` | Narrow coupling-hotspot query. | +| `check_cohesion` | `run_id` or absolute `root`, `path`, `max_results`, `detail_level` | Narrow cohesion-hotspot query. | +| `check_dead_code` | `run_id` or absolute `root`, `path`, `min_severity`, `max_results`, `detail_level` | Narrow dead-code query. | + +### Session-local tools + +| Tool | Key parameters | Purpose | +|--------------------------|--------------------------------|-------------------------------------------------------------------------------------| +| `mark_finding_reviewed` | `finding_id`, `run_id`, `note` | Mark a finding as reviewed in the current in-memory MCP session. | +| `list_reviewed_findings` | `run_id` | Return reviewed markers currently held in process memory. | +| `clear_session_runs` | none | Clear in-memory run history and session-local review state for this server process. | ## Resources -Current fixed resources: - -| Resource | Payload | Availability | -|----------------------------------|-------------------------------------------------------|-------------------------------------------------------| -| `codeclone://latest/summary` | latest run summary projection | always after at least one run | -| `codeclone://latest/triage` | latest production-first triage projection | always after at least one run | -| `codeclone://latest/report.json` | latest canonical report document | always after at least one run | -| `codeclone://latest/health` | latest health score + dimensions | always after at least one run | -| `codeclone://latest/gates` | latest gate evaluation result | only after `evaluate_gates` in current server process | -| `codeclone://latest/changed` | latest changed-files projection | only for a diff-aware latest run | -| `codeclone://schema` | schema-style descriptor for canonical report sections | always available | - -Current run-scoped URI templates: - -| URI template | Payload | Availability | -|---------------------------------------------------|--------------------------------------|-----------------------------------------| -| `codeclone://runs/{run_id}/summary` | run-specific summary projection | for any stored run | -| `codeclone://runs/{run_id}/report.json` | run-specific canonical report | for any stored run | -| `codeclone://runs/{run_id}/findings/{finding_id}` | run-specific canonical finding group | for an existing finding in a stored run | - -Fixed resources and URI templates are convenience views over already -registered runs. They do not trigger fresh analysis by themselves. -If a client needs the freshest truth, it must start a fresh analysis run first -(typically with `cache_policy="off"`), rather than relying on older session -state behind `codeclone://latest/...`. - -## Contracts - -- MCP is **read-only**: - - no source-file mutation - - no baseline update - - no metrics-baseline update - - no cache refresh writes -- Session review markers are **ephemeral only**: - - stored in memory per server process - - never written to baseline, cache, or report artifacts -- `streamable-http` defaults to loopback binding. - Non-loopback hosts require explicit `--allow-remote` because the server has - no built-in authentication. -- `--allow-remote` expands the trust boundary materially: - - any reachable network client can trigger CPU-intensive analysis - - any reachable network client can read analysis results - - request parameters such as `root` and path filters can still probe - repository-relative filesystem structure - - use it only on trusted networks or behind a firewall / authenticated - reverse proxy -- MCP must reuse current: - - pipeline stages - - baseline trust semantics - - cache semantics - - canonical report contract -- `coverage_xml` is resolved relative to the absolute root when it is not - already absolute. It is a current-run Cobertura input only; MCP must never - write it to baseline/cache/report artifacts or treat it as baseline truth. -- When `respect_pyproject=true`, MCP also respects `golden_fixture_paths`. - Clone groups excluded by that policy are omitted from active clone/gate - projections but remain available in the canonical report under the optional - `findings.groups.clones.suppressed.*` bucket. -- Invalid Cobertura XML during `analyze_*` does not fail analysis; the stored - run carries `coverage_join.status="invalid"` plus `invalid_reason`. - `evaluate_gates(fail_on_untested_hotspots=true)` on that run is a contract - error because hotspot gating requires a valid join. -- Inline MCP design-threshold parameters (`complexity_threshold`, - `coupling_threshold`, `cohesion_threshold`) define the canonical design - finding universe of that run and are recorded in - `meta.analysis_thresholds.design_findings`. -- `get_run_summary` is a deterministic convenience projection derived from the - canonical report (`meta`, `inventory`, `findings.summary`, - `metrics.summary.health`) plus baseline-diff/gate/changed-files context. -- `get_production_triage` is also a deterministic MCP projection over the same - canonical run state (`summary`, `derived.hotlists`, `derived.suggestions`, - and canonical finding source scope). It must not create a second analysis or - remediation truth path. -- Canonical JSON remains the source of truth for report semantics. -- `list_findings` and `list_hotspots` are deterministic projections over the - canonical report, not a separate analysis branch. -- `metrics_detail(family="overloaded_modules")` exposes the canonical report-only - module-hotspot layer, but does not promote it into findings, hotlists, or - gate semantics. -- `metrics_detail(family="coverage_join")` exposes the canonical current-run - coverage join summary/items, including measured coverage hotspots and - coverage scope gaps. `evaluate_gates(fail_on_untested_hotspots=true)` - requires a stored run created with valid `coverage_xml`. -- `get_remediation` is a deterministic MCP projection over existing - suggestions/explainability data, not a second remediation engine. -- `analysis_mode="clones_only"` must mirror the same metric/dependency - skip-semantics as the regular pipeline. -- Missing optional MCP dependency is handled explicitly by the launcher with a - user-facing install hint and exit code `2`. - -## Invariants (MUST) - -- Tool names are stable public surface. -- Resource URI shapes are stable public surface. -- Read-only vs session-local tool annotations remain accurate. -- `analyze_repository` always registers exactly one latest run. -- `analyze_changed_paths` requires `changed_paths` or `git_diff_ref`. -- `analyze_repository` and `analyze_changed_paths` require an absolute `root`; - relative roots like `.` are rejected. -- `git_diff_ref` is validated as a safe single revision expression before - invoking `git diff`. -- `changed_paths` is a structured `list[str]` of repo-relative paths, not a - comma-separated string payload. -- `analyze_changed_paths` may return the same `run_id` as a previous run when - the canonical report digest is unchanged; changed-files state is an overlay, - not a second canonical report. -- `get_run_summary` with no `run_id` resolves to the latest stored run. -- `codeclone://latest/...` resources always resolve to the latest stored run in - the current MCP server process, not to a globally fresh analysis state. -- Summary-style MCP payloads expose `cache.freshness` as a derived convenience - marker; canonical cache metadata remains available only through canonical - report/meta surfaces. -- `get_report_section(section="all")` returns the full canonical report document. -- `get_report_section(section="metrics")` returns only `metrics.summary`. -- `get_report_section(section="metrics_detail")` is intentionally bounded: - without filters it returns `summary` plus a hint; with `family` and/or `path` - it returns a paginated item slice. -- `get_report_section(section="changed")` is available only for diff-aware runs. -- MCP short `run_id` values are session handles over the canonical digest of - that run. -- MCP summary/normal finding/location payloads use relative paths only and do - not expose absolute `file://` URIs. -- Finding `locations` and `html_anchor` values are stable projections over the - current run and do not invent non-canonical ids. -- For the same finding id, `source_kind` remains consistent across - `list_findings`, `list_hotspots`, and `get_finding`. -- `get_finding(detail_level="full")` remains the compatibility-preserving - full-detail endpoint: `priority_factors` and location `uri` are still - available there. -- `compare_runs` is only semantically meaningful when both runs use comparable - repository scope/root and analysis settings. -- `compare_runs` exposes top-level `comparable` plus optional `reason`. When - roots or effective analysis settings differ, `regressions` and - `improvements` become empty lists, `unchanged` and `health_delta` become - `null`, and `verdict` becomes `incomparable`. -- `compare_runs.health_delta` is `after.health - before.health` between the two - selected comparable runs. It is independent of baseline or metrics-baseline - drift. -- `compare_runs.verdict` is intentionally conservative but not one-dimensional: - it returns `mixed` when run-to-run finding deltas and `health_delta` disagree. -- `analysis_mode="clones_only"` keeps clone findings fully usable, but MCP - surfaces mark `health` as unavailable instead of fabricating zeroed metrics. -- `coverage_xml` requires `analysis_mode="full"` because coverage join depends - on function-span metrics. -- `codeclone://latest/triage` is a latest-only resource; run-specific triage is - available via the tool, not via a `codeclone://runs/{run_id}/...` resource URI. - -## Failure modes - -| Condition | Behavior | -|---------------------------------------------------|---------------------------------------------------| -| `mcp` extra not installed | `codeclone-mcp` prints install hint and exits `2` | -| Invalid root path / invalid numeric config | service raises contract error | -| `coverage_xml` with `analysis_mode="clones_only"` | service raises contract error | -| Coverage hotspot gate without valid coverage join | service raises contract error | -| Requested run missing | service raises run-not-found error | -| Requested finding missing | service raises finding-not-found error | -| Unsupported report section/resource suffix | service raises contract error | - -## Determinism / canonicalization - -- MCP run identity is derived from canonical report integrity digest. -- Finding order is inherited from canonical report ordering. -- Hotlists are derived from canonical report data and deterministic derived ids. -- No MCP-only heuristics may change analysis or gating semantics. -- MCP must not re-synthesize design findings from raw metrics after the run; - threshold-aware design findings belong to the canonical report document. -- Coverage join ordering and hotspot gates are inherited from canonical - `metrics.families.coverage_join` facts. +Resources are deterministic read-only projections over stored runs. + +| URI | Purpose | +|---------------------------------------------------|-------------------------------------------------------------| +| `codeclone://latest/summary` | Compact summary for the latest stored run. | +| `codeclone://latest/report.json` | Canonical JSON report for the latest stored run. | +| `codeclone://latest/health` | Health/metrics snapshot for the latest stored run. | +| `codeclone://latest/gates` | Last gate-evaluation result produced in this MCP session. | +| `codeclone://latest/changed` | Changed-files projection for the latest diff-aware run. | +| `codeclone://latest/triage` | Production-first triage payload for the latest stored run. | +| `codeclone://schema` | Canonical report schema-style descriptor. | +| `codeclone://runs/{run_id}/summary` | Compact summary for one specific stored run. | +| `codeclone://runs/{run_id}/report.json` | Canonical JSON report for one specific stored run. | +| `codeclone://runs/{run_id}/findings/{finding_id}` | Canonical JSON finding payload for one specific stored run. | + +## Contract rules + +- MCP is read-only with respect to source files, baselines, cache artifacts, + and report artifacts. +- MCP reuses the same canonical report document as CLI/JSON/HTML/SARIF. +- Finding ids, ordering, and summary data are deterministic projections over + the stored run. +- `analyze_changed_paths` requires either explicit `changed_paths` or + `git_diff_ref`. +- `analyze_repository` and `analyze_changed_paths` require an absolute `root`. +- `check_*` tools may resolve against an existing stored run, but if `root` is + provided it must also be absolute. +- `git_diff_ref` is validated before any subprocess call. +- Review markers are session-local in-memory state only. +- Run history is process-local and does not survive restart. +- Missing optional MCP dependency is surfaced explicitly by the launcher. +- `metrics_detail(family="security_surfaces")` exposes a compact, report-only + inventory of exact security-relevant capability surfaces. It does not claim + vulnerabilities or exploitability. + +## Security model + +- default transport is local `stdio` +- non-local HTTP exposure requires explicit `--allow-remote` +- server runtime is loaded lazily so base installs and normal CI do not require + MCP packages +- MCP must not mutate repo state or synthesize findings outside canonical + report facts + +## Determinism + +- run identity is derived from canonical report integrity +- summary, hotspots, findings, and remediation payloads are deterministic + projections over stored run state +- MCP must not create MCP-only analysis semantics or MCP-only gate semantics ## Locked by tests -- `tests/test_mcp_service.py::test_mcp_service_analyze_repository_registers_latest_run` -- `tests/test_mcp_service.py::test_mcp_service_lists_findings_and_hotspots` -- `tests/test_mcp_service.py::test_mcp_service_changed_runs_remediation_and_review_flow` -- `tests/test_mcp_service.py::test_mcp_service_granular_checks_pr_summary_and_resources` -- `tests/test_mcp_service.py::test_mcp_service_evaluate_gates_on_existing_run` -- `tests/test_mcp_service.py::test_mcp_service_resources_expose_latest_summary_and_report` -- `tests/test_mcp_server.py::test_mcp_server_exposes_expected_read_only_tools` -- `tests/test_mcp_server.py::test_mcp_server_tool_roundtrip_and_resources` -- `tests/test_mcp_server.py::test_mcp_server_main_reports_missing_optional_dependency` - -## Non-guarantees - -- There is currently no standalone `mcp_api_version` constant. -- In-memory run history does not survive process restart. -- `clear_session_runs` resets the in-memory run registry and related session - caches, but does not mutate baseline/cache/report artifacts on disk. -- Client-specific UI/approval behavior is not part of the CodeClone contract. +- `tests/test_mcp_service.py` +- `tests/test_mcp_server.py` +- `tests/test_mcp_tool_schema_snapshot.py` ## See also diff --git a/docs/book/21-vscode-extension.md b/docs/book/21-vscode-extension.md index f9b6c0a..14b2c5c 100644 --- a/docs/book/21-vscode-extension.md +++ b/docs/book/21-vscode-extension.md @@ -10,6 +10,11 @@ CodeClone contracts. It does not define a second analysis truth model. Marketplace: [orenlab.codeclone](https://marketplace.visualstudio.com/items?itemName=orenlab.codeclone) +!!! note "No second truth path" + The extension is a guided IDE client over `codeclone-mcp`. It may reshape + review UX, but it must not recompute findings, health, or report truth + independently from MCP and canonical report semantics. + ## Position in the platform The VS Code extension is: @@ -52,7 +57,8 @@ It also provides: - command palette entry points for analysis and review - one onboarding walkthrough - markdown detail panels for findings, remediation, help topics, setup help, - restricted-mode guidance, and report-only Overloaded Module detail + restricted-mode guidance, and report-only detail for `Security Surfaces` and + `Overloaded Modules` - lightweight Explorer file decorations for review-relevant files - editor-local CodeLens and title actions for the active review target @@ -79,9 +85,11 @@ The extension currently supports: - conservative default analysis with an explicit deeper-review or custom-threshold follow-up profile - compact overview of structural health, current run state, baseline drift, and - current-run `Coverage Join` facts when MCP exposes `metrics.coverage_join` + current-run `Coverage Join` facts when MCP exposes `metrics.coverage_join`, + plus report-only `Security Surfaces` when MCP exposes + `metrics.security_surfaces` - review queues for new regressions, production hotspots, changed-scope - findings, and report-only `Overloaded Modules` + findings, and report-only `Security Surfaces` / `Overloaded Modules` - source reveal, peek, canonical finding detail, remediation detail, and session-local reviewed markers - bounded MCP help topics inside the IDE, with the optional `coverage` topic on @@ -117,6 +125,11 @@ Reviewed markers: ## Trust and runtime model +!!! warning "Workspace trust still matters" + The extension is intentionally limited in Restricted Mode. Local analysis, + local git access, and local MCP startup remain disabled until the workspace + is trusted. + The extension runs as a workspace extension and requires: - local filesystem access @@ -148,7 +161,8 @@ For this reason: - **Source-first**: findings prefer `Reveal Source` over detail panels; canonical detail and HTML report bridge are opt-in. - **Report-only separation**: Overloaded Modules stay visually distinct from - findings, gates, and health. + findings, gates, and health. `Security Surfaces` stay visually distinct too + and remain boundary inventory rather than vulnerability claims. - **Safe HTML bridge**: `Open in HTML Report` verifies the local file exists and is not older than the current run. - **Session-local state**: reviewed markers shape review UX but never leak diff --git a/docs/book/22-claude-desktop-bundle.md b/docs/book/22-claude-desktop-bundle.md index 652d276..db1d176 100644 --- a/docs/book/22-claude-desktop-bundle.md +++ b/docs/book/22-claude-desktop-bundle.md @@ -9,6 +9,11 @@ This chapter describes the bundle as a local install and launcher layer over existing CodeClone MCP contracts. It does not define a second analysis truth model. +!!! note "Wrapper only" + The Claude Desktop bundle is a thin local launcher surface over + `codeclone-mcp`. Analysis truth, findings, and health semantics remain in + the canonical MCP server. + ## Position in the platform The Claude Desktop bundle is: diff --git a/docs/book/23-codex-plugin.md b/docs/book/23-codex-plugin.md index e9fce9a..082f634 100644 --- a/docs/book/23-codex-plugin.md +++ b/docs/book/23-codex-plugin.md @@ -8,6 +8,11 @@ Document the current contract and behavior of the Codex plugin shipped in This chapter describes the plugin as a local Codex discovery and guidance layer over existing CodeClone MCP contracts. +!!! note "Guidance layer only" + The plugin contributes discovery metadata, a local MCP definition, and + review skills. It does not add a second analyzer or Codex-only finding + semantics. + ## Position in the platform The Codex plugin is: @@ -40,6 +45,7 @@ The plugin currently provides: - `.codex-plugin/plugin.json` - `.mcp.json` +- `scripts/launch_mcp` - `README.md` - two bundled skills: - `codeclone-review` @@ -51,6 +57,7 @@ The plugin currently provides: The plugin surface is additive: - `.mcp.json` contributes a local stdio MCP server definition +- `scripts/launch_mcp.py` resolves the local launcher without shell wrapping - that launcher prefers a workspace `.venv`, then a Poetry env, then `PATH` - the skills contribute workflow guidance and starter prompts - `README.md` documents local usage and boundaries inside the repository tree @@ -73,6 +80,8 @@ The plugin does not rewrite user config or install CodeClone automatically. - **Launcher honesty**: the plugin assumes `codeclone-mcp` is already installable in the current workspace or reachable on `PATH`, and prefers the workspace environment when one is present. +- **Shell-free launch**: the bundled launcher must stay argv-based and + local-stdio-only. ## Relationship to other interfaces diff --git a/docs/book/README.md b/docs/book/README.md index e1027ab..7ed5a71 100644 --- a/docs/book/README.md +++ b/docs/book/README.md @@ -5,6 +5,10 @@ This book is the contract-level documentation for CodeClone v2.x. All guarantees here are derived from code and locked tests. If a statement is not enforced by code/tests, it is explicitly marked as non-contractual. +!!! note "Contract rule" + If this book and the current repository code diverge, code and locked tests + win. Update the book after correcting the implementation or contract test. + ## How to read - Start with **Intro → Architecture map → Terminology**. diff --git a/docs/book/appendix/a-status-enums.md b/docs/book/appendix/a-status-enums.md index 507396e..6ce4d6f 100644 --- a/docs/book/appendix/a-status-enums.md +++ b/docs/book/appendix/a-status-enums.md @@ -2,13 +2,13 @@ ## Purpose -Centralize machine-readable status value sets used across baseline/cache/report contracts. +Centralize machine-readable status sets used across baseline/cache/report/CLI contracts. ## Public surface -- Baseline statuses: `codeclone/baseline.py:BaselineStatus` -- Cache statuses: `codeclone/cache.py:CacheStatus` -- Exit categories: `codeclone/contracts.py:ExitCode` +- Baseline statuses: `codeclone/baseline/trust.py:BaselineStatus` +- Cache statuses: `codeclone/cache/versioning.py:CacheStatus` +- Exit categories: `codeclone/contracts/__init__.py:ExitCode` ## Data model @@ -54,13 +54,13 @@ Defined by `BASELINE_UNTRUSTED_STATUSES`. ## Contracts -- Status values are serialized into report metadata (`baseline_status`, `cache_status`). -- CLI branches by enum values, not UI text. +- Status values are serialized into report metadata. +- CLI branches by enum/status values, not by human-facing message text. Refs: -- `codeclone/_cli_meta.py:ReportMeta` -- `codeclone/cli.py:_main_impl` +- `codeclone/surfaces/cli/report_meta.py:_build_report_meta` +- `codeclone/surfaces/cli/workflow.py:_main_impl` ## Locked by tests diff --git a/docs/book/appendix/b-schema-layouts.md b/docs/book/appendix/b-schema-layouts.md index 8d0a25e..7595832 100644 --- a/docs/book/appendix/b-schema-layouts.md +++ b/docs/book/appendix/b-schema-layouts.md @@ -2,17 +2,17 @@ ## Purpose -Compact structural layouts for baseline/cache/report contracts in `2.0.0b5`. +Compact structural layouts for baseline/cache/report contracts in `2.0.0b6`. ## Baseline schema (`2.1`) ```json { "meta": { - "generator": { "name": "codeclone", "version": "2.0.0b5" }, + "generator": { "name": "codeclone", "version": "2.0.0b6" }, "schema_version": "2.1", "fingerprint_version": "1", - "python_tag": "cp313", + "python_tag": "cp314", "created_at": "2026-03-11T00:00:00Z", "payload_sha256": "...", "metrics_payload_sha256": "...", @@ -60,9 +60,9 @@ Notes: ```json { "meta": { - "generator": { "name": "codeclone", "version": "2.0.0b5" }, + "generator": { "name": "codeclone", "version": "2.0.0b6" }, "schema_version": "1.2", - "python_tag": "cp313", + "python_tag": "cp314", "created_at": "2026-03-11T00:00:00Z", "payload_sha256": "...", "api_surface_payload_sha256": "..." @@ -91,13 +91,13 @@ Notes: } ``` -## Cache schema (`2.5`) +## Cache schema (`2.6`) ```json { - "v": "2.5", + "v": "2.6", "payload": { - "py": "cp313", + "py": "cp314", "fp": "1", "ap": { "min_loc": 10, @@ -109,7 +109,7 @@ Notes: "collect_api_surface": false }, "files": { - "codeclone/cache.py": { + "codeclone/cache/store.py": { "st": [1730000000000000000, 2048], "ss": [450, 12, 3, 1], "u": [[ @@ -126,6 +126,7 @@ Notes: "rq": ["pkg.dep:used_name"], "in": ["pkg.dep"], "cn": ["ClassName"], + "sc": [["process_boundary", "subprocess_run", "pkg.runner", "pkg.runner:run", 10, 10, "callable", "exact_call", "call", "subprocess.run"]], "sf": [["duplicated_branches", "key", [["stmt_seq", "Expr,Return"]], [["pkg.a:f", 10, 12]]]] } } @@ -146,13 +147,13 @@ Notes: - `u` row decoder accepts both legacy 11-column rows and canonical 17-column rows (legacy rows map new structural fields to neutral defaults). -## Report schema (`2.8`) +## Report schema (`2.10`) ```json { - "report_schema_version": "2.8", + "report_schema_version": "2.10", "meta": { - "codeclone_version": "2.0.0b5", + "codeclone_version": "2.0.0b6", "project_name": "codeclone", "scan_root": ".", "analysis_mode": "full", @@ -295,6 +296,17 @@ Notes: "added": 0, "breaking": 0, "strict_types": false + }, + "security_surfaces": { + "items": 0, + "modules": 0, + "exact_items": 0, + "category_count": 0, + "production": 0, + "tests": 0, + "fixtures": 0, + "other": 0, + "report_only": true } }, "families": { @@ -380,6 +392,27 @@ Notes: }, "items": [] }, + "security_surfaces": { + "summary": { + "items": 0, + "modules": 0, + "exact_items": 0, + "category_count": 0, + "categories": {}, + "by_source_kind": { + "production": 0, + "tests": 0, + "fixtures": 0, + "other": 0 + }, + "production": 0, + "tests": 0, + "fixtures": 0, + "other": 0, + "report_only": true + }, + "items": [] + }, "health": {} } }, @@ -439,7 +472,7 @@ Notes: ```text # CodeClone Report - Markdown schema: 1.0 -- Source report schema: 2.8 +- Source report schema: 2.10 ... ## Overview ## Inventory @@ -470,7 +503,7 @@ Notes: "tool": { "driver": { "name": "codeclone", - "version": "2.0.0b5", + "version": "2.0.0b6", "rules": [ { "id": "CCLONE001", @@ -509,7 +542,7 @@ Notes: "artifacts": [ { "location": { - "uri": "codeclone/report/sarif.py", + "uri": "codeclone/report/renderers/sarif.py", "uriBaseId": "%SRCROOT%" } } @@ -525,7 +558,7 @@ Notes: ], "properties": { "profileVersion": "1.0", - "reportSchemaVersion": "2.8" + "reportSchemaVersion": "2.10" }, "results": [ { @@ -540,7 +573,7 @@ Notes: { "physicalLocation": { "artifactLocation": { - "uri": "codeclone/report/sarif.py", + "uri": "codeclone/report/renderers/sarif.py", "uriBaseId": "%SRCROOT%", "index": 0 }, @@ -560,7 +593,7 @@ Notes: } ], "properties": { - "primaryPath": "codeclone/report/sarif.py", + "primaryPath": "codeclone/report/renderers/sarif.py", "primaryQualname": "codeclone.report.sarif:render_sarif_report_document", "primaryRegion": "1:10" }, @@ -598,9 +631,9 @@ INTEGRITY ## Refs -- `codeclone/baseline.py` -- `codeclone/cache.py` -- `codeclone/report/json_contract.py` -- `codeclone/report/serialize.py` -- `codeclone/report/markdown.py` -- `codeclone/report/sarif.py` +- `codeclone/baseline/clone_baseline.py` +- `codeclone/cache/store.py` +- `codeclone/report/document/builder.py` +- `codeclone/report/renderers/text.py` +- `codeclone/report/renderers/markdown.py` +- `codeclone/report/renderers/sarif.py` diff --git a/docs/book/appendix/c-error-catalog.md b/docs/book/appendix/c-error-catalog.md index 78e6389..4953c6d 100644 --- a/docs/book/appendix/c-error-catalog.md +++ b/docs/book/appendix/c-error-catalog.md @@ -8,82 +8,83 @@ Map core error conditions to statuses, markers, and exits. | Category | Marker | Exit | |----------------|-------------------|------| -| Contract error | `CONTRACT ERROR:` | 2 | -| Gating failure | `GATING FAILURE:` | 3 | -| Internal error | `INTERNAL ERROR:` | 5 | +| Contract error | `CONTRACT ERROR:` | `2` | +| Gating failure | `GATING FAILURE:` | `3` | +| Internal error | `INTERNAL ERROR:` | `5` | Refs: -- `codeclone/ui_messages.py:MARKER_CONTRACT_ERROR` -- `codeclone/contracts.py:ExitCode` +- `codeclone/ui_messages/__init__.py:MARKER_CONTRACT_ERROR` +- `codeclone/contracts/__init__.py:ExitCode` ## Baseline contract errors -| Condition | Baseline status | CLI behavior | -|----------------------|--------------------------------|-----------------------------------------| -| Missing baseline | `missing` | normal: empty diff; gating: exit 2 | -| Schema mismatch | `mismatch_schema_version` | normal: ignore baseline; gating: exit 2 | -| Fingerprint mismatch | `mismatch_fingerprint_version` | normal: ignore baseline; gating: exit 2 | -| Python tag mismatch | `mismatch_python_version` | normal: ignore baseline; gating: exit 2 | -| Integrity mismatch | `integrity_failed` | normal: ignore baseline; gating: exit 2 | +| Condition | Baseline status | CLI behavior | +|----------------------|--------------------------------|-------------------------------------------| +| Missing baseline | `missing` | normal: empty diff; gating: exit `2` | +| Schema mismatch | `mismatch_schema_version` | normal: ignore baseline; gating: exit `2` | +| Fingerprint mismatch | `mismatch_fingerprint_version` | normal: ignore baseline; gating: exit `2` | +| Python tag mismatch | `mismatch_python_version` | normal: ignore baseline; gating: exit `2` | +| Integrity mismatch | `integrity_failed` | normal: ignore baseline; gating: exit `2` | Refs: -- `codeclone/cli.py:_main_impl` -- `codeclone/baseline.py:BaselineStatus` +- `codeclone/surfaces/cli/workflow.py:_main_impl` +- `codeclone/baseline/trust.py:BaselineStatus` ## Cache degradation cases -| Condition | Cache status | Behavior | -|--------------------------------------------------|---------------------------------|------------------------| -| Missing cache file | `missing` | proceed without cache | -| Version mismatch | `version_mismatch` | ignore cache + warning | -| Analysis profile mismatch (`min_loc`/`min_stmt`) | `analysis_profile_mismatch` | ignore cache + warning | -| Invalid JSON/type | `invalid_json` / `invalid_type` | ignore cache + warning | -| Signature mismatch | `integrity_failed` | ignore cache + warning | -| Oversized cache | `too_large` | ignore cache + warning | +| Condition | Cache status | Behavior | +|---------------------------|---------------------------------|------------------------| +| Missing cache file | `missing` | proceed without cache | +| Version mismatch | `version_mismatch` | ignore cache + warning | +| Analysis profile mismatch | `analysis_profile_mismatch` | ignore cache + warning | +| Invalid JSON/type | `invalid_json` / `invalid_type` | ignore cache + warning | +| Signature mismatch | `integrity_failed` | ignore cache + warning | +| Oversized cache | `too_large` | ignore cache + warning | Refs: -- `codeclone/cache.py:CacheStatus` -- `codeclone/cache.py:Cache._ignore_cache` +- `codeclone/cache/versioning.py:CacheStatus` +- `codeclone/cache/store.py:Cache._ignore_cache` ## Source IO and gating | Condition | Behavior | |-------------------------------------------|---------------------------------| | Source read/decode failure in normal mode | file skipped; warning; continue | -| Source read/decode failure in gating mode | contract error exit 2 | +| Source read/decode failure in gating mode | contract error, exit `2` | Refs: -- `codeclone/cli.py:process_file` -- `codeclone/cli.py:_main_impl` +- `codeclone/core/worker.py:process_file` +- `codeclone/surfaces/cli/workflow.py:_main_impl` ## Report write errors -| Condition | Behavior | -|--------------------------------------------|-----------------------| -| Baseline write OSError | contract error exit 2 | -| HTML/JSON/Markdown/SARIF/TXT write OSError | contract error exit 2 | +| Condition | Behavior | +|--------------------------------------------|--------------------------| +| Baseline write OSError | contract error, exit `2` | +| HTML/JSON/Markdown/SARIF/TXT write OSError | contract error, exit `2` | Refs: -- `codeclone/cli.py:_main_impl` +- `codeclone/surfaces/cli/reports_output.py:_write_report_output` +- `codeclone/surfaces/cli/workflow.py:_main_impl` ## MCP interface errors | Condition | Behavior | |-----------------------------------------------|---------------------------------------------------| | Optional `mcp` extra missing | `codeclone-mcp` prints install hint and exits `2` | -| Invalid root path / invalid numeric config | MCP service contract error | +| Invalid root path / invalid config | MCP service contract error | | Missing run or finding id | MCP service request error | | Unsupported MCP resource URI / report section | MCP service contract error | Refs: -- `codeclone/mcp_server.py:main` -- `codeclone/mcp_service.py` +- `codeclone/surfaces/mcp/server.py:main` +- `codeclone/surfaces/mcp/service.py` ## Locked by tests diff --git a/docs/cfg.md b/docs/cfg.md index 4cb572a..b561e3c 100644 --- a/docs/cfg.md +++ b/docs/cfg.md @@ -186,7 +186,7 @@ This is critical for CI usage and baseline comparison. ## Python Tag Consistency for Baseline Checks Due to AST differences between interpreter versions, baseline compatibility is pinned to -the same `python_tag` (for example `cp313`), not full patch version equality. +the same `python_tag` (for example `cp314`), not full patch version equality. This keeps clone detection deterministic while allowing patch updates within the same tag. diff --git a/docs/claude-desktop-bundle.md b/docs/claude-desktop-bundle.md index 5e87168..c742616 100644 --- a/docs/claude-desktop-bundle.md +++ b/docs/claude-desktop-bundle.md @@ -23,14 +23,14 @@ The bundle prefers the current workspace launcher first: ```bash uv venv -uv pip install --python .venv/bin/python "codeclone[mcp]>=2.0.0b5" +uv pip install --python .venv/bin/python --pre "codeclone[mcp]" .venv/bin/codeclone-mcp --help ``` Global fallback: ```bash -uv tool install "codeclone[mcp]>=2.0.0b5" +uv tool install --pre "codeclone[mcp]" codeclone-mcp --help ``` diff --git a/docs/codex-plugin.md b/docs/codex-plugin.md index 19f8f54..6f6d5e8 100644 --- a/docs/codex-plugin.md +++ b/docs/codex-plugin.md @@ -9,6 +9,7 @@ Repo-local discovery via `.agents/plugins/marketplace.json`. |------------------------------|----------------------------------------------------| | `.codex-plugin/plugin.json` | Plugin metadata, prompts, instructions | | `.mcp.json` | Workspace-first MCP launcher definition | +| `scripts/launch_mcp` | Shell-free launcher wrapper for Codex | | `skills/codeclone-review/` | Conservative-first full review skill | | `skills/codeclone-hotspots/` | Quick hotspot discovery skill | | `assets/` | Plugin branding | @@ -17,14 +18,14 @@ Repo-local discovery via `.agents/plugins/marketplace.json`. ```bash uv venv -uv pip install --python .venv/bin/python "codeclone[mcp]>=2.0.0b5" +uv pip install --python .venv/bin/python --pre "codeclone[mcp]" .venv/bin/codeclone-mcp --help ``` Global fallback: ```bash -uv tool install "codeclone[mcp]>=2.0.0b5" +uv tool install --pre "codeclone[mcp]" codeclone-mcp --help ``` @@ -47,6 +48,7 @@ not mutate `~/.codex/config.toml` or install a second server binary. - if you already registered `codeclone-mcp` manually, keep only one setup path to avoid duplicate MCP surfaces - the bundled `.mcp.json` prefers `.venv`, then a Poetry env, then `PATH` +- the bundled launcher stays shell-free and local-stdio-only For the underlying interface contract, see: diff --git a/docs/mcp.md b/docs/mcp.md index 2031949..06a76b8 100644 --- a/docs/mcp.md +++ b/docs/mcp.md @@ -7,13 +7,24 @@ state is mutable in memory only. Works with any MCP-capable client regardless of backend model. +!!! note "Read-only by contract" + MCP is an integration surface over the same canonical pipeline and report + contracts as the CLI. It does not create a second analysis engine or write + back to repository state. + ## Install -```bash -uv tool install --pre "codeclone[mcp]" # install as a standalone tool -# or, inside an existing environment -uv pip install --pre "codeclone[mcp]" # add the MCP extra to that environment -``` +=== "Standalone tool" + + ```bash title="Install the MCP launcher as a standalone tool" + uv tool install --pre "codeclone[mcp]" + ``` + +=== "Existing environment" + + ```bash title="Install the MCP extra into the current environment" + uv pip install --pre "codeclone[mcp]" + ``` ## Quick client setup @@ -56,7 +67,7 @@ See [Claude Desktop bundle guide](claude-desktop-bundle.md). **Local agents** (Claude Code, Codex, Copilot Chat, Gemini CLI): -```bash +```bash title="Start a local stdio MCP server" codeclone-mcp --transport stdio ``` @@ -65,23 +76,34 @@ MCP analysis tools require an absolute repository root. Relative roots such as the client workspace. The same absolute-path rule applies to `check_*` tools when a `root` filter is provided. +!!! note "Absolute roots are required" + MCP tool requests must pass an absolute repository root. This keeps runs + deterministic across clients whose working directories may differ from the + visible workspace path. + **Remote / HTTP-only clients:** -```bash +```bash title="Start the optional HTTP transport locally" codeclone-mcp --transport streamable-http --host 127.0.0.1 --port 8000 ``` +!!! warning "Remote exposure is opt-in" + Non-loopback hosts require `--allow-remote`, and the built-in HTTP server + does not provide authentication. Use it only on trusted networks or behind + your own authenticated reverse proxy. + Non-loopback hosts require `--allow-remote` (no built-in auth). When `--allow-remote` is enabled, any reachable network client can trigger CPU-intensive analysis, read results, and probe repository-relative paths through MCP request parameters. Use it only on trusted networks. For anything production-adjacent, put the server behind a firewall or a reverse proxy with authentication. + Run retention is bounded: default `4`, max `10` (`--history-limit`). If a tool request omits `processes`, MCP defers process-count policy to the core CodeClone runtime. -Current `b5` MCP surface: `21` tools, `7` fixed resources, and `3` +Current `b6` MCP surface: `21` tools, `7` fixed resources, and `3` run-scoped URI templates. ## Tool surface @@ -134,6 +156,10 @@ run-scoped URI templates. `coverage_xml`, summaries include compact `coverage_join` facts. The XML path may be absolute or relative to the analysis root, and the join remains a current-run signal rather than baseline truth. +- Run summaries may also include compact `security_surfaces` facts: + item count, category count, production/test split, and `report_only=true`. + This layer inventories exact security-relevant capability surfaces and trust + boundaries; it does not claim vulnerabilities or exploitability. - When `respect_pyproject=true`, MCP also applies `golden_fixture_paths`. Fully matching golden-fixture clone groups are excluded from active clone and gate projections but remain visible in the canonical report under the @@ -145,7 +171,8 @@ run-scoped URI templates. Both accept the full canonical form as input. - `metrics_detail(family="overloaded_modules")` exposes the report-only module-hotspot layer without turning it into findings or gate data. -- `metrics_detail` also accepts `coverage_adoption`, `coverage_join`, and +- `metrics_detail` also accepts `coverage_adoption`, `coverage_join`, + `security_surfaces`, and `api_surface`. - `help(topic=...)` is static: meaning, anti-patterns, next step, doc links. - Start with repo defaults or `pyproject`-resolved thresholds, then lower them diff --git a/docs/publishing.md b/docs/publishing.md index b890b16..fe0be95 100644 --- a/docs/publishing.md +++ b/docs/publishing.md @@ -7,6 +7,10 @@ Document how the documentation site is built, validated, and published. This page is operational, not contractual. The source of truth for behavior remains the current repository code and CI workflow. +!!! note "Scope" + This page covers docs-site build and publishing mechanics. Public behavior + contracts still live in the book chapters and in the repository code. + ## Current stack - Site generator: `MkDocs` @@ -41,6 +45,10 @@ Relevant files: - `.github/workflows/docs.yml` - `scripts/build_docs_example_report.py` +!!! warning "Generated output only" + `site/` is a generated artifact. It is used for local preview and GitHub + Pages deployment, but it should not be committed. + ## Sample report generation The sample report is generated from the current `codeclone` repository tree. @@ -57,17 +65,18 @@ git. `site/` remains ignored. ## Local preview -Build the site: +=== "Build the site" -```bash -uv run --with mkdocs --with mkdocs-material mkdocs build --strict -``` + ```bash title="Validate the MkDocs site" + uv run --with mkdocs --with mkdocs-material mkdocs build --strict + ``` -Generate the sample report into the built site: +=== "Build the site and sample report" -```bash -uv run python scripts/build_docs_example_report.py --output-dir site/examples/report/live -``` + ```bash title="Generate the live sample report into site/" + uv run --with mkdocs --with mkdocs-material mkdocs build --strict + uv run python scripts/build_docs_example_report.py --output-dir site/examples/report/live + ``` Then open: diff --git a/docs/sarif.md b/docs/sarif.md index 2bb2518..c2ebd56 100644 --- a/docs/sarif.md +++ b/docs/sarif.md @@ -5,13 +5,13 @@ Explain how CodeClone projects canonical findings into SARIF and what IDEs or code-scanning tools can rely on. -SARIF is a machine-readable projection layer. The canonical source of report -truth remains the JSON report document. +SARIF is a deterministic projection layer. The canonical source of truth +remains the report document. ## Source files -- `codeclone/report/sarif.py` -- `codeclone/report/json_contract.py` +- `codeclone/report/renderers/sarif.py` +- `codeclone/report/document/builder.py` - `codeclone/report/findings.py` ## Design model @@ -32,18 +32,12 @@ anchored through `%SRCROOT%`. Current behavior: -- `run.originalUriBaseIds["%SRCROOT%"]` points at the scan root when an - absolute scan root is known +- `run.originalUriBaseIds["%SRCROOT%"]` points at the scan root when known - `run.artifacts[*]` enumerates referenced files - `artifactLocation.uri` uses repository-relative paths - `artifactLocation.index` aligns locations with artifacts for stable linking - `run.invocations[*].workingDirectory` mirrors the scan root URI when available -- `run.invocations[*].startTimeUtc` is emitted when analysis start time is - available in canonical runtime meta -- `run.automationDetails.id` is unique per run so code-scanning systems can - correlate uploads reliably - -This helps consumers resolve results back to workspace files consistently. +- `run.automationDetails.id` is unique per run ## Result model @@ -52,43 +46,12 @@ Current SARIF output includes: - `tool.driver.rules[*]` with stable rule IDs and help links - `results[*]` for clone groups, dead code, design findings, and structural findings - `locations[*]` with primary file/line mapping -- `locations[*].message` and `relatedLocations[*].message` with - human-readable role labels such as `Representative occurrence` -- `relatedLocations[*]` when the result has multiple relevant locations +- `relatedLocations[*]` for multi-location findings - `partialFingerprints.primaryLocationLineHash` for stable per-location identity - without encoding line numbers into the hash digest -- result `properties` with stable identity/context fields such as primary path, - qualname, and region - explicit `kind: "fail"` on results -For clone results, CodeClone also carries novelty-aware metadata when known: - -- `baselineState` - -This improves usefulness in IDE/code-scanning flows that distinguish new vs -known findings. - -Coverage join can materialize `coverage` / `coverage_hotspot` and -`coverage_scope_gap` design findings when the canonical report already -contains valid `metrics.families.coverage_join` facts. SARIF projects those -findings like other design findings; it does not parse Cobertura XML or create -coverage-specific analysis truth. - -## Rule metadata - -Rule records are intentionally richer than a minimal SARIF export. - -They include: - -- stable rule IDs -- stable rule names derived from `ruleId` -- display name -- help text / markdown -- tags -- docs-facing help URI - -The goal is not only schema compliance, but a better consumer experience in IDEs -and code-scanning platforms. +Coverage Join may materialize coverage design findings only when the canonical +report already contains valid `metrics.families.coverage_join` facts. ## What SARIF is good for here @@ -104,16 +67,6 @@ It is not the source of truth for: - gating semantics - baseline compatibility -Those remain owned by the canonical report and baseline contracts. - -## Limitations - -- Consumer UX depends on the IDE/platform; not every SARIF field is shown by - every tool. -- HTML-only presentation details are not carried into SARIF. -- SARIF wording may evolve as long as IDs, semantics, and deterministic - structure remain stable. - ## Validation and tests Relevant tests: @@ -124,7 +77,7 @@ Relevant tests: Contract-adjacent coverage includes: -- reuse of canonical report document +- reuse of the canonical report document - stable SARIF branch invariants - deterministic artifacts/rules/results ordering diff --git a/docs/terms-of-use.md b/docs/terms-of-use.md index 5c7f797..82604b4 100644 --- a/docs/terms-of-use.md +++ b/docs/terms-of-use.md @@ -9,7 +9,8 @@ CodeClone is distributed as a local analysis tool and local integration layer. That means: -- CodeClone is provided as-is under the repository license terms +- CodeClone source code is provided as-is under MPL-2.0, and repository + documentation is provided as-is under MIT - local integrations are wrappers over local CodeClone execution, not hosted managed services - users are responsible for reviewing the commands, configuration, and diff --git a/docs/vscode-extension.md b/docs/vscode-extension.md index 7f011ac..d488d7a 100644 --- a/docs/vscode-extension.md +++ b/docs/vscode-extension.md @@ -20,6 +20,7 @@ The extension helps you: - jump directly to source locations - open canonical finding or remediation detail only when needed - inspect current-run `Coverage Join` facts without inventing extension-local interpretations +- inspect report-only `Security Surfaces` as security-relevant boundary inventory - inspect report-only Overloaded Module candidates without treating them like findings It does not create a second truth model and it does not mutate the repository. @@ -36,13 +37,13 @@ to `PATH`. Runtime and version-mismatch messages identify that resolved launcher Recommended install for the preview extension: ```bash -uv tool install "codeclone[mcp]>=2.0.0b4" +uv tool install --pre "codeclone[mcp]" ``` If you want the launcher inside the current environment instead: ```bash -uv pip install "codeclone[mcp]>=2.0.0b4" +uv pip install --pre "codeclone[mcp]" ``` Verify the launcher: @@ -51,6 +52,11 @@ Verify the launcher: codeclone-mcp --help ``` +When you run the CLI inside an interactive VS Code terminal, CodeClone may also +show a one-time extension hint after the summary. It is suppressed in quiet, +CI, and non-interactive runs, and is remembered per CodeClone version next to +the resolved project cache path. + ## Main views ### Overview @@ -58,6 +64,8 @@ codeclone-mcp --help Compact health, current run state, baseline drift, and next-best review action. When the current run includes external Cobertura join facts, Overview also shows a factual `Coverage Join` section sourced from canonical MCP metrics. +When MCP exposes `security_surfaces`, Overview also shows a compact report-only +`Security Surfaces` section. ### Hotspots @@ -66,6 +74,7 @@ Primary operational view for: - new regressions - production hotspots - changed-files findings +- report-only Security Surfaces - report-only Overloaded Module candidates ### Runs & Session @@ -89,6 +98,8 @@ The extension stays source-first: review target - Explorer decorations stay lightweight and focus on new, production, or changed-scope relevance +- report-only Security Surfaces stay source-first: reveal source, open compact + detail, or copy a review brief without promoting them to findings `Open in HTML Report` exists as an explicit bridge to the richer human report, not as the primary IDE workflow. diff --git a/extensions/claude-desktop-codeclone/README.md b/extensions/claude-desktop-codeclone/README.md index 2bf2060..37ee769 100644 --- a/extensions/claude-desktop-codeclone/README.md +++ b/extensions/claude-desktop-codeclone/README.md @@ -20,14 +20,14 @@ Recommended workspace-local setup: ```bash uv venv -uv pip install --python .venv/bin/python "codeclone[mcp]>=2.0.0b4" +uv pip install --python .venv/bin/python --pre "codeclone[mcp]" .venv/bin/codeclone-mcp --help ``` Global fallback: ```bash -uv tool install "codeclone[mcp]>=2.0.0b4" +uv tool install --pre "codeclone[mcp]" codeclone-mcp --help ``` diff --git a/extensions/claude-desktop-codeclone/manifest.json b/extensions/claude-desktop-codeclone/manifest.json index 33c3286..ccdf403 100644 --- a/extensions/claude-desktop-codeclone/manifest.json +++ b/extensions/claude-desktop-codeclone/manifest.json @@ -2,7 +2,7 @@ "manifest_version": "0.3", "name": "codeclone", "display_name": "CodeClone", - "version": "2.0.0-b5.1", + "version": "2.0.0-b6.0", "description": "Baseline-aware structural review for Claude Desktop through a local CodeClone MCP launcher.", "long_description": "CodeClone for Claude Desktop wraps the local codeclone-mcp launcher as an MCP bundle. It keeps Claude on the same canonical MCP surface used by the CLI, HTML report, VS Code extension, and Codex plugin — read-only, baseline-aware, local stdio only.", "author": { diff --git a/extensions/claude-desktop-codeclone/package-lock.json b/extensions/claude-desktop-codeclone/package-lock.json index 9f156cb..0e6d72b 100644 --- a/extensions/claude-desktop-codeclone/package-lock.json +++ b/extensions/claude-desktop-codeclone/package-lock.json @@ -1,12 +1,12 @@ { "name": "@orenlab/codeclone-claude-desktop", - "version": "2.0.0-b5.1", + "version": "2.0.0-b6.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@orenlab/codeclone-claude-desktop", - "version": "2.0.0-b5.1", + "version": "2.0.0-b6.0", "license": "MPL-2.0", "engines": { "node": ">=20.0.0" diff --git a/extensions/claude-desktop-codeclone/package.json b/extensions/claude-desktop-codeclone/package.json index 2b97c37..5abfc93 100644 --- a/extensions/claude-desktop-codeclone/package.json +++ b/extensions/claude-desktop-codeclone/package.json @@ -1,6 +1,6 @@ { "name": "@orenlab/codeclone-claude-desktop", - "version": "2.0.0-b5.1", + "version": "2.0.0-b6.0", "private": true, "description": "Claude Desktop MCP bundle wrapper for the local CodeClone MCP launcher.", "license": "MPL-2.0", diff --git a/extensions/vscode-codeclone/CHANGELOG.md b/extensions/vscode-codeclone/CHANGELOG.md index 525fb2c..18f1165 100644 --- a/extensions/vscode-codeclone/CHANGELOG.md +++ b/extensions/vscode-codeclone/CHANGELOG.md @@ -1,5 +1,13 @@ # Change Log +## 0.2.4 + +- restore repo-local `uv run codeclone-mcp` fallback for the refactored MCP server layout +- cover both legacy and current CodeClone repo markers in extension runtime tests +- surface report-only `Security Surfaces` as a first-class hotspot and overview layer +- add source-first security review actions, briefs, and Markdown detail without creating a second truth model +- join `Security Surfaces` with current-run `Coverage Join` context when MCP exposes both families + ## 0.2.3 - explain baseline mismatch runs more clearly with compact baseline/runtime tag context diff --git a/extensions/vscode-codeclone/README.md b/extensions/vscode-codeclone/README.md index 897e65a..d585d95 100644 --- a/extensions/vscode-codeclone/README.md +++ b/extensions/vscode-codeclone/README.md @@ -9,8 +9,7 @@ creating a second truth model. The extension stays read-only with respect to repository state and uses the same canonical report semantics as the CLI, HTML report, and MCP server. -This extension is published as a preview while the `2.0.0b5` line is still in -beta. +This extension is published as a preview for the current `2.0.x` beta line. ## What it is for @@ -21,6 +20,7 @@ CodeClone inside VS Code is designed for: - conservative first-pass analysis with an explicit deeper-review follow-up - baseline-aware distinction between known debt and new regressions - guided drill-down from hotspot to source, finding detail, and remediation +- report-only review of security-relevant boundaries without turning them into vulnerability claims - lightweight code navigation without turning the sidebar into a second report app It is not a generic linter panel and it does not try to duplicate the HTML @@ -50,13 +50,13 @@ falling back to `PATH`. Runtime and version-mismatch messages identify that reso Recommended install for the preview extension: ```bash -uv tool install "codeclone[mcp]>=2.0.0b4" +uv tool install --pre "codeclone[mcp]" ``` If you want the launcher inside the current environment instead: ```bash -uv pip install "codeclone[mcp]>=2.0.0b4" +uv pip install --pre "codeclone[mcp]" ``` Verify the launcher: @@ -91,6 +91,7 @@ The main operational view. It focuses on: - new regressions - production hotspots - changed-files findings +- report-only Security Surfaces - report-only Overloaded Module candidates Focus mode is explicit and persisted per workspace. The extension favors @@ -142,7 +143,9 @@ opening raw JSON-like details by default. - **Source-first**: review should move you to code before it opens deeper detail. - **Report-only separation**: `Overloaded Modules` are visible but intentionally kept - outside findings, gates, and health. + outside findings, gates, and health. `Security Surfaces` follow the same rule + and stay framed as review-sensitive boundary inventory rather than + vulnerability proof. - **Limited Restricted Mode**: the extension keeps setup/onboarding available in untrusted workspaces, but local analysis and MCP stay disabled until trust is granted. diff --git a/extensions/vscode-codeclone/package-lock.json b/extensions/vscode-codeclone/package-lock.json index 10fc5af..9333373 100644 --- a/extensions/vscode-codeclone/package-lock.json +++ b/extensions/vscode-codeclone/package-lock.json @@ -1,12 +1,12 @@ { "name": "codeclone", - "version": "0.2.3", + "version": "0.2.4", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "codeclone", - "version": "0.2.3", + "version": "0.2.4", "license": "MPL-2.0", "devDependencies": { "@types/node": "^25.5.2", diff --git a/extensions/vscode-codeclone/package.json b/extensions/vscode-codeclone/package.json index 38a2f39..5bc9985 100644 --- a/extensions/vscode-codeclone/package.json +++ b/extensions/vscode-codeclone/package.json @@ -2,7 +2,7 @@ "name": "codeclone", "displayName": "CodeClone", "description": "Baseline-aware, triage-first structural review for Python, powered by CodeClone MCP.", - "version": "0.2.3", + "version": "0.2.4", "preview": true, "publisher": "orenlab", "license": "MPL-2.0", @@ -49,7 +49,7 @@ "build": "node esbuild.config.mjs", "watch": "node esbuild.config.mjs --watch", "typecheck": "tsc -p jsconfig.json --noEmit", - "check": "npm run build && npm run typecheck && node --check src/constants.js && node --check src/formatters.js && node --check src/runtime.js && node --check src/renderers.js && node --check src/providers.js && node --check src/support.js && node --check src/mcpClient.js && node --check src/extension.js", + "check": "npm run build && npm run typecheck && node --check src/constants.js && node --check src/formatters.js && node --check src/runtime.js && node --check src/renderers.js && node --check src/providers.js && node --check src/support.js && node --check src/runArtifacts.js && node --check src/mcpClient.js && node --check src/extension.js", "test": "node --test test/*.test.js", "test:host": "npm run build && node test/runExtensionHost.js", "vscode:prepublish": "npm run build" @@ -79,6 +79,8 @@ "onCommand:codeclone.openSetupHelp", "onCommand:codeclone.openOverloadedModule", "onCommand:codeclone.copyOverloadedModuleBrief", + "onCommand:codeclone.openSecuritySurface", + "onCommand:codeclone.copySecuritySurfaceBrief", "onCommand:codeclone.manageWorkspaceTrust" ], "main": "./dist/extension.js", @@ -316,6 +318,23 @@ "category": "CodeClone", "icon": "$(copy)" }, + { + "command": "codeclone.openSecuritySurface", + "title": "Open Security Surface Detail", + "category": "CodeClone", + "icon": "$(shield)" + }, + { + "command": "codeclone.reviewSecuritySurface", + "title": "Review Security Surface", + "category": "CodeClone" + }, + { + "command": "codeclone.copySecuritySurfaceBrief", + "title": "Copy Security Review Brief", + "category": "CodeClone", + "icon": "$(copy)" + }, { "command": "codeclone.openOverview", "title": "Open Overview", @@ -385,6 +404,18 @@ { "command": "codeclone.reviewOverloadedModule", "when": "false" + }, + { + "command": "codeclone.openSecuritySurface", + "when": "false" + }, + { + "command": "codeclone.copySecuritySurfaceBrief", + "when": "false" + }, + { + "command": "codeclone.reviewSecuritySurface", + "when": "false" } ], "view/title": [ @@ -554,6 +585,16 @@ "command": "codeclone.copyOverloadedModuleBrief", "when": "viewItem == codeclone.overloadedModule", "group": "navigation@1" + }, + { + "command": "codeclone.openSecuritySurface", + "when": "viewItem == codeclone.securitySurface", + "group": "inline@1" + }, + { + "command": "codeclone.copySecuritySurfaceBrief", + "when": "viewItem == codeclone.securitySurface", + "group": "navigation@1" } ], "editor/title": [ @@ -591,6 +632,11 @@ "command": "codeclone.copyOverloadedModuleBrief", "when": "editorTextFocus && codeclone.activeReviewTargetVisibleInEditor && codeclone.activeReviewTargetIsOverloadedModule", "group": "secondary@4" + }, + { + "command": "codeclone.copySecuritySurfaceBrief", + "when": "editorTextFocus && codeclone.activeReviewTargetVisibleInEditor && codeclone.activeReviewTargetIsSecuritySurface", + "group": "secondary@4" } ] }, diff --git a/extensions/vscode-codeclone/src/constants.js b/extensions/vscode-codeclone/src/constants.js index 647efec..675e033 100644 --- a/extensions/vscode-codeclone/src/constants.js +++ b/extensions/vscode-codeclone/src/constants.js @@ -23,6 +23,7 @@ const HOTSPOT_GROUPS = [ {id: "newRegressions", label: "New Regressions", icon: "diff-added"}, {id: "productionHotspots", label: "Production Hotspots", icon: "target"}, {id: "changedFiles", label: "Changed Files", icon: "git-commit"}, + {id: "securitySurfaces", label: "Security Surfaces", icon: "shield"}, {id: "overloadedModules", label: "Overloaded Modules", icon: "symbol-module"}, ]; @@ -50,7 +51,7 @@ const HOTSPOT_FOCUS_MODES = [ { id: "reportOnly", label: "Report-only", - description: "Focus only on report-only Overloaded Module candidates.", + description: "Focus only on report-only Security Surfaces and Overloaded Modules.", }, { id: "all", @@ -64,7 +65,7 @@ const HOTSPOT_GROUPS_BY_MODE = { new: ["newRegressions"], production: ["productionHotspots"], changed: ["changedFiles"], - reportOnly: ["overloadedModules"], + reportOnly: ["securitySurfaces", "overloadedModules"], all: HOTSPOT_GROUPS.map((group) => group.id), }; diff --git a/extensions/vscode-codeclone/src/extension.js b/extensions/vscode-codeclone/src/extension.js index f0aae46..5d30ab9 100644 --- a/extensions/vscode-codeclone/src/extension.js +++ b/extensions/vscode-codeclone/src/extension.js @@ -37,8 +37,11 @@ const { formatKind, formatNovelty, formatRunScope, + formatSecuritySurfaceLocation, + formatSecuritySurfaceReviewSignal, formatSeverity, formatSourceKindSummary, + humanizeIdentifier, isSpecificFocusMode, normalizeFindingLocations, normalizeRelativePath, @@ -46,6 +49,7 @@ const { reviewTargetKey, safeArray, safeObject, + securitySurfacesPayload, sameLaunchSpec, treeAccessibilityInformation, workspaceRelativePath, @@ -55,12 +59,14 @@ const { markdownBulletList, renderFindingMarkdown, renderOverloadedModuleMarkdown, + renderSecuritySurfaceMarkdown, renderHelpMarkdown, renderRemediationMarkdown, renderRestrictedModeMarkdown, renderSetupMarkdown, renderTriageMarkdown, } = require("./renderers"); +const {loadRunArtifacts} = require("./runArtifacts"); const { HotspotsTreeProvider, OverviewTreeProvider, @@ -341,6 +347,15 @@ class CodeCloneController { vscode.commands.registerCommand("codeclone.reviewOverloadedModule", (node) => this.reviewOverloadedModule(node) ), + vscode.commands.registerCommand("codeclone.openSecuritySurface", (node) => + this.openSecuritySurface(node) + ), + vscode.commands.registerCommand("codeclone.copySecuritySurfaceBrief", (node) => + this.copySecuritySurfaceBrief(node) + ), + vscode.commands.registerCommand("codeclone.reviewSecuritySurface", (node) => + this.reviewSecuritySurface(node) + ), ]; this.context.subscriptions.push(...subscriptions); } @@ -781,11 +796,15 @@ class CodeCloneController { const diffRef = vscode.workspace .getConfiguration("codeclone", state.folder.uri) .get("analysis.changedDiffRef", "HEAD"); + const coverageJoin = coverageJoinPayload(state.metricsSummary); + const securitySurfaces = securitySurfacesPayload(state.metricsSummary); const [ newRegressionsResponse, productionHotspotsResponse, changedFilesResponse, overloadedModulesResponse, + securitySurfacesResponse, + coverageJoinResponse, ] = await Promise.all([ this.client.callTool("list_findings", { run_id: runId, @@ -819,15 +838,36 @@ class CodeCloneController { family: "overloaded_modules", limit: 25, }), + Number(securitySurfaces.items || 0) > 0 + ? this.client.callTool("get_report_section", { + run_id: runId, + section: "metrics_detail", + family: "security_surfaces", + limit: 100, + }) + : Promise.resolve({items: []}), + String(coverageJoin.status || "").trim().toLowerCase() === "ok" + ? this.client.callTool("get_report_section", { + run_id: runId, + section: "metrics_detail", + family: "coverage_join", + limit: 200, + }) + : Promise.resolve({items: []}), ]); if (this.disposed) { return; } + const normalizedSecuritySurfaces = this.normalizeSecuritySurfaceItems( + safeArray(securitySurfacesResponse.items), + safeArray(coverageJoinResponse.items) + ); state.reviewArtifacts = { newRegressions: safeArray(newRegressionsResponse.items), productionHotspots: safeArray(productionHotspotsResponse.items), changedFiles: safeArray(changedFilesResponse.items), overloadedModules: safeArray(overloadedModulesResponse.items), + securitySurfaces: normalizedSecuritySurfaces, }; state.groupCache.clear(); this.rebuildFileDecorations(); @@ -1059,32 +1099,21 @@ class CodeCloneController { ...analysisSettings.overrides, }); const runId = String(analysisPayload.run_id); - const summary = await this.client.callTool("get_run_summary", { - run_id: runId, - }); - const triage = await this.client.callTool("get_production_triage", { - run_id: runId, - max_hotspots: 5, - max_suggestions: 5, - }); - const metrics = await this.client.callTool("get_report_section", { - run_id: runId, - section: "metrics", - }); - const reviewed = await this.client.callTool("list_reviewed_findings", { - run_id: runId, - }); - const gitSnapshot = await captureWorkspaceGitSnapshot(folder); + const artifacts = await loadRunArtifacts( + this.client, + folder, + runId + ); state.currentRunId = runId; - state.latestSummary = summary; - state.latestTriage = triage; - state.metricsSummary = metrics.summary || metrics; + state.latestSummary = artifacts.summary; + state.latestTriage = artifacts.triage; + state.metricsSummary = artifacts.metricsSummary; state.changedSummary = changedMode ? analysisPayload : null; state.analysisSettings = analysisSettings; - state.reviewed = safeArray(reviewed.items); + state.reviewed = artifacts.reviewedItems; state.lastScope = changedMode ? "changed" : "workspace"; state.lastUpdatedAt = new Date(); - state.gitSnapshot = gitSnapshot; + state.gitSnapshot = artifacts.gitSnapshot; state.stale = false; state.staleReason = null; state.lastStaleCheckAt = Date.now(); @@ -1152,6 +1181,7 @@ class CodeCloneController { if ( !candidate || candidate.nodeType === "overloadedModule" || + candidate.nodeType === "securitySurface" || !candidate.findingId ) { return null; @@ -1171,6 +1201,18 @@ class CodeCloneController { return candidate; } + activeSecuritySurfaceTarget(node) { + const candidate = node || this.activeReviewTarget; + if ( + !candidate || + candidate.nodeType !== "securitySurface" || + !safeObject(candidate.item).path + ) { + return null; + } + return candidate; + } + isTargetVisibleInEditor(target, editor = vscode.window.activeTextEditor) { if (!target || !editor || !editor.document) { return false; @@ -1183,6 +1225,11 @@ class CodeCloneController { } return workspaceRelativePath(state.folder, fsPath) === normalizeRelativePath(target.item.path); } + if (target.nodeType === "securitySurface") { + return safeArray(target.locations).some( + (location) => location.absolutePath === fsPath + ); + } return safeArray(target.locations).some( (location) => location.absolutePath === fsPath ); @@ -1237,6 +1284,37 @@ class CodeCloneController { return resolved; } + normalizeSecuritySurfaceItems(items, coverageJoinItems) { + const coverageIndex = new Map(); + for (const item of safeArray(coverageJoinItems)) { + const pathValue = String(safeObject(item).path || "").trim(); + const qualnameValue = String(safeObject(item).qualname || "").trim(); + if (!pathValue || !qualnameValue) { + continue; + } + coverageIndex.set(`${pathValue}::${qualnameValue}`, { + coverage_overlap: true, + coverage_hotspot: Boolean(safeObject(item).coverage_hotspot), + scope_gap_hotspot: Boolean(safeObject(item).scope_gap_hotspot), + }); + } + return safeArray(items).map((item) => { + const entry = safeObject(item); + const pathValue = String(entry.path || "").trim(); + const qualnameValue = String(entry.qualname || "").trim(); + const coverageEntry = + pathValue && qualnameValue + ? coverageIndex.get(`${pathValue}::${qualnameValue}`) + : null; + return { + ...entry, + coverage_overlap: Boolean(coverageEntry?.coverage_overlap), + coverage_hotspot: Boolean(coverageEntry?.coverage_hotspot), + scope_gap_hotspot: Boolean(coverageEntry?.scope_gap_hotspot), + }; + }); + } + reviewArtifactItems(state, groupId) { if (!state) { return []; @@ -1251,6 +1329,8 @@ class CodeCloneController { return safeArray(artifacts.changedFiles); case "overloadedModules": return safeArray(artifacts.overloadedModules); + case "securitySurfaces": + return safeArray(artifacts.securitySurfaces); default: return []; } @@ -1398,13 +1478,89 @@ class CodeCloneController { }; } + toSecuritySurfaceNodes(state, items) { + return items.map((item) => this.buildSecuritySurfaceNode(state, item)); + } + + securitySurfaceLocations(state, item) { + return [ + { + path: String(item.path || ""), + line: + typeof item.start_line === "number" && !Number.isNaN(item.start_line) + ? item.start_line + : null, + end_line: + typeof item.end_line === "number" && !Number.isNaN(item.end_line) + ? item.end_line + : null, + symbol: item.qualname ? String(item.qualname) : null, + absolutePath: + resolveWorkspacePath( + state.folder.uri.fsPath, + String(item.path || "") + ) || "", + }, + ].filter((location) => location.absolutePath); + } + + hydrateSecuritySurfaceNode(state, node) { + const locations = + safeArray(node.locations).length > 0 + ? safeArray(node.locations) + : this.securitySurfaceLocations(state, safeObject(node.item)); + return { + ...node, + nodeType: "securitySurface", + locations, + }; + } + + buildSecuritySurfaceNode(state, item) { + const locationLabel = formatSecuritySurfaceLocation(item); + const locations = this.securitySurfaceLocations(state, item); + return { + nodeType: "securitySurface", + workspaceKey: state.folder.uri.toString(), + runId: state.currentRunId, + item, + label: locationLabel, + description: `${humanizeIdentifier(item.capability)} · ${formatSecuritySurfaceReviewSignal(item)}`, + tooltip: + `${humanizeIdentifier(item.category)} · ${humanizeIdentifier(item.source_kind)}\n` + + `Evidence: ${String(item.evidence_symbol || "(unknown)")}`, + icon: new vscode.ThemeIcon("shield"), + contextValue: "codeclone.securitySurface", + locations, + command: { + command: "codeclone.reviewSecuritySurface", + title: "Review Security Surface", + arguments: [ + { + workspaceKey: state.folder.uri.toString(), + runId: state.currentRunId, + item, + nodeType: "securitySurface", + locations, + }, + ], + }, + }; + } + currentPriorityQueue(state) { const artifacts = safeObject(state.reviewArtifacts); const groupIds = this.hotspotFocusMode === "recommended" ? ["changedFiles", "newRegressions", "productionHotspots"] : this.hotspotFocusMode === "all" - ? ["changedFiles", "newRegressions", "productionHotspots", "overloadedModules"] + ? [ + "changedFiles", + "newRegressions", + "productionHotspots", + "securitySurfaces", + "overloadedModules", + ] : this.activeHotspotGroupIds(state); const queue = []; const seen = new Set(); @@ -1423,6 +1579,20 @@ class CodeCloneController { } continue; } + if (groupId === "securitySurfaces") { + for (const node of this.toSecuritySurfaceNodes( + state, + safeArray(artifacts.securitySurfaces) + )) { + const key = reviewTargetKey(node); + if (!key || seen.has(key)) { + continue; + } + seen.add(key); + queue.push(node); + } + continue; + } for (const node of this.toFindingNodes( state, this.reviewArtifactItems(state, groupId) @@ -1438,9 +1608,21 @@ class CodeCloneController { if ( this.hotspotFocusMode === "recommended" && queue.length === 0 && - safeArray(artifacts.overloadedModules).length > 0 + ( + safeArray(artifacts.securitySurfaces).length > 0 || + safeArray(artifacts.overloadedModules).length > 0 + ) ) { - return this.toOverloadedModuleNodes(state, safeArray(artifacts.overloadedModules)); + return [ + ...this.toSecuritySurfaceNodes( + state, + safeArray(artifacts.securitySurfaces) + ), + ...this.toOverloadedModuleNodes( + state, + safeArray(artifacts.overloadedModules) + ), + ]; } return queue; } @@ -1485,6 +1667,10 @@ class CodeCloneController { await this.revealOverloadedModuleSource(nextNode); return; } + if (nextNode.nodeType === "securitySurface") { + await this.revealSecuritySurfaceSource(nextNode); + return; + } await this.revealFindingSource(nextNode); } @@ -1546,6 +1732,8 @@ class CodeCloneController { if (picked) { if (picked.node.nodeType === "overloadedModule") { await this.reviewOverloadedModule(picked.node); + } else if (picked.node.nodeType === "securitySurface") { + await this.reviewSecuritySurface(picked.node); } else { await this.reviewFinding(picked.node); } @@ -1980,6 +2168,32 @@ class CodeCloneController { await this.revealWorkspacePath(state.folder, activeNode.item.path); } + async revealSecuritySurfaceSource(node) { + const activeNode = this.activeSecuritySurfaceTarget(node); + if (!activeNode) { + return; + } + const state = this.states.get(activeNode.workspaceKey); + if (!state) { + return; + } + const resolved = this.hydrateSecuritySurfaceNode(state, activeNode); + this.setActiveReviewTarget(resolved); + const location = firstNormalizedLocation(state.folder, resolved.locations); + if (!location || !location.path) { + await vscode.window.showInformationMessage( + "This security surface does not expose a source location." + ); + return; + } + await this.revealWorkspacePath( + state.folder, + location.path, + location.line ?? undefined, + location.end_line ?? undefined + ); + } + /** * @param {any} folder * @param {string} relativePath @@ -2167,6 +2381,118 @@ class CodeCloneController { ); } + async openSecuritySurface(node) { + const activeNode = this.activeSecuritySurfaceTarget(node); + if (!activeNode) { + return; + } + const state = this.states.get(activeNode.workspaceKey); + if (!state) { + return; + } + const resolved = this.hydrateSecuritySurfaceNode(state, activeNode); + this.setActiveReviewTarget(resolved); + await this.showMarkdownDocument(renderSecuritySurfaceMarkdown(resolved.item)); + } + + async reviewSecuritySurface(node) { + const activeNode = this.activeSecuritySurfaceTarget(node); + if (!activeNode) { + return; + } + const state = this.states.get(activeNode.workspaceKey); + if (!state) { + return; + } + const resolved = this.hydrateSecuritySurfaceNode(state, activeNode); + this.setActiveReviewTarget(resolved); + const picked = await vscode.window.showQuickPick( + [ + { + label: "Reveal source", + description: "Recommended", + action: "reveal", + }, + { + label: "Show report-only detail", + description: "Open Security Surface summary", + action: "detail", + }, + { + label: "Copy security review brief", + description: "AI handoff", + action: "brief", + }, + ], + { + title: "Review Security Surface", + placeHolder: `What do you want to do with ${formatSecuritySurfaceLocation(resolved.item)}?`, + } + ); + if (!picked) { + return; + } + if (picked.action === "reveal") { + await this.revealSecuritySurfaceSource(resolved); + return; + } + if (picked.action === "brief") { + await this.copySecuritySurfaceBrief(resolved); + return; + } + await this.openSecuritySurface(resolved); + } + + async copySecuritySurfaceBrief(node) { + const activeNode = this.activeSecuritySurfaceTarget(node); + if (!activeNode) { + return; + } + const state = this.states.get(activeNode.workspaceKey); + if (!state) { + return; + } + const resolved = this.hydrateSecuritySurfaceNode(state, activeNode); + this.setActiveReviewTarget(resolved); + const item = resolved.item; + const lines = [ + "# CodeClone Security Surface Brief", + "", + `Repository: ${state.folder.name || "unknown"}`, + `Location: ${formatSecuritySurfaceLocation(item)}`, + `Module: ${item.module || "unknown"}`, + `Symbol: ${item.qualname || item.module || "unknown"}`, + `Category: ${humanizeIdentifier(item.category || "unknown")}`, + `Capability: ${humanizeIdentifier(item.capability || "unknown")}`, + `Evidence: ${item.evidence_symbol || "(unknown)"}`, + `Source kind: ${humanizeIdentifier(item.source_kind || "unknown")}`, + `Review signal: ${formatSecuritySurfaceReviewSignal(item)}`, + "", + "Treat this as a report-only trust-boundary inventory entry, not as a vulnerability claim or gate result.", + "Keep behavior unchanged unless review shows that the boundary contract itself needs to move.", + ]; + if (item.scope_gap_hotspot) { + lines.push( + "", + "Coverage Join does not map this callable cleanly, so validate the exercised path manually before refactor." + ); + } else if (item.coverage_hotspot) { + lines.push( + "", + "Coverage Join marks this callable as low coverage, so inspect or add boundary-focused tests before change." + ); + } else if (item.coverage_overlap) { + lines.push( + "", + "Coverage Join overlaps with this callable, so inspect the measured tests before change." + ); + } + await vscode.env.clipboard.writeText(lines.join("\n")); + await vscode.window.showInformationMessage( + `Copied security review brief for ${formatSecuritySurfaceLocation(item)}.` + ); + } + async clearSessionState() { const folder = this.getPreferredFolder(); if (!folder) { @@ -2265,6 +2591,38 @@ class CodeCloneController { }), ]; } + if (target.nodeType === "securitySurface") { + const state = this.states.get(target.workspaceKey); + if (!state) { + return []; + } + const location = firstNormalizedLocation(state.folder, target.locations); + if (!location || location.absolutePath !== document.uri.fsPath) { + return []; + } + const startLine = Math.max(Number(location.line || 1) - 1, 0); + const range = new vscode.Range(startLine, 0, startLine, 0); + return [ + new vscode.CodeLens(range, { + command: "codeclone.previousReviewItem", + title: "$(arrow-up) Previous hotspot", + }), + new vscode.CodeLens(range, { + command: "codeclone.nextReviewItem", + title: "$(arrow-down) Next hotspot", + }), + new vscode.CodeLens(range, { + command: "codeclone.openSecuritySurface", + title: "$(shield) Report-only detail", + arguments: [target], + }), + new vscode.CodeLens(range, { + command: "codeclone.copySecuritySurfaceBrief", + title: "$(copy) Copy security brief", + arguments: [target], + }), + ]; + } const state = this.states.get(target.workspaceKey); if (!state) { return []; @@ -2338,10 +2696,12 @@ class CodeCloneController { changed: this.reviewArtifactCount(state, "changedFiles"), new: this.reviewArtifactCount(state, "newRegressions"), production: this.reviewArtifactCount(state, "productionHotspots"), + securitySurfaces: this.reviewArtifactCount(state, "securitySurfaces"), overloadedModules: this.reviewArtifactCount(state, "overloadedModules"), }; const baselineDrift = this.baselineDrift(state); const coverageJoin = coverageJoinPayload(state.metricsSummary); + const securitySurfaces = securitySurfacesPayload(state.metricsSummary); if (!node) { const sections = [ { @@ -2397,6 +2757,16 @@ class CodeCloneController { icon: new vscode.ThemeIcon("symbol-module"), }); } + if (Object.keys(securitySurfaces).length > 0) { + sections.push({ + nodeType: "section", + id: "overview.securitySurfaces", + label: "Security Surfaces", + description: + `${number(securitySurfaces.items)} items · ${number(securitySurfaces.production)} production · report-only`, + icon: new vscode.ThemeIcon("shield"), + }); + } if (Object.keys(coverageJoin).length > 0) { sections.push({ nodeType: "section", @@ -2558,6 +2928,30 @@ class CodeCloneController { ), ]; } + if (node.id === "overview.securitySurfaces") { + const categoryCounts = safeObject(securitySurfaces.categories); + const activeCategories = Object.entries(categoryCounts) + .filter(([, count]) => typeof count === "number" && count > 0) + .sort((left, right) => Number(right[1]) - Number(left[1])); + return [ + this.detailNode("Items", number(securitySurfaces.items)), + this.detailNode("Categories", number(securitySurfaces.category_count)), + this.detailNode("Modules", number(securitySurfaces.modules)), + this.detailNode("Production", number(securitySurfaces.production)), + this.detailNode("Tests", number(securitySurfaces.tests)), + this.detailNode("Exact items", number(securitySurfaces.exact_items)), + this.detailNode( + "Review surface", + `${number(reviewCounts.securitySurfaces)} visible in Hotspots` + ), + this.detailNode( + "Top category", + activeCategories.length > 0 + ? `${humanizeIdentifier(activeCategories[0][0])} · ${number(activeCategories[0][1])}` + : "none" + ), + ]; + } if (node.id === "overview.coverageJoin") { return [ this.detailNode("Status", capitalize(formatCoverageJoinStatus(coverageJoin))), @@ -2802,6 +3196,12 @@ class CodeCloneController { this.reviewArtifactItems(state, "overloadedModules") ); break; + case "securitySurfaces": + nodes = this.toSecuritySurfaceNodes( + state, + this.reviewArtifactItems(state, "securitySurfaces") + ); + break; default: nodes = []; } @@ -2884,6 +3284,8 @@ class CodeCloneController { return state.changedSummary ? `${this.reviewArtifactCount(state, "changedFiles")} visible · ${state.changedSummary.verdict}` : "not analyzed"; + case "securitySurfaces": + return `${this.reviewArtifactCount(state, "securitySurfaces")} report-only`; case "overloadedModules": return `${this.reviewArtifactCount(state, "overloadedModules")} report-only`; default: @@ -2899,6 +3301,8 @@ class CodeCloneController { return "No production hotspots are visible."; case "changedFiles": return "No findings touching changed files are visible."; + case "securitySurfaces": + return "No report-only Security Surfaces are visible."; case "overloadedModules": return "No report-only Overloaded Module candidates are visible."; default: @@ -2928,6 +3332,8 @@ class CodeCloneController { return this.hotspotFocusMode === "changed"; } return specificMode || this.reviewArtifactCount(state, "changedFiles") > 0; + case "securitySurfaces": + return specificMode || this.reviewArtifactCount(state, "securitySurfaces") > 0; case "overloadedModules": return specificMode || this.reviewArtifactCount(state, "overloadedModules") > 0; default: @@ -2971,6 +3377,13 @@ class CodeCloneController { title: "Review production hotspots", }; } + if (this.reviewArtifactCount(state, "securitySurfaces") > 0) { + return { + label: "Review security-relevant boundaries", + command: "codeclone.reviewPriorityQueue", + title: "Review security-relevant boundaries", + }; + } if (this.reviewArtifactCount(state, "overloadedModules") > 0) { return { label: "Inspect report-only Overloaded Modules", @@ -3053,6 +3466,18 @@ class CodeCloneController { item.command = node.command; break; } + case "securitySurface": { + item = new vscode.TreeItem( + node.label, + vscode.TreeItemCollapsibleState.None + ); + item.description = node.description; + item.tooltip = node.tooltip; + item.iconPath = node.icon; + item.contextValue = "codeclone.securitySurface"; + item.command = node.command; + break; + } case "helpTopic": { item = new vscode.TreeItem( node.label, @@ -3128,9 +3553,13 @@ class CodeCloneController { newCount + productionCount, changedCount ); + const securitySurfaceCount = Number( + this.reviewArtifactCount(state, "securitySurfaces") + ); const overloadedModuleCount = Number( this.reviewArtifactCount(state, "overloadedModules") ); + const reportOnlyCount = securitySurfaceCount + overloadedModuleCount; let badgeValue = 0; let badgeTooltip = ""; switch (this.hotspotFocusMode) { @@ -3147,15 +3576,17 @@ class CodeCloneController { badgeTooltip = `${changedCount} changed-files review items are visible in Hotspots`; break; case "reportOnly": - badgeValue = overloadedModuleCount; - badgeTooltip = `${overloadedModuleCount} report-only Overloaded Module candidates are visible in Hotspots`; + badgeValue = reportOnlyCount; + badgeTooltip = reportOnlyCount > 0 + ? `${reportOnlyCount} report-only review items are visible in Hotspots` + : "No report-only review items are visible in Hotspots"; break; default: - badgeValue = actionableCount > 0 ? actionableCount : overloadedModuleCount; + badgeValue = actionableCount > 0 ? actionableCount : reportOnlyCount; badgeTooltip = actionableCount > 0 ? `${actionableCount} review items need attention` - : `${overloadedModuleCount} report-only Overloaded Module candidates are visible in Hotspots`; + : `${reportOnlyCount} report-only review items are visible in Hotspots`; break; } this.hotspotsView.badge = @@ -3213,7 +3644,7 @@ class CodeCloneController { void vscode.commands.executeCommand( "setContext", "codeclone.activeReviewTargetIsFinding", - Boolean(activeTarget && activeTarget.nodeType !== "overloadedModule") + Boolean(activeTarget && activeTarget.nodeType === "finding") ); void vscode.commands.executeCommand( "setContext", @@ -3225,6 +3656,11 @@ class CodeCloneController { "codeclone.activeReviewTargetIsOverloadedModule", Boolean(activeTarget && activeTarget.nodeType === "overloadedModule") ); + void vscode.commands.executeCommand( + "setContext", + "codeclone.activeReviewTargetIsSecuritySurface", + Boolean(activeTarget && activeTarget.nodeType === "securitySurface") + ); void vscode.commands.executeCommand( "setContext", "codeclone.hotspotFocusMode", diff --git a/extensions/vscode-codeclone/src/formatters.js b/extensions/vscode-codeclone/src/formatters.js index 953b63e..dac0226 100644 --- a/extensions/vscode-codeclone/src/formatters.js +++ b/extensions/vscode-codeclone/src/formatters.js @@ -54,6 +54,11 @@ function capitalize(value) { return value.charAt(0).toUpperCase() + value.slice(1); } +function humanizeIdentifier(value) { + const text = String(value || "").trim().replace(/_/g, " "); + return text ? capitalize(text) : ""; +} + function formatBooleanWord(value) { return value ? "yes" : "no"; } @@ -93,6 +98,10 @@ function coverageJoinPayload(metricsSummary) { return safeObject(safeObject(metricsSummary).coverage_join); } +function securitySurfacesPayload(metricsSummary) { + return safeObject(safeObject(metricsSummary).security_surfaces); +} + function countedNoun(value, singular, plural = `${singular}s`) { const normalized = typeof value === "number" && !Number.isNaN(value) ? value : 0; @@ -255,6 +264,19 @@ function reviewTargetKey(target) { if (target.nodeType === "overloadedModule" && safeObject(target.item).path) { return `overloaded:${String(target.item.path)}`; } + if (target.nodeType === "securitySurface") { + const item = safeObject(target.item); + const pathValue = String(item.path || "").trim(); + const qualnameValue = String(item.qualname || "").trim(); + const capabilityValue = String(item.capability || "").trim(); + const lineValue = + typeof item.start_line === "number" && !Number.isNaN(item.start_line) + ? item.start_line + : 0; + if (pathValue) { + return `security:${pathValue}:${lineValue}:${qualnameValue}:${capabilityValue}`; + } + } if (target.findingId) { return `finding:${String(target.findingId)}`; } @@ -303,9 +325,51 @@ function emptyReviewArtifacts() { productionHotspots: [], changedFiles: [], overloadedModules: [], + securitySurfaces: [], }; } +function formatSecuritySurfaceLocation(item) { + const entry = safeObject(item); + const pathValue = String(entry.path || "").trim(); + const startLine = + typeof entry.start_line === "number" && !Number.isNaN(entry.start_line) + ? entry.start_line + : null; + const endLine = + typeof entry.end_line === "number" && !Number.isNaN(entry.end_line) + ? entry.end_line + : null; + if (!pathValue) { + return "(unknown)"; + } + if (startLine === null || startLine <= 0) { + return pathValue; + } + if (endLine !== null && endLine > startLine) { + return `${pathValue}:${startLine}-${endLine}`; + } + return `${pathValue}:${startLine}`; +} + +function formatSecuritySurfaceReviewSignal(item) { + const entry = safeObject(item); + const scopeText = humanizeIdentifier(entry.location_scope || "unknown"); + if (entry.scope_gap_hotspot) { + return `${scopeText} · scope gap`; + } + if (entry.coverage_hotspot) { + return `${scopeText} · low coverage`; + } + if (entry.coverage_overlap) { + return `${scopeText} · coverage overlap`; + } + if (String(entry.location_scope || "").trim() === "module") { + return `${scopeText} · capability present`; + } + return `${scopeText} · exact evidence`; +} + /** * @param {unknown} value * @returns {FindingLocation[]} @@ -400,11 +464,14 @@ module.exports = { formatCoverageJoinPercent, formatCoverageJoinStatus, formatCoverageJoinSummary, + formatSecuritySurfaceLocation, + formatSecuritySurfaceReviewSignal, formatKind, formatNovelty, formatRunScope, formatSeverity, formatSourceKindSummary, + humanizeIdentifier, isSpecificFocusMode, normalizeFindingLocations, normalizeLocations, @@ -413,6 +480,7 @@ module.exports = { reviewTargetKey, safeArray, safeObject, + securitySurfacesPayload, sameLaunchSpec, treeAccessibilityInformation, workspaceRelativePath, diff --git a/extensions/vscode-codeclone/src/renderers.js b/extensions/vscode-codeclone/src/renderers.js index f7a4cc2..1ff42a2 100644 --- a/extensions/vscode-codeclone/src/renderers.js +++ b/extensions/vscode-codeclone/src/renderers.js @@ -12,8 +12,11 @@ const { formatBaselineTags, formatBaselineState, formatKind, + formatSecuritySurfaceLocation, + formatSecuritySurfaceReviewSignal, formatSeverity, formatSourceKindSummary, + humanizeIdentifier, normalizeLocations, number, safeArray, @@ -280,10 +283,62 @@ function renderOverloadedModuleMarkdown(item) { return lines.join("\n"); } +function renderSecuritySurfaceMarkdown(item) { + const entry = safeObject(item); + const location = formatSecuritySurfaceLocation(entry); + const category = humanizeIdentifier(entry.category || "unknown"); + const capability = humanizeIdentifier(entry.capability || "unknown"); + const sourceKind = humanizeIdentifier(entry.source_kind || "unknown"); + const scope = humanizeIdentifier(entry.location_scope || "unknown"); + const classification = humanizeIdentifier( + entry.classification_mode || "unknown" + ); + const evidence = String(entry.evidence_symbol || "(unknown)"); + const reviewSignal = formatSecuritySurfaceReviewSignal(entry); + const guidance = [ + "Treat this as a report-only boundary inventory entry, not as a vulnerability claim.", + entry.location_scope === "module" + ? "Trace the callable or entrypoint that consumes this module capability before refactoring it." + : "Review the exact callable behavior at this trust boundary before refactoring it.", + ]; + if (entry.scope_gap_hotspot) { + guidance.push( + "Coverage Join marks this callable as a scope gap, so validate the exercised path manually before change." + ); + } else if (entry.coverage_hotspot) { + guidance.push( + "Coverage Join marks this callable as low coverage, so inspect or add boundary-focused tests before change." + ); + } else if (entry.coverage_overlap) { + guidance.push( + "Coverage Join overlaps with this callable, so inspect the measured tests before changing boundary behavior." + ); + } + + return [ + "# Security Surface", + "", + `- Location: \`${location}\``, + `- Module: \`${entry.module || "unknown"}\``, + `- Symbol: \`${entry.qualname || entry.module || "unknown"}\``, + `- Category: ${category}`, + `- Capability: ${capability}`, + `- Evidence: \`${evidence}\``, + `- Source kind: ${sourceKind}`, + `- Scope: ${scope}`, + `- Classification: ${classification}`, + `- Review signal: ${reviewSignal}`, + "", + "## Review guidance", + markdownBulletList(guidance), + ].join("\n"); +} + module.exports = { markdownBulletList, renderFindingMarkdown, renderOverloadedModuleMarkdown, + renderSecuritySurfaceMarkdown, renderHelpMarkdown, renderRemediationMarkdown, renderRestrictedModeMarkdown, diff --git a/extensions/vscode-codeclone/src/runArtifacts.js b/extensions/vscode-codeclone/src/runArtifacts.js new file mode 100644 index 0000000..a30421e --- /dev/null +++ b/extensions/vscode-codeclone/src/runArtifacts.js @@ -0,0 +1,45 @@ +"use strict"; + +const {captureWorkspaceGitSnapshot} = require("./runtime"); + +function arrayItems(value) { + return Array.isArray(value) ? value : []; +} + +async function loadRunArtifacts( + client, + folder, + runId, + captureGitSnapshot = captureWorkspaceGitSnapshot +) { + const [summary, triage, metrics, reviewed, gitSnapshot] = await Promise.all([ + client.callTool("get_run_summary", { + run_id: runId, + }), + client.callTool("get_production_triage", { + run_id: runId, + max_hotspots: 5, + max_suggestions: 5, + }), + client.callTool("get_report_section", { + run_id: runId, + section: "metrics", + }), + client.callTool("list_reviewed_findings", { + run_id: runId, + }), + captureGitSnapshot(folder), + ]); + + return { + summary, + triage, + metricsSummary: metrics.summary || metrics, + reviewedItems: arrayItems(reviewed.items), + gitSnapshot, + }; +} + +module.exports = { + loadRunArtifacts, +}; diff --git a/extensions/vscode-codeclone/src/runtime.js b/extensions/vscode-codeclone/src/runtime.js index 1b1c11b..bdc9a56 100644 --- a/extensions/vscode-codeclone/src/runtime.js +++ b/extensions/vscode-codeclone/src/runtime.js @@ -57,11 +57,12 @@ async function pathExists(filePath) { } async function looksLikeCodeCloneRepo(folderPath) { - const [hasPyproject, hasServer] = await Promise.all([ + const [hasPyproject, hasLegacyServer, hasSurfaceServer] = await Promise.all([ pathExists(path.join(folderPath, "pyproject.toml")), pathExists(path.join(folderPath, "codeclone", "mcp_server.py")), + pathExists(path.join(folderPath, "codeclone", "surfaces", "mcp", "server.py")), ]); - return hasPyproject && hasServer; + return hasPyproject && (hasLegacyServer || hasSurfaceServer); } async function readFileHead(filePath, maxBytes = 16384) { diff --git a/extensions/vscode-codeclone/src/support.js b/extensions/vscode-codeclone/src/support.js index 98da9ca..f7117d3 100644 --- a/extensions/vscode-codeclone/src/support.js +++ b/extensions/vscode-codeclone/src/support.js @@ -9,7 +9,7 @@ const ANALYSIS_PROFILE_DEEPER_REVIEW = "deeperReview"; const ANALYSIS_PROFILE_CUSTOM = "custom"; const MINIMUM_SUPPORTED_CODECLONE_VERSION = "2.0.0b4"; const PREVIEW_INSTALL_COMMAND = - 'uv tool install "codeclone[mcp]>=2.0.0b4"'; + 'uv tool install --pre "codeclone[mcp]"'; const ANALYSIS_PROFILE_IDS = new Set([ ANALYSIS_PROFILE_DEFAULTS, ANALYSIS_PROFILE_DEEPER_REVIEW, diff --git a/extensions/vscode-codeclone/test/formatters.test.js b/extensions/vscode-codeclone/test/formatters.test.js index 5e8dd0e..14293e1 100644 --- a/extensions/vscode-codeclone/test/formatters.test.js +++ b/extensions/vscode-codeclone/test/formatters.test.js @@ -4,8 +4,11 @@ const test = require("node:test"); const assert = require("node:assert/strict"); const Module = require("node:module"); -const originalLoad = Module._load; -Module._load = function patchedLoad(request, parent, isMain) { +const moduleInternals = /** @type {{_load: Function}} */ ( + /** @type {unknown} */ (Module) +); +const originalLoad = moduleInternals._load; +moduleInternals._load = function patchedLoad(request, parent, isMain) { if (request === "vscode") { return { ThemeIcon: class ThemeIcon {}, @@ -21,9 +24,12 @@ const { formatCoverageJoinPercent, formatCoverageJoinStatus, formatCoverageJoinSummary, + formatSecuritySurfaceLocation, + formatSecuritySurfaceReviewSignal, + securitySurfacesPayload, } = require("../src/formatters"); -Module._load = originalLoad; +moduleInternals._load = originalLoad; test("coverage join formatters render joined summary from canonical metrics facts", () => { const payload = { @@ -62,3 +68,44 @@ test("coverage join payload normalizes missing or null metrics family entries", status: "ok", }); }); + +test("security surfaces formatters keep summary payloads and review cues explicit", () => { + assert.deepEqual(securitySurfacesPayload(undefined), {}); + assert.deepEqual(securitySurfacesPayload({}), {}); + assert.deepEqual( + securitySurfacesPayload({ + security_surfaces: { + items: 5, + production: 3, + report_only: true, + }, + }), + { + items: 5, + production: 3, + report_only: true, + } + ); + + assert.equal( + formatSecuritySurfaceLocation({ + path: "pkg/client.py", + start_line: 12, + end_line: 18, + }), + "pkg/client.py:12-18" + ); + assert.equal( + formatSecuritySurfaceReviewSignal({ + location_scope: "callable", + coverage_hotspot: true, + }), + "Callable · low coverage" + ); + assert.equal( + formatSecuritySurfaceReviewSignal({ + location_scope: "module", + }), + "Module · capability present" + ); +}); diff --git a/extensions/vscode-codeclone/test/renderers.test.js b/extensions/vscode-codeclone/test/renderers.test.js index a81214a..023ea0b 100644 --- a/extensions/vscode-codeclone/test/renderers.test.js +++ b/extensions/vscode-codeclone/test/renderers.test.js @@ -4,8 +4,11 @@ const test = require("node:test"); const assert = require("node:assert/strict"); const Module = require("node:module"); -const originalLoad = Module._load; -Module._load = function patchedLoad(request, parent, isMain) { +const moduleInternals = /** @type {{_load: Function}} */ ( + /** @type {unknown} */ (Module) +); +const originalLoad = moduleInternals._load; +moduleInternals._load = function patchedLoad(request, parent, isMain) { if (request === "vscode") { return { ThemeIcon: class ThemeIcon {}, @@ -19,9 +22,12 @@ const { formatBaselineState, formatBaselineTags, } = require("../src/formatters"); -const {renderTriageMarkdown} = require("../src/renderers"); +const { + renderSecuritySurfaceMarkdown, + renderTriageMarkdown, +} = require("../src/renderers"); -Module._load = originalLoad; +moduleInternals._load = originalLoad; test("formatBaselineState explains comparison without a valid baseline", () => { assert.equal( @@ -78,3 +84,28 @@ test("renderTriageMarkdown surfaces baseline mismatch context compactly", () => ); assert.match(markdown, /Baseline tags: baseline cp313 · runtime cp314/); }); + +test("renderSecuritySurfaceMarkdown keeps report-only security posture explicit", () => { + const markdown = renderSecuritySurfaceMarkdown({ + path: "pkg/client.py", + start_line: 42, + end_line: 47, + module: "pkg.client", + qualname: "pkg.client:send", + category: "network_boundary", + capability: "requests_call", + evidence_symbol: "requests.post", + source_kind: "production", + location_scope: "callable", + classification_mode: "exact_call", + coverage_overlap: true, + scope_gap_hotspot: true, + }); + + assert.match(markdown, /# Security Surface/); + assert.match(markdown, /Location: `pkg\/client.py:42-47`/); + assert.match(markdown, /Category: Network boundary/); + assert.match(markdown, /Review signal: Callable · scope gap/); + assert.match(markdown, /not as a vulnerability claim/); + assert.match(markdown, /Coverage Join marks this callable as a scope gap/); +}); diff --git a/extensions/vscode-codeclone/test/runArtifacts.test.js b/extensions/vscode-codeclone/test/runArtifacts.test.js new file mode 100644 index 0000000..b5ab482 --- /dev/null +++ b/extensions/vscode-codeclone/test/runArtifacts.test.js @@ -0,0 +1,67 @@ +"use strict"; + +const test = require("node:test"); +const assert = require("node:assert/strict"); + +const {loadRunArtifacts} = require("../src/runArtifacts"); + +test("loadRunArtifacts starts MCP reads and git snapshot together", async () => { + const started = []; + /** @type {Map void>} */ + const resolvers = new Map(); + const client = { + callTool(method, payload) { + started.push([method, payload]); + return new Promise((resolve) => { + resolvers.set(method, resolve); + }); + }, + }; + let gitSnapshotStarted = false; + /** @type {(value: any) => void} */ + let resolveGitSnapshot = () => {}; + + const promise = loadRunArtifacts( + client, + {uri: {fsPath: "/workspace/repo"}}, + "run-123", + () => + new Promise((resolve) => { + gitSnapshotStarted = true; + resolveGitSnapshot = resolve; + }) + ); + + assert.deepEqual( + started.map(([method]) => method), + [ + "get_run_summary", + "get_production_triage", + "get_report_section", + "list_reviewed_findings", + ] + ); + assert.equal(gitSnapshotStarted, true); + + const resolveSummary = resolvers.get("get_run_summary"); + const resolveTriage = resolvers.get("get_production_triage"); + const resolveMetrics = resolvers.get("get_report_section"); + const resolveReviewed = resolvers.get("list_reviewed_findings"); + assert.ok(resolveSummary); + assert.ok(resolveTriage); + assert.ok(resolveMetrics); + assert.ok(resolveReviewed); + resolveSummary({version: "2.0.0b6"}); + resolveTriage({hotspots: []}); + resolveMetrics({summary: {health: {score: 90}}}); + resolveReviewed({items: [{id: "f1"}]}); + resolveGitSnapshot({head: "abc123"}); + + assert.deepEqual(await promise, { + summary: {version: "2.0.0b6"}, + triage: {hotspots: []}, + metricsSummary: {health: {score: 90}}, + reviewedItems: [{id: "f1"}], + gitSnapshot: {head: "abc123"}, + }); +}); diff --git a/extensions/vscode-codeclone/test/runtime.test.js b/extensions/vscode-codeclone/test/runtime.test.js new file mode 100644 index 0000000..27d17fe --- /dev/null +++ b/extensions/vscode-codeclone/test/runtime.test.js @@ -0,0 +1,43 @@ +"use strict"; + +const test = require("node:test"); +const assert = require("node:assert/strict"); +const fs = require("node:fs"); +const os = require("node:os"); +const path = require("node:path"); + +const {looksLikeCodeCloneRepo} = require("../src/runtime"); + +test("looksLikeCodeCloneRepo accepts the current MCP surface layout", async () => { + const root = fs.mkdtempSync(path.join(os.tmpdir(), "codeclone-vscode-runtime-")); + fs.writeFileSync(path.join(root, "pyproject.toml"), "[project]\nname='codeclone'\n"); + fs.mkdirSync(path.join(root, "codeclone", "surfaces", "mcp"), {recursive: true}); + fs.writeFileSync( + path.join(root, "codeclone", "surfaces", "mcp", "server.py"), + "# marker\n" + ); + + await assert.doesNotReject(async () => { + assert.equal(await looksLikeCodeCloneRepo(root), true); + }); +}); + +test("looksLikeCodeCloneRepo still accepts the legacy MCP server path", async () => { + const root = fs.mkdtempSync(path.join(os.tmpdir(), "codeclone-vscode-runtime-")); + fs.writeFileSync(path.join(root, "pyproject.toml"), "[project]\nname='codeclone'\n"); + fs.mkdirSync(path.join(root, "codeclone"), {recursive: true}); + fs.writeFileSync(path.join(root, "codeclone", "mcp_server.py"), "# legacy marker\n"); + + await assert.doesNotReject(async () => { + assert.equal(await looksLikeCodeCloneRepo(root), true); + }); +}); + +test("looksLikeCodeCloneRepo rejects non-CodeClone workspaces", async () => { + const root = fs.mkdtempSync(path.join(os.tmpdir(), "codeclone-vscode-runtime-")); + fs.writeFileSync(path.join(root, "pyproject.toml"), "[project]\nname='other'\n"); + + await assert.doesNotReject(async () => { + assert.equal(await looksLikeCodeCloneRepo(root), false); + }); +}); diff --git a/extensions/vscode-codeclone/test/support.test.js b/extensions/vscode-codeclone/test/support.test.js index 8045d45..5ca4f77 100644 --- a/extensions/vscode-codeclone/test/support.test.js +++ b/extensions/vscode-codeclone/test/support.test.js @@ -330,6 +330,6 @@ test("minimum supported CodeClone version and install command stay aligned", () assert.equal(isMinimumSupportedCodeCloneVersion("1.27.0"), false); assert.equal( PREVIEW_INSTALL_COMMAND, - 'uv tool install "codeclone[mcp]>=2.0.0b4"' + 'uv tool install --pre "codeclone[mcp]"' ); }); diff --git a/mkdocs.yml b/mkdocs.yml index 213a1a5..a941452 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -18,6 +18,7 @@ theme: - search.suggest - search.highlight - content.code.copy + - content.tabs.link palette: - media: "(prefers-color-scheme: light)" scheme: default diff --git a/plugins/codeclone/.codex-plugin/plugin.json b/plugins/codeclone/.codex-plugin/plugin.json index a0ce462..2f737fa 100644 --- a/plugins/codeclone/.codex-plugin/plugin.json +++ b/plugins/codeclone/.codex-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "codeclone", - "version": "2.0.0-b5.0", + "version": "2.0.0-b6.0", "description": "Baseline-aware structural code quality analysis for Codex through the local CodeClone MCP server.", "author": { "name": "Den Rozhnovskiy", diff --git a/plugins/codeclone/.mcp.json b/plugins/codeclone/.mcp.json index 488a9cf..31e5934 100644 --- a/plugins/codeclone/.mcp.json +++ b/plugins/codeclone/.mcp.json @@ -1,10 +1,9 @@ { "mcpServers": { "codeclone": { - "command": "sh", + "command": "python3", "args": [ - "-lc", - "if [ -x \"$PWD/.venv/bin/codeclone-mcp\" ]; then exec \"$PWD/.venv/bin/codeclone-mcp\" --transport stdio; fi; if command -v poetry >/dev/null 2>&1; then poetry_env=\"$(poetry env info -p 2>/dev/null || true)\"; if [ -n \"$poetry_env\" ] && [ -x \"$poetry_env/bin/codeclone-mcp\" ]; then exec \"$poetry_env/bin/codeclone-mcp\" --transport stdio; fi; fi; if command -v codeclone-mcp >/dev/null 2>&1; then exec codeclone-mcp --transport stdio; fi; echo 'codeclone-mcp not found: expected .venv/bin/codeclone-mcp, a Poetry env launcher, or a PATH entry' >&2; exit 1" + "./scripts/launch_mcp" ] } } diff --git a/plugins/codeclone/README.md b/plugins/codeclone/README.md index b2f4f23..ba96331 100644 --- a/plugins/codeclone/README.md +++ b/plugins/codeclone/README.md @@ -12,11 +12,15 @@ directly, including `Coverage Join` facts and the optional `coverage` help topic | File | Purpose | |------------------------------|----------------------------------------------------| | `.codex-plugin/plugin.json` | Plugin metadata and prompts | -| `.mcp.json` | Local `codeclone-mcp --transport stdio` definition | +| `.mcp.json` | Local stdio MCP definition | +| `scripts/launch_mcp` | Shell-free workspace-first launcher bootstrap | | `skills/codeclone-review/` | Conservative-first full review skill | | `skills/codeclone-hotspots/` | Quick hotspot discovery skill | | `assets/` | Plugin branding | +`plugin.json` keeps the machine identifier as lowercase `codeclone`; the +user-facing label stays in `interface.displayName` as `CodeClone`. + ## Install The plugin prefers a workspace launcher first: @@ -25,11 +29,14 @@ The plugin prefers a workspace launcher first: 2. the current Poetry environment launcher 3. `codeclone-mcp` from `PATH` +The bundled Codex launcher is a small repo-local Python wrapper, not a shell +snippet. It keeps the same workspace-first order without relying on `sh -lc`. + Recommended workspace-local setup: ```bash uv venv -uv pip install --python .venv/bin/python "codeclone[mcp]>=2.0.0b4" +uv pip install --python .venv/bin/python --pre "codeclone[mcp]" .venv/bin/codeclone-mcp --help ``` @@ -38,7 +45,7 @@ If your workspace uses Poetry, install CodeClone into that Poetry environment. Global fallback: ```bash -uv tool install "codeclone[mcp]>=2.0.0b4" +uv tool install --pre "codeclone[mcp]" codeclone-mcp --help ``` diff --git a/plugins/codeclone/scripts/launch_mcp b/plugins/codeclone/scripts/launch_mcp new file mode 100644 index 0000000..8fc2c74 --- /dev/null +++ b/plugins/codeclone/scripts/launch_mcp @@ -0,0 +1,9 @@ +from __future__ import annotations + +import runpy +from pathlib import Path + +PLUGIN_LAUNCHER = Path(__file__).with_name("launch_mcp.py") + +if __name__ == "__main__": + runpy.run_path(str(PLUGIN_LAUNCHER), run_name="__main__") diff --git a/plugins/codeclone/scripts/launch_mcp.py b/plugins/codeclone/scripts/launch_mcp.py new file mode 100644 index 0000000..8ba6a2b --- /dev/null +++ b/plugins/codeclone/scripts/launch_mcp.py @@ -0,0 +1,202 @@ +from __future__ import annotations + +import os +import shutil +import subprocess +import sys +from collections.abc import Callable, Mapping +from dataclasses import dataclass +from pathlib import Path + +POETRY_TIMEOUT_SECONDS = 5 +PLUGIN_ROOT = Path(__file__).resolve().parents[1] +REPO_ROOT = PLUGIN_ROOT.parents[1] +TRANSPORT_ARGS = ("--transport", "stdio") + + +@dataclass(frozen=True) +class LaunchTarget: + command: str + source: str + workspace_root: Path | None + + +def _normalized_env_value(value: str | None) -> str | None: + text = str(value or "").strip() + return text or None + + +def workspace_roots( + *, + env: Mapping[str, str], + cwd: str | None = None, + repo_root: Path = REPO_ROOT, +) -> tuple[Path, ...]: + candidates = ( + _normalized_env_value(env.get("CODECLONE_WORKSPACE_ROOT")), + _normalized_env_value(cwd if cwd is not None else os.getcwd()), + _normalized_env_value(env.get("PWD")), + str(repo_root), + ) + roots: list[Path] = [] + seen: set[str] = set() + for candidate in candidates: + if candidate is not None: + resolved = Path(candidate).resolve() + key = os.path.normcase(str(resolved)) + if key not in seen: + seen.add(key) + roots.append(resolved) + return tuple(roots) + + +def workspace_local_launcher_candidates(root: Path) -> tuple[Path, ...]: + if os.name == "nt": + return ( + root / ".venv" / "Scripts" / "codeclone-mcp.exe", + root / ".venv" / "Scripts" / "codeclone-mcp.cmd", + root / "venv" / "Scripts" / "codeclone-mcp.exe", + root / "venv" / "Scripts" / "codeclone-mcp.cmd", + ) + return ( + root / ".venv" / "bin" / "codeclone-mcp", + root / "venv" / "bin" / "codeclone-mcp", + ) + + +def resolve_workspace_local_launcher( + roots: tuple[Path, ...], +) -> LaunchTarget | None: + for root in roots: + for candidate in workspace_local_launcher_candidates(root): + if candidate.is_file(): + return LaunchTarget( + command=str(candidate), + source="workspaceLocal", + workspace_root=root, + ) + return None + + +def resolve_poetry_launcher( + *, + roots: tuple[Path, ...], + env: Mapping[str, str], + run_cmd: Callable[..., subprocess.CompletedProcess[str]] = subprocess.run, + which: Callable[[str], str | None] = shutil.which, +) -> LaunchTarget | None: + if which("poetry") is None: + return None + executable = "codeclone-mcp.exe" if os.name == "nt" else "codeclone-mcp" + script_dir = "Scripts" if os.name == "nt" else "bin" + for root in roots: + candidate = resolve_poetry_env_root(root=root, env=env, run_cmd=run_cmd) + if candidate is None: + continue + candidate = candidate / script_dir / executable + if candidate.is_file(): + return LaunchTarget( + command=str(candidate), + source="poetryEnv", + workspace_root=root, + ) + return None + + +def resolve_poetry_env_root( + *, + root: Path, + env: Mapping[str, str], + run_cmd: Callable[..., subprocess.CompletedProcess[str]], +) -> Path | None: + if not (root / "pyproject.toml").is_file(): + return None + try: + completed = run_cmd( + ["poetry", "env", "info", "-p"], + cwd=str(root), + env=dict(env), + capture_output=True, + text=True, + check=False, + timeout=POETRY_TIMEOUT_SECONDS, + ) + except (OSError, subprocess.TimeoutExpired): + return None + poetry_env_root = completed.stdout.strip() + if completed.returncode != 0 or not poetry_env_root: + return None + return Path(poetry_env_root) + + +def resolve_path_launcher( + *, + roots: tuple[Path, ...], + which: Callable[[str], str | None] = shutil.which, +) -> LaunchTarget | None: + resolved = which("codeclone-mcp") + if not resolved: + return None + return LaunchTarget( + command=resolved, + source="path", + workspace_root=roots[0] if roots else None, + ) + + +def resolve_launch_target( + *, + env: Mapping[str, str], + cwd: str | None = None, + repo_root: Path = REPO_ROOT, + run_cmd: Callable[..., subprocess.CompletedProcess[str]] = subprocess.run, + which: Callable[[str], str | None] = shutil.which, +) -> LaunchTarget | None: + roots = workspace_roots(env=env, cwd=cwd, repo_root=repo_root) + return ( + resolve_workspace_local_launcher(roots) + or resolve_poetry_launcher(roots=roots, env=env, run_cmd=run_cmd, which=which) + or resolve_path_launcher(roots=roots, which=which) + ) + + +def build_setup_message() -> str: + return ( + "CodeClone launcher not found. Expected a workspace .venv launcher, " + "a Poetry environment launcher, or a PATH entry for codeclone-mcp." + ) + + +def exec_launch_target(target: LaunchTarget, env: Mapping[str, str]) -> None: + child_env = dict(env) + if target.workspace_root is not None and not _normalized_env_value( + child_env.get("CODECLONE_WORKSPACE_ROOT") + ): + child_env["CODECLONE_WORKSPACE_ROOT"] = str(target.workspace_root) + argv = [target.command, *TRANSPORT_ARGS] + os.execvpe(target.command, argv, child_env) + + +def main() -> int: + target = resolve_launch_target(env=os.environ) + if target is None: + sys.stderr.write(f"[codeclone] {build_setup_message()}\n") + return 2 + workspace_root = ( + str(target.workspace_root) if target.workspace_root is not None else "" + ) + sys.stderr.write( + "[codeclone] launcher " + f"source={target.source} command={target.command} " + f"workspace_root={workspace_root}\n" + ) + try: + exec_launch_target(target, os.environ) + except OSError as exc: + sys.stderr.write(f"[codeclone] {exc}\n") + return 2 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/pyproject.toml b/pyproject.toml index fa7abdd..ac18a7b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta" [project] name = "codeclone" -version = "2.0.0b5" +version = "2.0.0b6" description = "Structural code quality analysis for Python" readme = { file = "README.md", content-type = "text/markdown" } license = "MPL-2.0 AND MIT" -license-files = ["LICENSE", "LICENSE-docs"] +license-files = ["LICENSE", "LICENSE-MIT"] authors = [ { name = "Den Rozhnovskiy", email = "pytelemonbot@mail.ru" } @@ -76,17 +76,43 @@ dev = [ ] [project.scripts] -codeclone = "codeclone.cli:main" -codeclone-mcp = "codeclone.mcp_server:main" +codeclone = "codeclone.main:main" +codeclone-mcp = "codeclone.surfaces.mcp.server:main" [tool.setuptools] packages = [ "codeclone", - "codeclone._html_report", - "codeclone._html_report._sections", + "codeclone.analysis", + "codeclone.baseline", + "codeclone.blocks", + "codeclone.cache", + "codeclone.config", + "codeclone.contracts", + "codeclone.core", + "codeclone.findings", + "codeclone.findings.clones", + "codeclone.findings.structural", "codeclone.domain", + "codeclone.meta_markers", "codeclone.metrics", + "codeclone.paths", + "codeclone.qualnames", "codeclone.report", + "codeclone.report.document", + "codeclone.report.gates", + "codeclone.report.html", + "codeclone.report.html.assets", + "codeclone.report.html.primitives", + "codeclone.report.html.sections", + "codeclone.report.html.widgets", + "codeclone.report.renderers", + "codeclone.scanner", + "codeclone.surfaces", + "codeclone.surfaces.cli", + "codeclone.surfaces.mcp", + "codeclone.surfaces.mcp.tools", + "codeclone.ui_messages", + "codeclone.utils", ] [tool.setuptools.package-data] @@ -107,9 +133,9 @@ min_stmt = 4 fail_on_new = true fail_cycles = true fail_dead_code = true -fail_health = 87 +fail_health = 80 fail_on_new_metrics = true -api_surface = false +api_surface = true golden_fixture_paths = ["tests/fixtures/golden_*"] min_typing_coverage = 99 @@ -136,9 +162,8 @@ target-version = "py310" select = ["E", "F", "W", "I", "B", "UP", "SIM", "C4", "PIE", "PERF", "RUF"] [tool.ruff.lint.per-file-ignores] -"codeclone/_html_css.py" = ["E501"] -"codeclone/_html_js.py" = ["E501"] -"codeclone/_html_report/_sections/*.py" = ["E501"] +"codeclone/report/html/assets/*.py" = ["E501"] +"codeclone/report/html/sections/*.py" = ["E501"] [tool.ruff.format] quote-style = "double" diff --git a/scripts/build_docs_example_report.py b/scripts/build_docs_example_report.py index b003fd2..805adcc 100644 --- a/scripts/build_docs_example_report.py +++ b/scripts/build_docs_example_report.py @@ -21,6 +21,7 @@ from codeclone import __version__ DEFAULT_OUTPUT_DIR = Path("site/examples/report/live") +CODECLONE_CLI_MODULE = "codeclone.main" @dataclass(frozen=True) @@ -61,7 +62,7 @@ def _run_codeclone(scan_root: Path, artifacts: ReportArtifacts) -> None: cmd = [ sys.executable, "-m", - "codeclone.cli", + CODECLONE_CLI_MODULE, str(scan_root), "--html", str(artifacts.html), diff --git a/scripts/launch_mcp b/scripts/launch_mcp new file mode 100644 index 0000000..ec6718b --- /dev/null +++ b/scripts/launch_mcp @@ -0,0 +1,15 @@ +from __future__ import annotations + +import runpy +from pathlib import Path + +PLUGIN_LAUNCHER = ( + Path(__file__).resolve().parents[1] + / "plugins" + / "codeclone" + / "scripts" + / "launch_mcp.py" +) + +if __name__ == "__main__": + runpy.run_path(str(PLUGIN_LAUNCHER), run_name="__main__") diff --git a/tests/_ast_metrics_helpers.py b/tests/_ast_metrics_helpers.py index cd75a99..fe5221d 100644 --- a/tests/_ast_metrics_helpers.py +++ b/tests/_ast_metrics_helpers.py @@ -8,7 +8,7 @@ import ast -from codeclone import extractor +from codeclone.analysis import _module_walk as module_walk_mod from codeclone.qualnames import QualnameCollector @@ -20,7 +20,7 @@ def tree_collector_and_imports( tree = ast.parse(source) collector = QualnameCollector() collector.visit(tree) - walk = extractor._collect_module_walk_data( + walk = module_walk_mod._collect_module_walk_data( tree=tree, module_name=module_name, collector=collector, diff --git a/tests/_contract_snapshots.py b/tests/_contract_snapshots.py new file mode 100644 index 0000000..b6842fb --- /dev/null +++ b/tests/_contract_snapshots.py @@ -0,0 +1,18 @@ +from __future__ import annotations + +import json +from pathlib import Path + +_CONTRACT_SNAPSHOT_ROOT = ( + Path(__file__).resolve().parent / "fixtures" / "contract_snapshots" +) + + +def load_json_snapshot(name: str) -> object: + path = _CONTRACT_SNAPSHOT_ROOT / name + return json.loads(path.read_text(encoding="utf-8")) + + +def load_text_snapshot(name: str) -> str: + path = _CONTRACT_SNAPSHOT_ROOT / name + return path.read_text(encoding="utf-8").replace("\r\n", "\n") diff --git a/tests/_import_graph.py b/tests/_import_graph.py new file mode 100644 index 0000000..186941d --- /dev/null +++ b/tests/_import_graph.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +import ast +from pathlib import Path + + +def _module_name_from_path(path: Path) -> str: + parts = list(path.with_suffix("").parts) + return ".".join(parts) + + +def _resolve_import(module_name: str, node: ast.ImportFrom) -> str: + if node.level == 0: + return node.module or "" + + parts = module_name.split(".") + prefix_parts = parts[: -node.level] + if node.module: + return ".".join([*prefix_parts, node.module]) + return ".".join(prefix_parts) + + +def _iter_local_imports(module_name: str, source: str) -> list[str]: + tree = ast.parse(source) + imports: list[str] = [] + for node in ast.walk(tree): + if isinstance(node, ast.Import): + imports.extend(alias.name for alias in node.names) + elif isinstance(node, ast.ImportFrom): + imports.append(_resolve_import(module_name, node)) + return [name for name in imports if name.startswith("codeclone")] diff --git a/tests/conftest.py b/tests/conftest.py index a497dcf..628c29f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,10 +6,12 @@ from __future__ import annotations +import sys from collections.abc import Callable import pytest +from codeclone.baseline.trust import current_python_tag from codeclone.contracts import CACHE_VERSION, REPORT_SCHEMA_VERSION ReportMetaFactory = Callable[..., dict[str, object]] @@ -18,15 +20,17 @@ @pytest.fixture def report_meta_factory() -> ReportMetaFactory: def _make(**overrides: object) -> dict[str, object]: + runtime_tag = current_python_tag() + runtime_version = f"{sys.version_info.major}.{sys.version_info.minor}" meta: dict[str, object] = { "report_schema_version": REPORT_SCHEMA_VERSION, "codeclone_version": "1.4.0", - "python_version": "3.13", - "python_tag": "cp313", + "python_version": runtime_version, + "python_tag": runtime_tag, "baseline_path": "/repo/codeclone.baseline.json", "baseline_fingerprint_version": "1", "baseline_schema_version": "1.0", - "baseline_python_tag": "cp313", + "baseline_python_tag": runtime_tag, "baseline_generator_name": "codeclone", "baseline_generator_version": "1.4.0", "baseline_payload_sha256": "a" * 64, diff --git a/tests/fixtures/contract_snapshots/cli_help.txt b/tests/fixtures/contract_snapshots/cli_help.txt new file mode 100644 index 0000000..5f5de73 --- /dev/null +++ b/tests/fixtures/contract_snapshots/cli_help.txt @@ -0,0 +1,197 @@ +usage: codeclone [--min-loc MIN_LOC] [--min-stmt MIN_STMT] + [--processes PROCESSES] [--changed-only | --no-changed-only] + [--diff-against GIT_REF] [--paths-from-git-diff GIT_REF] + [--cache-path [FILE]] [--cache-dir [FILE]] + [--max-cache-size-mb MB] [--baseline [FILE]] + [--max-baseline-size-mb MB] + [--update-baseline | --no-update-baseline] + [--metrics-baseline [FILE]] + [--update-metrics-baseline | --no-update-metrics-baseline] + [--ci | --no-ci] [--api-surface | --no-api-surface] + [--coverage FILE] [--fail-on-new | --no-fail-on-new] + [--fail-on-new-metrics | --no-fail-on-new-metrics] + [--fail-threshold MAX_CLONES] [--fail-complexity [CC_MAX]] + [--fail-coupling [CBO_MAX]] [--fail-cohesion [LCOM4_MAX]] + [--fail-cycles | --no-fail-cycles] + [--fail-dead-code | --no-fail-dead-code] + [--fail-health [SCORE_MIN]] + [--fail-on-typing-regression | --no-fail-on-typing-regression] + [--fail-on-docstring-regression | --no-fail-on-docstring-regression] + [--fail-on-api-break | --no-fail-on-api-break] + [--fail-on-untested-hotspots | --no-fail-on-untested-hotspots] + [--min-typing-coverage PERCENT] + [--min-docstring-coverage PERCENT] [--coverage-min PERCENT] + [--skip-metrics | --no-skip-metrics] + [--skip-dead-code | --no-skip-dead-code] + [--skip-dependencies | --no-skip-dependencies] + [--html [FILE]] [--json [FILE]] [--md [FILE]] + [--sarif [FILE]] [--text [FILE]] + [--timestamped-report-paths | --no-timestamped-report-paths] + [--open-html-report | --no-open-html-report] [--no-progress] + [--progress] [--no-color] [--color] [--quiet | --no-quiet] + [--verbose | --no-verbose] [--debug | --no-debug] [-h] + [--version] + [root] + +Structural code quality analysis for Python. + +Target: + root Project root directory to scan. + Defaults to the current directory. + +Analysis: + --min-loc MIN_LOC Minimum Lines of Code (LOC) required for clone analysis. + Default: 10. + --min-stmt MIN_STMT Minimum AST statement count required for clone analysis. + Default: 6. + --processes PROCESSES + Number of parallel worker processes. + Default: 4. + --changed-only, --no-changed-only + Limit clone gating and changed-scope summaries to findings that touch + files from a git diff selection. + --diff-against GIT_REF + Resolve changed files from `git diff --name-only `. + Use together with --changed-only. + --paths-from-git-diff GIT_REF + Shorthand for --changed-only using `git diff --name-only `. + Useful for PR and CI review flows. + --cache-path [FILE] Path to the cache file. + If FILE is omitted, uses /.cache/codeclone/cache.json. + --cache-dir [FILE] Legacy alias for --cache-path. + Prefer --cache-path in new configurations. + --max-cache-size-mb MB + Maximum cache file size in MB. + Default: 50. + +Baselines and CI: + --baseline [FILE] Path to the clone baseline. + If FILE is omitted, uses codeclone.baseline.json. + --max-baseline-size-mb MB + Maximum allowed baseline size in MB. + Default: 5. + --update-baseline, --no-update-baseline + Overwrite the clone baseline with current results. + Disabled by default. + --metrics-baseline [FILE] + Path to the metrics baseline. + If FILE is omitted, uses codeclone.baseline.json. + --update-metrics-baseline, --no-update-metrics-baseline + Overwrite the metrics baseline with current metrics. + Disabled by default. + --ci, --no-ci Enable CI preset. + Equivalent to: --fail-on-new --no-color --quiet. + When a trusted metrics baseline is available, CI mode also enables + metrics regression gating. + --api-surface, --no-api-surface + Collect public API surface facts for baseline-aware compatibility review. + Disabled by default. + --coverage FILE Join external Cobertura XML line coverage to function spans. + Pass a `coverage xml` report path. + +Quality gates: + --fail-on-new, --no-fail-on-new + Exit with code 3 if NEW clone findings not present in the baseline + are detected. + --fail-on-new-metrics, --no-fail-on-new-metrics + Exit with code 3 if new metrics violations appear relative to the + metrics baseline. + --fail-threshold MAX_CLONES + Exit with code 3 if the total number of function + block clone groups + exceeds this value. + Disabled unless set. + --fail-complexity [CC_MAX] + Exit with code 3 if any function exceeds the cyclomatic complexity + threshold. + If enabled without a value, uses 20. + --fail-coupling [CBO_MAX] + Exit with code 3 if any class exceeds the coupling threshold. + If enabled without a value, uses 10. + --fail-cohesion [LCOM4_MAX] + Exit with code 3 if any class exceeds the cohesion threshold. + If enabled without a value, uses 4. + --fail-cycles, --no-fail-cycles + Exit with code 3 if circular module dependencies are detected. + --fail-dead-code, --no-fail-dead-code + Exit with code 3 if high-confidence dead code is detected. + --fail-health [SCORE_MIN] + Exit with code 3 if the overall health score falls below the threshold. + If enabled without a value, uses 60. + --fail-on-typing-regression, --no-fail-on-typing-regression + Exit with code 3 if typing adoption coverage regresses relative to the + metrics baseline. + --fail-on-docstring-regression, --no-fail-on-docstring-regression + Exit with code 3 if public docstring coverage regresses relative to the + metrics baseline. + --fail-on-api-break, --no-fail-on-api-break + Exit with code 3 if public API removals or signature breaks are detected + relative to the metrics baseline. + --fail-on-untested-hotspots, --no-fail-on-untested-hotspots + Exit with code 3 if medium/high-risk functions measured by Coverage Join + fall below the joined coverage threshold. + Requires --coverage. + --min-typing-coverage PERCENT + Exit with code 3 if parameter typing coverage falls below the threshold. + Threshold is a whole percent from 0 to 100. + --min-docstring-coverage PERCENT + Exit with code 3 if public docstring coverage falls below the threshold. + Threshold is a whole percent from 0 to 100. + --coverage-min PERCENT + Coverage threshold for untested hotspot detection. + Threshold is a whole percent from 0 to 100. + Default: 50. + +Analysis stages: + --skip-metrics, --no-skip-metrics + Skip full metrics analysis and run in clone-only mode. + --skip-dead-code, --no-skip-dead-code + Skip dead code detection. + --skip-dependencies, --no-skip-dependencies + Skip dependency graph analysis. + +Reporting: + --html [FILE] Generate an HTML report. + If FILE is omitted, writes to .cache/codeclone/report.html. + --json [FILE] Generate the canonical JSON report. + If FILE is omitted, writes to .cache/codeclone/report.json. + --md [FILE] Generate a Markdown report. + If FILE is omitted, writes to .cache/codeclone/report.md. + --sarif [FILE] Generate a SARIF 2.1.0 report. + If FILE is omitted, writes to .cache/codeclone/report.sarif. + --text [FILE] Generate a plain-text report. + If FILE is omitted, writes to .cache/codeclone/report.txt. + --timestamped-report-paths, --no-timestamped-report-paths + Append a UTC timestamp to default report filenames. + Applies only to report flags passed without FILE. + +Output and UI: + --open-html-report, --no-open-html-report + Open the generated HTML report in the default browser. + Requires --html. + --no-progress Disable progress output. + Recommended for CI logs. + --progress Force-enable progress output. + --no-color Disable ANSI colors. + --color Force-enable ANSI colors. + --quiet, --no-quiet Reduce output to warnings, errors, and essential summaries. + --verbose, --no-verbose + Include detailed identifiers for NEW clone findings. + --debug, --no-debug Print debug details for internal errors, including traceback and + environment information. + +General: + -h, --help Show this help message and exit. + --version Print the CodeClone version and exit. + +Exit codes: + 0 Success. + 2 Contract error: untrusted or invalid baseline, invalid output + configuration, incompatible versions, or unreadable sources in + CI/gating mode. + 3 Gating failure: new clones, threshold violations, or metrics + quality gate failures. + 5 Internal error: unexpected exception. + +Repository: https://github.com/orenlab/codeclone +Issues: https://github.com/orenlab/codeclone/issues +Docs: https://orenlab.github.io/codeclone/ diff --git a/tests/fixtures/contract_snapshots/mcp_tool_schemas.json b/tests/fixtures/contract_snapshots/mcp_tool_schemas.json new file mode 100644 index 0000000..4cf9e3b --- /dev/null +++ b/tests/fixtures/contract_snapshots/mcp_tool_schemas.json @@ -0,0 +1,1541 @@ +[ + { + "input_schema": { + "properties": { + "analysis_mode": { + "default": "full", + "title": "Analysis Mode", + "type": "string" + }, + "api_surface": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Api Surface" + }, + "baseline_path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Baseline Path" + }, + "block_min_loc": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Block Min Loc" + }, + "block_min_stmt": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Block Min Stmt" + }, + "cache_path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Cache Path" + }, + "cache_policy": { + "default": "reuse", + "title": "Cache Policy", + "type": "string" + }, + "changed_paths": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Changed Paths" + }, + "cohesion_threshold": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Cohesion Threshold" + }, + "complexity_threshold": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Complexity Threshold" + }, + "coupling_threshold": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Coupling Threshold" + }, + "coverage_min": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Coverage Min" + }, + "coverage_xml": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Coverage Xml" + }, + "git_diff_ref": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Git Diff Ref" + }, + "max_baseline_size_mb": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Max Baseline Size Mb" + }, + "max_cache_size_mb": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Max Cache Size Mb" + }, + "metrics_baseline_path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Metrics Baseline Path" + }, + "min_loc": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Min Loc" + }, + "min_stmt": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Min Stmt" + }, + "processes": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Processes" + }, + "respect_pyproject": { + "default": true, + "title": "Respect Pyproject", + "type": "boolean" + }, + "root": { + "title": "Root", + "type": "string" + }, + "segment_min_loc": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Segment Min Loc" + }, + "segment_min_stmt": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Segment Min Stmt" + } + }, + "required": [ + "root" + ], + "title": "analyze_changed_pathsArguments", + "type": "object" + }, + "name": "analyze_changed_paths" + }, + { + "input_schema": { + "properties": { + "analysis_mode": { + "default": "full", + "title": "Analysis Mode", + "type": "string" + }, + "api_surface": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Api Surface" + }, + "baseline_path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Baseline Path" + }, + "block_min_loc": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Block Min Loc" + }, + "block_min_stmt": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Block Min Stmt" + }, + "cache_path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Cache Path" + }, + "cache_policy": { + "default": "reuse", + "title": "Cache Policy", + "type": "string" + }, + "changed_paths": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Changed Paths" + }, + "cohesion_threshold": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Cohesion Threshold" + }, + "complexity_threshold": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Complexity Threshold" + }, + "coupling_threshold": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Coupling Threshold" + }, + "coverage_min": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Coverage Min" + }, + "coverage_xml": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Coverage Xml" + }, + "git_diff_ref": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Git Diff Ref" + }, + "max_baseline_size_mb": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Max Baseline Size Mb" + }, + "max_cache_size_mb": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Max Cache Size Mb" + }, + "metrics_baseline_path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Metrics Baseline Path" + }, + "min_loc": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Min Loc" + }, + "min_stmt": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Min Stmt" + }, + "processes": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Processes" + }, + "respect_pyproject": { + "default": true, + "title": "Respect Pyproject", + "type": "boolean" + }, + "root": { + "title": "Root", + "type": "string" + }, + "segment_min_loc": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Segment Min Loc" + }, + "segment_min_stmt": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Segment Min Stmt" + } + }, + "required": [ + "root" + ], + "title": "analyze_repositoryArguments", + "type": "object" + }, + "name": "analyze_repository" + }, + { + "input_schema": { + "properties": { + "clone_type": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Clone Type" + }, + "detail_level": { + "default": "summary", + "title": "Detail Level", + "type": "string" + }, + "max_results": { + "default": 10, + "title": "Max Results", + "type": "integer" + }, + "path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Path" + }, + "root": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Root" + }, + "run_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Run Id" + }, + "source_kind": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Source Kind" + } + }, + "title": "check_clonesArguments", + "type": "object" + }, + "name": "check_clones" + }, + { + "input_schema": { + "properties": { + "detail_level": { + "default": "summary", + "title": "Detail Level", + "type": "string" + }, + "max_results": { + "default": 10, + "title": "Max Results", + "type": "integer" + }, + "path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Path" + }, + "root": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Root" + }, + "run_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Run Id" + } + }, + "title": "check_cohesionArguments", + "type": "object" + }, + "name": "check_cohesion" + }, + { + "input_schema": { + "properties": { + "detail_level": { + "default": "summary", + "title": "Detail Level", + "type": "string" + }, + "max_results": { + "default": 10, + "title": "Max Results", + "type": "integer" + }, + "min_complexity": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Min Complexity" + }, + "path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Path" + }, + "root": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Root" + }, + "run_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Run Id" + } + }, + "title": "check_complexityArguments", + "type": "object" + }, + "name": "check_complexity" + }, + { + "input_schema": { + "properties": { + "detail_level": { + "default": "summary", + "title": "Detail Level", + "type": "string" + }, + "max_results": { + "default": 10, + "title": "Max Results", + "type": "integer" + }, + "path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Path" + }, + "root": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Root" + }, + "run_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Run Id" + } + }, + "title": "check_couplingArguments", + "type": "object" + }, + "name": "check_coupling" + }, + { + "input_schema": { + "properties": { + "detail_level": { + "default": "normal", + "title": "Detail Level", + "type": "string" + }, + "max_results": { + "default": 10, + "title": "Max Results", + "type": "integer" + }, + "min_severity": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Min Severity" + }, + "path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Path" + }, + "root": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Root" + }, + "run_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Run Id" + } + }, + "title": "check_dead_codeArguments", + "type": "object" + }, + "name": "check_dead_code" + }, + { + "input_schema": { + "properties": {}, + "title": "clear_session_runsArguments", + "type": "object" + }, + "name": "clear_session_runs" + }, + { + "input_schema": { + "properties": { + "focus": { + "default": "all", + "title": "Focus", + "type": "string" + }, + "run_id_after": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Run Id After" + }, + "run_id_before": { + "title": "Run Id Before", + "type": "string" + } + }, + "required": [ + "run_id_before" + ], + "title": "compare_runsArguments", + "type": "object" + }, + "name": "compare_runs" + }, + { + "input_schema": { + "properties": { + "coverage_min": { + "default": 50, + "title": "Coverage Min", + "type": "integer" + }, + "fail_cohesion": { + "default": -1, + "title": "Fail Cohesion", + "type": "integer" + }, + "fail_complexity": { + "default": -1, + "title": "Fail Complexity", + "type": "integer" + }, + "fail_coupling": { + "default": -1, + "title": "Fail Coupling", + "type": "integer" + }, + "fail_cycles": { + "default": false, + "title": "Fail Cycles", + "type": "boolean" + }, + "fail_dead_code": { + "default": false, + "title": "Fail Dead Code", + "type": "boolean" + }, + "fail_health": { + "default": -1, + "title": "Fail Health", + "type": "integer" + }, + "fail_on_api_break": { + "default": false, + "title": "Fail On Api Break", + "type": "boolean" + }, + "fail_on_docstring_regression": { + "default": false, + "title": "Fail On Docstring Regression", + "type": "boolean" + }, + "fail_on_new": { + "default": false, + "title": "Fail On New", + "type": "boolean" + }, + "fail_on_new_metrics": { + "default": false, + "title": "Fail On New Metrics", + "type": "boolean" + }, + "fail_on_typing_regression": { + "default": false, + "title": "Fail On Typing Regression", + "type": "boolean" + }, + "fail_on_untested_hotspots": { + "default": false, + "title": "Fail On Untested Hotspots", + "type": "boolean" + }, + "fail_threshold": { + "default": -1, + "title": "Fail Threshold", + "type": "integer" + }, + "min_docstring_coverage": { + "default": -1, + "title": "Min Docstring Coverage", + "type": "integer" + }, + "min_typing_coverage": { + "default": -1, + "title": "Min Typing Coverage", + "type": "integer" + }, + "run_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Run Id" + } + }, + "title": "evaluate_gatesArguments", + "type": "object" + }, + "name": "evaluate_gates" + }, + { + "input_schema": { + "properties": { + "changed_paths": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Changed Paths" + }, + "format": { + "default": "markdown", + "title": "Format", + "type": "string" + }, + "git_diff_ref": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Git Diff Ref" + }, + "run_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Run Id" + } + }, + "title": "generate_pr_summaryArguments", + "type": "object" + }, + "name": "generate_pr_summary" + }, + { + "input_schema": { + "properties": { + "detail_level": { + "default": "normal", + "title": "Detail Level", + "type": "string" + }, + "finding_id": { + "title": "Finding Id", + "type": "string" + }, + "run_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Run Id" + } + }, + "required": [ + "finding_id" + ], + "title": "get_findingArguments", + "type": "object" + }, + "name": "get_finding" + }, + { + "input_schema": { + "properties": { + "max_hotspots": { + "default": 3, + "title": "Max Hotspots", + "type": "integer" + }, + "max_suggestions": { + "default": 3, + "title": "Max Suggestions", + "type": "integer" + }, + "run_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Run Id" + } + }, + "title": "get_production_triageArguments", + "type": "object" + }, + "name": "get_production_triage" + }, + { + "input_schema": { + "properties": { + "detail_level": { + "default": "normal", + "title": "Detail Level", + "type": "string" + }, + "finding_id": { + "title": "Finding Id", + "type": "string" + }, + "run_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Run Id" + } + }, + "required": [ + "finding_id" + ], + "title": "get_remediationArguments", + "type": "object" + }, + "name": "get_remediation" + }, + { + "input_schema": { + "properties": { + "family": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Family" + }, + "limit": { + "default": 50, + "title": "Limit", + "type": "integer" + }, + "offset": { + "default": 0, + "title": "Offset", + "type": "integer" + }, + "path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Path" + }, + "run_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Run Id" + }, + "section": { + "default": "all", + "title": "Section", + "type": "string" + } + }, + "title": "get_report_sectionArguments", + "type": "object" + }, + "name": "get_report_section" + }, + { + "input_schema": { + "properties": { + "run_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Run Id" + } + }, + "title": "get_run_summaryArguments", + "type": "object" + }, + "name": "get_run_summary" + }, + { + "input_schema": { + "properties": { + "detail": { + "default": "compact", + "title": "Detail", + "type": "string" + }, + "topic": { + "title": "Topic", + "type": "string" + } + }, + "required": [ + "topic" + ], + "title": "helpArguments", + "type": "object" + }, + "name": "help" + }, + { + "input_schema": { + "properties": { + "category": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Category" + }, + "changed_paths": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Changed Paths" + }, + "detail_level": { + "default": "summary", + "title": "Detail Level", + "type": "string" + }, + "exclude_reviewed": { + "default": false, + "title": "Exclude Reviewed", + "type": "boolean" + }, + "family": { + "default": "all", + "title": "Family", + "type": "string" + }, + "git_diff_ref": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Git Diff Ref" + }, + "limit": { + "default": 50, + "title": "Limit", + "type": "integer" + }, + "max_results": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Max Results" + }, + "novelty": { + "default": "all", + "title": "Novelty", + "type": "string" + }, + "offset": { + "default": 0, + "title": "Offset", + "type": "integer" + }, + "run_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Run Id" + }, + "severity": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Severity" + }, + "sort_by": { + "default": "default", + "title": "Sort By", + "type": "string" + }, + "source_kind": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Source Kind" + } + }, + "title": "list_findingsArguments", + "type": "object" + }, + "name": "list_findings" + }, + { + "input_schema": { + "properties": { + "changed_paths": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Changed Paths" + }, + "detail_level": { + "default": "summary", + "title": "Detail Level", + "type": "string" + }, + "exclude_reviewed": { + "default": false, + "title": "Exclude Reviewed", + "type": "boolean" + }, + "git_diff_ref": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Git Diff Ref" + }, + "kind": { + "title": "Kind", + "type": "string" + }, + "limit": { + "default": 10, + "title": "Limit", + "type": "integer" + }, + "max_results": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Max Results" + }, + "run_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Run Id" + } + }, + "required": [ + "kind" + ], + "title": "list_hotspotsArguments", + "type": "object" + }, + "name": "list_hotspots" + }, + { + "input_schema": { + "properties": { + "run_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Run Id" + } + }, + "title": "list_reviewed_findingsArguments", + "type": "object" + }, + "name": "list_reviewed_findings" + }, + { + "input_schema": { + "properties": { + "finding_id": { + "title": "Finding Id", + "type": "string" + }, + "note": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Note" + }, + "run_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Run Id" + } + }, + "required": [ + "finding_id" + ], + "title": "mark_finding_reviewedArguments", + "type": "object" + }, + "name": "mark_finding_reviewed" + } +] diff --git a/tests/fixtures/contract_snapshots/public_api_surface.json b/tests/fixtures/contract_snapshots/public_api_surface.json new file mode 100644 index 0000000..4cb866f --- /dev/null +++ b/tests/fixtures/contract_snapshots/public_api_surface.json @@ -0,0 +1,99 @@ +{ + "main_exports": [ + "main" + ], + "main_signature": "() -> 'None'", + "codeclone_exports": [ + "__version__" + ], + "mcp_service_public_methods": [ + { + "name": "analyze_changed_paths", + "signature": "(self, request: 'MCPAnalysisRequest') -> 'dict[str, object]'" + }, + { + "name": "analyze_repository", + "signature": "(self, request: 'MCPAnalysisRequest') -> 'dict[str, object]'" + }, + { + "name": "check_clones", + "signature": "(self, *, run_id: 'str | None' = None, root: 'str | None' = None, path: 'str | None' = None, clone_type: 'str | None' = None, source_kind: 'str | None' = None, max_results: 'int' = 10, detail_level: 'DetailLevel' = 'summary') -> 'dict[str, object]'" + }, + { + "name": "check_cohesion", + "signature": "(self, *, run_id: 'str | None' = None, root: 'str | None' = None, path: 'str | None' = None, max_results: 'int' = 10, detail_level: 'DetailLevel' = 'summary') -> 'dict[str, object]'" + }, + { + "name": "check_complexity", + "signature": "(self, *, run_id: 'str | None' = None, root: 'str | None' = None, path: 'str | None' = None, min_complexity: 'int | None' = None, max_results: 'int' = 10, detail_level: 'DetailLevel' = 'summary') -> 'dict[str, object]'" + }, + { + "name": "check_coupling", + "signature": "(self, *, run_id: 'str | None' = None, root: 'str | None' = None, path: 'str | None' = None, max_results: 'int' = 10, detail_level: 'DetailLevel' = 'summary') -> 'dict[str, object]'" + }, + { + "name": "check_dead_code", + "signature": "(self, *, run_id: 'str | None' = None, root: 'str | None' = None, path: 'str | None' = None, min_severity: 'str | None' = None, max_results: 'int' = 10, detail_level: 'DetailLevel' = 'summary') -> 'dict[str, object]'" + }, + { + "name": "clear_session_runs", + "signature": "(self) -> 'dict[str, object]'" + }, + { + "name": "compare_runs", + "signature": "(self, *, run_id_before: 'str', run_id_after: 'str | None' = None, focus: 'ComparisonFocus' = 'all') -> 'dict[str, object]'" + }, + { + "name": "evaluate_gates", + "signature": "(self, request: 'MCPGateRequest') -> 'dict[str, object]'" + }, + { + "name": "generate_pr_summary", + "signature": "(self, *, run_id: 'str | None' = None, changed_paths: 'Sequence[str]' = (), git_diff_ref: 'str | None' = None, format: 'PRSummaryFormat' = 'markdown') -> 'dict[str, object]'" + }, + { + "name": "get_finding", + "signature": "(self, *, finding_id: 'str', run_id: 'str | None' = None, detail_level: 'DetailLevel' = 'normal') -> 'dict[str, object]'" + }, + { + "name": "get_help", + "signature": "(self, *, topic: 'HelpTopic', detail: 'HelpDetail' = 'compact') -> 'dict[str, object]'" + }, + { + "name": "get_production_triage", + "signature": "(self, *, run_id: 'str | None' = None, max_hotspots: 'int' = 3, max_suggestions: 'int' = 3) -> 'dict[str, object]'" + }, + { + "name": "get_remediation", + "signature": "(self, *, finding_id: 'str', run_id: 'str | None' = None, detail_level: 'DetailLevel' = 'normal') -> 'dict[str, object]'" + }, + { + "name": "get_report_section", + "signature": "(self, *, run_id: 'str | None' = None, section: 'ReportSection' = 'all', family: 'MetricsDetailFamily | None' = None, path: 'str | None' = None, offset: 'int' = 0, limit: 'int' = 50) -> 'dict[str, object]'" + }, + { + "name": "get_run_summary", + "signature": "(self, run_id: 'str | None' = None) -> 'dict[str, object]'" + }, + { + "name": "list_findings", + "signature": "(self, *, run_id: 'str | None' = None, family: 'FindingFamilyFilter' = 'all', category: 'str | None' = None, severity: 'str | None' = None, source_kind: 'str | None' = None, novelty: 'FindingNoveltyFilter' = 'all', sort_by: 'FindingSort' = 'default', detail_level: 'DetailLevel' = 'summary', changed_paths: 'Sequence[str]' = (), git_diff_ref: 'str | None' = None, exclude_reviewed: 'bool' = False, offset: 'int' = 0, limit: 'int' = 50, max_results: 'int | None' = None) -> 'dict[str, object]'" + }, + { + "name": "list_hotspots", + "signature": "(self, *, kind: 'HotlistKind', run_id: 'str | None' = None, detail_level: 'DetailLevel' = 'summary', changed_paths: 'Sequence[str]' = (), git_diff_ref: 'str | None' = None, exclude_reviewed: 'bool' = False, limit: 'int' = 10, max_results: 'int | None' = None) -> 'dict[str, object]'" + }, + { + "name": "list_reviewed_findings", + "signature": "(self, *, run_id: 'str | None' = None) -> 'dict[str, object]'" + }, + { + "name": "mark_finding_reviewed", + "signature": "(self, *, finding_id: 'str', run_id: 'str | None' = None, note: 'str | None' = None) -> 'dict[str, object]'" + }, + { + "name": "read_resource", + "signature": "(self, uri: 'str') -> 'str'" + } + ] +} diff --git a/tests/fixtures/golden_project/golden_expected_ids.json b/tests/fixtures/golden_project/golden_expected_ids.json index a7bbbd1..7748b03 100644 --- a/tests/fixtures/golden_project/golden_expected_ids.json +++ b/tests/fixtures/golden_project/golden_expected_ids.json @@ -1,6 +1,6 @@ { "meta": { - "python_tag": "cp313" + "python_tag": "cp314" }, "function_group_keys": [ "efc8465229b381a3a50502d59d9539c0be3efe86|20-49" diff --git a/tests/fixtures/golden_v2/clone_metrics_cycle/golden_expected_snapshot.json b/tests/fixtures/golden_v2/clone_metrics_cycle/golden_expected_snapshot.json index 40ac43e..a311d83 100644 --- a/tests/fixtures/golden_v2/clone_metrics_cycle/golden_expected_snapshot.json +++ b/tests/fixtures/golden_v2/clone_metrics_cycle/golden_expected_snapshot.json @@ -30,7 +30,7 @@ ] }, "meta": { - "python_tag": "cp313" + "python_tag": "cp314" }, "metrics": { "cohesion_max": 2, diff --git a/tests/fixtures/golden_v2/pyproject_defaults/golden_expected_cli_snapshot.json b/tests/fixtures/golden_v2/pyproject_defaults/golden_expected_cli_snapshot.json index ab5236f..be5e481 100644 --- a/tests/fixtures/golden_v2/pyproject_defaults/golden_expected_cli_snapshot.json +++ b/tests/fixtures/golden_v2/pyproject_defaults/golden_expected_cli_snapshot.json @@ -1,8 +1,8 @@ { "meta": { - "python_tag": "cp313" + "python_tag": "cp314" }, - "report_schema_version": "2.8", + "report_schema_version": "2.10", "project_name": "pyproject_defaults", "scan_root": ".", "baseline_status": "missing", diff --git a/tests/fixtures/golden_v2/test_only_usage/golden_expected_snapshot.json b/tests/fixtures/golden_v2/test_only_usage/golden_expected_snapshot.json index 6357123..c8478a8 100644 --- a/tests/fixtures/golden_v2/test_only_usage/golden_expected_snapshot.json +++ b/tests/fixtures/golden_v2/test_only_usage/golden_expected_snapshot.json @@ -17,7 +17,7 @@ "segment_keys": [] }, "meta": { - "python_tag": "cp313" + "python_tag": "cp314" }, "metrics": { "cohesion_max": 0, diff --git a/tests/test_architecture.py b/tests/test_architecture.py index fc71506..48b8a6a 100644 --- a/tests/test_architecture.py +++ b/tests/test_architecture.py @@ -6,24 +6,9 @@ from __future__ import annotations -import ast from pathlib import Path - -def _module_name_from_path(path: Path) -> str: - parts = list(path.with_suffix("").parts) - return ".".join(parts) - - -def _resolve_import(module_name: str, node: ast.ImportFrom) -> str: - if node.level == 0: - return node.module or "" - - parts = module_name.split(".") - prefix_parts = parts[: -node.level] - if node.module: - return ".".join([*prefix_parts, node.module]) - return ".".join(prefix_parts) +from tests._import_graph import _iter_local_imports, _module_name_from_path def _iter_codeclone_modules(root: Path) -> list[tuple[str, Path]]: @@ -33,19 +18,6 @@ def _iter_codeclone_modules(root: Path) -> list[tuple[str, Path]]: ] -def _iter_local_imports(module_name: str, source: str) -> list[str]: - tree = ast.parse(source) - imports: list[str] = [] - for node in ast.walk(tree): - if isinstance(node, ast.Import): - imports.extend(alias.name for alias in node.names) - elif isinstance(node, ast.ImportFrom): - imports.append(_resolve_import(module_name, node)) - return [ - import_name for import_name in imports if import_name.startswith("codeclone") - ] - - def _violates(import_name: str, forbidden_prefixes: tuple[str, ...]) -> bool: return any( import_name == prefix or import_name.startswith(prefix + ".") @@ -73,51 +45,101 @@ def test_architecture_layer_violations() -> None: "codeclone.report.", ( "codeclone.ui_messages", - "codeclone.html_report", - "codeclone.cli", + "codeclone.report.html", + "codeclone.surfaces.cli", "codeclone._html_", - "codeclone._html_report", + "codeclone.report.html", ), ), ( "codeclone.extractor", ( "codeclone.report", - "codeclone.cli", + "codeclone.surfaces.cli", "codeclone.baseline", ), ), ( "codeclone.grouping", ( - "codeclone.cli", + "codeclone.surfaces.cli", "codeclone.baseline", - "codeclone.html_report", + "codeclone.report.html", ), ), ( "codeclone.baseline", ( - "codeclone.cli", + "codeclone.surfaces.cli", "codeclone.ui_messages", - "codeclone.html_report", + "codeclone.report.html", ), ), ( "codeclone.cache", ( - "codeclone.cli", + "codeclone.surfaces.cli", "codeclone.ui_messages", - "codeclone.html_report", + "codeclone.report.html", + ), + ), + ( + "codeclone.core", + ( + "codeclone.surfaces", + "codeclone.config", + ), + ), + ( + "codeclone.analysis", + ( + "codeclone.report", + "codeclone.surfaces", + "codeclone.config", + ), + ), + ( + "codeclone.metrics", + ( + "codeclone.report.document", + "codeclone.report.renderers", + "codeclone.surfaces", + "codeclone.config", + ), + ), + ( + "codeclone.findings", + ( + "codeclone.report", + "codeclone.surfaces", + "codeclone.config", + ), + ), + ( + "codeclone.report.document", + ( + "codeclone.surfaces", + "codeclone.config", + ), + ), + ( + "codeclone.report.renderers", + ( + "codeclone.core", + "codeclone.analysis", + "codeclone.metrics", + "codeclone.findings", + "codeclone.surfaces", + "codeclone.config", ), ), ( "codeclone.domain.", ( - "codeclone.cli", + "codeclone.surfaces.cli", "codeclone.pipeline", "codeclone.report", - "codeclone.html_report", + "codeclone.report.html", "codeclone.ui_messages", "codeclone.baseline", "codeclone.cache", @@ -130,6 +152,10 @@ def test_architecture_layer_violations() -> None: for module_prefix, forbidden_prefixes in forbidden_by_module_prefix: if _matches_module_prefix(module_name, module_prefix): + if module_prefix == "codeclone.report." and module_name.startswith( + "codeclone.report.html" + ): + continue violations.extend( [ ( @@ -142,7 +168,7 @@ def test_architecture_layer_violations() -> None: ) if module_name == "codeclone.models": - allowed_prefixes = ("codeclone.contracts", "codeclone.errors") + allowed_prefixes = ("codeclone.contracts",) unexpected_imports = [ import_name for import_name in imports diff --git a/tests/test_baseline.py b/tests/test_baseline.py index 859b8c1..04bee4c 100644 --- a/tests/test_baseline.py +++ b/tests/test_baseline.py @@ -12,9 +12,11 @@ import pytest import codeclone.baseline as baseline_mod +import codeclone.baseline.clone_baseline as clone_baseline_mod +import codeclone.baseline.trust as baseline_trust_mod from codeclone.baseline import Baseline, BaselineStatus, coerce_baseline_status from codeclone.contracts import BASELINE_FINGERPRINT_VERSION, BASELINE_SCHEMA_VERSION -from codeclone.errors import BaselineValidationError +from codeclone.contracts.errors import BaselineValidationError def _python_tag() -> str: @@ -47,7 +49,7 @@ def _trusted_payload( created_at: str | None = "2026-02-08T11:43:16Z", generator_version: str = "1.4.0", ) -> dict[str, object]: - payload = baseline_mod._baseline_payload( + payload = clone_baseline_mod._baseline_payload( functions=set(functions or [_func_id()]), blocks=set(blocks or [_block_id()]), generator="codeclone", @@ -164,7 +166,7 @@ def test_baseline_load_too_large( ) -> None: baseline_path = tmp_path / "baseline.json" _write_payload(baseline_path, _trusted_payload()) - monkeypatch.setattr(baseline_mod, "MAX_BASELINE_SIZE_BYTES", 1) + monkeypatch.setattr(baseline_trust_mod, "MAX_BASELINE_SIZE_BYTES", 1) baseline = Baseline(baseline_path) with pytest.raises(BaselineValidationError, match="too large") as exc: baseline.load() @@ -576,13 +578,13 @@ def test_baseline_payload_fields_contract_invariant(tmp_path: Path) -> None: def test_baseline_hash_canonical_determinism() -> None: - hash_a = baseline_mod._compute_payload_sha256( + hash_a = baseline_trust_mod._compute_payload_sha256( functions={"a" * 40 + "|0-19", "b" * 40 + "|0-19"}, blocks={_block_id()}, fingerprint_version="1", python_tag="cp313", ) - hash_b = baseline_mod._compute_payload_sha256( + hash_b = baseline_trust_mod._compute_payload_sha256( functions={"b" * 40 + "|0-19", "a" * 40 + "|0-19"}, blocks={_block_id()}, fingerprint_version="1", @@ -803,7 +805,7 @@ def _boom_stat(self: Path) -> object: with pytest.raises( BaselineValidationError, match="Cannot stat baseline file" ) as exc: - baseline_mod._safe_stat_size(path) + baseline_trust_mod._safe_stat_size(path) assert exc.value.status == "invalid_type" @@ -818,10 +820,10 @@ def _boom_replace(src: str | Path, dst: str | Path) -> None: temp_holder["path"] = Path(src) raise OSError("replace failed") - monkeypatch.setattr("codeclone._json_io.os.replace", _boom_replace) + monkeypatch.setattr("codeclone.utils.json_io.os.replace", _boom_replace) with pytest.raises(OSError, match="replace failed"): - baseline_mod._atomic_write_json(path, _trusted_payload()) + clone_baseline_mod._atomic_write_json(path, _trusted_payload()) assert temp_holder["path"].exists() is False @@ -841,18 +843,18 @@ def _boom_read(self: Path, *_args: object, **_kwargs: object) -> str: with pytest.raises( BaselineValidationError, match="Cannot read baseline file" ) as exc: - baseline_mod._load_json_object(path) + baseline_trust_mod._load_json_object(path) assert exc.value.status == "invalid_json" def test_baseline_optional_str_paths(tmp_path: Path) -> None: path = tmp_path / "baseline.json" - assert baseline_mod._optional_str({}, "generator_version", path=path) is None + assert baseline_trust_mod._optional_str({}, "generator_version", path=path) is None with pytest.raises( BaselineValidationError, match="'generator_version' must be string", ) as exc: - baseline_mod._optional_str( + baseline_trust_mod._optional_str( {"generator_version": 1}, "generator_version", path=path, @@ -868,7 +870,7 @@ def test_baseline_require_utc_iso8601_z_rejects_invalid_calendar_date( BaselineValidationError, match="'created_at' must be UTC ISO-8601 with Z", ) as exc: - baseline_mod._require_utc_iso8601_z( + baseline_trust_mod._require_utc_iso8601_z( {"created_at": "2026-02-31T00:00:00Z"}, "created_at", path=path, @@ -894,7 +896,7 @@ def test_baseline_load_legacy_codeclone_version_alias(tmp_path: Path) -> None: def test_parse_generator_meta_string_legacy_alias(tmp_path: Path) -> None: path = tmp_path / "baseline.json" - name, version = baseline_mod._parse_generator_meta( + name, version = baseline_trust_mod._parse_generator_meta( { "generator": "codeclone", "codeclone_version": "1.4.0", @@ -907,7 +909,7 @@ def test_parse_generator_meta_string_legacy_alias(tmp_path: Path) -> None: def test_parse_generator_meta_string_prefers_generator_version(tmp_path: Path) -> None: path = tmp_path / "baseline.json" - name, version = baseline_mod._parse_generator_meta( + name, version = baseline_trust_mod._parse_generator_meta( { "generator": "codeclone", "generator_version": "1.4.2", @@ -921,7 +923,7 @@ def test_parse_generator_meta_string_prefers_generator_version(tmp_path: Path) - def test_parse_generator_meta_object_top_level_fallback(tmp_path: Path) -> None: path = tmp_path / "baseline.json" - name, version = baseline_mod._parse_generator_meta( + name, version = baseline_trust_mod._parse_generator_meta( { "generator": {"name": "codeclone"}, "generator_version": "1.4.1", @@ -937,7 +939,7 @@ def test_parse_generator_meta_rejects_extra_generator_keys(tmp_path: Path) -> No with pytest.raises( BaselineValidationError, match="unexpected generator keys" ) as exc: - baseline_mod._parse_generator_meta( + baseline_trust_mod._parse_generator_meta( {"generator": {"name": "codeclone", "version": "1.4.0", "extra": "x"}}, path=path, ) @@ -946,7 +948,11 @@ def test_parse_generator_meta_rejects_extra_generator_keys(tmp_path: Path) -> No def test_baseline_parse_semver_three_parts(tmp_path: Path) -> None: path = tmp_path / "baseline.json" - assert baseline_mod._parse_semver("1.2.3", key="schema_version", path=path) == ( + assert baseline_trust_mod._parse_semver( + "1.2.3", + key="schema_version", + path=path, + ) == ( 1, 2, 3, @@ -959,10 +965,10 @@ def test_baseline_require_sorted_unique_ids_non_string(tmp_path: Path) -> None: BaselineValidationError, match="'functions' must be list\\[str\\]", ) as exc: - baseline_mod._require_sorted_unique_ids( + baseline_trust_mod._require_sorted_unique_ids( {"functions": [1]}, "functions", - pattern=baseline_mod._FUNCTION_ID_RE, + pattern=clone_baseline_mod._FUNCTION_ID_RE, path=path, ) assert exc.value.status == "invalid_type" @@ -1050,7 +1056,12 @@ def test_baseline_save_preserves_embedded_metrics_without_hash(tmp_path: Path) - def test_preserve_embedded_metrics_variants(tmp_path: Path) -> None: path = tmp_path / "baseline.json" _write_payload(path, {"meta": {}, "clones": {"functions": [], "blocks": []}}) - assert baseline_mod._preserve_embedded_metrics(path) == (None, None, None, None) + assert clone_baseline_mod._preserve_embedded_metrics(path) == ( + None, + None, + None, + None, + ) _write_payload( path, @@ -1060,7 +1071,7 @@ def test_preserve_embedded_metrics_variants(tmp_path: Path) -> None: "metrics": {"x": 1}, }, ) - assert baseline_mod._preserve_embedded_metrics(path) == ( + assert clone_baseline_mod._preserve_embedded_metrics(path) == ( {"x": 1}, None, None, @@ -1075,7 +1086,7 @@ def test_preserve_embedded_metrics_variants(tmp_path: Path) -> None: "metrics": {"x": 2}, }, ) - assert baseline_mod._preserve_embedded_metrics(path) == ( + assert clone_baseline_mod._preserve_embedded_metrics(path) == ( {"x": 2}, None, None, @@ -1090,7 +1101,7 @@ def test_preserve_embedded_metrics_variants(tmp_path: Path) -> None: "metrics": {"x": 3}, }, ) - assert baseline_mod._preserve_embedded_metrics(path) == ( + assert clone_baseline_mod._preserve_embedded_metrics(path) == ( {"x": 3}, "a" * 64, None, @@ -1109,7 +1120,7 @@ def test_preserve_embedded_metrics_variants(tmp_path: Path) -> None: "api_surface": {"modules": [{"module": "pkg.mod"}]}, }, ) - assert baseline_mod._preserve_embedded_metrics(path) == ( + assert clone_baseline_mod._preserve_embedded_metrics(path) == ( {"x": 3}, "a" * 64, {"modules": [{"module": "pkg.mod"}]}, @@ -1133,9 +1144,9 @@ def _payload(**_kwargs: object) -> dict[str, object]: "clones": {"functions": [], "blocks": []}, } - monkeypatch.setattr(baseline_mod, "_baseline_payload", _payload) + monkeypatch.setattr(clone_baseline_mod, "_baseline_payload", _payload) monkeypatch.setattr( - baseline_mod, + clone_baseline_mod, "_preserve_embedded_metrics", lambda _path: ({"health_score": 1}, "a" * 64, None, None), ) @@ -1167,7 +1178,7 @@ def _payload(**_kwargs: object) -> dict[str, object]: "clones": {"functions": [], "blocks": []}, } - monkeypatch.setattr(baseline_mod, "_baseline_payload", _payload) + monkeypatch.setattr(clone_baseline_mod, "_baseline_payload", _payload) baseline.save() _assert_baseline_runtime_meta( @@ -1207,7 +1218,7 @@ def _payload(**_kwargs: object) -> dict[str, object]: "clones": {"functions": [], "blocks": []}, } - monkeypatch.setattr(baseline_mod, "_baseline_payload", _payload) + monkeypatch.setattr(clone_baseline_mod, "_baseline_payload", _payload) baseline.save() _assert_baseline_runtime_meta( @@ -1263,7 +1274,7 @@ def _payload(**_kwargs: object) -> dict[str, object]: "clones": {"functions": [], "blocks": []}, } - monkeypatch.setattr(baseline_mod, "_baseline_payload", _payload) + monkeypatch.setattr(clone_baseline_mod, "_baseline_payload", _payload) baseline.save() assert baseline.generator == "keep-generator" diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index c921cb9..05e9010 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -6,12 +6,18 @@ from __future__ import annotations +from pathlib import Path +from subprocess import CompletedProcess + import pytest from benchmarks.run_benchmark import ( + BENCHMARK_CLI_MODULE, BENCHMARK_NEUTRAL_ARGS, RunMeasurement, Scenario, + _run_cli_once, + _timing_regressions, _validate_inventory_sample, ) @@ -33,6 +39,27 @@ def _measurement( ) +def _benchmark_payload( + *, + cold_full: float, + warm_full: float, + warm_clones_only: float, +) -> dict[str, object]: + def _scenario(name: str, median: float) -> dict[str, object]: + return { + "name": name, + "stats_seconds": {"median": median}, + } + + return { + "scenarios": [ + _scenario("cold_full", cold_full), + _scenario("warm_full", warm_full), + _scenario("warm_clones_only", warm_clones_only), + ] + } + + def test_benchmark_inventory_validation_accepts_valid_cold_and_warm_samples() -> None: _validate_inventory_sample( scenario=Scenario(name="cold_full", mode="cold", extra_args=()), @@ -49,12 +76,68 @@ def test_benchmark_neutral_args_disable_repo_quality_gates() -> None: assert "--no-fail-on-new-metrics" in BENCHMARK_NEUTRAL_ARGS assert "--no-fail-cycles" in BENCHMARK_NEUTRAL_ARGS assert "--no-fail-dead-code" in BENCHMARK_NEUTRAL_ARGS + assert "--no-api-surface" in BENCHMARK_NEUTRAL_ARGS + assert "--no-update-metrics-baseline" in BENCHMARK_NEUTRAL_ARGS assert "--fail-health" in BENCHMARK_NEUTRAL_ARGS assert "--min-typing-coverage" in BENCHMARK_NEUTRAL_ARGS assert "--min-docstring-coverage" in BENCHMARK_NEUTRAL_ARGS assert "--skip-metrics" not in BENCHMARK_NEUTRAL_ARGS +def test_benchmark_runner_invokes_canonical_main_entrypoint( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + captured: dict[str, object] = {} + + def fake_run( + cmd: list[str], + *, + check: bool, + capture_output: bool, + text: bool, + env: dict[str, str], + ) -> CompletedProcess[str]: + captured["cmd"] = cmd + captured["check"] = check + captured["capture_output"] = capture_output + captured["text"] = text + captured["env"] = env + return CompletedProcess(cmd, 0, stdout="", stderr="") + + monkeypatch.setattr("benchmarks.run_benchmark.subprocess.run", fake_run) + monkeypatch.setattr( + "benchmarks.run_benchmark._read_report", + lambda _report_path: ( + "digest", + {"found": 10, "analyzed": 10, "cached": 0, "skipped": 0}, + ), + ) + + _run_cli_once( + target=tmp_path, + python_executable="python3", + cache_path=tmp_path / "cache.json", + report_path=tmp_path / "report.json", + extra_args=("--skip-metrics",), + ) + + assert captured["cmd"] == [ + "python3", + "-m", + BENCHMARK_CLI_MODULE, + str(tmp_path), + *BENCHMARK_NEUTRAL_ARGS, + "--json", + str(tmp_path / "report.json"), + "--cache-path", + str(tmp_path / "cache.json"), + "--no-progress", + "--quiet", + "--skip-metrics", + ] + + @pytest.mark.parametrize( ("scenario", "measurement", "message"), ( @@ -90,3 +173,53 @@ def test_benchmark_inventory_validation_rejects_invalid_samples( scenario=scenario, measurement=measurement, ) + + +def test_benchmark_timing_regressions_accept_within_tolerance() -> None: + baseline = _benchmark_payload( + cold_full=1.0, + warm_full=0.30, + warm_clones_only=0.25, + ) + current = _benchmark_payload( + cold_full=1.04, + warm_full=0.31, + warm_clones_only=0.24, + ) + + assert ( + _timing_regressions( + current_payload=current, + baseline_payload=baseline, + max_regression_pct=5.0, + ) + == [] + ) + + +def test_benchmark_timing_regressions_report_excess_slowdown() -> None: + baseline = _benchmark_payload( + cold_full=1.0, + warm_full=0.30, + warm_clones_only=0.25, + ) + current = _benchmark_payload( + cold_full=1.07, + warm_full=0.32, + warm_clones_only=0.27, + ) + + regressions = _timing_regressions( + current_payload=current, + baseline_payload=baseline, + max_regression_pct=5.0, + ) + + assert regressions == [ + "cold_full: median 1.0700s exceeds baseline 1.0000s by 7.00% (allowed 5.00%)", + ( + "warm_clones_only: median 0.2700s exceeds baseline 0.2500s " + "by 8.00% (allowed 5.00%)" + ), + "warm_full: median 0.3200s exceeds baseline 0.3000s by 6.67% (allowed 5.00%)", + ] diff --git a/tests/test_blocks.py b/tests/test_blocks.py index a875635..64379d8 100644 --- a/tests/test_blocks.py +++ b/tests/test_blocks.py @@ -6,8 +6,8 @@ import ast +from codeclone.analysis.normalizer import NormalizationConfig from codeclone.blocks import extract_blocks -from codeclone.normalize import NormalizationConfig def test_extracts_non_overlapping_blocks() -> None: diff --git a/tests/test_cache.py b/tests/test_cache.py index b449f48..23a048a 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -14,14 +14,75 @@ import pytest -import codeclone.cache as cache_mod -from codeclone.blocks import BlockUnit, SegmentUnit -from codeclone.cache import Cache, CacheStatus -from codeclone.cache_io import sign_cache_payload -from codeclone.cache_paths import runtime_filepath_from_wire, wire_filepath_from_runtime -from codeclone.errors import CacheError -from codeclone.extractor import Unit -from codeclone.models import ApiParamSpec, FileMetrics, ModuleApiSurface, PublicSymbol +import codeclone.cache.store as cache_store +from codeclone.cache._canonicalize import ( + _as_module_api_surface_dict, + _as_module_docstring_coverage_dict, + _as_module_typing_coverage_dict, + _canonicalize_cache_entry, + _has_cache_entry_container_shape, +) +from codeclone.cache._validators import ( + _is_api_param_spec_dict, + _is_class_metrics_dict, + _is_dead_candidate_dict, + _is_module_api_surface_dict, + _is_module_dep_dict, + _is_public_symbol_dict, + _is_security_surface_dict, +) +from codeclone.cache._wire_decode import ( + _decode_optional_wire_api_surface, + _decode_optional_wire_module_ints, + _decode_optional_wire_source_stats, + _decode_wire_api_param_spec, + _decode_wire_api_surface_symbol, + _decode_wire_block, + _decode_wire_class_metric, + _decode_wire_dead_candidate, + _decode_wire_file_entry, + _decode_wire_file_sections, + _decode_wire_module_dep, + _decode_wire_name_sections, + _decode_wire_security_surface, + _decode_wire_segment, + _decode_wire_unit, +) +from codeclone.cache._wire_encode import _encode_wire_file_entry +from codeclone.cache._wire_helpers import ( + _decode_optional_wire_coupled_classes, + _decode_wire_int_fields, + _decode_wire_qualname_span_size, +) +from codeclone.cache.entries import ( + CacheEntry, + _as_security_surface_category, + _as_security_surface_classification_mode, + _as_security_surface_evidence_kind, + _as_security_surface_location_scope, + _block_dict_from_model, + _segment_dict_from_model, + _unit_dict_from_model, +) +from codeclone.cache.integrity import as_str_dict as _as_str_dict +from codeclone.cache.integrity import sign_cache_payload +from codeclone.cache.projection import ( + runtime_filepath_from_wire, + wire_filepath_from_runtime, +) +from codeclone.cache.store import Cache, file_stat_signature +from codeclone.cache.versioning import CacheStatus, _as_analysis_profile, _resolve_root +from codeclone.contracts.errors import CacheError +from codeclone.models import ( + ApiParamSpec, + BlockUnit, + FileMetrics, + ModuleApiSurface, + PublicSymbol, + SecuritySurface, + SegmentUnit, + Unit, +) def _make_unit(filepath: str) -> Unit: @@ -69,6 +130,30 @@ def _analysis_payload(cache: Cache, *, files: object) -> dict[str, object]: } +def _roundtrip_cache_entry_with_metrics( + tmp_path: Path, + *, + file_metrics: FileMetrics, +) -> CacheEntry: + cache_path = tmp_path / "cache.json" + cache = Cache(cache_path) + cache.put_file_entry( + "x.py", + {"mtime_ns": 1, "size": 10}, + [], + [], + [], + file_metrics=file_metrics, + ) + cache.save() + + loaded = Cache(cache_path) + loaded.load() + entry = loaded.get_file_entry("x.py") + assert entry is not None + return entry + + def test_cache_roundtrip(tmp_path: Path) -> None: cache_path = tmp_path / "cache.json" cache = Cache(cache_path) @@ -90,6 +175,47 @@ def test_cache_roundtrip(tmp_path: Path) -> None: assert loaded.cache_schema_version == Cache._CACHE_VERSION +def test_cache_prune_file_entries_removes_stale_paths(tmp_path: Path) -> None: + root = tmp_path.resolve() + cache_path = root / "cache.json" + live = root / "live.py" + stale = root / "stale.py" + live.write_text("def live():\n return 1\n", "utf-8") + + cache = Cache(cache_path, root=root) + cache.put_file_entry( + str(live), + file_stat_signature(str(live)), + [], + [], + [], + ) + cache.put_file_entry( + str(stale), + {"mtime_ns": 1, "size": 1}, + [], + [], + [], + ) + cache.save() + + loaded = Cache(cache_path, root=root) + loaded.load() + + removed = loaded.prune_file_entries((str(live),)) + + assert removed == 1 + assert str(live) in loaded.data["files"] + assert str(stale) not in loaded.data["files"] + + loaded.save() + + reloaded = Cache(cache_path, root=root) + reloaded.load() + assert reloaded.get_file_entry(str(live)) is not None + assert reloaded.get_file_entry(str(stale)) is None + + def test_cache_roundtrip_preserves_empty_structural_findings(tmp_path: Path) -> None: cache_path = tmp_path / "cache.json" cache = Cache(cache_path) @@ -114,14 +240,8 @@ def test_cache_roundtrip_preserves_empty_structural_findings(tmp_path: Path) -> def test_cache_roundtrip_preserves_api_surface_parameter_order( tmp_path: Path, ) -> None: - cache_path = tmp_path / "cache.json" - cache = Cache(cache_path) - cache.put_file_entry( - "x.py", - {"mtime_ns": 1, "size": 10}, - [], - [], - [], + entry = _roundtrip_cache_entry_with_metrics( + tmp_path, file_metrics=FileMetrics( class_metrics=(), module_deps=(), @@ -156,16 +276,163 @@ def test_cache_roundtrip_preserves_api_surface_parameter_order( ), ), ) - cache.save() - - loaded = Cache(cache_path) - loaded.load() - entry = loaded.get_file_entry("x.py") - assert entry is not None params = entry["api_surface"]["symbols"][0]["params"] assert [param["name"] for param in params] == ["beta", "alpha"] +def test_cache_roundtrip_preserves_security_surfaces(tmp_path: Path) -> None: + entry = _roundtrip_cache_entry_with_metrics( + tmp_path, + file_metrics=FileMetrics( + class_metrics=(), + module_deps=(), + dead_candidates=(), + referenced_names=frozenset(), + import_names=frozenset(), + class_names=frozenset(), + security_surfaces=( + SecuritySurface( + category="process_boundary", + capability="subprocess_run", + module="pkg.runner", + filepath="x.py", + qualname="pkg.runner:run_command", + start_line=10, + end_line=10, + location_scope="callable", + classification_mode="exact_call", + evidence_kind="call", + evidence_symbol="subprocess.run", + ), + ), + ), + ) + assert entry["security_surfaces"] == [ + { + "category": "process_boundary", + "capability": "subprocess_run", + "module": "pkg.runner", + "filepath": "x.py", + "qualname": "pkg.runner:run_command", + "start_line": 10, + "end_line": 10, + "location_scope": "callable", + "classification_mode": "exact_call", + "evidence_kind": "call", + "evidence_symbol": "subprocess.run", + } + ] + + +def test_security_surface_cache_helpers_reject_invalid_values() -> None: + assert _as_security_surface_category("process_boundary") == "process_boundary" + assert _as_security_surface_category("broken") is None + assert _as_security_surface_location_scope("callable") == "callable" + assert _as_security_surface_location_scope("broken") is None + assert _as_security_surface_classification_mode("exact_call") == "exact_call" + assert _as_security_surface_classification_mode("broken") is None + assert _as_security_surface_evidence_kind("call") == "call" + assert _as_security_surface_evidence_kind("broken") is None + assert ( + _is_module_api_surface_dict( + { + "module": "pkg.mod", + "filepath": "pkg/mod.py", + "all_declared": ["run"], + "symbols": "bad", + } + ) + is False + ) + assert _is_security_surface_dict(object()) is False + + +def test_decode_wire_security_surface_covers_valid_and_invalid_rows() -> None: + assert _decode_wire_security_surface(object(), "pkg/mod.py") is None + assert ( + _decode_wire_security_surface( + [ + "broken", + "subprocess_run", + "pkg.mod", + "pkg.mod:run", + 10, + 12, + "callable", + "exact_call", + "call", + "subprocess.run", + ], + "pkg/mod.py", + ) + is None + ) + assert ( + _decode_wire_security_surface( + [ + "process_boundary", + "subprocess_run", + "pkg.mod", + "pkg.mod:run", + "10", + 12, + "callable", + "exact_call", + "call", + "subprocess.run", + ], + "pkg/mod.py", + ) + is None + ) + assert ( + _decode_wire_security_surface( + [ + "process_boundary", + "subprocess_run", + "pkg.mod", + "pkg.mod:run", + 10, + 12, + "broken", + "exact_call", + "call", + "subprocess.run", + ], + "pkg/mod.py", + ) + is None + ) + decoded = _decode_wire_security_surface( + [ + "process_boundary", + "subprocess_run", + "pkg.mod", + "pkg.mod:run", + 10, + 12, + "callable", + "exact_call", + "call", + "subprocess.run", + ], + "pkg/mod.py", + ) + assert decoded == { + "category": "process_boundary", + "capability": "subprocess_run", + "module": "pkg.mod", + "filepath": "pkg/mod.py", + "qualname": "pkg.mod:run", + "start_line": 10, + "end_line": 12, + "location_scope": "callable", + "classification_mode": "exact_call", + "evidence_kind": "call", + "evidence_symbol": "subprocess.run", + } + + def test_cache_load_normalizes_stale_structural_findings(tmp_path: Path) -> None: cache_path = tmp_path / "cache.json" cache = Cache(cache_path) @@ -223,7 +490,7 @@ def test_cache_load_normalizes_stale_structural_findings(tmp_path: Path) -> None ) payload = _analysis_payload( cache, - files={"x.py": cache_mod._encode_wire_file_entry(entry)}, + files={"x.py": _encode_wire_file_entry(entry)}, ) signature = sign_cache_payload(payload) cache_path.write_text( @@ -284,7 +551,7 @@ def test_store_canonical_file_entry_marks_dirty_only_when_entry_changes( cache = Cache(tmp_path / "cache.json") canonical_entry = cast( Any, - cache_mod._canonicalize_cache_entry( + _canonicalize_cache_entry( { "stat": {"mtime_ns": 1, "size": 1}, "units": [], @@ -319,11 +586,11 @@ def test_store_canonical_file_entry_marks_dirty_only_when_entry_changes( def test_cache_helper_type_guards_and_wire_api_decoders_cover_invalid_inputs() -> None: - assert cache_mod._as_module_typing_coverage_dict({"module": "pkg"}) is None - assert cache_mod._as_module_docstring_coverage_dict({"module": "pkg"}) is None - assert cache_mod._as_module_api_surface_dict({"module": "pkg"}) is None + assert _as_module_typing_coverage_dict({"module": "pkg"}) is None + assert _as_module_docstring_coverage_dict({"module": "pkg"}) is None + assert _as_module_api_surface_dict({"module": "pkg"}) is None assert ( - cache_mod._has_cache_entry_container_shape( + _has_cache_entry_container_shape( { "stat": {"mtime_ns": 1, "size": 1}, "units": [], @@ -335,7 +602,7 @@ def test_cache_helper_type_guards_and_wire_api_decoders_cover_invalid_inputs() - is False ) assert ( - cache_mod._has_cache_entry_container_shape( + _has_cache_entry_container_shape( { "stat": {"mtime_ns": 1, "size": 1}, "units": [], @@ -347,7 +614,7 @@ def test_cache_helper_type_guards_and_wire_api_decoders_cover_invalid_inputs() - is False ) assert ( - cache_mod._has_cache_entry_container_shape( + _has_cache_entry_container_shape( { "stat": {"mtime_ns": 1, "size": 1}, "units": [], @@ -359,14 +626,14 @@ def test_cache_helper_type_guards_and_wire_api_decoders_cover_invalid_inputs() - is False ) assert ( - cache_mod._decode_optional_wire_api_surface( + _decode_optional_wire_api_surface( obj={"as": ["pkg.mod", ["run"], [None]]}, filepath="pkg/mod.py", ) is None ) assert ( - cache_mod._decode_optional_wire_module_ints( + _decode_optional_wire_module_ints( obj={"tc": ["pkg.mod", "bad"]}, key="tc", expected_len=2, @@ -374,18 +641,18 @@ def test_cache_helper_type_guards_and_wire_api_decoders_cover_invalid_inputs() - ) is None ) - assert cache_mod._decode_wire_api_surface_symbol(["pkg.mod:run"]) is None + assert _decode_wire_api_surface_symbol(["pkg.mod:run"]) is None assert ( - cache_mod._decode_wire_api_surface_symbol( + _decode_wire_api_surface_symbol( ["pkg.mod:run", "function", 1, 2, "name", "", [None]] ) is None ) - assert cache_mod._decode_wire_api_param_spec(["value"]) is None - assert cache_mod._is_api_param_spec_dict([]) is False - assert cache_mod._is_public_symbol_dict([]) is False + assert _decode_wire_api_param_spec(["value"]) is None + assert _is_api_param_spec_dict([]) is False + assert _is_public_symbol_dict([]) is False assert ( - cache_mod._is_public_symbol_dict( + _is_public_symbol_dict( { "qualname": "pkg.mod:run", "kind": "function", @@ -464,14 +731,12 @@ def test_cache_signature_validation_ignores_json_whitespace(tmp_path: Path) -> N def test_decode_wire_file_and_name_section_helpers_cover_valid_and_invalid() -> None: - encoded = cache_mod._encode_wire_file_entry( + encoded = _encode_wire_file_entry( { "stat": {"mtime_ns": 1, "size": 10}, - "units": [cache_mod._unit_dict_from_model(_make_unit("x.py"), "x.py")], - "blocks": [cache_mod._block_dict_from_model(_make_block("x.py"), "x.py")], - "segments": [ - cache_mod._segment_dict_from_model(_make_segment("x.py"), "x.py") - ], + "units": [_unit_dict_from_model(_make_unit("x.py"), "x.py")], + "blocks": [_block_dict_from_model(_make_block("x.py"), "x.py")], + "segments": [_segment_dict_from_model(_make_segment("x.py"), "x.py")], "class_metrics": [], "module_deps": [], "dead_candidates": [], @@ -483,7 +748,7 @@ def test_decode_wire_file_and_name_section_helpers_cover_valid_and_invalid() -> ) assert isinstance(encoded, dict) - file_sections = cache_mod._decode_wire_file_sections(obj=encoded, filepath="x.py") + file_sections = _decode_wire_file_sections(obj=encoded, filepath="x.py") assert file_sections is not None units, blocks, segments, class_metrics, module_deps, dead_candidates = file_sections assert units[0]["qualname"] == "mod:func" @@ -493,7 +758,7 @@ def test_decode_wire_file_and_name_section_helpers_cover_valid_and_invalid() -> assert module_deps == [] assert dead_candidates == [] - name_sections = cache_mod._decode_wire_name_sections(obj=encoded) + name_sections = _decode_wire_name_sections(obj=encoded) assert name_sections == ( ["used"], ["pkg.mod:used"], @@ -504,7 +769,7 @@ def test_decode_wire_file_and_name_section_helpers_cover_valid_and_invalid() -> invalid_sections = dict(encoded) invalid_sections["u"] = "bad" assert ( - cache_mod._decode_wire_file_sections( + _decode_wire_file_sections( obj=invalid_sections, filepath="x.py", ) @@ -513,7 +778,7 @@ def test_decode_wire_file_and_name_section_helpers_cover_valid_and_invalid() -> invalid_names = dict(encoded) invalid_names["rn"] = 1 - assert cache_mod._decode_wire_name_sections(obj=invalid_names) is None + assert _decode_wire_name_sections(obj=invalid_names) is None def test_cache_signature_mismatch_warns(tmp_path: Path) -> None: @@ -577,7 +842,7 @@ def test_cache_v_field_version_mismatch_warns(tmp_path: Path, version: str) -> N def test_cache_too_large_warns(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: cache_path = tmp_path / "cache.json" cache_path.write_text(json.dumps({"version": Cache._CACHE_VERSION, "files": {}})) - monkeypatch.setattr(cache_mod, "MAX_CACHE_SIZE_BYTES", 1) + monkeypatch.setattr(cache_store, "MAX_CACHE_SIZE_BYTES", 1) cache = Cache(cache_path) cache.load() assert cache.load_warning is not None @@ -814,7 +1079,7 @@ def test_cache_entry_not_dict(tmp_path: Path) -> None: def test_file_stat_signature(tmp_path: Path) -> None: file_path = tmp_path / "x.py" file_path.write_text("print('x')\n", "utf-8") - stat = cache_mod.file_stat_signature(str(file_path)) + stat = file_stat_signature(str(file_path)) assert stat["size"] == file_path.stat().st_size assert isinstance(stat["mtime_ns"], int) @@ -1258,7 +1523,7 @@ def _resolve_with_error(self: Path, *, strict: bool = False) -> Path: def test_as_str_dict_rejects_non_string_keys() -> None: - assert cache_mod._as_str_dict({1: "x"}) is None + assert _as_str_dict({1: "x"}) is None @pytest.mark.parametrize( @@ -1277,31 +1542,29 @@ def test_as_str_dict_rejects_non_string_keys() -> None: ], ) def test_decode_wire_file_entry_invalid_variants(entry: object, filepath: str) -> None: - assert cache_mod._decode_wire_file_entry(entry, filepath) is None + assert _decode_wire_file_entry(entry, filepath) is None def test_decode_wire_item_type_failures() -> None: - assert cache_mod._decode_wire_unit(["q", 1, 2, 3, 4, "fp"], "x.py") is None - assert cache_mod._decode_wire_unit(["q", 1, 2, 3, 4, "fp", "0-19"], "x.py") is None - assert ( - cache_mod._decode_wire_unit(["q", "1", 2, 3, 4, "fp", "0-19"], "x.py") is None - ) - assert cache_mod._decode_wire_block(["q", 1, 2, 3], "x.py") is None - assert cache_mod._decode_wire_block(["q", 1, 2, "4", "hash"], "x.py") is None - assert cache_mod._decode_wire_segment(["q", 1, 2, 3, "h"], "x.py") is None - assert cache_mod._decode_wire_segment(["q", 1, 2, "3", "h", "sig"], "x.py") is None + assert _decode_wire_unit(["q", 1, 2, 3, 4, "fp"], "x.py") is None + assert _decode_wire_unit(["q", 1, 2, 3, 4, "fp", "0-19"], "x.py") is None + assert _decode_wire_unit(["q", "1", 2, 3, 4, "fp", "0-19"], "x.py") is None + assert _decode_wire_block(["q", 1, 2, 3], "x.py") is None + assert _decode_wire_block(["q", 1, 2, "4", "hash"], "x.py") is None + assert _decode_wire_segment(["q", 1, 2, 3, "h"], "x.py") is None + assert _decode_wire_segment(["q", 1, 2, "3", "h", "sig"], "x.py") is None def test_decode_wire_item_rejects_invalid_risk_fields() -> None: assert ( - cache_mod._decode_wire_unit( + _decode_wire_unit( ["q", 1, 2, 3, 4, "fp", "0-19", 2, 1, "critical", "raw"], "x.py", ) is None ) assert ( - cache_mod._decode_wire_class_metric( + _decode_wire_class_metric( ["pkg.mod:Service", 1, 10, 3, 2, 4, 1, 7, 8], "x.py", ) @@ -1320,7 +1583,7 @@ def _resolve_with_error(self: Path, *, strict: bool = False) -> Path: return original_resolve(self, strict=strict) monkeypatch.setattr(Path, "resolve", _resolve_with_error) - assert cache_mod._resolve_root(tmp_path) is None + assert _resolve_root(tmp_path) is None def test_cache_entry_rejects_invalid_metrics_sections(tmp_path: Path) -> None: @@ -1344,46 +1607,36 @@ def test_cache_entry_rejects_invalid_metrics_sections(tmp_path: Path) -> None: def test_decode_wire_file_entry_rejects_metrics_related_invalid_sections() -> None: + assert _decode_wire_file_entry({"st": [1, 2], "cm": "bad"}, "x.py") is None assert ( - cache_mod._decode_wire_file_entry({"st": [1, 2], "cm": "bad"}, "x.py") is None - ) - assert ( - cache_mod._decode_wire_file_entry( + _decode_wire_file_entry( {"st": [1, 2], "cm": [["Q", 1, 2, 3, 4, 5, 6, "low"]]}, "x.py", ) is None ) + assert _decode_wire_file_entry({"st": [1, 2], "md": "bad"}, "x.py") is None assert ( - cache_mod._decode_wire_file_entry({"st": [1, 2], "md": "bad"}, "x.py") is None - ) - assert ( - cache_mod._decode_wire_file_entry( + _decode_wire_file_entry( {"st": [1, 2], "md": [["source", "target", "import"]]}, "x.py", ) is None ) - assert ( - cache_mod._decode_wire_file_entry({"st": [1, 2], "dc": "bad"}, "x.py") is None - ) - decoded = cache_mod._decode_wire_file_entry( + assert _decode_wire_file_entry({"st": [1, 2], "dc": "bad"}, "x.py") is None + decoded = _decode_wire_file_entry( {"st": [1, 2], "dc": [["q", "n", 1, 2, "function"]]}, "x.py", ) assert decoded is not None assert decoded["dead_candidates"][0]["filepath"] == "x.py" - assert cache_mod._decode_wire_file_entry({"st": [1, 2], "rn": [1]}, "x.py") is None - assert cache_mod._decode_wire_file_entry({"st": [1, 2], "in": [1]}, "x.py") is None - assert cache_mod._decode_wire_file_entry({"st": [1, 2], "cn": [1]}, "x.py") is None - assert ( - cache_mod._decode_wire_file_entry({"st": [1, 2], "cc": "bad"}, "x.py") is None - ) - assert ( - cache_mod._decode_wire_file_entry({"st": [1, 2], "cc": [["Q"]]}, "x.py") is None - ) + assert _decode_wire_file_entry({"st": [1, 2], "rn": [1]}, "x.py") is None + assert _decode_wire_file_entry({"st": [1, 2], "in": [1]}, "x.py") is None + assert _decode_wire_file_entry({"st": [1, 2], "cn": [1]}, "x.py") is None + assert _decode_wire_file_entry({"st": [1, 2], "cc": "bad"}, "x.py") is None + assert _decode_wire_file_entry({"st": [1, 2], "cc": [["Q"]]}, "x.py") is None assert ( - cache_mod._decode_wire_file_entry( + _decode_wire_file_entry( {"st": [1, 2], "cc": [["Q", ["A", 1]]]}, "x.py", ) @@ -1392,7 +1645,7 @@ def test_decode_wire_file_entry_rejects_metrics_related_invalid_sections() -> No def test_decode_wire_file_entry_accepts_metrics_sections() -> None: - decoded = cache_mod._decode_wire_file_entry( + decoded = _decode_wire_file_entry( { "st": [1, 2], "cm": [["pkg.mod:Service", 1, 10, 3, 2, 4, 1, "low", "medium"]], @@ -1415,7 +1668,7 @@ def test_decode_wire_file_entry_accepts_metrics_sections() -> None: def test_decode_wire_file_entry_optional_source_stats() -> None: - decoded = cache_mod._decode_wire_file_entry( + decoded = _decode_wire_file_entry( {"st": [1, 2], "ss": [10, 3, 1, 1]}, "x.py", ) @@ -1427,20 +1680,16 @@ def test_decode_wire_file_entry_optional_source_stats() -> None: "classes": 1, } - assert cache_mod._decode_optional_wire_source_stats(obj={"ss": "bad"}) is None - assert cache_mod._decode_optional_wire_source_stats(obj={"ss": [1, 2, 3]}) is None - assert ( - cache_mod._decode_optional_wire_source_stats(obj={"ss": [1, 2, -1, 0]}) is None - ) + assert _decode_optional_wire_source_stats(obj={"ss": "bad"}) is None + assert _decode_optional_wire_source_stats(obj={"ss": [1, 2, 3]}) is None + assert _decode_optional_wire_source_stats(obj={"ss": [1, 2, -1, 0]}) is None def test_cache_helpers_cover_invalid_analysis_profile_and_source_stats_shapes() -> None: + assert _decode_wire_qualname_span_size(["pkg.mod:fn", 1, 2, "bad"]) is None + assert _decode_wire_qualname_span_size([None, 1, 2, 4]) is None assert ( - cache_mod._decode_wire_qualname_span_size(["pkg.mod:fn", 1, 2, "bad"]) is None - ) - assert cache_mod._decode_wire_qualname_span_size([None, 1, 2, 4]) is None - assert ( - cache_mod._as_analysis_profile( + _as_analysis_profile( { "min_loc": 1, "min_stmt": 1, @@ -1452,16 +1701,13 @@ def test_cache_helpers_cover_invalid_analysis_profile_and_source_stats_shapes() ) is None ) - assert ( - cache_mod._decode_optional_wire_source_stats(obj={"ss": [1, 2, "bad", 0]}) - is None - ) + assert _decode_optional_wire_source_stats(obj={"ss": [1, 2, "bad", 0]}) is None def test_canonicalize_cache_entry_skips_invalid_dead_candidate_suppression_shape() -> ( None ): - normalized = cache_mod._canonicalize_cache_entry( + normalized = _canonicalize_cache_entry( cast( Any, { @@ -1503,7 +1749,7 @@ def test_canonicalize_cache_entry_skips_invalid_dead_candidate_suppression_shape def test_decode_optional_wire_coupled_classes_rejects_non_string_qualname() -> None: assert ( - cache_mod._decode_optional_wire_coupled_classes( + _decode_optional_wire_coupled_classes( obj={"cc": [[1, ["A"]]]}, key="cc", ) @@ -1512,7 +1758,7 @@ def test_decode_optional_wire_coupled_classes_rejects_non_string_qualname() -> N def test_decode_wire_file_entry_skips_empty_coupled_classes_mapping() -> None: - decoded = cache_mod._decode_wire_file_entry( + decoded = _decode_wire_file_entry( { "st": [1, 2], "cm": [["pkg.mod:Service", 1, 10, 3, 2, 4, 1, "low", "medium"]], @@ -1525,46 +1771,46 @@ def test_decode_wire_file_entry_skips_empty_coupled_classes_mapping() -> None: def test_decode_wire_metrics_items_and_deps_roundtrip_shape() -> None: - class_metric = cache_mod._decode_wire_class_metric( + class_metric = _decode_wire_class_metric( ["pkg.mod:Service", 1, 10, 3, 2, 4, 1, "low", "medium"], "x.py", ) assert class_metric is not None assert class_metric["filepath"] == "x.py" assert ( - cache_mod._decode_wire_class_metric( + _decode_wire_class_metric( ["pkg.mod:Service", "1", 10, 3, 2, 4, 1, "low", "medium"], "x.py", ) is None ) - module_dep = cache_mod._decode_wire_module_dep(["a", "b", "import", 1]) + module_dep = _decode_wire_module_dep(["a", "b", "import", 1]) assert module_dep is not None assert module_dep["source"] == "a" - assert cache_mod._decode_wire_module_dep(["a", "b", "import", "1"]) is None + assert _decode_wire_module_dep(["a", "b", "import", "1"]) is None - dead_candidate = cache_mod._decode_wire_dead_candidate( + dead_candidate = _decode_wire_dead_candidate( ["pkg.mod:unused", "unused", 1, 2, "function"], "fallback.py", ) assert dead_candidate is not None assert dead_candidate["filepath"] == "fallback.py" assert ( - cache_mod._decode_wire_dead_candidate( + _decode_wire_dead_candidate( ["pkg.mod:unused", "unused", "1", 2, "function"], "fallback.py", ) is None ) assert ( - cache_mod._decode_wire_dead_candidate( + _decode_wire_dead_candidate( ["pkg.mod:unused", "unused", 1, 2, "function", "legacy.py"], "fallback.py", ) is None ) - dead_candidate_with_suppression = cache_mod._decode_wire_dead_candidate( + dead_candidate_with_suppression = _decode_wire_dead_candidate( ["pkg.mod:unused", "unused", 1, 2, "function", ["dead-code", "dead-code"]], "fallback.py", ) @@ -1573,7 +1819,7 @@ def test_decode_wire_metrics_items_and_deps_roundtrip_shape() -> None: def test_encode_wire_file_entry_includes_optional_metrics_sections() -> None: - entry: cache_mod.CacheEntry = { + entry: CacheEntry = { "stat": {"mtime_ns": 1, "size": 2}, "units": [], "blocks": [], @@ -1601,7 +1847,7 @@ def test_encode_wire_file_entry_includes_optional_metrics_sections() -> None: "import_names": ["z", "a"], "class_names": ["B", "A"], } - wire = cache_mod._encode_wire_file_entry(entry) + wire = _encode_wire_file_entry(entry) assert "cm" in wire assert "cc" in wire assert "md" in wire @@ -1611,7 +1857,7 @@ def test_encode_wire_file_entry_includes_optional_metrics_sections() -> None: def test_encode_wire_file_entry_compacts_dead_candidate_filepaths() -> None: - entry: cache_mod.CacheEntry = { + entry: CacheEntry = { "stat": {"mtime_ns": 1, "size": 2}, "units": [], "blocks": [], @@ -1632,12 +1878,12 @@ def test_encode_wire_file_entry_compacts_dead_candidate_filepaths() -> None: "import_names": [], "class_names": [], } - wire = cache_mod._encode_wire_file_entry(entry) + wire = _encode_wire_file_entry(entry) assert wire["dc"] == [["pkg.mod:unused", "unused", 3, 4, "function"]] def test_encode_wire_file_entry_encodes_dead_candidate_suppressions() -> None: - entry: cache_mod.CacheEntry = { + entry: CacheEntry = { "stat": {"mtime_ns": 1, "size": 2}, "units": [], "blocks": [], @@ -1659,12 +1905,12 @@ def test_encode_wire_file_entry_encodes_dead_candidate_suppressions() -> None: "import_names": [], "class_names": [], } - wire = cache_mod._encode_wire_file_entry(entry) + wire = _encode_wire_file_entry(entry) assert wire["dc"] == [["pkg.mod:unused", "unused", 3, 4, "function", ["dead-code"]]] def test_encode_wire_file_entry_skips_empty_or_invalid_coupled_classes() -> None: - entry: cache_mod.CacheEntry = { + entry: CacheEntry = { "stat": {"mtime_ns": 1, "size": 2}, "units": [], "blocks": [], @@ -1703,7 +1949,7 @@ def test_encode_wire_file_entry_skips_empty_or_invalid_coupled_classes() -> None "import_names": [], "class_names": [], } - wire = cache_mod._encode_wire_file_entry(entry) + wire = _encode_wire_file_entry(entry) assert "cc" not in wire @@ -1764,7 +2010,7 @@ def test_get_file_entry_sorts_coupled_classes_in_runtime_payload( def test_cache_entry_container_shape_rejects_invalid_source_stats() -> None: assert ( - cache_mod._has_cache_entry_container_shape( + _has_cache_entry_container_shape( { "stat": {"mtime_ns": 1, "size": 1}, "source_stats": { @@ -1783,11 +2029,11 @@ def test_cache_entry_container_shape_rejects_invalid_source_stats() -> None: def test_cache_type_predicates_reject_non_dict_variants() -> None: - assert cache_mod._is_class_metrics_dict([]) is False - assert cache_mod._is_module_dep_dict([]) is False - assert cache_mod._is_dead_candidate_dict([]) is False + assert _is_class_metrics_dict([]) is False + assert _is_module_dep_dict([]) is False + assert _is_dead_candidate_dict([]) is False assert ( - cache_mod._is_dead_candidate_dict( + _is_dead_candidate_dict( { "qualname": "pkg.mod:broken", "local_name": "broken", @@ -1799,7 +2045,7 @@ def test_cache_type_predicates_reject_non_dict_variants() -> None: is False ) assert ( - cache_mod._is_dead_candidate_dict( + _is_dead_candidate_dict( { "qualname": "pkg.mod:unused", "local_name": "unused", @@ -1813,7 +2059,7 @@ def test_cache_type_predicates_reject_non_dict_variants() -> None: is True ) assert ( - cache_mod._is_dead_candidate_dict( + _is_dead_candidate_dict( { "qualname": "pkg.mod:unused", "local_name": "unused", @@ -1827,7 +2073,7 @@ def test_cache_type_predicates_reject_non_dict_variants() -> None: is False ) assert ( - cache_mod._is_class_metrics_dict( + _is_class_metrics_dict( { "qualname": "pkg.mod:Service", "filepath": "x.py", @@ -1844,7 +2090,7 @@ def test_cache_type_predicates_reject_non_dict_variants() -> None: is True ) assert ( - cache_mod._is_class_metrics_dict( + _is_class_metrics_dict( { "qualname": "pkg.mod:Service", "filepath": "x.py", @@ -1862,7 +2108,7 @@ def test_cache_type_predicates_reject_non_dict_variants() -> None: is True ) assert ( - cache_mod._is_class_metrics_dict( + _is_class_metrics_dict( { "qualname": "pkg.mod:Service", "filepath": "x.py", @@ -1879,9 +2125,9 @@ def test_cache_type_predicates_reject_non_dict_variants() -> None: ) is False ) - assert cache_mod._is_class_metrics_dict({"qualname": "pkg.mod:Service"}) is False + assert _is_class_metrics_dict({"qualname": "pkg.mod:Service"}) is False assert ( - cache_mod._is_module_dep_dict( + _is_module_dep_dict( { "source": "a", "target": "b", @@ -1894,12 +2140,12 @@ def test_cache_type_predicates_reject_non_dict_variants() -> None: def test_decode_wire_int_fields_rejects_non_int_values() -> None: - assert cache_mod._decode_wire_int_fields(["x", "nope"], 1) is None + assert _decode_wire_int_fields(["x", "nope"], 1) is None def test_decode_wire_block_rejects_missing_block_hash() -> None: assert ( - cache_mod._decode_wire_block( + _decode_wire_block( ["pkg.mod:func", 10, 12, 4, None], "pkg/mod.py", ) @@ -1909,7 +2155,7 @@ def test_decode_wire_block_rejects_missing_block_hash() -> None: def test_decode_wire_segment_rejects_missing_segment_signature() -> None: assert ( - cache_mod._decode_wire_segment( + _decode_wire_segment( ["pkg.mod:func", 10, 12, 4, "seg-hash", None], "pkg/mod.py", ) @@ -1918,4 +2164,4 @@ def test_decode_wire_segment_rejects_missing_segment_signature() -> None: def test_decode_wire_dead_candidate_rejects_invalid_rows() -> None: - assert cache_mod._decode_wire_dead_candidate(object(), "pkg/mod.py") is None + assert _decode_wire_dead_candidate(object(), "pkg/mod.py") is None diff --git a/tests/test_cfg.py b/tests/test_cfg.py index 28a423e..7ea6c74 100644 --- a/tests/test_cfg.py +++ b/tests/test_cfg.py @@ -9,12 +9,12 @@ import pytest -from codeclone.cfg import CFG, CFGBuilder -from codeclone.cfg_model import CFG as CFGModel -from codeclone.cfg_model import Block -from codeclone.extractor import _cfg_fingerprint_and_complexity +from codeclone.analysis.cfg import CFG, CFGBuilder +from codeclone.analysis.cfg_model import CFG as CFGModel +from codeclone.analysis.cfg_model import Block +from codeclone.analysis.fingerprint import _cfg_fingerprint_and_complexity +from codeclone.analysis.normalizer import NormalizationConfig from codeclone.meta_markers import CFG_META_PREFIX -from codeclone.normalize import NormalizationConfig from tests._ast_helpers import fix_missing_single_function diff --git a/tests/test_cfg_model.py b/tests/test_cfg_model.py index 36c4eee..f6cb564 100644 --- a/tests/test_cfg_model.py +++ b/tests/test_cfg_model.py @@ -4,7 +4,7 @@ # SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy -from codeclone.cfg_model import CFG, Block +from codeclone.analysis.cfg_model import CFG, Block def test_block_hash_and_eq() -> None: diff --git a/tests/test_cli_config.py b/tests/test_cli_config.py index 365d5f1..790f7e6 100644 --- a/tests/test_cli_config.py +++ b/tests/test_cli_config.py @@ -13,8 +13,10 @@ import pytest -import codeclone._cli_config as cfg_mod -from codeclone._cli_config import ConfigValidationError +import codeclone.config.pyproject_loader as loader_mod +import codeclone.config.resolver as resolver_mod +import codeclone.config.spec as spec_mod +from codeclone.config.pyproject_loader import ConfigValidationError def _write_pyproject(path: Path, content: str) -> None: @@ -26,7 +28,7 @@ def test_collect_explicit_cli_dests_stops_on_double_dash() -> None: parser.add_argument("--min-loc", dest="min_loc", type=int, default=20) parser.add_argument("--quiet", action="store_true") parser.add_argument("--json", dest="json_out") - explicit = cfg_mod.collect_explicit_cli_dests( + explicit = resolver_mod.collect_explicit_cli_dests( parser, argv=("--min-loc=10", "--quiet", "--", "--json", "report.json"), ) @@ -34,7 +36,7 @@ def test_collect_explicit_cli_dests_stops_on_double_dash() -> None: def test_load_pyproject_config_missing_file_returns_empty(tmp_path: Path) -> None: - assert cfg_mod.load_pyproject_config(tmp_path) == {} + assert loader_mod.load_pyproject_config(tmp_path) == {} def test_load_pyproject_config_raises_on_loader_errors( @@ -46,19 +48,19 @@ def test_load_pyproject_config_raises_on_loader_errors( def _raise_oserror(_path: Path) -> object: raise OSError("denied") - monkeypatch.setattr(cfg_mod, "_load_toml", _raise_oserror) + monkeypatch.setattr(loader_mod, "_load_toml", _raise_oserror) with pytest.raises( ConfigValidationError, match=r"Cannot read pyproject\.toml", ): - cfg_mod.load_pyproject_config(tmp_path) + loader_mod.load_pyproject_config(tmp_path) def _raise_value_error(_path: Path) -> object: raise ValueError("broken") - monkeypatch.setattr(cfg_mod, "_load_toml", _raise_value_error) + monkeypatch.setattr(loader_mod, "_load_toml", _raise_value_error) with pytest.raises(ConfigValidationError, match="Invalid TOML"): - cfg_mod.load_pyproject_config(tmp_path) + loader_mod.load_pyproject_config(tmp_path) def test_load_pyproject_config_validates_tool_structure( @@ -67,31 +69,37 @@ def test_load_pyproject_config_validates_tool_structure( pyproject = tmp_path / "pyproject.toml" _write_pyproject(pyproject, "[tool]\n") - monkeypatch.setattr(cfg_mod, "_load_toml", lambda _path: []) + monkeypatch.setattr(loader_mod, "_load_toml", lambda _path: []) with pytest.raises(ConfigValidationError, match="root must be object"): - cfg_mod.load_pyproject_config(tmp_path) + loader_mod.load_pyproject_config(tmp_path) - monkeypatch.setattr(cfg_mod, "_load_toml", lambda _path: {"tool": "bad"}) + monkeypatch.setattr(loader_mod, "_load_toml", lambda _path: {"tool": "bad"}) with pytest.raises(ConfigValidationError, match="'tool' must be object"): - cfg_mod.load_pyproject_config(tmp_path) + loader_mod.load_pyproject_config(tmp_path) monkeypatch.setattr( - cfg_mod, "_load_toml", lambda _path: {"tool": {"codeclone": []}} + loader_mod, + "_load_toml", + lambda _path: {"tool": {"codeclone": []}}, ) with pytest.raises( ConfigValidationError, match=r"'tool\.codeclone' must be object", ): - cfg_mod.load_pyproject_config(tmp_path) + loader_mod.load_pyproject_config(tmp_path) - monkeypatch.setattr(cfg_mod, "_load_toml", lambda _path: {"tool": {}}) - assert cfg_mod.load_pyproject_config(tmp_path) == {} + monkeypatch.setattr(loader_mod, "_load_toml", lambda _path: {"tool": {}}) + assert loader_mod.load_pyproject_config(tmp_path) == {} - monkeypatch.setattr(cfg_mod, "_load_toml", lambda _path: {"tool": None}) - assert cfg_mod.load_pyproject_config(tmp_path) == {} + monkeypatch.setattr(loader_mod, "_load_toml", lambda _path: {"tool": None}) + assert loader_mod.load_pyproject_config(tmp_path) == {} - monkeypatch.setattr(cfg_mod, "_load_toml", lambda _path: {"tool": {"other": {}}}) - assert cfg_mod.load_pyproject_config(tmp_path) == {} + monkeypatch.setattr( + loader_mod, + "_load_toml", + lambda _path: {"tool": {"other": {}}}, + ) + assert loader_mod.load_pyproject_config(tmp_path) == {} def test_load_pyproject_config_unknown_key_rejected( @@ -100,12 +108,12 @@ def test_load_pyproject_config_unknown_key_rejected( pyproject = tmp_path / "pyproject.toml" _write_pyproject(pyproject, "[tool]\n") monkeypatch.setattr( - cfg_mod, + loader_mod, "_load_toml", lambda _path: {"tool": {"codeclone": {"unknown_option": 1}}}, ) with pytest.raises(ConfigValidationError, match="Unknown key\\(s\\)"): - cfg_mod.load_pyproject_config(tmp_path) + loader_mod.load_pyproject_config(tmp_path) def test_load_pyproject_config_normalizes_relative_and_absolute_paths( @@ -122,7 +130,7 @@ def test_load_pyproject_config_normalizes_relative_and_absolute_paths( sarif_out = "reports/report.sarif" """.strip(), ) - loaded = cfg_mod.load_pyproject_config(tmp_path) + loaded = loader_mod.load_pyproject_config(tmp_path) assert loaded["min_loc"] == 5 assert loaded["cache_path"] == str(tmp_path / ".cache/codeclone/cache.json") assert loaded["json_out"] == "/tmp/report.json" @@ -132,7 +140,7 @@ def test_load_pyproject_config_normalizes_relative_and_absolute_paths( def test_apply_pyproject_config_overrides_respects_explicit_cli_flags() -> None: args = argparse.Namespace(min_loc=10, quiet=False) - cfg_mod.apply_pyproject_config_overrides( + resolver_mod.apply_pyproject_config_overrides( args=args, config_values={"min_loc": 42, "quiet": True}, explicit_cli_dests={"quiet"}, @@ -158,7 +166,7 @@ def test_apply_pyproject_config_overrides_respects_explicit_cli_flags() -> None: def test_validate_config_value_accepts_expected_types( key: str, value: object, expected: object ) -> None: - assert cfg_mod._validate_config_value(key=key, value=value) == expected + assert loader_mod.validate_config_value(key=key, value=value) == expected @pytest.mark.parametrize( @@ -168,7 +176,11 @@ def test_validate_config_value_accepts_expected_types( ("update_baseline", "yes", "expected bool"), ("min_loc", True, "expected int"), ("baseline", 1, "expected str"), - ("golden_fixture_paths", "tests/fixtures/golden_*", "expected list\\[str\\]"), + ( + "golden_fixture_paths", + "tests/fixtures/golden_*", + "expected list\\[str\\]", + ), ( "golden_fixture_paths", ["tests/fixtures/golden_*", 1], @@ -181,24 +193,24 @@ def test_validate_config_value_rejects_invalid_types( key: str, value: object, error_fragment: str ) -> None: with pytest.raises(ConfigValidationError, match=error_fragment): - cfg_mod._validate_config_value(key=key, value=value) + loader_mod.validate_config_value(key=key, value=value) def test_validate_config_value_unsupported_spec_raises( monkeypatch: pytest.MonkeyPatch, ) -> None: monkeypatch.setitem( - cfg_mod._CONFIG_KEY_SPECS, + loader_mod.CONFIG_KEY_SPECS, "_unsupported", - cfg_mod._ConfigKeySpec(tuple), + spec_mod.ConfigKeySpec(tuple), ) with pytest.raises(ConfigValidationError, match="Unsupported config key spec"): - cfg_mod._validate_config_value(key="_unsupported", value=("x",)) + loader_mod.validate_config_value(key="_unsupported", value=("x",)) def test_normalize_path_config_value_behaviour(tmp_path: Path) -> None: assert ( - cfg_mod._normalize_path_config_value( + loader_mod.normalize_path_config_value( key="min_loc", value=10, root_path=tmp_path, @@ -206,20 +218,20 @@ def test_normalize_path_config_value_behaviour(tmp_path: Path) -> None: == 10 ) assert ( - cfg_mod._normalize_path_config_value( + loader_mod.normalize_path_config_value( key="cache_path", value=123, root_path=tmp_path, ) == 123 ) - assert cfg_mod._normalize_path_config_value( + assert loader_mod.normalize_path_config_value( key="cache_path", value="relative/cache.json", root_path=tmp_path, ) == str(tmp_path / "relative/cache.json") assert ( - cfg_mod._normalize_path_config_value( + loader_mod.normalize_path_config_value( key="cache_path", value="/tmp/absolute-cache.json", root_path=tmp_path, @@ -228,7 +240,7 @@ def test_normalize_path_config_value_behaviour(tmp_path: Path) -> None: ) patterns = ("tests/fixtures/golden_*",) assert ( - cfg_mod._normalize_path_config_value( + loader_mod.normalize_path_config_value( key="golden_fixture_paths", value=patterns, root_path=tmp_path, @@ -248,7 +260,7 @@ def test_load_pyproject_config_accepts_golden_fixture_paths(tmp_path: Path) -> N ] """.strip(), ) - loaded = cfg_mod.load_pyproject_config(tmp_path) + loaded = loader_mod.load_pyproject_config(tmp_path) assert loaded["golden_fixture_paths"] == ("tests/fixtures/golden_*",) @@ -257,18 +269,18 @@ def test_load_toml_py310_missing_tomli_raises( ) -> None: toml_path = tmp_path / "pyproject.toml" _write_pyproject(toml_path, "[tool]\n") - monkeypatch.setattr(cfg_mod, "sys", SimpleNamespace(version_info=(3, 10, 14))) + monkeypatch.setattr(loader_mod, "sys", SimpleNamespace(version_info=(3, 10, 14))) def _raise_module_not_found(_name: str) -> object: raise ModuleNotFoundError("tomli") monkeypatch.setattr( - cfg_mod, + loader_mod, "importlib", SimpleNamespace(import_module=_raise_module_not_found), ) with pytest.raises(ConfigValidationError, match="requires dependency 'tomli'"): - cfg_mod._load_toml(toml_path) + loader_mod._load_toml(toml_path) def test_load_toml_py310_invalid_tomli_module_raises( @@ -276,14 +288,14 @@ def test_load_toml_py310_invalid_tomli_module_raises( ) -> None: toml_path = tmp_path / "pyproject.toml" _write_pyproject(toml_path, "[tool]\n") - monkeypatch.setattr(cfg_mod, "sys", SimpleNamespace(version_info=(3, 10, 14))) + monkeypatch.setattr(loader_mod, "sys", SimpleNamespace(version_info=(3, 10, 14))) monkeypatch.setattr( - cfg_mod, + loader_mod, "importlib", SimpleNamespace(import_module=lambda _name: object()), ) with pytest.raises(ConfigValidationError, match="missing callable 'load'"): - cfg_mod._load_toml(toml_path) + loader_mod._load_toml(toml_path) def test_load_toml_py310_uses_tomli_load( @@ -291,7 +303,7 @@ def test_load_toml_py310_uses_tomli_load( ) -> None: toml_path = tmp_path / "pyproject.toml" _write_pyproject(toml_path, "[tool]\n") - monkeypatch.setattr(cfg_mod, "sys", SimpleNamespace(version_info=(3, 10, 14))) + monkeypatch.setattr(loader_mod, "sys", SimpleNamespace(version_info=(3, 10, 14))) class _FakeTomli: @staticmethod @@ -301,8 +313,8 @@ def load(file_obj: Any) -> dict[str, object]: return {"tool": {}} monkeypatch.setattr( - cfg_mod, + loader_mod, "importlib", SimpleNamespace(import_module=lambda _name: _FakeTomli), ) - assert cfg_mod._load_toml(toml_path) == {"tool": {}} + assert loader_mod._load_toml(toml_path) == {"tool": {}} diff --git a/tests/test_cli_help_snapshot.py b/tests/test_cli_help_snapshot.py new file mode 100644 index 0000000..ce9e818 --- /dev/null +++ b/tests/test_cli_help_snapshot.py @@ -0,0 +1,25 @@ +from __future__ import annotations + +import os +import subprocess +import sys +from pathlib import Path + +from tests._contract_snapshots import load_text_snapshot + + +def test_cli_help_snapshot() -> None: + root_dir = Path(__file__).resolve().parents[1] + env = os.environ.copy() + env["PYTHONPATH"] = str(root_dir) + os.pathsep + env.get("PYTHONPATH", "") + result = subprocess.run( + [sys.executable, "-m", "codeclone.main", "--help"], + capture_output=True, + text=True, + env=env, + check=False, + ) + + assert result.returncode == 0 + assert result.stderr == "" + assert result.stdout.replace("\r\n", "\n") == load_text_snapshot("cli_help.txt") diff --git a/tests/test_cli_inprocess.py b/tests/test_cli_inprocess.py index c7f2f30..9ceaab2 100644 --- a/tests/test_cli_inprocess.py +++ b/tests/test_cli_inprocess.py @@ -16,21 +16,29 @@ import pytest -import codeclone._cli_meta as cli_meta -import codeclone._cli_reports as cli_reports import codeclone.baseline as baseline -import codeclone.pipeline as pipeline -from codeclone import __version__, cli -from codeclone._cli_gating import parse_metric_reason_entry -from codeclone.cache import Cache, file_stat_signature +import codeclone.baseline.trust as baseline_trust +import codeclone.core.discovery as core_discovery +import codeclone.core.parallelism as core_parallelism +import codeclone.core.pipeline as core_pipeline +import codeclone.core.worker as core_worker +import codeclone.surfaces.cli.report_meta as cli_meta +import codeclone.surfaces.cli.reports_output as cli_reports +import codeclone.surfaces.cli.tips as cli_tips +import codeclone.surfaces.cli.workflow as cli +from codeclone import __version__ +from codeclone.cache.store import Cache, file_stat_signature from codeclone.contracts import ( BASELINE_FINGERPRINT_VERSION, BASELINE_SCHEMA_VERSION, CACHE_VERSION, REPORT_SCHEMA_VERSION, ) -from codeclone.errors import CacheError +from codeclone.contracts.errors import CacheError +from codeclone.core._types import FileProcessResult as CliFileProcessResult +from codeclone.core.parallelism import _parallel_min_files from codeclone.models import Unit +from codeclone.report.gates.reasons import parse_metric_reason_entry from tests._assertions import ( assert_contains_all, assert_mapping_entries, @@ -167,8 +175,8 @@ def _patch_dummy_progress(monkeypatch: pytest.MonkeyPatch) -> None: def _patch_parallel(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setattr(pipeline, "ProcessPoolExecutor", _DummyExecutor) - monkeypatch.setattr(pipeline, "as_completed", lambda futures: futures) + monkeypatch.setattr(core_parallelism, "ProcessPoolExecutor", _DummyExecutor) + monkeypatch.setattr(core_parallelism, "as_completed", lambda futures: futures) def _run_main(monkeypatch: pytest.MonkeyPatch, args: Iterable[str]) -> None: @@ -261,11 +269,11 @@ def _patch_fixed_executor( monkeypatch: pytest.MonkeyPatch, future: _FixedFuture ) -> None: monkeypatch.setattr( - pipeline, + core_parallelism, "ProcessPoolExecutor", lambda *args, **kwargs: _FixedExecutor(future), ) - monkeypatch.setattr(pipeline, "as_completed", lambda futures: futures) + monkeypatch.setattr(core_parallelism, "as_completed", lambda futures: futures) def _baseline_payload( @@ -309,7 +317,7 @@ def _baseline_payload( and isinstance(meta_python_tag, str) and payload_sha256 is None ): - hash_value = baseline._compute_payload_sha256( + hash_value = baseline_trust._compute_payload_sha256( functions=set(function_list), blocks=set(block_list), fingerprint_version=meta_fingerprint, @@ -519,6 +527,9 @@ def put_file_entry( def save(self) -> None: return None + def prune_file_entries(self, existing_filepaths: object) -> int: + return 0 + monkeypatch.setattr(cli, "Cache", _CacheStub) _write_default_source(tmp_path) _run_parallel_main(monkeypatch, [str(tmp_path), *extra_args, "--no-progress"]) @@ -534,7 +545,7 @@ def _assert_worker_failure_internal_error( ) -> None: _write_default_source(tmp_path) - def _boom(*_args: object, **_kwargs: object) -> cli.ProcessingResult: + def _boom(*_args: object, **_kwargs: object) -> CliFileProcessResult: raise RuntimeError("boom") class _FailExec: @@ -554,8 +565,8 @@ def __exit__( if not no_progress: _patch_dummy_progress(monkeypatch) - monkeypatch.setattr(pipeline, "ProcessPoolExecutor", _FailExec) - monkeypatch.setattr(pipeline, "process_file", _boom) + monkeypatch.setattr(core_parallelism, "ProcessPoolExecutor", _FailExec) + monkeypatch.setattr(core_worker, "process_file", _boom) args = [str(tmp_path)] if no_progress: args.append("--no-progress") @@ -686,8 +697,8 @@ def _prepare_single_source_cache(tmp_path: Path) -> tuple[Path, Path, Cache]: return src, cache_path, Cache(cache_path) -def _source_read_error_result(filepath: str) -> cli.ProcessingResult: - return cli.ProcessingResult( +def _source_read_error_result(filepath: str) -> CliFileProcessResult: + return CliFileProcessResult( filepath=filepath, success=False, error="Cannot read file: [Errno 13] Permission denied", @@ -801,6 +812,9 @@ def put_file_entry( def save(self) -> None: return None + def prune_file_entries(self, existing_filepaths: object) -> int: + return 0 + monkeypatch.setattr(cli, "Cache", _CacheStub) _patch_parallel(monkeypatch) _run_main(monkeypatch, [str(root1), "--no-progress"]) @@ -821,7 +835,7 @@ def test_cli_cache_not_shared_between_projects( legacy_cache.parent.mkdir(parents=True, exist_ok=True) legacy_cache.write_text("{}", "utf-8") - monkeypatch.setattr(pipeline, "iter_py_files", lambda _root: []) + monkeypatch.setattr(core_discovery, "iter_py_files", lambda _root: []) _patch_parallel(monkeypatch) _run_main(monkeypatch, [str(root2), "--no-progress"]) out = capsys.readouterr().out @@ -966,6 +980,9 @@ def put_file_entry( def save(self) -> None: return None + def prune_file_entries(self, existing_filepaths: object) -> int: + return 0 + monkeypatch.setattr(cli, "LEGACY_CACHE_PATH", _LegacyPathSame(cache_path)) monkeypatch.setattr(cli, "Cache", _CacheStub) _patch_parallel(monkeypatch) @@ -1015,6 +1032,9 @@ def put_file_entry( def save(self) -> None: return None + def prune_file_entries(self, existing_filepaths: object) -> int: + return 0 + monkeypatch.setattr(cli, "Cache", _CacheStub) _patch_parallel(monkeypatch) _run_main( @@ -1037,10 +1057,10 @@ def test_cli_main_progress_fallback( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - for idx in range(pipeline._parallel_min_files(2) + 1): + for idx in range(_parallel_min_files(2) + 1): src = tmp_path / f"a{idx}.py" src.write_text("def f():\n return 1\n", "utf-8") - monkeypatch.setattr(pipeline, "ProcessPoolExecutor", _FailingExecutor) + monkeypatch.setattr(core_parallelism, "ProcessPoolExecutor", _FailingExecutor) _run_main(monkeypatch, [str(tmp_path), "--processes", "2"]) out = capsys.readouterr().out assert "falling back to sequential" in out @@ -1051,10 +1071,10 @@ def test_cli_main_no_progress_fallback( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - for idx in range(pipeline._parallel_min_files(2) + 1): + for idx in range(_parallel_min_files(2) + 1): src = tmp_path / f"a{idx}.py" src.write_text("def f():\n return 1\n", "utf-8") - monkeypatch.setattr(pipeline, "ProcessPoolExecutor", _FailingExecutor) + monkeypatch.setattr(core_parallelism, "ProcessPoolExecutor", _FailingExecutor) _run_main(monkeypatch, [str(tmp_path), "--processes", "2", "--no-progress"]) out = capsys.readouterr().out assert "falling back to sequential" in out @@ -1071,7 +1091,7 @@ def test_cli_main_no_progress_fallback_quiet( tmp_path / "baseline.json", python_version=f"{sys.version_info.major}.{sys.version_info.minor}", ) - monkeypatch.setattr(pipeline, "ProcessPoolExecutor", _FailingExecutor) + monkeypatch.setattr(core_parallelism, "ProcessPoolExecutor", _FailingExecutor) _run_main( monkeypatch, [ @@ -1129,7 +1149,7 @@ def _boom(*_args: object, **_kwargs: object) -> object: raise RuntimeError("boom") _patch_parallel(monkeypatch) - monkeypatch.setattr(pipeline, "build_groups", _boom) + monkeypatch.setattr(core_pipeline, "build_groups", _boom) with pytest.raises(SystemExit) as exc: _run_main(monkeypatch, [str(tmp_path), "--no-progress"]) assert exc.value.code == 5 @@ -1598,11 +1618,49 @@ def test_cli_legacy_baseline_fail_on_new_fails_fast_exit_2( out, "legacy (<=1.3.x)", "Invalid baseline file", - "CI requires a trusted baseline", + "Baseline-aware gates require a trusted baseline", "Run: codeclone . --update-baseline", ) +def test_cli_shared_baseline_mismatch_is_reported_once_without_ci_label( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + _write_default_source(tmp_path) + mismatch_tag = "cp313" if baseline.current_python_tag() != "cp313" else "cp314" + _write_baseline( + tmp_path / "codeclone.baseline.json", + python_version="3.13" if mismatch_tag == "cp313" else "3.14", + python_tag=mismatch_tag, + ) + (tmp_path / "pyproject.toml").write_text( + """ +[tool.codeclone] +fail_on_new = true +fail_on_new_metrics = true +""".strip() + + "\n", + "utf-8", + ) + + _assert_parallel_cli_exit( + monkeypatch, + [ + str(tmp_path), + "--html", + "--no-progress", + ], + expected_code=2, + ) + + out = capsys.readouterr().out + assert out.count("Invalid baseline file") == 1 + assert "CI requires a trusted baseline" not in out + assert "Baseline-aware gates require a trusted baseline" in out + + def test_cli_reports_include_audit_metadata_integrity_failed( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, @@ -2242,6 +2300,35 @@ def test_cli_outputs_quiet_no_print( assert "report saved" not in out +def test_cli_shows_vscode_extension_tip_once_per_version( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + _write_default_source(tmp_path) + tips_path = tmp_path / ".cache" / "codeclone" / "tips.json" + + monkeypatch.setenv("TERM_PROGRAM", "vscode") + monkeypatch.delenv("CI", raising=False) + monkeypatch.delenv("GITHUB_ACTIONS", raising=False) + monkeypatch.setattr(cli_tips, "_stream_is_tty", lambda _stream: True) + + _run_parallel_main(monkeypatch, [str(tmp_path), "--no-progress", "--no-color"]) + first_out = capsys.readouterr().out + + assert "VS Code detected" in first_out + assert "marketplace.visualstudio.com" in first_out + assert first_out.index("Summary") < first_out.index("Tip:") + + state = json.loads(tips_path.read_text("utf-8")) + assert state["tips"]["vscode_extension"]["last_shown_version"] == __version__ + + _run_parallel_main(monkeypatch, [str(tmp_path), "--no-progress", "--no-color"]) + second_out = capsys.readouterr().out + + assert "VS Code detected" not in second_out + + def test_cli_update_baseline_skips_version_check( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, @@ -3067,7 +3154,7 @@ def test_cli_discovery_skip_oserror( def _bad_stat(_path: str) -> dict[str, int]: raise OSError("nope") - monkeypatch.setattr(pipeline, "file_stat_signature", _bad_stat) + monkeypatch.setattr(core_discovery, "file_stat_signature", _bad_stat) _patch_parallel(monkeypatch) args = [str(tmp_path), *extra_args] if "--ci" in extra_args: @@ -3103,10 +3190,10 @@ def test_cli_unreadable_source_normal_mode_warns_and_continues( def _source_read_error( fp: str, *_args: object, **_kwargs: object - ) -> cli.ProcessingResult: + ) -> CliFileProcessResult: return _source_read_error_result(fp) - monkeypatch.setattr(pipeline, "process_file", _source_read_error) + monkeypatch.setattr(core_worker, "process_file", _source_read_error) _run_parallel_main( monkeypatch, [ @@ -3137,10 +3224,10 @@ def test_cli_unreadable_source_fails_in_ci_with_contract_error( def _source_read_error( fp: str, *_args: object, **_kwargs: object - ) -> cli.ProcessingResult: + ) -> CliFileProcessResult: return _source_read_error_result(fp) - monkeypatch.setattr(pipeline, "process_file", _source_read_error) + monkeypatch.setattr(core_worker, "process_file", _source_read_error) _patch_parallel(monkeypatch) with pytest.raises(SystemExit) as exc: _run_main( @@ -3196,7 +3283,7 @@ def test_cli_contract_error_priority_over_gating_failure_for_unreadable_source( def _source_read_error( fp: str, *_args: object, **_kwargs: object - ) -> cli.ProcessingResult: + ) -> CliFileProcessResult: return _source_read_error_result(fp) def _diff( @@ -3204,7 +3291,7 @@ def _diff( ) -> tuple[set[str], set[str]]: return {"f1"}, set() - monkeypatch.setattr(pipeline, "process_file", _source_read_error) + monkeypatch.setattr(core_worker, "process_file", _source_read_error) monkeypatch.setattr(baseline.Baseline, "diff", _diff) _patch_parallel(monkeypatch) with pytest.raises(SystemExit) as exc: @@ -3241,10 +3328,10 @@ def test_cli_unreadable_source_ci_shows_overflow_summary( def _source_read_error( fp: str, *_args: object, **_kwargs: object - ) -> cli.ProcessingResult: + ) -> CliFileProcessResult: return _source_read_error_result(fp) - monkeypatch.setattr(pipeline, "process_file", _source_read_error) + monkeypatch.setattr(core_worker, "process_file", _source_read_error) _patch_parallel(monkeypatch) with pytest.raises(SystemExit) as exc: _run_main( @@ -3425,7 +3512,16 @@ def test_cli_summary_with_metrics_baseline_shows_metrics_section( ], ) out = capsys.readouterr().out - assert_contains_all(out, "Metrics", "Adoption", "Overloaded") + assert_contains_all( + out, + "Metrics", + "Dependencies", + "avg", + "p95", + "max", + "Adoption", + "Overloaded", + ) def test_cli_summary_with_api_surface_shows_public_api_line( @@ -3481,7 +3577,17 @@ def test_cli_ci_summary_includes_adoption_and_public_api_lines( ], ) out = capsys.readouterr().out - assert_contains_all(out, "Adoption", "Public API", "symbols=", "docstrings=") + assert_contains_all( + out, + "Dependencies", + "avg=", + "p95=", + "max=", + "Adoption", + "Public API", + "symbols=", + "docstrings=", + ) def test_cli_pyproject_golden_fixture_paths_exclude_fixture_clone_groups( @@ -3664,7 +3770,7 @@ def test_cli_scan_failed_is_internal_error( def _boom(_root: str) -> Iterable[str]: raise RuntimeError("scan failed") - monkeypatch.setattr(pipeline, "iter_py_files", _boom) + monkeypatch.setattr(core_discovery, "iter_py_files", _boom) with pytest.raises(SystemExit) as exc: _run_main(monkeypatch, [str(tmp_path)]) assert exc.value.code == 5 @@ -3680,7 +3786,7 @@ def test_cli_scan_oserror_is_contract_error( def _boom(_root: str) -> Iterable[str]: raise OSError("scan denied") - monkeypatch.setattr(pipeline, "iter_py_files", _boom) + monkeypatch.setattr(core_discovery, "iter_py_files", _boom) with pytest.raises(SystemExit) as exc: _run_main(monkeypatch, [str(tmp_path)]) assert exc.value.code == 2 @@ -3699,10 +3805,10 @@ def test_cli_failed_files_report( def _bad_process( _fp: str, *_args: object, **_kwargs: object - ) -> cli.ProcessingResult: - return cli.ProcessingResult(filepath=_fp, success=False, error="bad") + ) -> CliFileProcessResult: + return CliFileProcessResult(filepath=_fp, success=False, error="bad") - monkeypatch.setattr(pipeline, "process_file", _bad_process) + monkeypatch.setattr(core_worker, "process_file", _bad_process) _patch_parallel(monkeypatch) _run_main(monkeypatch, [str(tmp_path), "--no-progress"]) out = capsys.readouterr().out @@ -3720,10 +3826,10 @@ def test_cli_failed_files_report_single( def _bad_process( _fp: str, *_args: object, **_kwargs: object - ) -> cli.ProcessingResult: - return cli.ProcessingResult(filepath=_fp, success=False, error="bad") + ) -> CliFileProcessResult: + return CliFileProcessResult(filepath=_fp, success=False, error="bad") - monkeypatch.setattr(pipeline, "process_file", _bad_process) + monkeypatch.setattr(core_worker, "process_file", _bad_process) _patch_parallel(monkeypatch) _run_main(monkeypatch, [str(tmp_path), "--no-progress"]) out = capsys.readouterr().out @@ -3739,10 +3845,10 @@ def test_cli_worker_failed( src = tmp_path / "a.py" src.write_text("def f():\n return 1\n", "utf-8") - def _boom(*_args: object, **_kwargs: object) -> cli.ProcessingResult: + def _boom(*_args: object, **_kwargs: object) -> CliFileProcessResult: raise RuntimeError("boom") - monkeypatch.setattr(pipeline, "process_file", _boom) + monkeypatch.setattr(core_worker, "process_file", _boom) _patch_parallel(monkeypatch) with pytest.raises(SystemExit) as exc: _run_main(monkeypatch, [str(tmp_path), "--no-progress"]) @@ -3971,7 +4077,7 @@ def test_cli_batch_result_none_no_progress( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - for idx in range(pipeline._parallel_min_files(2) + 1): + for idx in range(_parallel_min_files(2) + 1): src = tmp_path / f"a{idx}.py" src.write_text("def f():\n return 1\n", "utf-8") _patch_fixed_executor(monkeypatch, _FixedFuture(value=None)) @@ -3985,7 +4091,7 @@ def test_cli_batch_result_none_progress( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - for idx in range(pipeline._parallel_min_files(2) + 1): + for idx in range(_parallel_min_files(2) + 1): src = tmp_path / f"a{idx}.py" src.write_text("def f():\n return 1\n", "utf-8") _patch_dummy_progress(monkeypatch) @@ -4000,7 +4106,7 @@ def test_cli_failed_batch_item_no_progress( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - for idx in range(pipeline._parallel_min_files(2) + 1): + for idx in range(_parallel_min_files(2) + 1): src = tmp_path / f"a{idx}.py" src.write_text("def f():\n return 1\n", "utf-8") _patch_fixed_executor(monkeypatch, _FixedFuture(error=RuntimeError("boom"))) @@ -4014,7 +4120,7 @@ def test_cli_failed_batch_item_progress( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - for idx in range(pipeline._parallel_min_files(2) + 1): + for idx in range(_parallel_min_files(2) + 1): src = tmp_path / f"a{idx}.py" src.write_text("def f():\n return 1\n", "utf-8") _patch_dummy_progress(monkeypatch) diff --git a/tests/test_cli_smoke.py b/tests/test_cli_smoke.py index ea3399f..77ec213 100644 --- a/tests/test_cli_smoke.py +++ b/tests/test_cli_smoke.py @@ -33,7 +33,7 @@ def run_cli( [ executable, "-m", - "codeclone.cli", + "codeclone.main", *args, "--processes", "1", diff --git a/tests/test_cli_unit.py b/tests/test_cli_unit.py index 112c528..a84e97e 100644 --- a/tests/test_cli_unit.py +++ b/tests/test_cli_unit.py @@ -11,30 +11,43 @@ import webbrowser from argparse import Namespace from collections.abc import Callable +from io import StringIO from pathlib import Path from types import SimpleNamespace -from typing import Any, cast +from typing import Any, TextIO, cast import pytest -import codeclone._cli_baselines as cli_baselines_mod -import codeclone._cli_meta as cli_meta_mod -import codeclone._cli_reports as cli_reports -import codeclone._cli_summary as cli_summary import codeclone.baseline as baseline_mod -import codeclone.cli as cli -import codeclone.metrics_baseline as metrics_baseline_mod -import codeclone.pipeline as pipeline +import codeclone.baseline.metrics_baseline as metrics_baseline_mod +import codeclone.core.worker as core_worker +import codeclone.surfaces.cli.attrs as cli_attrs +import codeclone.surfaces.cli.baseline_state as cli_baselines_mod +import codeclone.surfaces.cli.changed_scope as cli_changed_scope +import codeclone.surfaces.cli.console as cli_console +import codeclone.surfaces.cli.report_meta as cli_meta_mod +import codeclone.surfaces.cli.reports_output as cli_reports +import codeclone.surfaces.cli.runtime as cli_runtime +import codeclone.surfaces.cli.summary as cli_summary +import codeclone.surfaces.cli.tips as cli_tips +import codeclone.surfaces.cli.workflow as cli from codeclone import __version__ from codeclone import ui_messages as ui -from codeclone._cli_args import build_parser -from codeclone._cli_config import ConfigValidationError -from codeclone.cache import Cache -from codeclone.cli import process_file +from codeclone.analysis.normalizer import NormalizationConfig +from codeclone.cache.store import Cache +from codeclone.config.argparse_builder import build_parser +from codeclone.config.pyproject_loader import ConfigValidationError from codeclone.contracts import DOCS_URL, ISSUES_URL, REPOSITORY_URL -from codeclone.errors import BaselineValidationError +from codeclone.contracts.errors import BaselineValidationError +from codeclone.core._types import ( + AnalysisResult, + BootstrapResult, + DiscoveryResult, + ProcessingResult, +) +from codeclone.core.reporting import GatingResult +from codeclone.core.worker import process_file from codeclone.models import HealthScore, ProjectMetrics -from codeclone.normalize import NormalizationConfig from tests._assertions import assert_contains_all @@ -46,6 +59,15 @@ def print(self, *objects: object, **kwargs: object) -> None: self.lines.append(" ".join(str(obj) for obj in objects)) +class _TTYStream(StringIO): + def __init__(self, *, is_tty: bool) -> None: + super().__init__() + self._is_tty = is_tty + + def isatty(self) -> bool: + return self._is_tty + + def _metrics_baseline_runtime_for_gate_checks() -> ( cli_baselines_mod._MetricsBaselineRuntime ): @@ -134,7 +156,7 @@ def test_process_file_unexpected_error( def _boom(*_args: object, **_kwargs: object) -> object: raise RuntimeError("boom") - monkeypatch.setattr(pipeline, "extract_units_and_stats_from_source", _boom) + monkeypatch.setattr(core_worker, "extract_units_and_stats_from_source", _boom) result = process_file(str(src), str(tmp_path), NormalizationConfig(), 1, 1) assert result.success is False assert result.error is not None @@ -149,6 +171,174 @@ def test_process_file_success(tmp_path: Path) -> None: assert result.stat is not None +def test_cli_attr_helpers_handle_bool_int_and_path_edges(tmp_path: Path) -> None: + args = SimpleNamespace( + flag="yes", + numeric=True, + broken=3.14, + path_value=tmp_path / "report.json", + invalid_text=123, + ) + + assert cli_attrs.bool_attr(args, "flag") is True + assert cli_attrs.int_attr(args, "numeric", default=7) == 7 + assert cli_attrs.int_attr(args, "broken", default=9) == 9 + assert cli_attrs.optional_text_attr(args, "path_value") == str( + tmp_path / "report.json" + ) + assert cli_attrs.optional_text_attr(args, "invalid_text") is None + + +def test_cli_tips_detect_vscode_environment_signals() -> None: + assert cli_tips._is_vscode_environment({"TERM_PROGRAM": "vscode"}) is True + assert cli_tips._is_vscode_environment({"VSCODE_PID": "123"}) is True + assert cli_tips._is_vscode_environment({"TERM_PROGRAM": "xterm-256color"}) is False + + +def test_cli_stream_is_tty_handles_oserror() -> None: + class _BrokenTTY: + def isatty(self) -> bool: + raise OSError("tty unavailable") + + assert cli_tips._stream_is_tty(cast("TextIO", _BrokenTTY())) is False + + +def test_cli_load_tips_state_rejects_invalid_tip_shapes( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + monkeypatch.setattr(cli_tips, "read_json_object", lambda _path: {"tips": []}) + assert cli_tips._load_tips_state(tmp_path / "tips.json") == { + "schema_version": 1, + "tips": {}, + } + + +def test_cli_tip_last_shown_version_rejects_invalid_shapes() -> None: + assert ( + cli_tips._tip_last_shown_version({"tips": []}, tip_key="vscode_extension") == "" + ) + assert ( + cli_tips._tip_last_shown_version( + {"tips": {"vscode_extension": {"last_shown_version": 7}}}, + tip_key="vscode_extension", + ) + == "" + ) + + +def test_cli_vscode_extension_tip_uses_versioned_cache( + tmp_path: Path, +) -> None: + printer = _RecordingPrinter() + args = SimpleNamespace(quiet=False, ci=False) + env = {"TERM_PROGRAM": "vscode"} + cache_path = tmp_path / ".cache" / "codeclone" / "cache.json" + + cli_tips.maybe_print_vscode_extension_tip( + args=args, + console=printer, + codeclone_version=__version__, + cache_path=cache_path, + environ=env, + stream=_TTYStream(is_tty=True), + ) + + assert len(printer.lines) == 1 + assert "VS Code detected" in printer.lines[0] + assert "marketplace.visualstudio.com" in printer.lines[0] + + tips_path = cache_path.parent / "tips.json" + state = json.loads(tips_path.read_text("utf-8")) + assert state["tips"]["vscode_extension"]["last_shown_version"] == __version__ + + shown_again = cli_tips.maybe_print_vscode_extension_tip( + args=args, + console=printer, + codeclone_version=__version__, + cache_path=cache_path, + environ=env, + stream=_TTYStream(is_tty=True), + ) + assert shown_again is False + assert len(printer.lines) == 1 + + shown_for_new_version = cli_tips.maybe_print_vscode_extension_tip( + args=args, + console=printer, + codeclone_version=f"{__version__}.post1", + cache_path=cache_path, + environ=env, + stream=_TTYStream(is_tty=True), + ) + assert shown_for_new_version is True + assert len(printer.lines) == 2 + + +def test_cli_vscode_extension_tip_tolerates_state_write_failure( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + printer = _RecordingPrinter() + args = SimpleNamespace(quiet=False, ci=False) + + def _fail_remember(**_kwargs: object) -> None: + raise OSError("read-only cache") + + monkeypatch.setattr(cli_tips, "_remember_tip_version", _fail_remember) + + shown = cli_tips.maybe_print_vscode_extension_tip( + args=args, + console=printer, + codeclone_version=__version__, + cache_path=tmp_path / ".cache" / "codeclone" / "cache.json", + environ={"TERM_PROGRAM": "vscode"}, + stream=_TTYStream(is_tty=True), + ) + + assert shown is True + assert len(printer.lines) == 1 + + +@pytest.mark.parametrize( + ("args", "env", "isatty"), + [ + (SimpleNamespace(quiet=True, ci=False), {"TERM_PROGRAM": "vscode"}, True), + (SimpleNamespace(quiet=False, ci=True), {"TERM_PROGRAM": "vscode"}, True), + ( + SimpleNamespace(quiet=False, ci=False), + {"TERM_PROGRAM": "vscode", "CI": "1"}, + True, + ), + (SimpleNamespace(quiet=False, ci=False), {"TERM_PROGRAM": "vscode"}, False), + ( + SimpleNamespace(quiet=False, ci=False), + {"TERM_PROGRAM": "xterm-256color"}, + True, + ), + ], +) +def test_cli_vscode_extension_tip_respects_context_gates( + tmp_path: Path, + args: SimpleNamespace, + env: dict[str, str], + isatty: bool, +) -> None: + printer = _RecordingPrinter() + effective_env = dict(env) + + shown = cli_tips.maybe_print_vscode_extension_tip( + args=args, + console=printer, + codeclone_version=__version__, + cache_path=tmp_path / ".cache" / "codeclone" / "cache.json", + environ=effective_env, + stream=_TTYStream(is_tty=isatty), + ) + + assert shown is False + assert printer.lines == [] + + def test_cli_module_main_guard(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setattr(sys, "argv", ["codeclone", "--help"]) with pytest.raises(SystemExit) as exc: @@ -230,7 +420,7 @@ def test_cli_help_text_consistency( def test_report_path_origins_distinguish_bare_and_explicit_flags() -> None: - assert cli._report_path_origins( + assert cli_reports._report_path_origins( ( "--html", "--json", @@ -249,7 +439,7 @@ def test_report_path_origins_distinguish_bare_and_explicit_flags() -> None: def test_report_path_origins_stops_at_double_dash() -> None: - assert cli._report_path_origins(("--json=out.json", "--", "--html")) == { + assert cli_reports._report_path_origins(("--json=out.json", "--", "--html")) == { "html": None, "json": "explicit", "md": None, @@ -260,7 +450,7 @@ def test_report_path_origins_stops_at_double_dash() -> None: def test_timestamped_report_path_appends_utc_slug() -> None: path = Path("/tmp/report.html") - assert cli._timestamped_report_path( + assert cli_reports._timestamped_report_path( path, report_generated_at_utc="2026-03-22T21:30:45Z", ) == Path("/tmp/report-20260322T213045Z.html") @@ -400,7 +590,7 @@ def test_validate_changed_scope_args_rejects_invalid_combinations( ) -> None: cli.console = cli._make_console(no_color=True) with pytest.raises(SystemExit) as exc: - cli._validate_changed_scope_args(args=args) + cli_changed_scope._validate_changed_scope_args(args=args) assert exc.value.code == 2 @@ -410,7 +600,7 @@ def test_validate_changed_scope_args_promotes_paths_from_git_diff() -> None: diff_against=None, paths_from_git_diff="HEAD~1", ) - assert cli._validate_changed_scope_args(args=args) == "HEAD~1" + assert cli_changed_scope._validate_changed_scope_args(args=args) == "HEAD~1" assert args.changed_only is True @@ -423,7 +613,7 @@ def test_normalize_changed_paths_relativizes_dedupes_and_sorts(tmp_path: Path) - first.write_text("pass\n", "utf-8") second.write_text("pass\n", "utf-8") - assert cli._normalize_changed_paths( + assert cli_changed_scope._normalize_changed_paths( root_path=root_path, paths=("pkg/b.py", str(second), " pkg/b.py ", ""), ) == ("pkg/a.py", "pkg/b.py") @@ -445,7 +635,11 @@ def _fake_relative_to(self: Path, *other: str | Path) -> Path: monkeypatch.setattr(Path, "relative_to", _fake_relative_to) assert ( - cli._normalize_changed_paths(root_path=root_path, paths=(str(candidate),)) == () + cli_changed_scope._normalize_changed_paths( + root_path=root_path, + paths=(str(candidate),), + ) + == () ) @@ -463,7 +657,10 @@ def _broken_resolve(self: Path, strict: bool = False) -> Path: monkeypatch.setattr(Path, "resolve", _broken_resolve) with pytest.raises(SystemExit) as exc: - cli._normalize_changed_paths(root_path=root_path, paths=("broken.py",)) + cli_changed_scope._normalize_changed_paths( + root_path=root_path, + paths=("broken.py",), + ) assert exc.value.code == 2 @@ -476,7 +673,10 @@ def test_normalize_changed_paths_rejects_outside_root(tmp_path: Path) -> None: outside_path.write_text("pass\n", "utf-8") with pytest.raises(SystemExit) as exc: - cli._normalize_changed_paths(root_path=root_path, paths=(str(outside_path),)) + cli_changed_scope._normalize_changed_paths( + root_path=root_path, + paths=(str(outside_path),), + ) assert exc.value.code == 2 @@ -498,7 +698,10 @@ def _run(*args: object, **kwargs: object) -> subprocess.CompletedProcess[str]: ) monkeypatch.setattr(subprocess, "run", _run) - assert cli._git_diff_changed_paths(root_path=root_path, git_diff_ref="HEAD~1") == ( + assert cli_changed_scope._git_diff_changed_paths( + root_path=root_path, + git_diff_ref="HEAD~1", + ) == ( "pkg/a.py", "pkg/b.py", ) @@ -514,14 +717,17 @@ def _run(*args: object, **kwargs: object) -> subprocess.CompletedProcess[str]: monkeypatch.setattr(subprocess, "run", _run) with pytest.raises(SystemExit) as exc: - cli._git_diff_changed_paths(root_path=tmp_path.resolve(), git_diff_ref="HEAD~1") + cli_changed_scope._git_diff_changed_paths( + root_path=tmp_path.resolve(), + git_diff_ref="HEAD~1", + ) assert exc.value.code == 2 def test_git_diff_changed_paths_rejects_option_like_ref(tmp_path: Path) -> None: cli.console = cli._make_console(no_color=True) with pytest.raises(SystemExit) as exc: - cli._git_diff_changed_paths( + cli_changed_scope._git_diff_changed_paths( root_path=tmp_path.resolve(), git_diff_ref="--cached" ) assert exc.value.code == 2 @@ -543,7 +749,7 @@ def test_git_diff_changed_paths_rejects_unsafe_ref_syntax( ) -> None: cli.console = cli._make_console(no_color=True) with pytest.raises(SystemExit) as exc: - cli._git_diff_changed_paths( + cli_changed_scope._git_diff_changed_paths( root_path=tmp_path.resolve(), git_diff_ref=git_diff_ref, ) @@ -551,7 +757,7 @@ def test_git_diff_changed_paths_rejects_unsafe_ref_syntax( def test_report_path_origins_ignores_unrelated_equals_tokens() -> None: - assert cli._report_path_origins(("--unknown=value", "--json=out.json")) == { + assert cli_reports._report_path_origins(("--unknown=value", "--json=out.json")) == { "html": None, "json": "explicit", "md": None, @@ -561,7 +767,7 @@ def test_report_path_origins_ignores_unrelated_equals_tokens() -> None: def test_changed_clone_gate_from_report_filters_changed_scope() -> None: - gate = cli._changed_clone_gate_from_report( + gate = cli_changed_scope._changed_clone_gate_from_report( { "findings": { "groups": { @@ -686,15 +892,32 @@ def test_enforce_gating_rewrites_clone_threshold_for_changed_scope( ) -> None: cli.console = cli._make_console(no_color=True) observed: dict[str, object] = {} + analysis = AnalysisResult( + func_groups={}, + block_groups={}, + block_groups_report={}, + segment_groups={}, + suppressed_segment_groups=0, + block_group_facts={}, + func_clones_count=8, + block_clones_count=0, + segment_clones_count=0, + files_analyzed_or_cached=0, + project_metrics=None, + metrics_payload=None, + suggestions=(), + segment_groups_raw_digest="", + ) - monkeypatch.setattr( - cli, - "gate", - lambda **_kwargs: pipeline.GatingResult( + def _fake_gate(**kwargs: object) -> GatingResult: + gate_analysis = cast("AnalysisResult", kwargs["analysis"]) + observed["clone_threshold_total"] = gate_analysis.func_clones_count + return GatingResult( exit_code=3, - reasons=("clone:threshold:8:1",), - ), - ) + reasons=("clone:threshold:2:1",), + ) + + monkeypatch.setattr(cli, "gate", _fake_gate) monkeypatch.setattr( cli, "_print_gating_failure_block", @@ -706,8 +929,8 @@ def test_enforce_gating_rewrites_clone_threshold_for_changed_scope( with pytest.raises(SystemExit) as exc: cli._enforce_gating( args=Namespace(fail_threshold=1, verbose=False), - boot=cast("pipeline.BootstrapResult", object()), - analysis=cast("pipeline.AnalysisResult", object()), + boot=cast("BootstrapResult", object()), + analysis=analysis, processing=cast(Any, Namespace(source_read_failures=[])), source_read_contract_failure=False, baseline_failure_code=None, @@ -720,6 +943,7 @@ def test_enforce_gating_rewrites_clone_threshold_for_changed_scope( ) assert exc.value.code == 3 + assert observed["clone_threshold_total"] == 2 assert observed["code"] == "threshold" assert observed["entries"] == ( ("clone_groups_total", 2), @@ -732,15 +956,29 @@ def test_enforce_gating_drops_rewritten_threshold_when_changed_scope_is_within_l ) -> None: cli.console = cli._make_console(no_color=True) observed: dict[str, object] = {} - - monkeypatch.setattr( - cli, - "gate", - lambda **_kwargs: pipeline.GatingResult( - exit_code=3, - reasons=("clone:threshold:8:1",), - ), + analysis = AnalysisResult( + func_groups={}, + block_groups={}, + block_groups_report={}, + segment_groups={}, + suppressed_segment_groups=0, + block_group_facts={}, + func_clones_count=8, + block_clones_count=0, + segment_clones_count=0, + files_analyzed_or_cached=0, + project_metrics=None, + metrics_payload=None, + suggestions=(), + segment_groups_raw_digest="", ) + + def _fake_gate(**kwargs: object) -> GatingResult: + gate_analysis = cast("AnalysisResult", kwargs["analysis"]) + observed["clone_threshold_total"] = gate_analysis.func_clones_count + return GatingResult(exit_code=0, reasons=()) + + monkeypatch.setattr(cli, "gate", _fake_gate) monkeypatch.setattr( cli, "_print_gating_failure_block", @@ -749,8 +987,8 @@ def test_enforce_gating_drops_rewritten_threshold_when_changed_scope_is_within_l cli._enforce_gating( args=Namespace(fail_threshold=5, verbose=False), - boot=cast("pipeline.BootstrapResult", object()), - analysis=cast("pipeline.AnalysisResult", object()), + boot=cast("BootstrapResult", object()), + analysis=analysis, processing=cast(Any, Namespace(source_read_failures=[])), source_read_contract_failure=False, baseline_failure_code=None, @@ -762,7 +1000,7 @@ def test_enforce_gating_drops_rewritten_threshold_when_changed_scope_is_within_l clone_threshold_total=2, ) - assert observed == {} + assert observed == {"clone_threshold_total": 2} def test_main_impl_prints_changed_scope_when_changed_projection_is_available( @@ -861,7 +1099,7 @@ def test_main_impl_prints_changed_scope_when_changed_projection_is_available( monkeypatch.setattr( cli, "_changed_clone_gate_from_report", - lambda _report, changed_paths: cli.ChangedCloneGate( + lambda _report, changed_paths: cli_changed_scope.ChangedCloneGate( changed_paths=tuple(changed_paths), new_func=frozenset(), new_block=frozenset(), @@ -970,6 +1208,23 @@ def test_compact_summary_labels_use_machine_scannable_keys() -> None: == "Metrics cc=2.8/21 cbo=0.6/8 lcom4=1.2/4" " cycles=0 dead_code=1 health=85(B) overloaded_modules=3" ) + assert ( + ui.fmt_summary_compact_dependencies( + avg_depth=4.0, + p95_depth=13, + max_depth=16, + ) + == "Dependencies avg=4.0 p95=13 max=16" + ) + assert ( + ui.fmt_summary_compact_security_surfaces( + items=5, + categories=3, + production=4, + tests=1, + ) + == "Security items=5 categories=3 production=4 tests=1" + ) assert ( ui.fmt_summary_compact_adoption( param_permille=750, @@ -1015,6 +1270,16 @@ def test_compact_summary_labels_use_machine_scannable_keys() -> None: ui.fmt_coverage_join_ignored("bad xml") == "[warning]Coverage join ignored: bad xml[/warning]" ) + assert ui.fmt_cli_runtime_warning( + "Cache analysis profile mismatch (found min_loc=6, min_stmt=4, " + "collect_api_surface=false; expected min_loc=6, min_stmt=4, " + "collect_api_surface=true); ignoring cache." + ) == ( + " [warning]Cache[/warning] analysis profile mismatch\n" + " [dim]found min_loc=6, min_stmt=4, collect_api_surface=false[/dim]\n" + " [dim]expected min_loc=6, min_stmt=4, collect_api_surface=true[/dim]\n" + " [dim]ignoring cache[/dim]" + ) def test_ui_summary_formatters_cover_optional_branches() -> None: @@ -1039,6 +1304,27 @@ def test_ui_summary_formatters_cover_optional_branches() -> None: assert "[yellow]2[/yellow] fixtures" in clones assert "5 detected" in ui.fmt_metrics_cycles(5) + dependencies = ui.fmt_metrics_dependencies( + avg_depth=4.0, + p95_depth=13, + max_depth=16, + ) + assert_contains_all(dependencies, "avg 4.0", "p95 13", "max 16") + security_surfaces = ui.fmt_metrics_security_surfaces( + items=5, + categories=3, + production=4, + tests=1, + ) + assert_contains_all( + security_surfaces, + "5", + "surfaces", + "3", + "categories", + "production 4", + "tests 1", + ) dead_with_suppressed = ui.fmt_metrics_dead_code(447, suppressed=9) assert "447 found" in dead_with_suppressed assert "(9 suppressed)" in dead_with_suppressed @@ -1200,6 +1486,42 @@ def test_print_metrics_in_quiet_mode_includes_overloaded_modules( assert "Public API" not in out +def test_print_metrics_in_quiet_mode_includes_security_surfaces( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] +) -> None: + monkeypatch.setattr(cli, "console", cli._make_console(no_color=True)) + cli_summary._print_metrics( + console=cast("cli_summary._Printer", cli.console), + quiet=True, + metrics=cli_summary.MetricsSnapshot( + complexity_avg=2.8, + complexity_max=20, + high_risk_count=0, + coupling_avg=0.5, + coupling_max=9, + cohesion_avg=1.2, + cohesion_max=4, + cycles_count=0, + dead_code_count=0, + health_total=85, + health_grade="B", + security_surfaces_items=5, + security_surfaces_category_count=3, + security_surfaces_production=4, + security_surfaces_tests=1, + ), + ) + out = capsys.readouterr().out + assert_contains_all( + out, + "Security", + "items=5", + "categories=3", + "production=4", + "tests=1", + ) + + def test_print_metrics_in_quiet_mode_includes_adoption_public_api_and_coverage( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] ) -> None: @@ -1219,6 +1541,10 @@ def test_print_metrics_in_quiet_mode_includes_adoption_public_api_and_coverage( dead_code_count=0, health_total=85, health_grade="B", + security_surfaces_items=5, + security_surfaces_category_count=3, + security_surfaces_production=4, + security_surfaces_tests=1, adoption_param_permille=750, adoption_return_permille=500, adoption_docstring_permille=667, @@ -1243,6 +1569,9 @@ def test_print_metrics_in_quiet_mode_includes_adoption_public_api_and_coverage( out = capsys.readouterr().out assert_contains_all( out, + "Security", + "items=5", + "production=4", "Adoption", "params=75.0%", "Public API", @@ -1272,6 +1601,10 @@ def test_print_metrics_in_normal_mode_includes_adoption_public_api_and_coverage( dead_code_count=0, health_total=85, health_grade="B", + security_surfaces_items=5, + security_surfaces_category_count=3, + security_surfaces_production=4, + security_surfaces_tests=1, adoption_param_permille=750, adoption_return_permille=500, adoption_docstring_permille=667, @@ -1296,6 +1629,8 @@ def test_print_metrics_in_normal_mode_includes_adoption_public_api_and_coverage( out = capsys.readouterr().out assert_contains_all( out, + "Security", + "5 surfaces", "Adoption", "params 75.0%", "docstrings 66.7%", @@ -1326,7 +1661,7 @@ def test_configure_metrics_mode_rejects_skip_metrics_with_metrics_flags( skip_dependencies=False, ) with pytest.raises(SystemExit) as exc: - cli._configure_metrics_mode(args=args, metrics_baseline_exists=False) + cli_runtime._configure_metrics_mode(args=args, metrics_baseline_exists=False) assert exc.value.code == 2 @@ -1344,7 +1679,7 @@ def test_configure_metrics_mode_forces_dependency_and_dead_code_when_gated() -> skip_dead_code=True, skip_dependencies=True, ) - cli._configure_metrics_mode(args=args, metrics_baseline_exists=True) + cli_runtime._configure_metrics_mode(args=args, metrics_baseline_exists=True) assert args.skip_dead_code is False assert args.skip_dependencies is False @@ -1374,7 +1709,7 @@ def test_configure_metrics_mode_does_not_force_api_surface_for_baseline_update() coverage_xml=None, ) - cli._configure_metrics_mode(args=args, metrics_baseline_exists=True) + cli_runtime._configure_metrics_mode(args=args, metrics_baseline_exists=True) assert args.api_surface is False @@ -1402,7 +1737,7 @@ def test_configure_metrics_mode_forces_api_surface_for_api_break_gate() -> None: coverage_xml=None, ) - cli._configure_metrics_mode(args=args, metrics_baseline_exists=True) + cli_runtime._configure_metrics_mode(args=args, metrics_baseline_exists=True) assert args.api_surface is True @@ -1410,20 +1745,20 @@ def test_configure_metrics_mode_forces_api_surface_for_api_break_gate() -> None: def test_probe_metrics_baseline_section_for_non_object_payload(tmp_path: Path) -> None: path = tmp_path / "baseline.json" path.write_text("[]", "utf-8") - probe = cli._probe_metrics_baseline_section(path) + probe = cli_baselines_mod._probe_metrics_baseline_section(path) assert probe.has_metrics_section is True assert probe.payload is None def test_metrics_computed_respects_skip_switches() -> None: - assert cli._metrics_computed( + assert cli_runtime._metrics_computed( Namespace( skip_metrics=False, skip_dependencies=True, skip_dead_code=True, ) ) == ("complexity", "coupling", "cohesion", "health", "coverage_adoption") - assert cli._metrics_computed( + assert cli_runtime._metrics_computed( Namespace( skip_metrics=False, skip_dependencies=False, @@ -1441,7 +1776,7 @@ def test_metrics_computed_respects_skip_switches() -> None: def test_metrics_computed_includes_api_surface_only_when_enabled() -> None: - assert cli._metrics_computed( + assert cli_runtime._metrics_computed( Namespace( skip_metrics=False, skip_dependencies=True, @@ -1449,7 +1784,7 @@ def test_metrics_computed_includes_api_surface_only_when_enabled() -> None: api_surface=False, ) ) == ("complexity", "coupling", "cohesion", "health", "coverage_adoption") - assert cli._metrics_computed( + assert cli_runtime._metrics_computed( Namespace( skip_metrics=False, skip_dependencies=True, @@ -1467,7 +1802,7 @@ def test_metrics_computed_includes_api_surface_only_when_enabled() -> None: def test_metrics_computed_includes_coverage_join_only_with_xml() -> None: - assert cli._metrics_computed( + assert cli_runtime._metrics_computed( Namespace( skip_metrics=False, skip_dependencies=True, @@ -1476,7 +1811,7 @@ def test_metrics_computed_includes_coverage_join_only_with_xml() -> None: coverage_xml=None, ) ) == ("complexity", "coupling", "cohesion", "health", "coverage_adoption") - assert cli._metrics_computed( + assert cli_runtime._metrics_computed( Namespace( skip_metrics=False, skip_dependencies=True, @@ -1498,7 +1833,7 @@ def test_enforce_gating_requires_coverage_input_for_hotspot_gate( monkeypatch: pytest.MonkeyPatch, ) -> None: cli.console = cli._make_console(no_color=True) - monkeypatch.setattr(cli, "gate", lambda **_kwargs: pipeline.GatingResult(0, ())) + monkeypatch.setattr(cli, "gate", lambda **_kwargs: GatingResult(0, ())) with pytest.raises(SystemExit) as exc: cli._enforce_gating( args=Namespace( @@ -1506,7 +1841,7 @@ def test_enforce_gating_requires_coverage_input_for_hotspot_gate( fail_threshold=-1, verbose=False, ), - boot=cast("pipeline.BootstrapResult", object()), + boot=cast("BootstrapResult", object()), analysis=cast(Any, SimpleNamespace(coverage_join=None)), processing=cast(Any, Namespace(source_read_failures=[])), source_read_contract_failure=False, @@ -1524,7 +1859,7 @@ def test_enforce_gating_requires_valid_coverage_input_for_hotspot_gate( monkeypatch: pytest.MonkeyPatch, ) -> None: cli.console = cli._make_console(no_color=True) - monkeypatch.setattr(cli, "gate", lambda **_kwargs: pipeline.GatingResult(0, ())) + monkeypatch.setattr(cli, "gate", lambda **_kwargs: GatingResult(0, ())) with pytest.raises(SystemExit) as exc: cli._enforce_gating( args=Namespace( @@ -1532,7 +1867,7 @@ def test_enforce_gating_requires_valid_coverage_input_for_hotspot_gate( fail_threshold=-1, verbose=False, ), - boot=cast("pipeline.BootstrapResult", object()), + boot=cast("BootstrapResult", object()), analysis=cast( Any, SimpleNamespace( @@ -1601,8 +1936,8 @@ def _resolve(self: Path, *, strict: bool = False) -> Path: assert os.environ.get("CODECLONE_DEBUG") == "1" -def _stub_discovery_result() -> pipeline.DiscoveryResult: - return pipeline.DiscoveryResult( +def _stub_discovery_result() -> DiscoveryResult: + return DiscoveryResult( files_found=0, cache_hits=0, files_skipped=0, @@ -1619,8 +1954,8 @@ def _stub_discovery_result() -> pipeline.DiscoveryResult: ) -def _stub_processing_result() -> pipeline.ProcessingResult: - return pipeline.ProcessingResult( +def _stub_processing_result() -> ProcessingResult: + return ProcessingResult( units=(), blocks=(), segments=(), @@ -1642,8 +1977,8 @@ def _stub_processing_result() -> pipeline.ProcessingResult: def _stub_analysis_result( *, project_metrics: ProjectMetrics | None = None, -) -> pipeline.AnalysisResult: - return pipeline.AnalysisResult( +) -> AnalysisResult: + return AnalysisResult( func_groups={}, block_groups={}, block_groups_report={}, @@ -1808,7 +2143,7 @@ def test_main_impl_prints_metric_gate_reasons_and_exits_gating_failure( monkeypatch.setattr( cli, "gate", - lambda **_kwargs: pipeline.GatingResult( + lambda **_kwargs: GatingResult( exit_code=3, reasons=( "metric:Health score regressed vs metrics baseline: delta=-1.", @@ -2091,11 +2426,11 @@ def test_main_impl_ci_enables_fail_on_new_metrics_when_metrics_baseline_loaded( observed: dict[str, bool] = {} - def _capture_gate(**kwargs: object) -> pipeline.GatingResult: + def _capture_gate(**kwargs: object) -> GatingResult: boot = kwargs["boot"] - assert isinstance(boot, pipeline.BootstrapResult) + assert isinstance(boot, BootstrapResult) observed["fail_on_new_metrics"] = bool(boot.args.fail_on_new_metrics) - return pipeline.GatingResult(exit_code=0, reasons=()) + return GatingResult(exit_code=0, reasons=()) monkeypatch.setattr(cli, "gate", _capture_gate) monkeypatch.setattr( @@ -2119,7 +2454,7 @@ def _capture_gate(**kwargs: object) -> pipeline.GatingResult: def test_print_verbose_clone_hashes_noop_on_empty() -> None: printer = _RecordingPrinter() - cli._print_verbose_clone_hashes( + cli_console._print_verbose_clone_hashes( printer, label="Function clone hashes", clone_hashes=set(), @@ -2129,7 +2464,7 @@ def test_print_verbose_clone_hashes_noop_on_empty() -> None: def test_print_verbose_clone_hashes_prints_sorted_values() -> None: printer = _RecordingPrinter() - cli._print_verbose_clone_hashes( + cli_console._print_verbose_clone_hashes( printer, label="Block clone hashes", clone_hashes={"b-hash", "a-hash"}, diff --git a/tests/test_codex_plugin.py b/tests/test_codex_plugin.py index 806878e..a525cee 100644 --- a/tests/test_codex_plugin.py +++ b/tests/test_codex_plugin.py @@ -12,20 +12,30 @@ def test_codex_plugin_manifest_is_consistent() -> None: root = Path(__file__).resolve().parents[1] plugin_root = root / "plugins" / "codeclone" manifest = _load_json(plugin_root / ".codex-plugin" / "plugin.json") + marketplace = _load_json(root / ".agents" / "plugins" / "marketplace.json") assert isinstance(manifest, dict) + assert manifest["name"] == plugin_root.name assert manifest["name"] == "codeclone" - assert manifest["version"] == "2.0.0-b5.0" + assert manifest["version"] == "2.0.0-b6.0" assert manifest["skills"] == "./skills/" assert manifest["mcpServers"] == "./.mcp.json" assert manifest["license"] == "MPL-2.0" assert manifest["homepage"] == "https://orenlab.github.io/codeclone/codex-plugin/" + assert isinstance(marketplace, dict) + assert marketplace["plugins"][0]["name"] == manifest["name"] interface = manifest["interface"] assert isinstance(interface, dict) - assert interface["displayName"] == "CodeClone" - assert interface["category"] == "Developer Tools" - assert interface["websiteURL"] == manifest["homepage"] + assert { + "displayName": interface["displayName"], + "category": interface["category"], + "websiteURL": interface["websiteURL"], + } == { + "displayName": "CodeClone", + "category": "Developer Tools", + "websiteURL": manifest["homepage"], + } assert ( interface["privacyPolicyURL"] == "https://orenlab.github.io/codeclone/privacy-policy/" @@ -71,13 +81,11 @@ def test_codex_plugin_marketplace_and_mcp_config_are_aligned() -> None: assert isinstance(mcp_config, dict) server = mcp_config["mcpServers"]["codeclone"] - assert server["command"] == "sh" - assert server["args"][0] == "-lc" - launcher = server["args"][1] - assert "$PWD/.venv/bin/codeclone-mcp" in launcher - assert "poetry env info -p" in launcher - assert "exec codeclone-mcp --transport stdio" in launcher - assert "PATH entry" in launcher + assert server["command"] == "python3" + assert server["args"] == ["./scripts/launch_mcp"] + assert (plugin_root / "scripts" / "launch_mcp").is_file() + assert (plugin_root / "scripts" / "launch_mcp.py").is_file() + assert (root / "scripts" / "launch_mcp").is_file() def test_codex_plugin_skill_exists() -> None: @@ -126,7 +134,8 @@ def test_codex_plugin_readme_and_docs_exist() -> None: assert "does not rewrite `~/.codex/config.toml`" in readme_text assert "The plugin prefers a workspace launcher first" in readme_text assert "the current Poetry environment launcher" in readme_text - assert 'uv tool install "codeclone[mcp]>=2.0.0b4"' in readme_text + assert "without relying on `sh -lc`" in readme_text + assert 'uv tool install --pre "codeclone[mcp]"' in readme_text assert (root / "docs" / "codex-plugin.md").is_file() assert (root / "docs" / "terms-of-use.md").is_file() diff --git a/tests/test_codex_plugin_launcher.py b/tests/test_codex_plugin_launcher.py new file mode 100644 index 0000000..fbe0ebd --- /dev/null +++ b/tests/test_codex_plugin_launcher.py @@ -0,0 +1,106 @@ +from __future__ import annotations + +import importlib.util +import subprocess +import sys +from pathlib import Path +from types import ModuleType + + +def _load_launcher_module() -> ModuleType: + root = Path(__file__).resolve().parents[1] + path = root / "plugins" / "codeclone" / "scripts" / "launch_mcp.py" + spec = importlib.util.spec_from_file_location( + "codeclone_codex_plugin_launcher", + path, + ) + assert spec is not None and spec.loader is not None + module = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = module + spec.loader.exec_module(module) + return module + + +launcher_mod = _load_launcher_module() + + +def test_workspace_roots_keep_workspace_root_first() -> None: + repo_root = Path("/repo") + roots = launcher_mod.workspace_roots( + env={ + "CODECLONE_WORKSPACE_ROOT": "/workspace/current", + "PWD": "/workspace/current", + }, + cwd="/workspace/plugin", + repo_root=repo_root, + ) + assert roots == ( + Path("/workspace/current"), + Path("/workspace/plugin"), + repo_root, + ) + + +def test_resolve_launch_target_prefers_workspace_local_launcher(tmp_path: Path) -> None: + workspace_root = tmp_path / "workspace" + launcher_path = launcher_mod.workspace_local_launcher_candidates(workspace_root)[0] + launcher_path.parent.mkdir(parents=True, exist_ok=True) + launcher_path.write_text("", encoding="utf-8") + + target = launcher_mod.resolve_launch_target( + env={"PWD": str(workspace_root)}, + cwd=str(workspace_root), + repo_root=workspace_root, + which=lambda _name: "/usr/local/bin/codeclone-mcp", + ) + + assert target == launcher_mod.LaunchTarget( + command=str(launcher_path), + source="workspaceLocal", + workspace_root=workspace_root, + ) + + +def test_resolve_launch_target_prefers_poetry_before_path(tmp_path: Path) -> None: + workspace_root = tmp_path / "workspace" + poetry_root = tmp_path / "poetry-env" + poetry_launcher = poetry_root / "bin" / "codeclone-mcp" + poetry_launcher.parent.mkdir(parents=True, exist_ok=True) + poetry_launcher.write_text("", encoding="utf-8") + (workspace_root / "pyproject.toml").parent.mkdir(parents=True, exist_ok=True) + (workspace_root / "pyproject.toml").write_text( + "[project]\nname='demo'\n", encoding="utf-8" + ) + + def fake_run(*_args: object, **kwargs: object) -> subprocess.CompletedProcess[str]: + assert kwargs["cwd"] == str(workspace_root) + return subprocess.CompletedProcess( + args=["poetry", "env", "info", "-p"], + returncode=0, + stdout=str(poetry_root), + stderr="", + ) + + target = launcher_mod.resolve_launch_target( + env={"PWD": str(workspace_root)}, + cwd=str(workspace_root), + repo_root=workspace_root, + run_cmd=fake_run, + which=lambda name: ( + "/usr/local/bin/poetry" + if name == "poetry" + else "/usr/local/bin/codeclone-mcp" + ), + ) + + assert target == launcher_mod.LaunchTarget( + command=str(poetry_launcher), + source="poetryEnv", + workspace_root=workspace_root, + ) + + +def test_build_setup_message_is_actionable() -> None: + assert "workspace .venv launcher" in launcher_mod.build_setup_message() + assert "Poetry environment launcher" in launcher_mod.build_setup_message() + assert "PATH entry" in launcher_mod.build_setup_message() diff --git a/tests/test_coerce.py b/tests/test_coerce.py index 9b7b0c0..d8d034a 100644 --- a/tests/test_coerce.py +++ b/tests/test_coerce.py @@ -8,7 +8,7 @@ from collections.abc import Mapping, Sequence -from codeclone import _coerce +from codeclone.utils import coerce as _coerce def test_as_int_handles_bool_int_str_and_default() -> None: diff --git a/tests/test_core_branch_coverage.py b/tests/test_core_branch_coverage.py index b26fdb6..77a3c8a 100644 --- a/tests/test_core_branch_coverage.py +++ b/tests/test_core_branch_coverage.py @@ -14,29 +14,49 @@ import orjson import pytest -import codeclone.cli as cli -import codeclone.pipeline as pipeline -from codeclone._cli_gating import policy_context -from codeclone.cache import ( - Cache, - CacheEntry, - SegmentReportProjection, +import codeclone.core.discovery as core_discovery +import codeclone.core.pipeline as core_pipeline +import codeclone.surfaces.cli.console as cli_console +import codeclone.surfaces.cli.workflow as cli +from codeclone.analysis.normalizer import NormalizationConfig +from codeclone.cache._canonicalize import ( _as_file_stat_dict, - _as_risk_literal, + _has_cache_entry_container_shape, +) +from codeclone.cache._validators import _is_dead_candidate_dict +from codeclone.cache._wire_decode import ( _decode_wire_file_entry, _decode_wire_structural_findings_optional, _decode_wire_structural_group, _decode_wire_structural_occurrence, _decode_wire_structural_signature, _decode_wire_unit, - _encode_wire_file_entry, - _has_cache_entry_container_shape, - _is_dead_candidate_dict, +) +from codeclone.cache._wire_encode import _encode_wire_file_entry +from codeclone.cache.entries import CacheEntry, SourceStatsDict, _as_risk_literal +from codeclone.cache.projection import ( + SegmentReportProjection, build_segment_report_projection, + decode_segment_report_projection, +) +from codeclone.cache.store import Cache, file_stat_signature +from codeclone.contracts.errors import CacheError +from codeclone.core._types import ( + AnalysisResult, + BootstrapResult, + DiscoveryResult, + OutputPaths, + ProcessingResult, + _coerce_segment_report_projection, + _segment_groups_digest, ) -from codeclone.cache_segments import decode_segment_report_projection -from codeclone.errors import CacheError -from codeclone.grouping import build_segment_groups +from codeclone.core.discovery import discover +from codeclone.core.discovery_cache import ( + _cache_entry_source_stats, + decode_cached_structural_finding_group, +) +from codeclone.core.pipeline import analyze +from codeclone.findings.clones.grouping import build_segment_groups from codeclone.models import ( BlockUnit, ClassMetrics, @@ -45,7 +65,7 @@ ModuleDep, SegmentUnit, ) -from codeclone.normalize import NormalizationConfig +from codeclone.report.gates.reasons import policy_context from tests._assertions import assert_contains_all @@ -484,7 +504,7 @@ def test_pipeline_analyze_uses_cached_segment_projection( "size": 6, } raw_groups = build_segment_groups((seg_item_a, seg_item_b)) - digest = pipeline._segment_groups_digest(raw_groups) + digest = _segment_groups_digest(raw_groups) cached_projection = { "digest": digest, "suppressed": 7, @@ -522,9 +542,9 @@ def _must_not_run( ) -> tuple[dict[str, list[dict[str, object]]], int]: raise AssertionError("prepare_segment_report_groups must not be called") - monkeypatch.setattr(pipeline, "prepare_segment_report_groups", _must_not_run) + monkeypatch.setattr(core_pipeline, "prepare_segment_report_groups", _must_not_run) - boot = pipeline.BootstrapResult( + boot = BootstrapResult( root=Path("."), config=NormalizationConfig(), args=Namespace( @@ -535,10 +555,10 @@ def _must_not_run( min_stmt=1, processes=1, ), - output_paths=pipeline.OutputPaths(), + output_paths=OutputPaths(), cache_path=Path("cache.json"), ) - discovery = pipeline.DiscoveryResult( + discovery = DiscoveryResult( files_found=0, cache_hits=0, files_skipped=0, @@ -553,11 +573,10 @@ def _must_not_run( files_to_process=(), skipped_warnings=(), cached_segment_report_projection=cast( - "SegmentReportProjection", - cached_projection, + SegmentReportProjection, cached_projection ), ) - processing = pipeline.ProcessingResult( + processing = ProcessingResult( units=(), blocks=(), segments=(seg_item_a, seg_item_b), @@ -575,30 +594,90 @@ def _must_not_run( source_read_failures=(), ) - result = pipeline.analyze(boot=boot, discovery=discovery, processing=processing) + result = analyze(boot=boot, discovery=discovery, processing=processing) assert result.suppressed_segment_groups == 7 assert result.segment_groups == cached_projection["groups"] assert result.segment_groups_raw_digest == digest def test_pipeline_coerce_segment_projection_invalid_shapes() -> None: - assert pipeline._coerce_segment_report_projection("bad") is None + assert _coerce_segment_report_projection("bad") is None assert ( - pipeline._coerce_segment_report_projection( - {"digest": 1, "suppressed": 0, "groups": {}} - ) + _coerce_segment_report_projection({"digest": 1, "suppressed": 0, "groups": {}}) is None ) assert ( - pipeline._coerce_segment_report_projection( + _coerce_segment_report_projection( {"digest": "d", "suppressed": 0, "groups": {"k": "bad"}} ) is None ) + assert ( + _coerce_segment_report_projection( + { + "digest": "d", + "suppressed": 0, + "groups": {"k": [{"segment_hash": "h", "segment_sig": "s"}]}, + } + ) + is None + ) + + assert ( + _coerce_segment_report_projection( + { + "digest": "d", + "suppressed": 0, + "groups": {"k": ["bad-item"]}, + } + ) + is None + ) + + +def test_pipeline_coerce_segment_projection_valid_group_items() -> None: + projection = _coerce_segment_report_projection( + { + "digest": "digest", + "suppressed": 2, + "groups": { + "sig-1": [ + { + "segment_hash": "hash-1", + "segment_sig": "sig-1", + "filepath": "pkg/mod.py", + "qualname": "pkg.mod:run", + "start_line": 10, + "end_line": 16, + "size": 6, + } + ] + }, + } + ) + + assert projection == { + "digest": "digest", + "suppressed": 2, + "groups": { + "sig-1": [ + { + "segment_hash": "hash-1", + "segment_sig": "sig-1", + "filepath": "pkg/mod.py", + "qualname": "pkg.mod:run", + "start_line": 10, + "end_line": 16, + "size": 6, + } + ] + }, + } + def test_pipeline_analyze_tracks_suppressed_dead_code_candidates() -> None: - boot = pipeline.BootstrapResult( + boot = BootstrapResult( root=Path("."), config=NormalizationConfig(), args=Namespace( @@ -609,10 +688,10 @@ def test_pipeline_analyze_tracks_suppressed_dead_code_candidates() -> None: min_stmt=1, processes=1, ), - output_paths=pipeline.OutputPaths(), + output_paths=OutputPaths(), cache_path=Path("cache.json"), ) - discovery = pipeline.DiscoveryResult( + discovery = DiscoveryResult( files_found=1, cache_hits=0, files_skipped=0, @@ -627,7 +706,7 @@ def test_pipeline_analyze_tracks_suppressed_dead_code_candidates() -> None: files_to_process=(), skipped_warnings=(), ) - processing = pipeline.ProcessingResult( + processing = ProcessingResult( units=(), blocks=(), segments=(), @@ -655,7 +734,7 @@ def test_pipeline_analyze_tracks_suppressed_dead_code_candidates() -> None: source_read_failures=(), ) - result = pipeline.analyze(boot=boot, discovery=discovery, processing=processing) + result = analyze(boot=boot, discovery=discovery, processing=processing) assert result.project_metrics is not None assert result.project_metrics.dead_code == () assert result.suppressed_dead_code_items == 1 @@ -672,7 +751,7 @@ def test_pipeline_analyze_tracks_suppressed_dead_code_candidates() -> None: def test_pipeline_decode_cached_structural_group() -> None: - decoded = pipeline._decode_cached_structural_finding_group( + decoded = decode_cached_structural_finding_group( { "finding_kind": "duplicated_branches", "finding_key": "k", @@ -690,7 +769,7 @@ def _discover_with_single_cached_entry( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, cached_entry: dict[str, object], -) -> pipeline.DiscoveryResult: +) -> DiscoveryResult: source = tmp_path / "a.py" source.write_text("def f():\n return 1\n", "utf-8") filepath = str(source) @@ -701,16 +780,68 @@ class _FakeCache: def get_file_entry(self, _path: str) -> dict[str, object]: return cache_entry - boot = pipeline.BootstrapResult( + def prune_file_entries(self, existing_filepaths: object) -> int: + return 0 + + boot = BootstrapResult( root=tmp_path, config=NormalizationConfig(), args=Namespace(skip_metrics=False, min_loc=1, min_stmt=1, processes=1), - output_paths=pipeline.OutputPaths(), + output_paths=OutputPaths(), cache_path=tmp_path / "cache.json", ) - monkeypatch.setattr(pipeline, "iter_py_files", lambda _root: [filepath]) - monkeypatch.setattr(pipeline, "file_stat_signature", lambda _path: stat) - return pipeline.discover(boot=boot, cache=cast(Cache, _FakeCache())) + monkeypatch.setattr(core_discovery, "iter_py_files", lambda _root: [filepath]) + monkeypatch.setattr(core_discovery, "file_stat_signature", lambda _path: stat) + return discover(boot=boot, cache=cast(Cache, _FakeCache())) + + +def test_discover_prunes_deleted_cache_entries(tmp_path: Path) -> None: + live = tmp_path / "a.py" + stale = tmp_path / "stale.py" + live.write_text("def f():\n return 1\n", "utf-8") + + cache_path = tmp_path / "cache.json" + cache = Cache(cache_path, root=tmp_path) + cache.put_file_entry( + str(live), + file_stat_signature(str(live)), + [], + [], + [], + source_stats=SourceStatsDict(lines=2, functions=1, methods=0, classes=0), + ) + cache.put_file_entry( + str(stale), + {"mtime_ns": 1, "size": 1}, + [], + [], + [], + source_stats=SourceStatsDict(lines=0, functions=0, methods=0, classes=0), + ) + cache.save() + + loaded = Cache(cache_path, root=tmp_path) + loaded.load() + boot = BootstrapResult( + root=tmp_path, + config=NormalizationConfig(), + args=Namespace(skip_metrics=False, min_loc=1, min_stmt=1, processes=1), + output_paths=OutputPaths(), + cache_path=cache_path, + ) + + result = discover(boot=boot, cache=loaded) + + assert result.files_found == 1 + assert result.cache_hits == 1 + assert result.files_to_process == () + assert str(stale) not in loaded.data["files"] + + loaded.save() + + reloaded = Cache(cache_path, root=tmp_path) + reloaded.load() + assert str(stale) not in reloaded.data["files"] @pytest.mark.parametrize( @@ -830,9 +961,9 @@ def test_pipeline_discover_cache_admission_branches( def test_pipeline_cached_source_stats_helper_invalid_shapes() -> None: - assert pipeline._cache_entry_source_stats(cast(CacheEntry, {})) is None + assert _cache_entry_source_stats(cast(CacheEntry, {})) is None assert ( - pipeline._cache_entry_source_stats( + _cache_entry_source_stats( cast( CacheEntry, { @@ -850,40 +981,53 @@ def test_pipeline_cached_source_stats_helper_invalid_shapes() -> None: def test_cli_metric_reason_parser_and_policy_context() -> None: - assert cli._parse_metric_reason_entry( + assert cli_console._parse_metric_reason_entry( "New high-risk functions vs metrics baseline: 1." ) == ("new_high_risk_functions", "1") - assert cli._parse_metric_reason_entry( + assert cli_console._parse_metric_reason_entry( "New high-coupling classes vs metrics baseline: 2." ) == ("new_high_coupling_classes", "2") - assert cli._parse_metric_reason_entry( + assert cli_console._parse_metric_reason_entry( "New dependency cycles vs metrics baseline: 3." ) == ("new_dependency_cycles", "3") - assert cli._parse_metric_reason_entry( + assert cli_console._parse_metric_reason_entry( "New dead code items vs metrics baseline: 4." ) == ("new_dead_code_items", "4") - assert cli._parse_metric_reason_entry( + assert cli_console._parse_metric_reason_entry( "Health score regressed vs metrics baseline: delta=-7." ) == ("health_delta", "-7") - assert cli._parse_metric_reason_entry( + assert cli_console._parse_metric_reason_entry( + "Typing coverage regressed vs metrics baseline: " + "params_delta=-2, returns_delta=-1." + ) == ("typing_coverage_delta", "-2 (returns_delta=-1)") + assert cli_console._parse_metric_reason_entry( + "Docstring coverage regressed vs metrics baseline: delta=-3." + ) == ("docstring_coverage_delta", "-3") + assert cli_console._parse_metric_reason_entry( + "Public API breaking changes vs metrics baseline: 5." + ) == ("api_breaking_changes", "5") + assert cli_console._parse_metric_reason_entry( + "Coverage hotspots detected: hotspots=2, threshold=50." + ) == ("coverage_hotspots", "2 (threshold=50)") + assert cli_console._parse_metric_reason_entry( "Dependency cycles detected: 3 cycle(s)." ) == ("dependency_cycles", "3") - assert cli._parse_metric_reason_entry( + assert cli_console._parse_metric_reason_entry( "Dead code detected (high confidence): 2 item(s)." ) == ("dead_code_items", "2") - assert cli._parse_metric_reason_entry( + assert cli_console._parse_metric_reason_entry( "Complexity threshold exceeded: max=11, threshold=10." ) == ("complexity_max", "11 (threshold=10)") - assert cli._parse_metric_reason_entry( + assert cli_console._parse_metric_reason_entry( "Coupling threshold exceeded: max=12, threshold=9." ) == ("coupling_max", "12 (threshold=9)") - assert cli._parse_metric_reason_entry( + assert cli_console._parse_metric_reason_entry( "Cohesion threshold exceeded: max=13, threshold=8." ) == ("cohesion_max", "13 (threshold=8)") - assert cli._parse_metric_reason_entry( + assert cli_console._parse_metric_reason_entry( "Health score below threshold: score=70, threshold=80." ) == ("health_score", "70 (threshold=80)") - assert cli._parse_metric_reason_entry("custom reason.") == ( + assert cli_console._parse_metric_reason_entry("custom reason.") == ( "detail", "custom reason", ) @@ -924,18 +1068,18 @@ def test_cli_run_analysis_stages_handles_cache_save_error( monkeypatch: pytest.MonkeyPatch, ) -> None: args = Namespace(quiet=False, no_progress=False, skip_metrics=True) - boot = pipeline.BootstrapResult( + boot = BootstrapResult( root=Path("."), config=NormalizationConfig(), args=args, - output_paths=pipeline.OutputPaths(), + output_paths=OutputPaths(), cache_path=Path("cache.json"), ) monkeypatch.setattr( cli, "discover", - lambda **_kwargs: pipeline.DiscoveryResult( + lambda **_kwargs: DiscoveryResult( files_found=0, cache_hits=0, files_skipped=0, @@ -954,7 +1098,7 @@ def test_cli_run_analysis_stages_handles_cache_save_error( monkeypatch.setattr( cli, "process", - lambda **_kwargs: pipeline.ProcessingResult( + lambda **_kwargs: ProcessingResult( units=(), blocks=(), segments=(), @@ -975,7 +1119,7 @@ def test_cli_run_analysis_stages_handles_cache_save_error( monkeypatch.setattr( cli, "analyze", - lambda **_kwargs: pipeline.AnalysisResult( + lambda **_kwargs: AnalysisResult( func_groups={}, block_groups={}, block_groups_report={}, diff --git a/tests/test_coverage_edges.py b/tests/test_coverage_edges.py new file mode 100644 index 0000000..16b3d44 --- /dev/null +++ b/tests/test_coverage_edges.py @@ -0,0 +1,249 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import argparse +import ast +import operator +from typing import Any, cast + +import pytest + +import codeclone.analysis as analysis_mod +import codeclone.analysis.units as units_mod +import codeclone.config.argparse_builder as argparse_builder_mod +import codeclone.config.spec as spec_mod +import codeclone.report.gates.evaluator as evaluator_mod +import codeclone.surfaces.cli.console as cli_console_mod +import codeclone.surfaces.cli.state as cli_state_mod +from codeclone.analysis.normalizer import NormalizationConfig +from codeclone.config.spec import OptionSpec +from codeclone.contracts.errors import ParseError +from codeclone.report.gates.evaluator import MetricGateConfig +from codeclone.utils.git_diff import validate_git_diff_ref + + +def _report_document() -> dict[str, object]: + return { + "findings": { + "groups": { + "clones": { + "functions": [{"id": "clone:function:new", "novelty": "new"}], + "blocks": [], + } + } + }, + "metrics": { + "families": { + "complexity": {"summary": {"max": 30}}, + "coupling": {"summary": {"max": 12}}, + "cohesion": {"summary": {"max": 4}}, + "dependencies": {"summary": {"cycles": 0}}, + "dead_code": {"summary": {"high_confidence": 1}}, + "health": {"summary": {"score": 90}}, + "coverage_adoption": { + "summary": { + "param_permille": 1000, + "docstring_permille": 1000, + "param_delta": 0, + "return_delta": 0, + "docstring_delta": 0, + } + }, + "api_surface": {"summary": {"breaking": 0}}, + "coverage_join": {"summary": {"status": "", "coverage_hotspots": 0}}, + } + }, + } + + +def test_analysis_module_exports_extract_units_directly() -> None: + assert ( + analysis_mod.extract_units_and_stats_from_source + is units_mod.extract_units_and_stats_from_source + ) + with pytest.raises(AttributeError, match="has no attribute 'missing'"): + operator.attrgetter("missing")(analysis_mod) + + +def test_extract_units_rejects_non_module_ast_root( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr(units_mod, "_parse_with_limits", lambda *_args: ast.Pass()) + with pytest.raises(ParseError, match="expected module AST root"): + units_mod.extract_units_and_stats_from_source( + source="pass\n", + filepath="pkg/mod.py", + module_name="pkg.mod", + cfg=NormalizationConfig(), + min_loc=1, + min_stmt=1, + ) + + +def test_cli_state_initializes_console_once_and_allows_override( + monkeypatch: pytest.MonkeyPatch, +) -> None: + sentinel = object() + monkeypatch.setattr(cli_state_mod, "console", None) + monkeypatch.setattr(cli_console_mod, "make_plain_console", lambda: sentinel) + + assert cli_state_mod.get_console() is sentinel + assert cli_state_mod.get_console() is sentinel + + replacement = object() + cli_state_mod.set_console(replacement) + assert cli_state_mod.get_console() is replacement + + +def test_validate_git_diff_ref_rejects_control_whitespace_characters() -> None: + with pytest.raises(ValueError, match="whitespace and control characters"): + validate_git_diff_ref("main\tHEAD") + + +def test_validate_git_diff_ref_rejects_empty_value() -> None: + with pytest.raises(ValueError, match="must not be empty"): + validate_git_diff_ref("") + + +def test_add_option_rejects_unsupported_cli_kind() -> None: + parser = argparse.ArgumentParser() + group = parser.add_argument_group("Example") + option = OptionSpec( + dest="broken", + group="Example", + cli_kind=cast(Any, "broken-kind"), + flags=("--broken",), + ) + + with pytest.raises(RuntimeError, match="Unsupported CLI option kind"): + argparse_builder_mod._add_option(group, option=option, version="2.0.0") + + +def test_config_spec_option_supports_explicit_pyproject_key_and_conflict_guard( + monkeypatch: pytest.MonkeyPatch, +) -> None: + explicit = spec_mod._option( + dest="baseline_path", + group="Example", + pyproject_type=str, + pyproject_key="baseline-file", + ) + assert explicit.pyproject_key == "baseline-file" + + monkeypatch.setattr( + spec_mod, + "OPTIONS", + ( + spec_mod._option( + dest="first", + group="Example", + pyproject_type=str, + pyproject_key="shared", + ), + spec_mod._option( + dest="second", + group="Example", + pyproject_type=int, + pyproject_key="shared", + ), + ), + ) + with pytest.raises(RuntimeError, match="Conflicting pyproject spec for shared"): + spec_mod._build_pyproject_specs() + + +def test_summarize_metrics_diff_accepts_mapping_payload() -> None: + summary = evaluator_mod.summarize_metrics_diff( + { + "new_high_risk_functions": 2, + "new_high_coupling_classes": 3, + "new_cycles": 4, + "new_dead_code": 5, + "health_delta": -2, + "typing_param_permille_delta": -100, + "typing_return_permille_delta": -200, + "docstring_permille_delta": -300, + "new_api_breaking_changes": 7, + } + ) + + assert summary == { + "new_high_risk_functions": 2, + "new_high_coupling_classes": 3, + "new_cycles": 4, + "new_dead_code": 5, + "health_delta": -2, + "typing_param_permille_delta": -100, + "typing_return_permille_delta": -200, + "docstring_permille_delta": -300, + "new_api_symbols": 0, + "api_breaking_changes": 7, + } + + +def test_metric_gate_reasons_wrapper_uses_report_document_snapshot() -> None: + reasons = evaluator_mod.metric_gate_reasons( + report_document=_report_document(), + config=MetricGateConfig( + fail_complexity=20, + fail_coupling=-1, + fail_cohesion=-1, + fail_cycles=False, + fail_dead_code=True, + fail_health=-1, + fail_on_new_metrics=False, + fail_on_typing_regression=False, + fail_on_docstring_regression=False, + fail_on_api_break=False, + fail_on_untested_hotspots=False, + min_typing_coverage=-1, + min_docstring_coverage=-1, + coverage_min=50, + fail_on_new=True, + fail_threshold=0, + ), + metrics_diff={"new_dead_code": 1, "health_delta": -1}, + ) + + assert "Complexity threshold exceeded: max CC=30, threshold=20." in reasons + assert "Dead code detected (high confidence): 1 item(s)." in reasons + assert "New dead code items vs metrics baseline: 1." not in reasons + + +def test_metric_gate_reasons_for_state_skips_missing_builder( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delitem( + evaluator_mod._GATE_REASON_BUILDERS, + "complexity_threshold", + raising=False, + ) + + reasons = evaluator_mod.metric_gate_reasons_for_state( + state=evaluator_mod.GateState(complexity_max=10), + config=MetricGateConfig( + fail_complexity=5, + fail_coupling=-1, + fail_cohesion=-1, + fail_cycles=False, + fail_dead_code=False, + fail_health=-1, + fail_on_new_metrics=False, + fail_on_typing_regression=False, + fail_on_docstring_regression=False, + fail_on_api_break=False, + fail_on_untested_hotspots=False, + min_typing_coverage=-1, + min_docstring_coverage=-1, + coverage_min=50, + fail_on_new=False, + fail_threshold=-1, + ), + ) + + assert reasons == () diff --git a/tests/test_defaults_contract.py b/tests/test_defaults_contract.py new file mode 100644 index 0000000..0515e04 --- /dev/null +++ b/tests/test_defaults_contract.py @@ -0,0 +1,138 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import inspect +from pathlib import Path + +from codeclone.baseline.trust import MAX_BASELINE_SIZE_BYTES +from codeclone.cache.versioning import MAX_CACHE_SIZE_BYTES +from codeclone.config import spec as spec_mod +from codeclone.config.argparse_builder import build_parser +from codeclone.contracts import ( + DEFAULT_BASELINE_PATH, + DEFAULT_BLOCK_MIN_LOC, + DEFAULT_BLOCK_MIN_STMT, + DEFAULT_COVERAGE_MIN, + DEFAULT_MAX_BASELINE_SIZE_MB, + DEFAULT_MAX_CACHE_SIZE_MB, + DEFAULT_MIN_LOC, + DEFAULT_MIN_STMT, + DEFAULT_PROCESSES, + DEFAULT_ROOT, + DEFAULT_SEGMENT_MIN_LOC, + DEFAULT_SEGMENT_MIN_STMT, + HEALTH_DEPENDENCY_DEPTH_AVG_MULTIPLIER, + HEALTH_DEPENDENCY_DEPTH_P95_MARGIN, +) +from codeclone.core._types import DEFAULT_RUNTIME_PROCESSES +from codeclone.metrics import health as health_mod +from codeclone.report.gates.evaluator import MetricGateConfig +from codeclone.report.html.sections import _dependencies as html_dependencies_mod +from codeclone.surfaces.mcp import server as mcp_server +from codeclone.surfaces.mcp.service import CodeCloneMCPService +from codeclone.surfaces.mcp.session import MCPAnalysisRequest, MCPGateRequest + + +def test_config_spec_reexports_shared_runtime_defaults() -> None: + assert spec_mod.DEFAULT_ROOT == DEFAULT_ROOT + assert spec_mod.DEFAULT_MIN_LOC == DEFAULT_MIN_LOC + assert spec_mod.DEFAULT_MIN_STMT == DEFAULT_MIN_STMT + assert spec_mod.DEFAULT_BLOCK_MIN_LOC == DEFAULT_BLOCK_MIN_LOC + assert spec_mod.DEFAULT_BLOCK_MIN_STMT == DEFAULT_BLOCK_MIN_STMT + assert spec_mod.DEFAULT_SEGMENT_MIN_LOC == DEFAULT_SEGMENT_MIN_LOC + assert spec_mod.DEFAULT_SEGMENT_MIN_STMT == DEFAULT_SEGMENT_MIN_STMT + assert spec_mod.DEFAULT_PROCESSES == DEFAULT_PROCESSES + assert spec_mod.DEFAULT_MAX_CACHE_SIZE_MB == DEFAULT_MAX_CACHE_SIZE_MB + assert spec_mod.DEFAULT_MAX_BASELINE_SIZE_MB == DEFAULT_MAX_BASELINE_SIZE_MB + assert spec_mod.DEFAULT_BASELINE_PATH == DEFAULT_BASELINE_PATH + assert spec_mod.DEFAULTS_BY_DEST["coverage_min"] == DEFAULT_COVERAGE_MIN + + +def test_cli_parser_defaults_follow_contract_defaults() -> None: + args = build_parser("2.0.0").parse_args([]) + + assert args.root == DEFAULT_ROOT + assert args.min_loc == DEFAULT_MIN_LOC + assert args.min_stmt == DEFAULT_MIN_STMT + assert args.block_min_loc == DEFAULT_BLOCK_MIN_LOC + assert args.block_min_stmt == DEFAULT_BLOCK_MIN_STMT + assert args.segment_min_loc == DEFAULT_SEGMENT_MIN_LOC + assert args.segment_min_stmt == DEFAULT_SEGMENT_MIN_STMT + assert args.processes == DEFAULT_PROCESSES + assert args.max_cache_size_mb == DEFAULT_MAX_CACHE_SIZE_MB + assert args.baseline == DEFAULT_BASELINE_PATH + assert args.max_baseline_size_mb == DEFAULT_MAX_BASELINE_SIZE_MB + assert args.metrics_baseline == DEFAULT_BASELINE_PATH + assert args.coverage_min == DEFAULT_COVERAGE_MIN + + +def test_size_byte_limits_derive_from_contract_megabyte_defaults() -> None: + assert MAX_CACHE_SIZE_BYTES == DEFAULT_MAX_CACHE_SIZE_MB * 1024 * 1024 + assert MAX_BASELINE_SIZE_BYTES == DEFAULT_MAX_BASELINE_SIZE_MB * 1024 * 1024 + + +def test_runtime_and_gate_defaults_follow_contract_defaults(tmp_path: Path) -> None: + service = CodeCloneMCPService() + args = service._build_args( + root_path=tmp_path, + request=MCPAnalysisRequest(respect_pyproject=False), + ) + + assert DEFAULT_RUNTIME_PROCESSES == DEFAULT_PROCESSES + assert args.min_loc == DEFAULT_MIN_LOC + assert args.min_stmt == DEFAULT_MIN_STMT + assert args.block_min_loc == DEFAULT_BLOCK_MIN_LOC + assert args.block_min_stmt == DEFAULT_BLOCK_MIN_STMT + assert args.segment_min_loc == DEFAULT_SEGMENT_MIN_LOC + assert args.segment_min_stmt == DEFAULT_SEGMENT_MIN_STMT + assert args.max_cache_size_mb == DEFAULT_MAX_CACHE_SIZE_MB + assert args.max_baseline_size_mb == DEFAULT_MAX_BASELINE_SIZE_MB + assert args.baseline == DEFAULT_BASELINE_PATH + assert args.metrics_baseline == DEFAULT_BASELINE_PATH + assert args.coverage_min == DEFAULT_COVERAGE_MIN + assert MCPGateRequest().coverage_min == DEFAULT_COVERAGE_MIN + assert ( + MetricGateConfig( + fail_complexity=-1, + fail_coupling=-1, + fail_cohesion=-1, + fail_cycles=False, + fail_dead_code=False, + fail_health=-1, + fail_on_new_metrics=False, + ).coverage_min + == DEFAULT_COVERAGE_MIN + ) + + +def test_mcp_parser_and_builder_defaults_stay_in_sync() -> None: + args = mcp_server.build_parser().parse_args([]) + signature = inspect.signature(mcp_server.build_mcp_server) + + assert signature.parameters["history_limit"].default == args.history_limit + assert signature.parameters["host"].default == args.host + assert signature.parameters["port"].default == args.port + assert signature.parameters["json_response"].default == args.json_response + assert signature.parameters["stateless_http"].default == args.stateless_http + assert signature.parameters["debug"].default == args.debug + assert signature.parameters["log_level"].default == args.log_level + + +def test_dependency_depth_profile_contract_stays_shared_between_health_and_html() -> ( + None +): + health_source = inspect.getsource(health_mod._dependency_expected_tail) + html_source = inspect.getsource(html_dependencies_mod.render_dependencies_panel) + + assert "HEALTH_DEPENDENCY_DEPTH_AVG_MULTIPLIER" in health_source + assert "HEALTH_DEPENDENCY_DEPTH_P95_MARGIN" in health_source + assert HEALTH_DEPENDENCY_DEPTH_AVG_MULTIPLIER == 2.0 + assert HEALTH_DEPENDENCY_DEPTH_P95_MARGIN == 1 + assert "avg depth" in html_source + assert "p95 depth" in html_source + assert "HEALTH_DEPENDENCY_MAX_DEPTH_SAFE_ZONE" not in html_source diff --git a/tests/test_detector_golden.py b/tests/test_detector_golden.py index d03e103..fc1bebb 100644 --- a/tests/test_detector_golden.py +++ b/tests/test_detector_golden.py @@ -12,10 +12,10 @@ import pytest -from codeclone import extractor +from codeclone.analysis.normalizer import NormalizationConfig +from codeclone.analysis.units import extract_units_and_stats_from_source from codeclone.baseline import current_python_tag -from codeclone.normalize import NormalizationConfig -from codeclone.report import build_block_groups, build_groups +from codeclone.findings.clones.grouping import build_block_groups, build_groups from codeclone.scanner import module_name_from_path from tests._assertions import snapshot_python_tag @@ -29,7 +29,7 @@ def _detect_group_keys(project_root: Path) -> tuple[list[str], list[str]]: source = path.read_text("utf-8") module_name = module_name_from_path(str(project_root), str(path)) units, blocks, _segments, _source_stats, _file_metrics, _sf = ( - extractor.extract_units_and_stats_from_source( + extract_units_and_stats_from_source( source=source, filepath=str(path), module_name=module_name, diff --git a/tests/test_docs_example_report.py b/tests/test_docs_example_report.py new file mode 100644 index 0000000..eee1230 --- /dev/null +++ b/tests/test_docs_example_report.py @@ -0,0 +1,69 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import runpy +import sys +from pathlib import Path +from unittest.mock import patch + + +def _load_docs_report_namespace() -> dict[str, object]: + script_path = ( + Path(__file__).resolve().parents[1] / "scripts" / "build_docs_example_report.py" + ) + return runpy.run_path(str(script_path)) + + +def test_docs_example_report_uses_main_entrypoint( + tmp_path: Path, +) -> None: + module = _load_docs_report_namespace() + observed: dict[str, object] = {} + + def _fake_run( + cmd: list[str], + *, + cwd: Path, + check: bool, + ) -> None: + observed["cmd"] = cmd + observed["cwd"] = cwd + observed["check"] = check + + report_artifacts_type = module["ReportArtifacts"] + assert callable(report_artifacts_type) + artifacts = report_artifacts_type( + html=tmp_path / "index.html", + json=tmp_path / "report.json", + sarif=tmp_path / "report.sarif", + manifest=tmp_path / "manifest.json", + ) + run_codeclone = module["_run_codeclone"] + assert callable(run_codeclone) + + with patch("subprocess.run", side_effect=_fake_run): + run_codeclone(tmp_path, artifacts) + + assert observed == { + "cmd": [ + sys.executable, + "-m", + "codeclone.main", + str(tmp_path), + "--html", + str(artifacts.html), + "--json", + str(artifacts.json), + "--sarif", + str(artifacts.sarif), + "--no-progress", + "--quiet", + ], + "cwd": tmp_path, + "check": True, + } diff --git a/tests/test_extractor.py b/tests/test_extractor.py index 33e89fe..82f327f 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -15,11 +15,14 @@ import pytest -from codeclone import extractor, qualnames -from codeclone.errors import ParseError -from codeclone.metrics import find_unused -from codeclone.models import BlockUnit, ClassMetrics, ModuleDep, SegmentUnit -from codeclone.normalize import NormalizationConfig +import codeclone.analysis._module_walk as module_walk_mod +import codeclone.analysis.parser as parser_mod +import codeclone.analysis.units as units_mod +from codeclone import qualnames +from codeclone.analysis.normalizer import NormalizationConfig +from codeclone.contracts.errors import ParseError +from codeclone.metrics.dead_code import find_unused +from codeclone.models import BlockUnit, ClassMetrics, ModuleDep, SegmentUnit, Unit from codeclone.qualnames import FunctionNode, QualnameCollector @@ -36,12 +39,12 @@ def extract_units_from_source( segment_min_loc: int = 20, segment_min_stmt: int = 10, ) -> tuple[ - list[extractor.Unit], + list[Unit], list[BlockUnit], list[SegmentUnit], ]: units, blocks, segments, _source_stats, _file_metrics, _sf = ( - extractor.extract_units_and_stats_from_source( + units_mod.extract_units_and_stats_from_source( source=source, filepath=filepath, module_name=module_name, @@ -71,9 +74,9 @@ def _collect_module_walk( *, module_name: str = "pkg.mod", collect_referenced_names: bool = True, -) -> tuple[ast.Module, QualnameCollector, extractor._ModuleWalkResult]: +) -> tuple[ast.Module, QualnameCollector, module_walk_mod._ModuleWalkResult]: tree, collector = _parse_tree_and_collector(source) - walk = extractor._collect_module_walk_data( + walk = module_walk_mod._collect_module_walk_data( tree=tree, module_name=module_name, collector=collector, @@ -88,7 +91,7 @@ def _dead_qualnames_from_source( filepath: str = "pkg/mod.py", module_name: str = "pkg.mod", ) -> tuple[str, ...]: - _, _, _, _, file_metrics, _ = extractor.extract_units_and_stats_from_source( + _, _, _, _, file_metrics, _ = units_mod.extract_units_and_stats_from_source( source=source, filepath=filepath, module_name=module_name, @@ -131,13 +134,13 @@ def foo(): def test_source_tokens_returns_empty_on_tokenize_error() -> None: - assert extractor._source_tokens('"""') == () + assert parser_mod._source_tokens('"""') == () def test_declaration_token_index_returns_none_when_start_token_is_missing() -> None: - tokens = extractor._source_tokens("value = 1\n") + tokens = parser_mod._source_tokens("value = 1\n") assert ( - extractor._declaration_token_index( + parser_mod._declaration_token_index( source_tokens=tokens, start_line=1, start_col=0, @@ -148,11 +151,11 @@ def test_declaration_token_index_returns_none_when_start_token_is_missing() -> N def test_declaration_token_index_uses_prebuilt_index() -> None: - tokens = extractor._source_tokens("async def demo():\n return 1\n") - token_index = extractor._build_declaration_token_index(tokens) + tokens = parser_mod._source_tokens("async def demo():\n return 1\n") + token_index = parser_mod._build_declaration_token_index(tokens) assert ( - extractor._declaration_token_index( + parser_mod._declaration_token_index( source_tokens=tokens, start_line=1, start_col=0, @@ -171,11 +174,11 @@ async def demo(): """ ).body[0] assert isinstance(async_node, ast.AsyncFunctionDef) - assert extractor._declaration_token_name(async_node) == "async" + assert parser_mod._declaration_token_name(async_node) == "async" - tokens = extractor._source_tokens("def demo():\n return 1\n") + tokens = parser_mod._source_tokens("def demo():\n return 1\n") assert ( - extractor._declaration_token_index( + parser_mod._declaration_token_index( source_tokens=tokens, start_line=1, start_col=0, @@ -184,22 +187,22 @@ async def demo(): == 0 ) - nested_tokens = extractor._source_tokens( + nested_tokens = parser_mod._source_tokens( "def demo(arg: tuple[int, int]) -> tuple[int, int]:\n return arg\n" ) assert ( - extractor._scan_declaration_colon_line( + parser_mod._scan_declaration_colon_line( source_tokens=nested_tokens, start_index=0, ) == 1 ) - default_tokens = extractor._source_tokens( + default_tokens = parser_mod._source_tokens( "def demo(arg=(1, [2])):\n return arg\n" ) assert ( - extractor._scan_declaration_colon_line( + parser_mod._scan_declaration_colon_line( source_tokens=default_tokens, start_index=0, ) @@ -212,7 +215,7 @@ async def demo(): tokenize.TokenInfo(tokenize.OP, "(", (1, 8), (1, 9), "def demo("), ) assert ( - extractor._scan_declaration_colon_line( + parser_mod._scan_declaration_colon_line( source_tokens=eof_tokens, start_index=0, ) @@ -224,7 +227,7 @@ async def demo(): tokenize.TokenInfo(tokenize.OP, ")", (1, 8), (1, 9), "def demo)"), ) assert ( - extractor._scan_declaration_colon_line( + parser_mod._scan_declaration_colon_line( source_tokens=unmatched_close_tokens, start_index=0, ) @@ -233,9 +236,9 @@ async def demo(): def test_scan_declaration_colon_line_returns_none_when_header_is_incomplete() -> None: - tokens = extractor._source_tokens("def broken\n") + tokens = parser_mod._source_tokens("def broken\n") assert ( - extractor._scan_declaration_colon_line( + parser_mod._scan_declaration_colon_line( source_tokens=tokens, start_index=0, ) @@ -251,7 +254,7 @@ class Demo: """ ).body[0] assert isinstance(node, ast.ClassDef) - assert extractor._declaration_end_line(node, source_tokens=()) == 2 + assert parser_mod._declaration_end_line(node, source_tokens=()) == 2 def test_declaration_end_line_returns_zero_for_invalid_start_line() -> None: @@ -263,7 +266,7 @@ def broken(): ).body[0] assert isinstance(node, ast.FunctionDef) node.lineno = 0 - assert extractor._declaration_end_line(node, source_tokens=()) == 0 + assert parser_mod._declaration_end_line(node, source_tokens=()) == 0 def test_declaration_fallback_helpers_cover_empty_and_same_line_bodies() -> None: @@ -275,7 +278,7 @@ def demo(): ).body[0] assert isinstance(empty_body_node, ast.FunctionDef) empty_body_node.body = [] - assert extractor._fallback_declaration_end_line(empty_body_node, start_line=2) == 2 + assert parser_mod._fallback_declaration_end_line(empty_body_node, start_line=2) == 2 inline_body_node = ast.parse( """ @@ -285,14 +288,16 @@ def demo(): ).body[0] assert isinstance(inline_body_node, ast.FunctionDef) inline_body_node.body[0].lineno = 2 - assert extractor._fallback_declaration_end_line(inline_body_node, start_line=2) == 2 + assert ( + parser_mod._fallback_declaration_end_line(inline_body_node, start_line=2) == 2 + ) no_colon_tokens = ( tokenize.TokenInfo(tokenize.NAME, "def", (2, 0), (2, 3), "def demo"), tokenize.TokenInfo(tokenize.NAME, "demo", (2, 4), (2, 8), "def demo"), ) assert ( - extractor._declaration_end_line( + parser_mod._declaration_end_line( inline_body_node, source_tokens=no_colon_tokens, ) @@ -351,7 +356,7 @@ def test_extract_units_skips_suppression_tokenization_without_inline_directives( source: str, ) -> None: monkeypatch.setattr( - extractor, + module_walk_mod, "_source_tokens", lambda _source: (_ for _ in ()).throw( AssertionError("_source_tokens should not be called") @@ -376,14 +381,17 @@ def test_extract_units_tokenizes_when_inline_suppressions_exist( monkeypatch: pytest.MonkeyPatch, ) -> None: calls = 0 - original_source_tokens = extractor._source_tokens + original_source_tokens = cast( + "Callable[[str], tuple[tokenize.TokenInfo, ...]]", + module_walk_mod.__dict__["_source_tokens"], + ) def _record_tokens(source: str) -> tuple[tokenize.TokenInfo, ...]: nonlocal calls calls += 1 return original_source_tokens(source) - monkeypatch.setattr(extractor, "_source_tokens", _record_tokens) + monkeypatch.setattr(module_walk_mod, "_source_tokens", _record_tokens) units, blocks, segments = extract_units_from_source( source=""" @@ -424,7 +432,7 @@ def foo(x): return a + b + c + d + e """ _units, _blocks, _segments, _source_stats, _file_metrics, sf = ( - extractor.extract_units_and_stats_from_source( + units_mod.extract_units_and_stats_from_source( source=src, filepath="x.py", module_name="mod", @@ -440,19 +448,19 @@ def foo(x): def test_parse_timeout_raises(monkeypatch: pytest.MonkeyPatch) -> None: @contextmanager def _boom(_timeout_s: int) -> Iterator[None]: - raise extractor._ParseTimeoutError("AST parsing timeout") + raise parser_mod._ParseTimeoutError("AST parsing timeout") if False: yield - monkeypatch.setattr(extractor, "_parse_limits", _boom) + monkeypatch.setattr(parser_mod, "_parse_limits", _boom) with pytest.raises(ParseError, match="AST parsing timeout"): - extractor._parse_with_limits("x = 1", 1) + parser_mod._parse_with_limits("x = 1", 1) def test_parse_limits_no_timeout() -> None: - with extractor._parse_limits(0): - tree = extractor._parse_with_limits("x = 1", 0) + with parser_mod._parse_limits(0): + tree = parser_mod._parse_with_limits("x = 1", 0) assert tree is not None @@ -481,8 +489,8 @@ def setrlimit(_key: int, _val: tuple[int, int]) -> None: _patch_posix_parse_limits(monkeypatch, _DummyResource) - with extractor._parse_limits(1): - tree = extractor._parse_with_limits("x = 1", 1) + with parser_mod._parse_limits(1): + tree = parser_mod._parse_with_limits("x = 1", 1) assert tree is not None @@ -505,7 +513,7 @@ def setrlimit(_key: int, val: tuple[int, int]) -> None: _patch_posix_parse_limits(monkeypatch, _DummyResource) - with extractor._parse_limits(5): + with parser_mod._parse_limits(5): pass assert calls @@ -546,7 +554,7 @@ def getrusage(_who: int) -> _DummyUsage: _patch_posix_parse_limits(monkeypatch, _DummyResource) - with extractor._parse_limits(5): + with parser_mod._parse_limits(5): pass assert calls @@ -586,7 +594,7 @@ def getrusage(_who: int) -> _DummyUsage: _patch_posix_parse_limits(monkeypatch, _DummyResource) - with extractor._parse_limits(5): + with parser_mod._parse_limits(5): pass # Raised from 2 to ceil(10)+5 to avoid immediate SIGXCPU. @@ -613,7 +621,7 @@ def setrlimit(_key: int, val: tuple[int, int]) -> None: _patch_posix_parse_limits(monkeypatch, _DummyResource) - with extractor._parse_limits(5): + with parser_mod._parse_limits(5): pass # Finite soft limits are never lowered. @@ -646,22 +654,26 @@ def setrlimit(_key: int, _val: tuple[int, int]) -> None: monkeypatch.setitem(sys.modules, "resource", _DummyResource) # Should not raise even if restoring old limits fails. - with extractor._parse_limits(5): + with parser_mod._parse_limits(5): pass def test_resolve_import_target_absolute_and_relative() -> None: absolute = ast.ImportFrom(module="pkg.util", names=[], level=0) - assert extractor._resolve_import_target("root.mod.sub", absolute) == "pkg.util" + assert ( + module_walk_mod._resolve_import_target("root.mod.sub", absolute) == "pkg.util" + ) relative = ast.ImportFrom(module="helpers", names=[], level=1) assert ( - extractor._resolve_import_target("root.mod.sub", relative) == "root.mod.helpers" + module_walk_mod._resolve_import_target("root.mod.sub", relative) + == "root.mod.helpers" ) relative_no_module = ast.ImportFrom(module=None, names=[], level=2) assert ( - extractor._resolve_import_target("root.mod.sub", relative_no_module) == "root" + module_walk_mod._resolve_import_target("root.mod.sub", relative_no_module) + == "root" ) @@ -680,7 +692,7 @@ def test_collect_module_walk_data_imports_and_references() -> None: ) collector = QualnameCollector() collector.visit(tree) - walk = extractor._collect_module_walk_data( + walk = module_walk_mod._collect_module_walk_data( tree=tree, module_name="root.mod.sub", collector=collector, @@ -720,7 +732,7 @@ def test_collect_module_walk_data_edge_branches() -> None: tree = ast.parse("from .... import parent") collector = QualnameCollector() collector.visit(tree) - walk = extractor._collect_module_walk_data( + walk = module_walk_mod._collect_module_walk_data( tree=tree, module_name="pkg.mod", collector=collector, @@ -733,7 +745,7 @@ def test_collect_module_walk_data_edge_branches() -> None: lambda_call_tree = ast.parse("(lambda x: x)(1)") lambda_collector = QualnameCollector() lambda_collector.visit(lambda_call_tree) - lambda_walk = extractor._collect_module_walk_data( + lambda_walk = module_walk_mod._collect_module_walk_data( tree=lambda_call_tree, module_name="pkg.mod", collector=lambda_collector, @@ -752,7 +764,7 @@ def test_collect_module_walk_data_without_referenced_name_collection() -> None: ) collector = QualnameCollector() collector.visit(tree) - walk = extractor._collect_module_walk_data( + walk = module_walk_mod._collect_module_walk_data( tree=tree, module_name="root.mod.sub", collector=collector, @@ -777,12 +789,12 @@ def test_collect_module_walk_data_without_referenced_name_collection() -> None: def test_module_walk_helpers_cover_import_and_reference_branches() -> None: - state = extractor._ModuleWalkState() + state = module_walk_mod._ModuleWalkState() import_node = cast( ast.Import, ast.parse("import typing_extensions as te").body[0], ) - extractor._collect_import_node( + module_walk_mod._collect_import_node( node=import_node, module_name="pkg.mod", state=state, @@ -796,7 +808,7 @@ def test_module_walk_helpers_cover_import_and_reference_branches() -> None: ast.ImportFrom, ast.parse("from typing import Protocol as Proto, Thing as Alias").body[0], ) - extractor._collect_import_from_node( + module_walk_mod._collect_import_from_node( node=import_from_node, module_name="pkg.mod", state=state, @@ -810,7 +822,7 @@ def test_module_walk_helpers_cover_import_and_reference_branches() -> None: names=[ast.alias(name="parent", asname=None)], level=4, ) - extractor._collect_import_from_node( + module_walk_mod._collect_import_from_node( node=unresolved_import, module_name="pkg.mod", state=state, @@ -820,9 +832,9 @@ def test_module_walk_helpers_cover_import_and_reference_branches() -> None: name_node = cast(ast.Name, ast.parse("value", mode="eval").body) attr_node = cast(ast.Attribute, ast.parse("obj.attr", mode="eval").body) - extractor._collect_load_reference_node(node=name_node, state=state) - extractor._collect_load_reference_node(node=attr_node, state=state) - extractor._collect_load_reference_node( + module_walk_mod._collect_load_reference_node(node=name_node, state=state) + module_walk_mod._collect_load_reference_node(node=attr_node, state=state) + module_walk_mod._collect_load_reference_node( node=cast(ast.Constant, ast.parse("1", mode="eval").body), state=state, ) @@ -832,8 +844,11 @@ def test_module_walk_helpers_cover_import_and_reference_branches() -> None: def test_dotted_expr_protocol_detection_and_runtime_candidate_edges() -> None: dotted_expr = ast.parse("pkg.helpers.decorate", mode="eval").body - assert extractor._dotted_expr_name(dotted_expr) == "pkg.helpers.decorate" - assert extractor._dotted_expr_name(ast.parse("custom()", mode="eval").body) is None + assert module_walk_mod._dotted_expr_name(dotted_expr) == "pkg.helpers.decorate" + assert ( + module_walk_mod._dotted_expr_name(ast.parse("custom()", mode="eval").body) + is None + ) tree = ast.parse( """ @@ -848,7 +863,7 @@ class B(te.Protocol[int]): ) collector = QualnameCollector() collector.visit(tree) - walk = extractor._collect_module_walk_data( + walk = module_walk_mod._collect_module_walk_data( tree=tree, module_name="pkg.mod", collector=collector, @@ -859,12 +874,12 @@ class B(te.Protocol[int]): assert "te" in protocol_module_aliases classes = [node for node in tree.body if isinstance(node, ast.ClassDef)] class_a, class_b = classes - assert extractor._is_protocol_class( + assert module_walk_mod._is_protocol_class( class_a, protocol_symbol_aliases=protocol_symbol_aliases, protocol_module_aliases=protocol_module_aliases, ) - assert not extractor._is_protocol_class( + assert not module_walk_mod._is_protocol_class( class_b, protocol_symbol_aliases=protocol_symbol_aliases, protocol_module_aliases=protocol_module_aliases, @@ -880,7 +895,7 @@ def f(x): """.strip() ).body[0] assert isinstance(runtime_candidate, ast.FunctionDef) - assert extractor._is_non_runtime_candidate(runtime_candidate) + assert module_walk_mod._is_non_runtime_candidate(runtime_candidate) def test_resolve_referenced_qualnames_covers_module_class_and_attr_branches() -> None: @@ -899,26 +914,26 @@ def hook(self) -> int: dynamic = factory().attr """ tree, collector = _parse_tree_and_collector(src) - state = extractor._ModuleWalkState() + state = module_walk_mod._ModuleWalkState() for node in ast.walk(tree): if isinstance(node, ast.Import): - extractor._collect_import_node( + module_walk_mod._collect_import_node( node=node, module_name="pkg.mod", state=state, collect_referenced_names=True, ) elif isinstance(node, ast.ImportFrom): - extractor._collect_import_from_node( + module_walk_mod._collect_import_from_node( node=node, module_name="pkg.mod", state=state, collect_referenced_names=True, ) else: - extractor._collect_load_reference_node(node=node, state=state) + module_walk_mod._collect_load_reference_node(node=node, state=state) - resolved = extractor._resolve_referenced_qualnames( + resolved = module_walk_mod._resolve_referenced_qualnames( module_name="pkg.mod", collector=collector, state=state, @@ -961,7 +976,7 @@ def test_extractor_private_helper_branches_cover_invalid_protocol_and_declaratio attr="method", ctx=ast.Load(), ) - assert extractor._dotted_expr_name(expr) is None + assert module_walk_mod._dotted_expr_name(expr) is None protocol_class = ast.parse( """ @@ -971,7 +986,7 @@ class Demo(Unknown, alias.Protocol): ).body[0] assert isinstance(protocol_class, ast.ClassDef) assert ( - extractor._is_protocol_class( + module_walk_mod._is_protocol_class( protocol_class, protocol_symbol_aliases=frozenset({"Protocol"}), protocol_module_aliases=frozenset({"typing"}), @@ -988,7 +1003,7 @@ def demo(): assert isinstance(bad_span_node, ast.FunctionDef) bad_span_node.lineno = 3 bad_span_node.end_lineno = 2 - assert extractor._eligible_unit_shape(bad_span_node, min_loc=1, min_stmt=1) is None + assert units_mod._eligible_unit_shape(bad_span_node, min_loc=1, min_stmt=1) is None _, missing_method_collector, missing_method_walk = _collect_module_walk( """ @@ -1012,7 +1027,7 @@ def work(self) -> int: declaration_collector.units[0][1].end_lineno = 0 declaration_collector.class_nodes[0][1].end_lineno = 0 assert ( - extractor._collect_declaration_targets( + module_walk_mod._collect_declaration_targets( filepath="pkg/mod.py", module_name="pkg.mod", collector=declaration_collector, @@ -1025,8 +1040,8 @@ def demo(): # codeclone: ignore[dead-code] return 1 """ _, suppression_collector = _parse_tree_and_collector(suppression_source) - monkeypatch.setattr(extractor, "_source_tokens", lambda _source: ()) - suppression_index = extractor._build_suppression_index_for_source( + monkeypatch.setattr(module_walk_mod, "_source_tokens", lambda _source: ()) + suppression_index = module_walk_mod._build_suppression_index_for_source( source=suppression_source, filepath="pkg/mod.py", module_name="pkg.mod", @@ -1041,7 +1056,7 @@ def test_extract_stats_drops_referenced_names_for_test_filepaths() -> None: live() """ - _, _, _, _, test_metrics, _ = extractor.extract_units_and_stats_from_source( + _, _, _, _, test_metrics, _ = units_mod.extract_units_and_stats_from_source( source=src, filepath="pkg/tests/test_usage.py", module_name="pkg.tests.test_usage", @@ -1049,7 +1064,7 @@ def test_extract_stats_drops_referenced_names_for_test_filepaths() -> None: min_loc=1, min_stmt=1, ) - _, _, _, _, regular_metrics, _ = extractor.extract_units_and_stats_from_source( + _, _, _, _, regular_metrics, _ = units_mod.extract_units_and_stats_from_source( source=src, filepath="pkg/usage.py", module_name="pkg.usage", @@ -1086,7 +1101,7 @@ def verify(self): def make(): return Service() """ - _, _, _, _, file_metrics, _ = extractor.extract_units_and_stats_from_source( + _, _, _, _, file_metrics, _ = units_mod.extract_units_and_stats_from_source( source=src, filepath="pkg/service.py", module_name="pkg.service", @@ -1123,7 +1138,7 @@ def test_orphan_usage(): assert orphan() == 1 """ - _, _, _, _, prod_metrics, _ = extractor.extract_units_and_stats_from_source( + _, _, _, _, prod_metrics, _ = units_mod.extract_units_and_stats_from_source( source=src_prod, filepath="pkg/mod.py", module_name="pkg.mod", @@ -1131,7 +1146,7 @@ def test_orphan_usage(): min_loc=1, min_stmt=1, ) - _, _, _, _, test_metrics, _ = extractor.extract_units_and_stats_from_source( + _, _, _, _, test_metrics, _ = units_mod.extract_units_and_stats_from_source( source=src_test, filepath="pkg/tests/test_mod.py", module_name="pkg.tests.test_mod", @@ -1264,7 +1279,7 @@ def used(): broken_class.lineno = 0 broken_class.end_lineno = 0 collector.class_nodes.append(("Broken", broken_class)) - dead = extractor._collect_dead_candidates( + dead = module_walk_mod._collect_dead_candidates( filepath="pkg/mod.py", module_name="pkg.mod", collector=collector, @@ -1283,7 +1298,7 @@ def visit(self, _tree: ast.AST) -> None: return None monkeypatch.setattr(qualnames, "QualnameCollector", _CollectorNoClassMetrics) - _, _, _, _, file_metrics, _ = extractor.extract_units_and_stats_from_source( + _, _, _, _, file_metrics, _ = units_mod.extract_units_and_stats_from_source( source="class Broken:\n pass\n", filepath="pkg/mod.py", module_name="pkg.mod", @@ -1303,7 +1318,7 @@ def wrapper(): value = _run_impl() return helpers.decorate(value) """ - _, _, _, _, file_metrics, _ = extractor.extract_units_and_stats_from_source( + _, _, _, _, file_metrics, _ = units_mod.extract_units_and_stats_from_source( source=src, filepath="pkg/cli.py", module_name="pkg.cli", @@ -1335,7 +1350,7 @@ def parse_value(value: object) -> str: return str(value) """ _tree, collector, walk = _collect_module_walk(src) - dead = extractor._collect_dead_candidates( + dead = module_walk_mod._collect_dead_candidates( filepath="pkg/mod.py", module_name="pkg.mod", collector=collector, @@ -1515,7 +1530,7 @@ def _extract_with_thresholds( stmt_count: int, lines_per_stmt: int, **thresholds: int, - ) -> tuple[list[extractor.Unit], list[BlockUnit], list[SegmentUnit]]: + ) -> tuple[list[Unit], list[BlockUnit], list[SegmentUnit]]: return extract_units_from_source( source=self._make_func( stmt_count=stmt_count, @@ -1691,10 +1706,10 @@ def _fake_extract_segments( captured_hashes["value"] = precomputed_hashes return [] - monkeypatch.setattr(extractor, "_parse_with_limits", _fake_parse) - monkeypatch.setattr(extractor, "_stmt_count", lambda _node: 12) - monkeypatch.setattr(extractor, "_cfg_fingerprint_and_complexity", _fake_fingerprint) - monkeypatch.setattr(extractor, "extract_segments", _fake_extract_segments) + monkeypatch.setattr(units_mod, "_parse_with_limits", _fake_parse) + monkeypatch.setattr(units_mod, "_stmt_count", lambda _node: 12) + monkeypatch.setattr(units_mod, "_cfg_fingerprint_and_complexity", _fake_fingerprint) + monkeypatch.setattr(units_mod, "extract_segments", _fake_extract_segments) units, blocks, segments = extract_units_from_source( source="def f():\n pass\n", @@ -1725,7 +1740,7 @@ def f(): def _fake_parse(_source: str, _timeout_s: int) -> ast.AST: return tree - monkeypatch.setattr(extractor, "_parse_with_limits", _fake_parse) + monkeypatch.setattr(units_mod, "_parse_with_limits", _fake_parse) units, blocks, segments = extract_units_from_source( source="def f():\n return 1\n", filepath="x.py", @@ -1771,4 +1786,4 @@ def _fake_signal(_sig: int, handler: Callable[[int, object], None] | None) -> No monkeypatch.setattr(signal, "setitimer", lambda *_args, **_kwargs: None) with pytest.raises(ParseError, match="AST parsing timeout"): - extractor._parse_with_limits("x = 1", 1) + parser_mod._parse_with_limits("x = 1", 1) diff --git a/tests/test_fingerprint.py b/tests/test_fingerprint.py index a785d15..278107e 100644 --- a/tests/test_fingerprint.py +++ b/tests/test_fingerprint.py @@ -4,7 +4,7 @@ # SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy -from codeclone.fingerprint import bucket_loc, sha1 +from codeclone.analysis.fingerprint import bucket_loc, sha1 def test_sha1_stable() -> None: diff --git a/tests/test_gating.py b/tests/test_gating.py new file mode 100644 index 0000000..641c24d --- /dev/null +++ b/tests/test_gating.py @@ -0,0 +1,218 @@ +from __future__ import annotations + +from argparse import Namespace +from pathlib import Path + +from codeclone.analysis.normalizer import NormalizationConfig +from codeclone.core._types import AnalysisResult, BootstrapResult, OutputPaths +from codeclone.core.reporting import gate as cli_gate +from codeclone.models import ( + DeadItem, + HealthScore, + MetricsDiff, + ModuleDep, + ProjectMetrics, +) +from codeclone.report.gates.evaluator import MetricGateConfig, evaluate_gates +from codeclone.surfaces.mcp.service import CodeCloneMCPService +from codeclone.surfaces.mcp.session import ( + MCPAnalysisRequest, + MCPGateRequest, + MCPRunRecord, +) + + +def _project_metrics() -> ProjectMetrics: + return ProjectMetrics( + complexity_avg=10.0, + complexity_max=30, + high_risk_functions=("pkg.mod:hot",), + coupling_avg=5.0, + coupling_max=12, + high_risk_classes=("pkg.mod:Service",), + cohesion_avg=2.5, + cohesion_max=4, + low_cohesion_classes=("pkg.mod:Service",), + dependency_modules=2, + dependency_edges=1, + dependency_edge_list=( + ModuleDep(source="pkg.mod", target="pkg.dep", import_type="import", line=1), + ), + dependency_cycles=(), + dependency_max_depth=1, + dependency_longest_chains=(), + dead_code=( + DeadItem( + qualname="pkg.mod:unused", + filepath="pkg/mod.py", + start_line=1, + end_line=2, + kind="function", + confidence="high", + ), + ), + health=HealthScore(total=90, grade="A", dimensions={"health": 90}), + ) + + +def _report_document() -> dict[str, object]: + return { + "meta": {"baseline": {"status": "ok"}}, + "findings": { + "groups": { + "clones": { + "functions": [{"id": "clone:function:new", "novelty": "new"}], + "blocks": [], + "segments": [], + } + } + }, + "metrics": { + "families": { + "complexity": {"summary": {"max": 30}}, + "coupling": {"summary": {"max": 12}}, + "cohesion": {"summary": {"max": 4}}, + "dependencies": {"summary": {"cycles": 0}}, + "dead_code": {"summary": {"high_confidence": 1}}, + "health": {"summary": {"score": 90}}, + "coverage_adoption": { + "summary": { + "param_permille": 1000, + "docstring_permille": 1000, + "param_delta": 0, + "return_delta": 0, + "docstring_delta": 0, + } + }, + "api_surface": {"summary": {"breaking": 0}}, + "coverage_join": {"summary": {"status": "", "coverage_hotspots": 0}}, + } + }, + } + + +def test_cli_and_mcp_gate_results_match_for_same_inputs(tmp_path: Path) -> None: + report_document = _report_document() + project_metrics = _project_metrics() + metrics_diff = MetricsDiff( + new_high_risk_functions=(), + new_high_coupling_classes=(), + new_cycles=(), + new_dead_code=("pkg.mod:unused",), + health_delta=-1, + ) + config = MetricGateConfig( + fail_complexity=20, + fail_coupling=-1, + fail_cohesion=-1, + fail_cycles=False, + fail_dead_code=True, + fail_health=-1, + fail_on_new_metrics=True, + fail_on_new=True, + fail_threshold=0, + ) + + args = Namespace( + fail_complexity=config.fail_complexity, + fail_coupling=config.fail_coupling, + fail_cohesion=config.fail_cohesion, + fail_cycles=config.fail_cycles, + fail_dead_code=config.fail_dead_code, + fail_health=config.fail_health, + fail_on_new_metrics=config.fail_on_new_metrics, + fail_on_typing_regression=config.fail_on_typing_regression, + fail_on_docstring_regression=config.fail_on_docstring_regression, + fail_on_api_break=config.fail_on_api_break, + fail_on_untested_hotspots=config.fail_on_untested_hotspots, + min_typing_coverage=config.min_typing_coverage, + min_docstring_coverage=config.min_docstring_coverage, + coverage_min=config.coverage_min, + fail_on_new=config.fail_on_new, + fail_threshold=config.fail_threshold, + ) + boot = BootstrapResult( + root=tmp_path, + config=NormalizationConfig(), + args=args, + output_paths=OutputPaths(), + cache_path=tmp_path / "cache.json", + ) + analysis = AnalysisResult( + func_groups={}, + block_groups={}, + block_groups_report={}, + segment_groups={}, + suppressed_segment_groups=0, + block_group_facts={}, + func_clones_count=1, + block_clones_count=0, + segment_clones_count=0, + files_analyzed_or_cached=1, + project_metrics=project_metrics, + metrics_payload=None, + suggestions=(), + segment_groups_raw_digest="", + ) + + cli_result = cli_gate( + boot=boot, + analysis=analysis, + new_func={"clone:function:new"}, + new_block=set(), + metrics_diff=metrics_diff, + ) + + service = CodeCloneMCPService(history_limit=2) + request = MCPAnalysisRequest(root=str(tmp_path), respect_pyproject=False) + record = MCPRunRecord( + run_id="gate-parity", + root=tmp_path, + request=request, + comparison_settings=(), + report_document=report_document, + summary={}, + changed_paths=(), + changed_projection=None, + warnings=(), + failures=(), + func_clones_count=1, + block_clones_count=0, + project_metrics=project_metrics, + coverage_join=None, + suggestions=(), + new_func=frozenset({"clone:function:new"}), + new_block=frozenset(), + metrics_diff=metrics_diff, + ) + mcp_result = service._evaluate_gate_snapshot( + record=record, + request=MCPGateRequest( + fail_complexity=20, + fail_dead_code=True, + fail_on_new_metrics=True, + fail_on_new=True, + fail_threshold=0, + ), + ) + + evaluator_result = evaluate_gates( + report_document=report_document, + config=config, + baseline_status="ok", + metrics_diff=metrics_diff, + clone_new_count=1, + clone_total=1, + ) + + expected_reasons = ( + "metric:Complexity threshold exceeded: max CC=30, threshold=20.", + "metric:Dead code detected (high confidence): 1 item(s).", + "metric:New dead code items vs metrics baseline: 1.", + "metric:Health score regressed vs metrics baseline: delta=-1.", + "clone:new", + "clone:threshold:1:0", + ) + + assert cli_result == mcp_result == evaluator_result + assert cli_result.reasons == expected_reasons diff --git a/tests/test_golden_fixtures.py b/tests/test_golden_fixtures.py index 398e52d..46d52d1 100644 --- a/tests/test_golden_fixtures.py +++ b/tests/test_golden_fixtures.py @@ -8,7 +8,7 @@ import pytest -from codeclone.golden_fixtures import ( +from codeclone.findings.clones.golden_fixtures import ( GoldenFixturePatternError, build_suppressed_clone_groups, normalize_golden_fixture_patterns, diff --git a/tests/test_golden_v2.py b/tests/test_golden_v2.py index 3de17ea..4ef283c 100644 --- a/tests/test_golden_v2.py +++ b/tests/test_golden_v2.py @@ -16,16 +16,22 @@ import pytest -import codeclone.pipeline as pipeline -from codeclone import cli +import codeclone.core.parallelism as core_parallelism +import codeclone.main as cli +from codeclone.analysis.normalizer import NormalizationConfig +from codeclone.analysis.units import extract_units_and_stats_from_source from codeclone.baseline import current_python_tag -from codeclone.extractor import extract_units_and_stats_from_source -from codeclone.grouping import build_block_groups, build_groups, build_segment_groups +from codeclone.core.pipeline import compute_project_metrics +from codeclone.findings.clones.grouping import ( + build_block_groups, + build_groups, + build_segment_groups, +) +from codeclone.findings.structural.detectors import ( + build_clone_cohort_structural_findings, +) from codeclone.models import ClassMetrics, DeadCandidate, ModuleDep -from codeclone.normalize import NormalizationConfig -from codeclone.pipeline import compute_project_metrics from codeclone.scanner import iter_py_files, module_name_from_path -from codeclone.structural_findings import build_clone_cohort_structural_findings from tests._assertions import snapshot_python_tag _GOLDEN_V2_ROOT = Path("tests/fixtures/golden_v2").resolve() @@ -59,8 +65,12 @@ def _dummy_process_pool_executor( def _patch_parallel(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setattr(pipeline, "ProcessPoolExecutor", _dummy_process_pool_executor) - monkeypatch.setattr(pipeline, "as_completed", lambda futures: futures) + monkeypatch.setattr( + core_parallelism, + "ProcessPoolExecutor", + _dummy_process_pool_executor, + ) + monkeypatch.setattr(core_parallelism, "as_completed", lambda futures: futures) def _relative_to_root(path: str, root: Path) -> str: diff --git a/tests/test_html_report.py b/tests/test_html_report.py index 45dfd3b..7f72e61 100644 --- a/tests/test_html_report.py +++ b/tests/test_html_report.py @@ -6,13 +6,16 @@ import importlib import json +import re from collections.abc import Callable +from itertools import pairwise from pathlib import Path +from types import SimpleNamespace from typing import Any, cast import pytest -from codeclone._html_badges import _tab_empty_info +from codeclone.baseline.trust import current_python_tag from codeclone.contracts import ( CACHE_VERSION, DOCS_URL, @@ -20,29 +23,40 @@ REPORT_SCHEMA_VERSION, REPOSITORY_URL, ) -from codeclone.errors import FileProcessingError -from codeclone.html_report import ( - _FileCache, - _pygments_css, - _render_code_block, - _try_pygments, -) -from codeclone.html_report import ( - build_html_report as _core_build_html_report, -) +from codeclone.contracts.errors import FileProcessingError +from codeclone.findings.ids import clone_group_id, structural_group_id from codeclone.models import ( StructuralFindingGroup, StructuralFindingOccurrence, Suggestion, SuppressedCloneGroup, ) -from codeclone.report import build_block_group_facts -from codeclone.report.json_contract import ( - build_report_document, - clone_group_id, - structural_group_id, +from codeclone.report.document.builder import build_report_document +from codeclone.report.explain import build_block_group_facts +from codeclone.report.html import ( + build_html_report as _core_build_html_report, +) +from codeclone.report.html.primitives.location import ( + location_file_target, + relative_location_path, ) -from codeclone.report.serialize import render_json_report_document +from codeclone.report.html.sections._security_surfaces import ( + _coverage_join_review_text, + _coverage_review_cues, + _coverage_review_index, + _coverage_review_item_key, + _coverage_review_key, + _pluralize, + _review_cell_text, +) +from codeclone.report.html.widgets.badges import _tab_empty_info +from codeclone.report.html.widgets.snippets import ( + _FileCache, + _pygments_css, + _render_code_block, + _try_pygments, +) +from codeclone.report.renderers.json import render_json_report_document from tests._assertions import assert_contains_all from tests._report_fixtures import ( REPEATED_ASSERT_SOURCE, @@ -1276,7 +1290,7 @@ def test_try_pygments_ok() -> None: def test_render_code_block_without_pygments_uses_escaped_fallback( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - import codeclone._html_snippets as snippets + import codeclone.report.html.widgets.snippets as snippets src = tmp_path / "a.py" src.write_text("x = ''\n", "utf-8") @@ -1327,14 +1341,14 @@ def test_html_report_with_blocks(tmp_path: Path) -> None: def test_html_report_pygments_fallback(monkeypatch: pytest.MonkeyPatch) -> None: - import codeclone.html_report as hr + import codeclone.report.html.widgets.snippets as snippets def _fake_css(name: str) -> str: if name in ("github-dark", "github-light"): return "" return "x" - monkeypatch.setattr(hr, "_pygments_css", _fake_css) + monkeypatch.setattr(snippets, "_pygments_css", _fake_css) html = build_html_report( func_groups={}, block_groups={}, segment_groups={}, title="Pygments" ) @@ -1460,11 +1474,11 @@ def test_render_code_block_truncates_and_fallback( f = tmp_path / "a.py" f.write_text("\n".join([f"line{i}" for i in range(1, 30)]), "utf-8") - import codeclone.html_report as hr + import codeclone.report.html.widgets.snippets as snippets - monkeypatch.setattr(hr, "_try_pygments", lambda _text: None) + monkeypatch.setattr(snippets, "_try_pygments", lambda _text: None) cache = _FileCache(maxsize=2) - snippet = hr._render_code_block( + snippet = snippets._render_code_block( filepath=str(f), start_line=1, end_line=20, @@ -1511,6 +1525,8 @@ def _metrics_payload( dep_max_depth: object, dead_total: object, dead_critical: object, + dep_avg_depth: object = 2.5, + dep_p95_depth: object = 3, dead_suppressed: object = 0, ) -> dict[str, object]: suppressed_items: list[dict[str, object]] = [] @@ -1597,6 +1613,8 @@ def _metrics_payload( "modules": 4, "edges": 4, "max_depth": dep_max_depth, + "avg_depth": dep_avg_depth, + "p95_depth": dep_p95_depth, "cycles": dep_cycles, "longest_chains": [["pkg.a", "pkg.b", "pkg.c"]], "edge_list": [ @@ -1693,6 +1711,7 @@ def test_html_report_metrics_warn_branches_and_dependency_svg() -> None: assert "insight-warn" in html assert "dep-graph-svg" in html assert "Grade B" in html + assert "Cycles: 0; avg depth: 2.5; p95 depth: 3; max dependency depth: 9." in html assert "pkg.mod.func" in html assert "outside/project/pkg/mod.py" in html @@ -1720,7 +1739,7 @@ def test_html_report_metrics_risk_branches() -> None: html, "insight-risk", 'stroke="var(--error)"', - "Cycles: 1; max dependency depth: 4.", + "Cycles: 1; avg depth: 2.5; p95 depth: 3; max dependency depth: 4.", "5 candidates total; 2 high-confidence items; 0 suppressed.", '